src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     int i, j;
 293     struct brw_reg reg;
 294     int urb_read_length = 0;
 295     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 296     GLuint reg_index = 0;
 297
 298     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 299     c->first_free_grf = 0;
 300
 301     for (i = 0; i < 4; i++) {
 302         if (i < c->key.nr_depth_regs)
 303             reg = brw_vec8_grf(i * 2, 0);
 304         else
 305             reg = brw_vec8_grf(0, 0);
 306         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 307     }
 308     reg_index += 2 * c->key.nr_depth_regs;
 309
 310     /* constants */
 311     {
 312         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 313         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 314
 315         /* use a real constant buffer, or just use a section of the GRF? */
 316         /* XXX this heuristic may need adjustment... */
 317         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 318            c->fp->use_const_buffer = GL_TRUE;
 319         else
 320            c->fp->use_const_buffer = GL_FALSE;
 321         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 322
 323         if (c->fp->use_const_buffer) {
 324            /* We'll use a real constant buffer and fetch constants from
 325             * it with a dataport read message.
 326             */
 327
 328            /* number of float constants in CURBE */
 329            c->prog_data.nr_params = 0;
 330         }
 331         else {
 332            const struct gl_program_parameter_list *plist =
 333               c->fp->program.Base.Parameters;
 334            int index = 0;
 335
 336            /* number of float constants in CURBE */
 337            c->prog_data.nr_params = 4 * nr_params;
 338
 339            /* loop over program constants (float[4]) */
 340            for (i = 0; i < nr_params; i++) {
 341               /* loop over XYZW channels */
 342               for (j = 0; j < 4; j++, index++) {
 343                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 344                  /* Save pointer to parameter/constant value.
 345                   * Constants will be copied in prepare_constant_buffer()
 346                   */
 347                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 348                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 349               }
 350            }
 351            /* number of constant regs used (each reg is float[8]) */
 352            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 353            reg_index += c->nr_creg;
 354         }
 355     }
 356
 357     /* fragment shader inputs */
 358     for (i = 0; i < VERT_RESULT_MAX; i++) {
 359        int fp_input;
 360
 361        if (i >= VERT_RESULT_VAR0)
 362           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 363        else if (i <= VERT_RESULT_TEX7)
 364           fp_input = i;
 365        else
 366           fp_input = -1;
 367
 368        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 369           urb_read_length = reg_index;
 370           reg = brw_vec8_grf(reg_index, 0);
 371           for (j = 0; j < 4; j++)
 372              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 373        }
 374        if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 375           reg_index += 2;
 376        }
 377     }
 378
 379     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 380     c->prog_data.urb_read_length = urb_read_length;
 381     c->prog_data.curb_read_length = c->nr_creg;
 382     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 383     reg_index++;
 384     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 385     reg_index += 2;
 386
 387     /* mark GRF regs [0..reg_index-1] as in-use */
 388     for (i = 0; i < reg_index; i++)
 389        prealloc_grf(c, i);
 390
 391     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 392     prealloc_grf(c, 126);
 393     prealloc_grf(c, 127);
 394
 395     for (i = 0; i < c->nr_fp_insns; i++) {
 396         const struct prog_instruction *inst = &c->prog_instructions[i];
 397         struct brw_reg dst[4];
 398
 399         switch (inst->Opcode) {
 400         case OPCODE_TEX:
 401         case OPCODE_TXB:
 402             /* Allocate the channels of texture results contiguously,
 403              * since they are written out that way by the sampler unit.
 404              */
 405             for (j = 0; j < 4; j++) {
 406                 dst[j] = get_dst_reg(c, inst, j);
 407                 if (j != 0)
 408                     assert(dst[j].nr == dst[j - 1].nr + 1);
 409             }
 410             break;
 411         default:
 412             break;
 413         }
 414     }
 415
 416     /* An instruction may reference up to three constants.
 417      * They'll be found in these registers.
 418      * XXX alloc these on demand!
 419      */
 420     if (c->fp->use_const_buffer) {
 421        for (i = 0; i < 3; i++) {
 422           c->current_const[i].index = -1;
 423           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 424        }
 425     }
 426 #if 0
 427     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 428     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 429 #endif
 430 }
 431
 432
 433 /**
 434  * Check if any of the instruction's src registers are constants, uniforms,
 435  * or statevars.  If so, fetch any constants that we don't already have in
 436  * the three GRF slots.
 437  */
 438 static void fetch_constants(struct brw_wm_compile *c,
 439                             const struct prog_instruction *inst)
 440 {
 441    struct brw_compile *p = &c->func;
 442    GLuint i;
 443
 444    /* loop over instruction src regs */
 445    for (i = 0; i < 3; i++) {
 446       const struct prog_src_register *src = &inst->SrcReg[i];
 447       if (src->File == PROGRAM_STATE_VAR ||
 448           src->File == PROGRAM_CONSTANT ||
 449           src->File == PROGRAM_UNIFORM) {
 450          c->current_const[i].index = src->Index;
 451
 452 #if 0
 453          printf("  fetch const[%d] for arg %d into reg %d\n",
 454                 src->Index, i, c->current_const[i].reg.nr);
 455 #endif
 456
 457          /* need to fetch the constant now */
 458          brw_dp_READ_4(p,
 459                        c->current_const[i].reg,  /* writeback dest */
 460                        src->RelAddr,             /* relative indexing? */
 461                        16 * src->Index,          /* byte offset */
 462                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 463                        );
 464       }
 465    }
 466 }
 467
 468
 469 /**
 470  * Convert Mesa dst register to brw register.
 471  */
 472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 473                                   const struct prog_instruction *inst,
 474                                   GLuint component)
 475 {
 476     const int nr = 1;
 477     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 478             0, 0);
 479 }
 480
 481
 482 static struct brw_reg
 483 get_src_reg_const(struct brw_wm_compile *c,
 484                   const struct prog_instruction *inst,
 485                   GLuint srcRegIndex, GLuint component)
 486 {
 487    /* We should have already fetched the constant from the constant
 488     * buffer in fetch_constants().  Now we just have to return a
 489     * register description that extracts the needed component and
 490     * smears it across all eight vector components.
 491     */
 492    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 493    struct brw_reg const_reg;
 494
 495    assert(component < 4);
 496    assert(srcRegIndex < 3);
 497    assert(c->current_const[srcRegIndex].index != -1);
 498    const_reg = c->current_const[srcRegIndex].reg;
 499
 500    /* extract desired float from the const_reg, and smear */
 501    const_reg = stride(const_reg, 0, 1, 0);
 502    const_reg.subnr = component * 4;
 503
 504    if (src->Negate & (1 << component))
 505       const_reg = negate(const_reg);
 506    if (src->Abs)
 507       const_reg = brw_abs(const_reg);
 508
 509 #if 0
 510    printf("  form const[%d].%d for arg %d, reg %d\n",
 511           c->current_const[srcRegIndex].index,
 512           component,
 513           srcRegIndex,
 514           const_reg.nr);
 515 #endif
 516
 517    return const_reg;
 518 }
 519
 520
 521 /**
 522  * Convert Mesa src register to brw register.
 523  */
 524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 525                                   const struct prog_instruction *inst,
 526                                   GLuint srcRegIndex, GLuint channel)
 527 {
 528     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 529     const GLuint nr = 1;
 530     const GLuint component = GET_SWZ(src->Swizzle, channel);
 531
 532     /* Extended swizzle terms */
 533     if (component == SWIZZLE_ZERO) {
 534        return brw_imm_f(0.0F);
 535     }
 536     else if (component == SWIZZLE_ONE) {
 537        return brw_imm_f(1.0F);
 538     }
 539
 540     if (c->fp->use_const_buffer &&
 541         (src->File == PROGRAM_STATE_VAR ||
 542          src->File == PROGRAM_CONSTANT ||
 543          src->File == PROGRAM_UNIFORM)) {
 544        return get_src_reg_const(c, inst, srcRegIndex, component);
 545     }
 546     else {
 547        /* other type of source register */
 548        return get_reg(c, src->File, src->Index, component, nr,
 549                       src->Negate, src->Abs);
 550     }
 551 }
 552
 553 /**
 554  * Subroutines are minimal support for resusable instruction sequences.
 555  * They are implemented as simply as possible to minimise overhead: there
 556  * is no explicit support for communication between the caller and callee
 557  * other than saving the return address in a temporary register, nor is
 558  * there any automatic local storage.  This implies that great care is
 559  * required before attempting reentrancy or any kind of nested
 560  * subroutine invocations.
 561  */
 562 static void invoke_subroutine( struct brw_wm_compile *c,
 563                                enum _subroutine subroutine,
 564                                void (*emit)( struct brw_wm_compile * ) )
 565 {
 566     struct brw_compile *p = &c->func;
 567
 568     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 569
 570     if( c->subroutines[ subroutine ] ) {
 571         /* subroutine previously emitted: reuse existing instructions */
 572
 573         int mark = mark_tmps( c );
 574         struct brw_reg return_address = retype( alloc_tmp( c ),
 575                                                 BRW_REGISTER_TYPE_UD );
 576         int here = p->nr_insn;
 577
 578         brw_push_insn_state(p);
 579         brw_set_mask_control(p, BRW_MASK_DISABLE);
 580         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 581
 582         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 583                  brw_imm_d( ( c->subroutines[ subroutine ] -
 584                               here - 1 ) << 4 ) );
 585         brw_pop_insn_state(p);
 586
 587         release_tmps( c, mark );
 588     } else {
 589         /* previously unused subroutine: emit, and mark for later reuse */
 590
 591         int mark = mark_tmps( c );
 592         struct brw_reg return_address = retype( alloc_tmp( c ),
 593                                                 BRW_REGISTER_TYPE_UD );
 594         struct brw_instruction *calc;
 595         int base = p->nr_insn;
 596
 597         brw_push_insn_state(p);
 598         brw_set_mask_control(p, BRW_MASK_DISABLE);
 599         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 600         brw_pop_insn_state(p);
 601
 602         c->subroutines[ subroutine ] = p->nr_insn;
 603
 604         emit( c );
 605
 606         brw_push_insn_state(p);
 607         brw_set_mask_control(p, BRW_MASK_DISABLE);
 608         brw_MOV( p, brw_ip_reg(), return_address );
 609         brw_pop_insn_state(p);
 610
 611         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 612
 613         release_tmps( c, mark );
 614     }
 615 }
 616
 617 static void emit_arl(struct brw_wm_compile *c,
 618                      const struct prog_instruction *inst)
 619 {
 620     struct brw_compile *p = &c->func;
 621     struct brw_reg src0, addr_reg;
 622     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 623     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 624                            BRW_ARF_ADDRESS, 0);
 625     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 626     brw_MOV(p, addr_reg, src0);
 627     brw_set_saturate(p, 0);
 628 }
 629
 630 /**
 631  * For GLSL shaders, this KIL will be unconditional.
 632  * It may be contained inside an IF/ENDIF structure of course.
 633  */
 634 static void emit_kil(struct brw_wm_compile *c)
 635 {
 636     struct brw_compile *p = &c->func;
 637     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 638     brw_push_insn_state(p);
 639     brw_set_mask_control(p, BRW_MASK_DISABLE);
 640     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
 641     brw_AND(p, depth, c->emit_mask_reg, depth);
 642     brw_pop_insn_state(p);
 643 }
 644
 645 static INLINE struct brw_reg high_words( struct brw_reg reg )
 646 {
 647     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 648                    0, 8, 2 );
 649 }
 650
 651 static INLINE struct brw_reg low_words( struct brw_reg reg )
 652 {
 653     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 654 }
 655
 656 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 657 {
 658     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 659 }
 660
 661 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 662 {
 663     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 664                    0, 16, 2 );
 665 }
 666
 667 /* One-, two- and three-dimensional Perlin noise, similar to the description
 668    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 669 static void noise1_sub( struct brw_wm_compile *c ) {
 670
 671     struct brw_compile *p = &c->func;
 672     struct brw_reg param,
 673         x0, x1, /* gradients at each end */
 674         t, tmp[ 2 ], /* float temporaries */
 675         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 676     int i;
 677     int mark = mark_tmps( c );
 678
 679     x0 = alloc_tmp( c );
 680     x1 = alloc_tmp( c );
 681     t = alloc_tmp( c );
 682     tmp[ 0 ] = alloc_tmp( c );
 683     tmp[ 1 ] = alloc_tmp( c );
 684     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 685     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 686     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 687     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 688     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 689
 690     param = lookup_tmp( c, mark - 2 );
 691
 692     brw_set_access_mode( p, BRW_ALIGN_1 );
 693
 694     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 695
 696     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 697        be hashed.  Also compute the remainder (offset within the unit
 698        length), interleaved to reduce register dependency penalties. */
 699     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 700     brw_FRC( p, param, param );
 701     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 702     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 703     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 704
 705     /* We're now ready to perform the hashing.  The two hashes are
 706        interleaved for performance.  The hash function used is
 707        designed to rapidly achieve avalanche and require only 32x16
 708        bit multiplication, and 16-bit swizzles (which we get for
 709        free).  We can't use immediate operands in the multiplies,
 710        because immediates are permitted only in src1 and the 16-bit
 711        factor is permitted only in src0. */
 712     for( i = 0; i < 2; i++ )
 713         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 714     for( i = 0; i < 2; i++ )
 715        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 716                 high_words( itmp[ i ] ) );
 717     for( i = 0; i < 2; i++ )
 718         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 719     for( i = 0; i < 2; i++ )
 720        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 721                 high_words( itmp[ i ] ) );
 722     for( i = 0; i < 2; i++ )
 723         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 724     for( i = 0; i < 2; i++ )
 725        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 726                 high_words( itmp[ i ] ) );
 727
 728     /* Now we want to initialise the two gradients based on the
 729        hashes.  Format conversion from signed integer to float leaves
 730        everything scaled too high by a factor of pow( 2, 31 ), but
 731        we correct for that right at the end. */
 732     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 733     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 734     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 735
 736     brw_MUL( p, x0, x0, param );
 737     brw_MUL( p, x1, x1, t );
 738
 739     /* We interpolate between the gradients using the polynomial
 740        6t^5 - 15t^4 + 10t^3 (Perlin). */
 741     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 742     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 743     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 744     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 745     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 746     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 747                                            pipeline */
 748     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 749     brw_MUL( p, param, tmp[ 0 ], param );
 750     brw_MUL( p, x1, x1, param );
 751     brw_ADD( p, x0, x0, x1 );
 752     /* scale by pow( 2, -30 ), to compensate for the format conversion
 753        above and an extra factor of 2 so that a single gradient covers
 754        the [-1,1] range */
 755     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 756
 757     release_tmps( c, mark );
 758 }
 759
 760 static void emit_noise1( struct brw_wm_compile *c,
 761                          const struct prog_instruction *inst )
 762 {
 763     struct brw_compile *p = &c->func;
 764     struct brw_reg src, param, dst;
 765     GLuint mask = inst->DstReg.WriteMask;
 766     int i;
 767     int mark = mark_tmps( c );
 768
 769     assert( mark == 0 );
 770
 771     src = get_src_reg( c, inst, 0, 0 );
 772
 773     param = alloc_tmp( c );
 774
 775     brw_MOV( p, param, src );
 776
 777     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 778
 779     /* Fill in the result: */
 780     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 781     for (i = 0 ; i < 4; i++) {
 782         if (mask & (1<<i)) {
 783             dst = get_dst_reg(c, inst, i);
 784             brw_MOV( p, dst, param );
 785         }
 786     }
 787     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 788         brw_set_saturate( p, 0 );
 789
 790     release_tmps( c, mark );
 791 }
 792
 793 static void noise2_sub( struct brw_wm_compile *c ) {
 794
 795     struct brw_compile *p = &c->func;
 796     struct brw_reg param0, param1,
 797         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
 798         t, tmp[ 4 ], /* float temporaries */
 799         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
 800     int i;
 801     int mark = mark_tmps( c );
 802
 803     x0y0 = alloc_tmp( c );
 804     x0y1 = alloc_tmp( c );
 805     x1y0 = alloc_tmp( c );
 806     x1y1 = alloc_tmp( c );
 807     t = alloc_tmp( c );
 808     for( i = 0; i < 4; i++ ) {
 809         tmp[ i ] = alloc_tmp( c );
 810         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 811     }
 812     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
 813     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
 814     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
 815
 816     param0 = lookup_tmp( c, mark - 3 );
 817     param1 = lookup_tmp( c, mark - 2 );
 818
 819     brw_set_access_mode( p, BRW_ALIGN_1 );
 820
 821     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
 822        be hashed.  Also compute the remainders (offsets within the unit
 823        square), interleaved to reduce register dependency penalties. */
 824     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
 825     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
 826     brw_FRC( p, param0, param0 );
 827     brw_FRC( p, param1, param1 );
 828     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 829     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
 830              low_words( itmp[ 1 ] ) );
 831     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 832     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 833     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
 834     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
 835     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
 836
 837     /* We're now ready to perform the hashing.  The four hashes are
 838        interleaved for performance.  The hash function used is
 839        designed to rapidly achieve avalanche and require only 32x16
 840        bit multiplication, and 16-bit swizzles (which we get for
 841        free).  We can't use immediate operands in the multiplies,
 842        because immediates are permitted only in src1 and the 16-bit
 843        factor is permitted only in src0. */
 844     for( i = 0; i < 4; i++ )
 845         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 846     for( i = 0; i < 4; i++ )
 847         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 848                  high_words( itmp[ i ] ) );
 849     for( i = 0; i < 4; i++ )
 850         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
 851     for( i = 0; i < 4; i++ )
 852         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 853                  high_words( itmp[ i ] ) );
 854     for( i = 0; i < 4; i++ )
 855         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
 856     for( i = 0; i < 4; i++ )
 857         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 858                  high_words( itmp[ i ] ) );
 859
 860     /* Now we want to initialise the four gradients based on the
 861        hashes.  Format conversion from signed integer to float leaves
 862        everything scaled too high by a factor of pow( 2, 15 ), but
 863        we correct for that right at the end. */
 864     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
 865     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
 866     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
 867     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
 868     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
 869
 870     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
 871     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
 872     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
 873     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
 874
 875     brw_MUL( p, x1y0, x1y0, t );
 876     brw_MUL( p, x1y1, x1y1, t );
 877     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
 878     brw_MUL( p, x0y0, x0y0, param0 );
 879     brw_MUL( p, x0y1, x0y1, param0 );
 880
 881     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
 882     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
 883     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
 884     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
 885
 886     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
 887     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
 888     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
 889     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
 890
 891     /* We interpolate between the gradients using the polynomial
 892        6t^5 - 15t^4 + 10t^3 (Perlin). */
 893     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
 894     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
 895     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 896     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
 897     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 898     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 899     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
 900                                                  pipeline */
 901     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 902     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
 903     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 904     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 905     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
 906                                                  pipeline */
 907     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 908     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 909     brw_MUL( p, param0, tmp[ 0 ], param0 );
 910     brw_MUL( p, param1, tmp[ 1 ], param1 );
 911
 912     /* Here we interpolate in the y dimension... */
 913     brw_MUL( p, x0y1, x0y1, param1 );
 914     brw_MUL( p, x1y1, x1y1, param1 );
 915     brw_ADD( p, x0y0, x0y0, x0y1 );
 916     brw_ADD( p, x1y0, x1y0, x1y1 );
 917
 918     /* And now in x.  There are horrible register dependencies here,
 919        but we have nothing else to do. */
 920     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
 921     brw_MUL( p, x1y0, x1y0, param0 );
 922     brw_ADD( p, x0y0, x0y0, x1y0 );
 923
 924     /* scale by pow( 2, -15 ), as described above */
 925     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
 926
 927     release_tmps( c, mark );
 928 }
 929
 930 static void emit_noise2( struct brw_wm_compile *c,
 931                          const struct prog_instruction *inst )
 932 {
 933     struct brw_compile *p = &c->func;
 934     struct brw_reg src0, src1, param0, param1, dst;
 935     GLuint mask = inst->DstReg.WriteMask;
 936     int i;
 937     int mark = mark_tmps( c );
 938
 939     assert( mark == 0 );
 940
 941     src0 = get_src_reg( c, inst, 0, 0 );
 942     src1 = get_src_reg( c, inst, 0, 1 );
 943
 944     param0 = alloc_tmp( c );
 945     param1 = alloc_tmp( c );
 946
 947     brw_MOV( p, param0, src0 );
 948     brw_MOV( p, param1, src1 );
 949
 950     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
 951
 952     /* Fill in the result: */
 953     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 954     for (i = 0 ; i < 4; i++) {
 955         if (mask & (1<<i)) {
 956             dst = get_dst_reg(c, inst, i);
 957             brw_MOV( p, dst, param0 );
 958         }
 959     }
 960     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 961         brw_set_saturate( p, 0 );
 962
 963     release_tmps( c, mark );
 964 }
 965
 966 /**
 967  * The three-dimensional case is much like the one- and two- versions above,
 968  * but since the number of corners is rapidly growing we now pack 16 16-bit
 969  * hashes into each register to extract more parallelism from the EUs.
 970  */
 971 static void noise3_sub( struct brw_wm_compile *c ) {
 972
 973     struct brw_compile *p = &c->func;
 974     struct brw_reg param0, param1, param2,
 975         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
 976         xi, yi, zi, /* interpolation coefficients */
 977         t, tmp[ 8 ], /* float temporaries */
 978         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
 979         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
 980     int i;
 981     int mark = mark_tmps( c );
 982
 983     x0y0 = alloc_tmp( c );
 984     x0y1 = alloc_tmp( c );
 985     x1y0 = alloc_tmp( c );
 986     x1y1 = alloc_tmp( c );
 987     xi = alloc_tmp( c );
 988     yi = alloc_tmp( c );
 989     zi = alloc_tmp( c );
 990     t = alloc_tmp( c );
 991     for( i = 0; i < 8; i++ ) {
 992         tmp[ i ] = alloc_tmp( c );
 993         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 994         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
 995     }
 996
 997     param0 = lookup_tmp( c, mark - 4 );
 998     param1 = lookup_tmp( c, mark - 3 );
 999     param2 = lookup_tmp( c, mark - 2 );
1000
1001     brw_set_access_mode( p, BRW_ALIGN_1 );
1002
1003     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1004        be hashed.  Also compute the remainders (offsets within the unit
1005        cube), interleaved to reduce register dependency penalties. */
1006     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1007     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1008     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1009     brw_FRC( p, param0, param0 );
1010     brw_FRC( p, param1, param1 );
1011     brw_FRC( p, param2, param2 );
1012     /* Since we now have only 16 bits of precision in the hash, we must
1013        be more careful about thorough mixing to maintain entropy as we
1014        squash the input vector into a small scalar. */
1015     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1016     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1017     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1018              brw_imm_uw( 0x9B93 ) );
1019     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1020              brw_imm_uw( 0xBC8F ) );
1021
1022     /* Temporarily disable the execution mask while we work with ExecSize=16
1023        channels (the mask is set for ExecSize=8 and is probably incorrect).
1024        Although this might cause execution of unwanted channels, the code
1025        writes only to temporary registers and has no side effects, so
1026        disabling the mask is harmless. */
1027     brw_push_insn_state( p );
1028     brw_set_mask_control( p, BRW_MASK_DISABLE );
1029     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1030     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1031     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1032
1033     /* We're now ready to perform the hashing.  The eight hashes are
1034        interleaved for performance.  The hash function used is
1035        designed to rapidly achieve avalanche and require only 16x16
1036        bit multiplication, and 8-bit swizzles (which we get for
1037        free). */
1038     for( i = 0; i < 4; i++ )
1039         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1040     for( i = 0; i < 4; i++ )
1041         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1042                  odd_bytes( wtmp[ i ] ) );
1043     for( i = 0; i < 4; i++ )
1044         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1045     for( i = 0; i < 4; i++ )
1046         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1047                  odd_bytes( wtmp[ i ] ) );
1048     brw_pop_insn_state( p );
1049
1050     /* Now we want to initialise the four rear gradients based on the
1051        hashes.  Format conversion from signed integer to float leaves
1052        everything scaled too high by a factor of pow( 2, 15 ), but
1053        we correct for that right at the end. */
1054     /* x component */
1055     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1056     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1057     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1058     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1059     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1060
1061     brw_push_insn_state( p );
1062     brw_set_mask_control( p, BRW_MASK_DISABLE );
1063     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1064     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1065     brw_pop_insn_state( p );
1066
1067     brw_MUL( p, x1y0, x1y0, t );
1068     brw_MUL( p, x1y1, x1y1, t );
1069     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1070     brw_MUL( p, x0y0, x0y0, param0 );
1071     brw_MUL( p, x0y1, x0y1, param0 );
1072
1073     /* y component */
1074     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1075     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1076     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1077     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1078
1079     brw_push_insn_state( p );
1080     brw_set_mask_control( p, BRW_MASK_DISABLE );
1081     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1082     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1083     brw_pop_insn_state( p );
1084
1085     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1086     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1087     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1088     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1089     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1090
1091     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1092     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1093     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1094     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1095
1096     /* z component */
1097     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1098     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1099     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1100     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1101
1102     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1103     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1104     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1105     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1106
1107     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1108     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1109     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1110     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1111
1112     /* We interpolate between the gradients using the polynomial
1113        6t^5 - 15t^4 + 10t^3 (Perlin). */
1114     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1115     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1116     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1117     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1118     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1119     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1120     brw_MUL( p, xi, xi, param0 );
1121     brw_MUL( p, yi, yi, param1 );
1122     brw_MUL( p, zi, zi, param2 );
1123     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1124     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1125     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1126     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1127     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1128     brw_MUL( p, xi, xi, param0 );
1129     brw_MUL( p, yi, yi, param1 );
1130     brw_MUL( p, zi, zi, param2 );
1131     brw_MUL( p, xi, xi, param0 );
1132     brw_MUL( p, yi, yi, param1 );
1133     brw_MUL( p, zi, zi, param2 );
1134     brw_MUL( p, xi, xi, param0 );
1135     brw_MUL( p, yi, yi, param1 );
1136     brw_MUL( p, zi, zi, param2 );
1137
1138     /* Here we interpolate in the y dimension... */
1139     brw_MUL( p, x0y1, x0y1, yi );
1140     brw_MUL( p, x1y1, x1y1, yi );
1141     brw_ADD( p, x0y0, x0y0, x0y1 );
1142     brw_ADD( p, x1y0, x1y0, x1y1 );
1143
1144     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1145     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1146     brw_MUL( p, x1y0, x1y0, xi );
1147     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1148
1149     /* Now do the same thing for the front four gradients... */
1150     /* x component */
1151     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1152     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1153     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1154     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1155
1156     brw_push_insn_state( p );
1157     brw_set_mask_control( p, BRW_MASK_DISABLE );
1158     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1159     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1160     brw_pop_insn_state( p );
1161
1162     brw_MUL( p, x1y0, x1y0, t );
1163     brw_MUL( p, x1y1, x1y1, t );
1164     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1165     brw_MUL( p, x0y0, x0y0, param0 );
1166     brw_MUL( p, x0y1, x0y1, param0 );
1167
1168     /* y component */
1169     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1170     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1171     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1172     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1173
1174     brw_push_insn_state( p );
1175     brw_set_mask_control( p, BRW_MASK_DISABLE );
1176     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1177     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1178     brw_pop_insn_state( p );
1179
1180     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1181     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1182     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1183     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1184     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1185
1186     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1187     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1188     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1189     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1190
1191     /* z component */
1192     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1193     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1194     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1195     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1196
1197     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1198     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1199     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1200     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1201
1202     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1203     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1204     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1205     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1206
1207     /* The interpolation coefficients are still around from last time, so
1208        again interpolate in the y dimension... */
1209     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1210     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1211     brw_MUL( p, x0y1, x0y1, yi );
1212     brw_MUL( p, x1y1, x1y1, yi );
1213     brw_ADD( p, x0y0, x0y0, x0y1 );
1214     brw_ADD( p, x1y0, x1y0, x1y1 );
1215
1216     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1217        time put the front face in tmp[ 1 ] and we're nearly there... */
1218     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1219     brw_MUL( p, x1y0, x1y0, xi );
1220     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1221
1222     /* The final interpolation, in the z dimension: */
1223     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1224     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1225     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1226
1227     /* scale by pow( 2, -15 ), as described above */
1228     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1229
1230     release_tmps( c, mark );
1231 }
1232
1233 static void emit_noise3( struct brw_wm_compile *c,
1234                          const struct prog_instruction *inst )
1235 {
1236     struct brw_compile *p = &c->func;
1237     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1238     GLuint mask = inst->DstReg.WriteMask;
1239     int i;
1240     int mark = mark_tmps( c );
1241
1242     assert( mark == 0 );
1243
1244     src0 = get_src_reg( c, inst, 0, 0 );
1245     src1 = get_src_reg( c, inst, 0, 1 );
1246     src2 = get_src_reg( c, inst, 0, 2 );
1247
1248     param0 = alloc_tmp( c );
1249     param1 = alloc_tmp( c );
1250     param2 = alloc_tmp( c );
1251
1252     brw_MOV( p, param0, src0 );
1253     brw_MOV( p, param1, src1 );
1254     brw_MOV( p, param2, src2 );
1255
1256     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1257
1258     /* Fill in the result: */
1259     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1260     for (i = 0 ; i < 4; i++) {
1261         if (mask & (1<<i)) {
1262             dst = get_dst_reg(c, inst, i);
1263             brw_MOV( p, dst, param0 );
1264         }
1265     }
1266     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1267         brw_set_saturate( p, 0 );
1268
1269     release_tmps( c, mark );
1270 }
1271
1272 /**
1273  * For the four-dimensional case, the little micro-optimisation benefits
1274  * we obtain by unrolling all the loops aren't worth the massive bloat it
1275  * now causes.  Instead, we loop twice around performing a similar operation
1276  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1277  * code to glue it all together.
1278  */
1279 static void noise4_sub( struct brw_wm_compile *c )
1280 {
1281     struct brw_compile *p = &c->func;
1282     struct brw_reg param[ 4 ],
1283         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1284         w0, /* noise for the w=0 cube */
1285         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1286         interp[ 4 ], /* interpolation coefficients */
1287         t, tmp[ 8 ], /* float temporaries */
1288         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1289         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1290     int i, j;
1291     int mark = mark_tmps( c );
1292     GLuint loop, origin;
1293
1294     x0y0 = alloc_tmp( c );
1295     x0y1 = alloc_tmp( c );
1296     x1y0 = alloc_tmp( c );
1297     x1y1 = alloc_tmp( c );
1298     t = alloc_tmp( c );
1299     w0 = alloc_tmp( c );
1300     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1301     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1302
1303     for( i = 0; i < 4; i++ ) {
1304         param[ i ] = lookup_tmp( c, mark - 5 + i );
1305         interp[ i ] = alloc_tmp( c );
1306     }
1307
1308     for( i = 0; i < 8; i++ ) {
1309         tmp[ i ] = alloc_tmp( c );
1310         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1311         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1312     }
1313
1314     brw_set_access_mode( p, BRW_ALIGN_1 );
1315
1316     /* We only want 16 bits of precision from the integral part of each
1317        co-ordinate, but unfortunately the RNDD semantics would saturate
1318        at 16 bits if we performed the operation directly to a 16-bit
1319        destination.  Therefore, we round to 32-bit temporaries where
1320        appropriate, and then store only the lower 16 bits. */
1321     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1322     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1323     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1324     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1325     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1326     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1327
1328     /* Modify the flag register here, because the side effect is useful
1329        later (see below).  We know for certain that all flags will be
1330        cleared, since the FRC instruction cannot possibly generate
1331        negative results.  Even for exceptional inputs (infinities, denormals,
1332        NaNs), the architecture guarantees that the L conditional is false. */
1333     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1334     brw_FRC( p, param[ 0 ], param[ 0 ] );
1335     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1336     for( i = 1; i < 4; i++ )
1337         brw_FRC( p, param[ i ], param[ i ] );
1338
1339     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1340        of all. */
1341     for( i = 0; i < 4; i++ )
1342         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1343     for( i = 0; i < 4; i++ )
1344         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1345     for( i = 0; i < 4; i++ )
1346         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1347     for( i = 0; i < 4; i++ )
1348         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1349     for( j = 0; j < 3; j++ )
1350         for( i = 0; i < 4; i++ )
1351             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1352
1353     /* Mark the current address, as it will be a jump destination.  The
1354        following code will be executed twice: first, with the flag
1355        register clear indicating the w=0 case, and second with flags
1356        set for w=1. */
1357     loop = p->nr_insn;
1358
1359     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1360        be hashed.  Since we have only 16 bits of precision in the hash, we
1361        must be careful about thorough mixing to maintain entropy as we
1362        squash the input vector into a small scalar. */
1363     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1364              brw_imm_uw( 0xBC8F ) );
1365     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1366              brw_imm_uw( 0xD0BD ) );
1367     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1368              brw_imm_uw( 0x9B93 ) );
1369     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1370              brw_imm_uw( 0xA359 ) );
1371     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1372              brw_imm_uw( 0xBC8F ) );
1373
1374     /* Temporarily disable the execution mask while we work with ExecSize=16
1375        channels (the mask is set for ExecSize=8 and is probably incorrect).
1376        Although this might cause execution of unwanted channels, the code
1377        writes only to temporary registers and has no side effects, so
1378        disabling the mask is harmless. */
1379     brw_push_insn_state( p );
1380     brw_set_mask_control( p, BRW_MASK_DISABLE );
1381     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1382     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1383     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1384
1385     /* We're now ready to perform the hashing.  The eight hashes are
1386        interleaved for performance.  The hash function used is
1387        designed to rapidly achieve avalanche and require only 16x16
1388        bit multiplication, and 8-bit swizzles (which we get for
1389        free). */
1390     for( i = 0; i < 4; i++ )
1391         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1392     for( i = 0; i < 4; i++ )
1393         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1394                  odd_bytes( wtmp[ i ] ) );
1395     for( i = 0; i < 4; i++ )
1396         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1397     for( i = 0; i < 4; i++ )
1398         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1399                  odd_bytes( wtmp[ i ] ) );
1400     brw_pop_insn_state( p );
1401
1402     /* Now we want to initialise the four rear gradients based on the
1403        hashes.  Format conversion from signed integer to float leaves
1404        everything scaled too high by a factor of pow( 2, 15 ), but
1405        we correct for that right at the end. */
1406     /* x component */
1407     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1408     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1409     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1410     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1411     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1412
1413     brw_push_insn_state( p );
1414     brw_set_mask_control( p, BRW_MASK_DISABLE );
1415     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1416     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1417     brw_pop_insn_state( p );
1418
1419     brw_MUL( p, x1y0, x1y0, t );
1420     brw_MUL( p, x1y1, x1y1, t );
1421     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1422     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1423     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1424
1425     /* y component */
1426     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1427     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1428     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1429     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1430
1431     brw_push_insn_state( p );
1432     brw_set_mask_control( p, BRW_MASK_DISABLE );
1433     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1434     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1435     brw_pop_insn_state( p );
1436
1437     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1438     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1439     /* prepare t for the w component (used below): w the first time through
1440        the loop; w - 1 the second time) */
1441     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1442     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1443     p->current->header.predicate_inverse = 1;
1444     brw_MOV( p, t, param[ 3 ] );
1445     p->current->header.predicate_inverse = 0;
1446     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1447     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1448     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1449
1450     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1451     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1452     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1453     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1454
1455     /* z component */
1456     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1457     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1458     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1459     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1460
1461     brw_push_insn_state( p );
1462     brw_set_mask_control( p, BRW_MASK_DISABLE );
1463     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1464     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1465     brw_pop_insn_state( p );
1466
1467     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1468     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1469     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1470     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1471
1472     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1473     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1474     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1475     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1476
1477     /* w component */
1478     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1479     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1480     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1481     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1482
1483     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1484     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1485     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1486     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1487     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1488
1489     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1490     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1491     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1492     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1493
1494     /* Here we interpolate in the y dimension... */
1495     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1496     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1497     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1498     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1499     brw_ADD( p, x0y0, x0y0, x0y1 );
1500     brw_ADD( p, x1y0, x1y0, x1y1 );
1501
1502     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1503     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1504     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1505     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1506
1507     /* Now do the same thing for the front four gradients... */
1508     /* x component */
1509     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1510     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1511     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1512     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1513
1514     brw_push_insn_state( p );
1515     brw_set_mask_control( p, BRW_MASK_DISABLE );
1516     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1517     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1518     brw_pop_insn_state( p );
1519
1520     brw_MUL( p, x1y0, x1y0, t );
1521     brw_MUL( p, x1y1, x1y1, t );
1522     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1523     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1524     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1525
1526     /* y component */
1527     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1528     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1529     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1530     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1531
1532     brw_push_insn_state( p );
1533     brw_set_mask_control( p, BRW_MASK_DISABLE );
1534     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1535     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1536     brw_pop_insn_state( p );
1537
1538     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1539     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1540     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1541     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1542     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1543
1544     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1545     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1546     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1547     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1548
1549     /* z component */
1550     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1551     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1552     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1553     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1554
1555     brw_push_insn_state( p );
1556     brw_set_mask_control( p, BRW_MASK_DISABLE );
1557     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1558     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1559     brw_pop_insn_state( p );
1560
1561     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1562     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1563     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1564     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1565     /* prepare t for the w component (used below): w the first time through
1566        the loop; w - 1 the second time) */
1567     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1568     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1569     p->current->header.predicate_inverse = 1;
1570     brw_MOV( p, t, param[ 3 ] );
1571     p->current->header.predicate_inverse = 0;
1572     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1573
1574     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1575     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1576     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1577     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1578
1579     /* w component */
1580     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1581     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1582     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1583     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1584
1585     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1586     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1587     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1588     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1589
1590     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1591     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1592     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1593     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1594
1595     /* Interpolate in the y dimension: */
1596     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1597     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1598     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1599     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1600     brw_ADD( p, x0y0, x0y0, x0y1 );
1601     brw_ADD( p, x1y0, x1y0, x1y1 );
1602
1603     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1604        time put the front face in tmp[ 1 ] and we're nearly there... */
1605     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1606     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1607     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1608
1609     /* Another interpolation, in the z dimension: */
1610     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1611     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1612     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1613
1614     /* Exit the loop if we've computed both cubes... */
1615     origin = p->nr_insn;
1616     brw_push_insn_state( p );
1617     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1618     brw_set_mask_control( p, BRW_MASK_DISABLE );
1619     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1620     brw_pop_insn_state( p );
1621
1622     /* Save the result for the w=0 case, and increment the w coordinate: */
1623     brw_MOV( p, w0, tmp[ 0 ] );
1624     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1625              brw_imm_uw( 1 ) );
1626
1627     /* Loop around for the other cube.  Explicitly set the flag register
1628        (unfortunately we must spend an extra instruction to do this: we
1629        can't rely on a side effect of the previous MOV or ADD because
1630        conditional modifiers which are normally true might be false in
1631        exceptional circumstances, e.g. given a NaN input; the add to
1632        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1633     brw_push_insn_state( p );
1634     brw_set_mask_control( p, BRW_MASK_DISABLE );
1635     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1636     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1637              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1638     brw_pop_insn_state( p );
1639
1640     /* Patch the previous conditional branch now that we know the
1641        destination address. */
1642     brw_set_src1( p->store + origin,
1643                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1644
1645     /* The very last interpolation. */
1646     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1647     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1648     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1649
1650     /* scale by pow( 2, -15 ), as described above */
1651     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1652
1653     release_tmps( c, mark );
1654 }
1655
1656 static void emit_noise4( struct brw_wm_compile *c,
1657                          const struct prog_instruction *inst )
1658 {
1659     struct brw_compile *p = &c->func;
1660     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1661     GLuint mask = inst->DstReg.WriteMask;
1662     int i;
1663     int mark = mark_tmps( c );
1664
1665     assert( mark == 0 );
1666
1667     src0 = get_src_reg( c, inst, 0, 0 );
1668     src1 = get_src_reg( c, inst, 0, 1 );
1669     src2 = get_src_reg( c, inst, 0, 2 );
1670     src3 = get_src_reg( c, inst, 0, 3 );
1671
1672     param0 = alloc_tmp( c );
1673     param1 = alloc_tmp( c );
1674     param2 = alloc_tmp( c );
1675     param3 = alloc_tmp( c );
1676
1677     brw_MOV( p, param0, src0 );
1678     brw_MOV( p, param1, src1 );
1679     brw_MOV( p, param2, src2 );
1680     brw_MOV( p, param3, src3 );
1681
1682     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1683
1684     /* Fill in the result: */
1685     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1686     for (i = 0 ; i < 4; i++) {
1687         if (mask & (1<<i)) {
1688             dst = get_dst_reg(c, inst, i);
1689             brw_MOV( p, dst, param0 );
1690         }
1691     }
1692     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1693         brw_set_saturate( p, 0 );
1694
1695     release_tmps( c, mark );
1696 }
1697
1698 /**
1699  * Resolve subroutine calls after code emit is done.
1700  */
1701 static void post_wm_emit( struct brw_wm_compile *c )
1702 {
1703     brw_resolve_cals(&c->func);
1704 }
1705
1706 static void
1707 get_argument_regs(struct brw_wm_compile *c,
1708                   const struct prog_instruction *inst,
1709                   int index,
1710                   struct brw_reg *dst,
1711                   struct brw_reg *regs,
1712                   int mask)
1713 {
1714     struct brw_compile *p = &c->func;
1715     int i, j;
1716
1717     for (i = 0; i < 4; i++) {
1718         if (mask & (1 << i)) {
1719             regs[i] = get_src_reg(c, inst, index, i);
1720
1721             /* Unalias destination registers from our sources. */
1722             if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1723                for (j = 0; j < 4; j++) {
1724                    if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1725                        struct brw_reg tmp = alloc_tmp(c);
1726                        brw_MOV(p, tmp, regs[i]);
1727                        regs[i] = tmp;
1728                        break;
1729                    }
1730                }
1731             }
1732         }
1733     }
1734 }
1735
1736 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1737 {
1738    struct intel_context *intel = &brw->intel;
1739 #define MAX_IF_DEPTH 32
1740 #define MAX_LOOP_DEPTH 32
1741     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1742     GLuint i, if_depth = 0, loop_depth = 0;
1743     struct brw_compile *p = &c->func;
1744     struct brw_indirect stack_index = brw_indirect(0, 0);
1745
1746     c->out_of_regs = GL_FALSE;
1747
1748     prealloc_reg(c);
1749     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1750     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1751
1752     for (i = 0; i < c->nr_fp_insns; i++) {
1753         const struct prog_instruction *inst = &c->prog_instructions[i];
1754         int dst_flags;
1755         struct brw_reg args[3][4], dst[4];
1756         int j;
1757         int mark = mark_tmps( c );
1758
1759         c->cur_inst = i;
1760
1761 #if 0
1762         printf("Inst %d: ", i);
1763         _mesa_print_instruction(inst);
1764 #endif
1765
1766         /* fetch any constants that this instruction needs */
1767         if (c->fp->use_const_buffer)
1768            fetch_constants(c, inst);
1769
1770         if (inst->Opcode != OPCODE_ARL) {
1771            for (j = 0; j < 4; j++) {
1772               if (inst->DstReg.WriteMask & (1 << j))
1773                  dst[j] = get_dst_reg(c, inst, j);
1774               else
1775                  dst[j] = brw_null_reg();
1776            }
1777         }
1778         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1779             get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1780
1781         dst_flags = inst->DstReg.WriteMask;
1782         if (inst->SaturateMode == SATURATE_ZERO_ONE)
1783             dst_flags |= SATURATE;
1784
1785         if (inst->CondUpdate)
1786             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1787         else
1788             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1789
1790         switch (inst->Opcode) {
1791             case WM_PIXELXY:
1792                 emit_pixel_xy(c, dst, dst_flags);
1793                 break;
1794             case WM_DELTAXY:
1795                 emit_delta_xy(p, dst, dst_flags, args[0]);
1796                 break;
1797             case WM_PIXELW:
1798                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1799                 break;
1800             case WM_LINTERP:
1801                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1802                 break;
1803             case WM_PINTERP:
1804                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1805                 break;
1806             case WM_CINTERP:
1807                 emit_cinterp(p, dst, dst_flags, args[0]);
1808                 break;
1809             case WM_WPOSXY:
1810                 emit_wpos_xy(c, dst, dst_flags, args[0]);
1811                 break;
1812             case WM_FB_WRITE:
1813                 emit_fb_write(c, args[0], args[1], args[2],
1814                               INST_AUX_GET_TARGET(inst->Aux),
1815                               inst->Aux & INST_AUX_EOT);
1816                 break;
1817             case WM_FRONTFACING:
1818                 emit_frontfacing(p, dst, dst_flags);
1819                 break;
1820             case OPCODE_ADD:
1821                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1822                 break;
1823             case OPCODE_ARL:
1824                 emit_arl(c, inst);
1825                 break;
1826             case OPCODE_FRC:
1827                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1828                 break;
1829             case OPCODE_FLR:
1830                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1831                 break;
1832             case OPCODE_LRP:
1833                 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1834                 break;
1835             case OPCODE_TRUNC:
1836                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1837                 break;
1838             case OPCODE_MOV:
1839             case OPCODE_SWZ:
1840                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1841                 break;
1842             case OPCODE_DP3:
1843                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1844                 break;
1845             case OPCODE_DP4:
1846                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1847                 break;
1848             case OPCODE_XPD:
1849                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1850                 break;
1851             case OPCODE_DPH:
1852                 emit_dph(p, dst, dst_flags, args[0], args[1]);
1853                 break;
1854             case OPCODE_RCP:
1855                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1856                 break;
1857             case OPCODE_RSQ:
1858                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1859                 break;
1860             case OPCODE_SIN:
1861                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1862                 break;
1863             case OPCODE_COS:
1864                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1865                 break;
1866             case OPCODE_EX2:
1867                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1868                 break;
1869             case OPCODE_LG2:
1870                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1871                 break;
1872             case OPCODE_MIN:
1873                 emit_min(p, dst, dst_flags, args[0], args[1]);
1874                 break;
1875             case OPCODE_MAX:
1876                 emit_max(p, dst, dst_flags, args[0], args[1]);
1877                 break;
1878             case OPCODE_DDX:
1879             case OPCODE_DDY:
1880                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1881                           args[0]);
1882                 break;
1883             case OPCODE_SLT:
1884                 emit_sop(p, dst, dst_flags,
1885                          BRW_CONDITIONAL_L, args[0], args[1]);
1886                 break;
1887             case OPCODE_SLE:
1888                 emit_sop(p, dst, dst_flags,
1889                          BRW_CONDITIONAL_LE, args[0], args[1]);
1890                 break;
1891             case OPCODE_SGT:
1892                 emit_sop(p, dst, dst_flags,
1893                          BRW_CONDITIONAL_G, args[0], args[1]);
1894                 break;
1895             case OPCODE_SGE:
1896                 emit_sop(p, dst, dst_flags,
1897                          BRW_CONDITIONAL_GE, args[0], args[1]);
1898                 break;
1899             case OPCODE_SEQ:
1900                 emit_sop(p, dst, dst_flags,
1901                          BRW_CONDITIONAL_EQ, args[0], args[1]);
1902                 break;
1903             case OPCODE_SNE:
1904                 emit_sop(p, dst, dst_flags,
1905                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
1906                 break;
1907             case OPCODE_MUL:
1908                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1909                 break;
1910             case OPCODE_POW:
1911                 emit_math2(c, BRW_MATH_FUNCTION_POW,
1912                            dst, dst_flags, args[0], args[1]);
1913                 break;
1914             case OPCODE_MAD:
1915                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1916                 break;
1917             case OPCODE_NOISE1:
1918                 emit_noise1(c, inst);
1919                 break;
1920             case OPCODE_NOISE2:
1921                 emit_noise2(c, inst);
1922                 break;
1923             case OPCODE_NOISE3:
1924                 emit_noise3(c, inst);
1925                 break;
1926             case OPCODE_NOISE4:
1927                 emit_noise4(c, inst);
1928                 break;
1929             case OPCODE_TEX:
1930                 emit_tex(c, dst, dst_flags, args[0],
1931                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1932                                  0, 1, 0, 0),
1933                          inst->TexSrcTarget,
1934                          inst->TexSrcUnit,
1935                          (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1936                 break;
1937             case OPCODE_TXB:
1938                 emit_txb(c, dst, dst_flags, args[0],
1939                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1940                                  0, 1, 0, 0),
1941                          inst->TexSrcTarget,
1942                          c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
1943                 break;
1944             case OPCODE_KIL_NV:
1945                 emit_kil(c);
1946                 break;
1947             case OPCODE_IF:
1948                 assert(if_depth < MAX_IF_DEPTH);
1949                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1950                 break;
1951             case OPCODE_ELSE:
1952                 assert(if_depth > 0);
1953                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
1954                 break;
1955             case OPCODE_ENDIF:
1956                 assert(if_depth > 0);
1957                 brw_ENDIF(p, if_inst[--if_depth]);
1958                 break;
1959             case OPCODE_BGNSUB:
1960                 brw_save_label(p, inst->Comment, p->nr_insn);
1961                 break;
1962             case OPCODE_ENDSUB:
1963                 /* no-op */
1964                 break;
1965             case OPCODE_CAL:
1966                 brw_push_insn_state(p);
1967                 brw_set_mask_control(p, BRW_MASK_DISABLE);
1968                 brw_set_access_mode(p, BRW_ALIGN_1);
1969                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1970                 brw_set_access_mode(p, BRW_ALIGN_16);
1971                 brw_ADD(p, get_addr_reg(stack_index),
1972                          get_addr_reg(stack_index), brw_imm_d(4));
1973                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
1974                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1975                 brw_pop_insn_state(p);
1976                 break;
1977
1978             case OPCODE_RET:
1979                 brw_push_insn_state(p);
1980                 brw_set_mask_control(p, BRW_MASK_DISABLE);
1981                 brw_ADD(p, get_addr_reg(stack_index),
1982                         get_addr_reg(stack_index), brw_imm_d(-4));
1983                 brw_set_access_mode(p, BRW_ALIGN_1);
1984                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
1985                 brw_set_access_mode(p, BRW_ALIGN_16);
1986                 brw_pop_insn_state(p);
1987
1988                 break;
1989             case OPCODE_BGNLOOP:
1990                 /* XXX may need to invalidate the current_constant regs */
1991                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1992                 break;
1993             case OPCODE_BRK:
1994                 brw_BREAK(p);
1995                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1996                 break;
1997             case OPCODE_CONT:
1998                 brw_CONT(p);
1999                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2000                 break;
2001             case OPCODE_ENDLOOP:
2002                {
2003                   struct brw_instruction *inst0, *inst1;
2004                   GLuint br = 1;
2005
2006                   if (intel->is_ironlake)
2007                      br = 2;
2008
2009                   assert(loop_depth > 0);
2010                   loop_depth--;
2011                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2012                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2013                   while (inst0 > loop_inst[loop_depth]) {
2014                      inst0--;
2015                      if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2016                          inst0->bits3.if_else.jump_count == 0) {
2017                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2018                         inst0->bits3.if_else.pop_count = 0;
2019                      }
2020                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2021                               inst0->bits3.if_else.jump_count == 0) {
2022                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2023                         inst0->bits3.if_else.pop_count = 0;
2024                      }
2025                   }
2026                }
2027                break;
2028             default:
2029                 printf("unsupported IR in fragment shader %d\n",
2030                         inst->Opcode);
2031         }
2032
2033         /* Release temporaries containing any unaliased source regs. */
2034         release_tmps( c, mark );
2035
2036         if (inst->CondUpdate)
2037             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2038         else
2039             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2040     }
2041     post_wm_emit(c);
2042
2043     if (INTEL_DEBUG & DEBUG_WM) {
2044       printf("wm-native:\n");
2045       for (i = 0; i < p->nr_insn; i++)
2046          brw_disasm(stderr, &p->store[i]);
2047       printf("\n");
2048     }
2049 }
2050
2051 /**
2052  * Do GPU code generation for shaders that use GLSL features such as
2053  * flow control.  Other shaders will be compiled with the
2054  */
2055 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2056 {
2057     if (INTEL_DEBUG & DEBUG_WM) {
2058         printf("brw_wm_glsl_emit:\n");
2059     }
2060
2061     /* initial instruction translation/simplification */
2062     brw_wm_pass_fp(c);
2063
2064     /* actual code generation */
2065     brw_wm_emit_glsl(brw, c);
2066
2067     if (INTEL_DEBUG & DEBUG_WM) {
2068         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2069     }
2070
2071     c->prog_data.total_grf = num_grf_used(c);
2072     c->prog_data.total_scratch = 0;
2073 }