src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_ADD] = 2,
  48       [OPCODE_CMP] = 3,
  49       [OPCODE_DP3] = 2,
  50       [OPCODE_DP4] = 2,
  51       [OPCODE_DPH] = 2,
  52       [OPCODE_MAX] = 2,
  53       [OPCODE_MIN] = 2,
  54       [OPCODE_MUL] = 2,
  55       [OPCODE_SEQ] = 2,
  56       [OPCODE_SGE] = 2,
  57       [OPCODE_SGT] = 2,
  58       [OPCODE_SLE] = 2,
  59       [OPCODE_SLT] = 2,
  60       [OPCODE_SNE] = 2,
  61       [OPCODE_XPD] = 2,
  62    };
  63
  64    /* These opcodes get broken down in a way that allow two
  65     * args to be immediates.
  66     */
  67    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  68       if (arg == 1 || arg == 2)
  69          return GL_TRUE;
  70    }
  71
  72    if (opcode > ARRAY_SIZE(opcode_array))
  73       return GL_FALSE;
  74
  75    return arg == opcode_array[opcode] - 1;
  76 }
  77
  78 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  79 {
  80    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  81
  82    if (++c->last_tmp > c->prog_data.total_grf)
  83       c->prog_data.total_grf = c->last_tmp;
  84
  85    return tmp;
  86 }
  87
  88 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  89 {
  90    if (tmp.nr == c->last_tmp-1)
  91       c->last_tmp--;
  92 }
  93
  94 static void release_tmps( struct brw_vs_compile *c )
  95 {
  96    c->last_tmp = c->first_tmp;
  97 }
  98
  99
 100 /**
 101  * Preallocate GRF register before code emit.
 102  * Do things as simply as possible.  Allocate and populate all regs
 103  * ahead of time.
 104  */
 105 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 106 {
 107    struct intel_context *intel = &c->func.brw->intel;
 108    GLuint i, reg = 0, mrf;
 109    int attributes_in_vue;
 110
 111    /* Determine whether to use a real constant buffer or use a block
 112     * of GRF registers for constants.  The later is faster but only
 113     * works if everything fits in the GRF.
 114     * XXX this heuristic/check may need some fine tuning...
 115     */
 116    if (c->vp->program.Base.Parameters->NumParameters +
 117        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 118       c->vp->use_const_buffer = GL_TRUE;
 119    else
 120       c->vp->use_const_buffer = GL_FALSE;
 121
 122    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 123
 124    /* r0 -- reserved as usual
 125     */
 126    c->r0 = brw_vec8_grf(reg, 0);
 127    reg++;
 128
 129    /* User clip planes from curbe:
 130     */
 131    if (c->key.nr_userclip) {
 132       for (i = 0; i < c->key.nr_userclip; i++) {
 133          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 134       }
 135
 136       /* Deal with curbe alignment:
 137        */
 138       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 139    }
 140
 141    /* Vertex program parameters from curbe:
 142     */
 143    if (c->vp->use_const_buffer) {
 144       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 145       int constant = 0;
 146
 147       /* We've got more constants than we can load with the push
 148        * mechanism.  This is often correlated with reladdr loads where
 149        * we should probably be using a pull mechanism anyway to avoid
 150        * excessive reading.  However, the pull mechanism is slow in
 151        * general.  So, we try to allocate as many non-reladdr-loaded
 152        * constants through the push buffer as we can before giving up.
 153        */
 154       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 155       for (i = 0;
 156            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 157            i++) {
 158          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 159          int arg;
 160
 161          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 162             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 163                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 164                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 165                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 166                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 167                 inst->SrcReg[arg].RelAddr)
 168                continue;
 169
 170             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 171                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 172             }
 173          }
 174       }
 175
 176       for (i = 0; i < constant; i++) {
 177          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 178                                                               (i%2) * 4),
 179                                                  0, 4, 1);
 180       }
 181       reg += (constant + 1) / 2;
 182       c->prog_data.curb_read_length = reg - 1;
 183       /* XXX 0 causes a bug elsewhere... */
 184       c->prog_data.nr_params = MAX2(constant * 4, 4);
 185    }
 186    else {
 187       /* use a section of the GRF for constants */
 188       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 189       for (i = 0; i < nr_params; i++) {
 190          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 191       }
 192       reg += (nr_params + 1) / 2;
 193       c->prog_data.curb_read_length = reg - 1;
 194
 195       c->prog_data.nr_params = nr_params * 4;
 196    }
 197
 198    /* Allocate input regs:
 199     */
 200    c->nr_inputs = 0;
 201    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 202       if (c->prog_data.inputs_read & (1 << i)) {
 203          c->nr_inputs++;
 204          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 205          reg++;
 206       }
 207    }
 208    /* If there are no inputs, we'll still be reading one attribute's worth
 209     * because it's required -- see urb_read_length setting.
 210     */
 211    if (c->nr_inputs == 0)
 212       reg++;
 213
 214    /* Allocate outputs.  The non-position outputs go straight into message regs.
 215     */
 216    c->nr_outputs = 0;
 217    c->first_output = reg;
 218    c->first_overflow_output = 0;
 219
 220    if (intel->gen >= 6)
 221       mrf = 6;
 222    else if (intel->gen == 5)
 223       mrf = 8;
 224    else
 225       mrf = 4;
 226
 227    for (i = 0; i < VERT_RESULT_MAX; i++) {
 228       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 229          c->nr_outputs++;
 230          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 231          if (i == VERT_RESULT_HPOS) {
 232             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 233             reg++;
 234          }
 235          else if (i == VERT_RESULT_PSIZ) {
 236             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 237             reg++;
 238             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 239          }
 240          else {
 241             if (mrf < 16) {
 242                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 243                mrf++;
 244             }
 245             else {
 246                /* too many vertex results to fit in MRF, use GRF for overflow */
 247                if (!c->first_overflow_output)
 248                   c->first_overflow_output = i;
 249                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 250                reg++;
 251             }
 252          }
 253       }
 254    }
 255
 256    /* Allocate program temporaries:
 257     */
 258    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 259       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 260       reg++;
 261    }
 262
 263    /* Address reg(s).  Don't try to use the internal address reg until
 264     * deref time.
 265     */
 266    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 267       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 268                                              reg,
 269                                              0,
 270                                              BRW_REGISTER_TYPE_D,
 271                                              BRW_VERTICAL_STRIDE_8,
 272                                              BRW_WIDTH_8,
 273                                              BRW_HORIZONTAL_STRIDE_1,
 274                                              BRW_SWIZZLE_XXXX,
 275                                              WRITEMASK_X);
 276       reg++;
 277    }
 278
 279    if (c->vp->use_const_buffer) {
 280       for (i = 0; i < 3; i++) {
 281          c->current_const[i].index = -1;
 282          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 283          reg++;
 284       }
 285    }
 286
 287    for (i = 0; i < 128; i++) {
 288       if (c->output_regs[i].used_in_src) {
 289          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 290          reg++;
 291       }
 292    }
 293
 294    if (c->needs_stack) {
 295       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 296       reg += 2;
 297    }
 298
 299    /* Some opcodes need an internal temporary:
 300     */
 301    c->first_tmp = reg;
 302    c->last_tmp = reg;           /* for allocation purposes */
 303
 304    /* Each input reg holds data from two vertices.  The
 305     * urb_read_length is the number of registers read from *each*
 306     * vertex urb, so is half the amount:
 307     */
 308    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 309    /* Setting this field to 0 leads to undefined behavior according to the
 310     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 311     * sitting in them, even if it's padding.
 312     */
 313    if (c->prog_data.urb_read_length == 0)
 314       c->prog_data.urb_read_length = 1;
 315
 316    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 317     * them to fit the biggest thing they need to.
 318     */
 319    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 320
 321    if (intel->gen >= 6)
 322       c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8;
 323    else if (intel->gen == 5)
 324       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 325    else
 326       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 327
 328    c->prog_data.total_grf = reg;
 329
 330    if (INTEL_DEBUG & DEBUG_VS) {
 331       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 332       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 333       printf("%s reg = %d\n", __FUNCTION__, reg);
 334    }
 335 }
 336
 337
 338 /**
 339  * If an instruction uses a temp reg both as a src and the dest, we
 340  * sometimes need to allocate an intermediate temporary.
 341  */
 342 static void unalias1( struct brw_vs_compile *c,
 343                       struct brw_reg dst,
 344                       struct brw_reg arg0,
 345                       void (*func)( struct brw_vs_compile *,
 346                                     struct brw_reg,
 347                                     struct brw_reg ))
 348 {
 349    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 350       struct brw_compile *p = &c->func;
 351       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 352       func(c, tmp, arg0);
 353       brw_MOV(p, dst, tmp);
 354       release_tmp(c, tmp);
 355    }
 356    else {
 357       func(c, dst, arg0);
 358    }
 359 }
 360
 361 /**
 362  * \sa unalias2
 363  * Checkes if 2-operand instruction needs an intermediate temporary.
 364  */
 365 static void unalias2( struct brw_vs_compile *c,
 366                       struct brw_reg dst,
 367                       struct brw_reg arg0,
 368                       struct brw_reg arg1,
 369                       void (*func)( struct brw_vs_compile *,
 370                                     struct brw_reg,
 371                                     struct brw_reg,
 372                                     struct brw_reg ))
 373 {
 374    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 375        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 376       struct brw_compile *p = &c->func;
 377       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 378       func(c, tmp, arg0, arg1);
 379       brw_MOV(p, dst, tmp);
 380       release_tmp(c, tmp);
 381    }
 382    else {
 383       func(c, dst, arg0, arg1);
 384    }
 385 }
 386
 387 /**
 388  * \sa unalias2
 389  * Checkes if 3-operand instruction needs an intermediate temporary.
 390  */
 391 static void unalias3( struct brw_vs_compile *c,
 392                       struct brw_reg dst,
 393                       struct brw_reg arg0,
 394                       struct brw_reg arg1,
 395                       struct brw_reg arg2,
 396                       void (*func)( struct brw_vs_compile *,
 397                                     struct brw_reg,
 398                                     struct brw_reg,
 399                                     struct brw_reg,
 400                                     struct brw_reg ))
 401 {
 402    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 403        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 404        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 405       struct brw_compile *p = &c->func;
 406       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 407       func(c, tmp, arg0, arg1, arg2);
 408       brw_MOV(p, dst, tmp);
 409       release_tmp(c, tmp);
 410    }
 411    else {
 412       func(c, dst, arg0, arg1, arg2);
 413    }
 414 }
 415
 416 static void emit_sop( struct brw_vs_compile *c,
 417                       struct brw_reg dst,
 418                       struct brw_reg arg0,
 419                       struct brw_reg arg1,
 420                       GLuint cond)
 421 {
 422    struct brw_compile *p = &c->func;
 423
 424    brw_MOV(p, dst, brw_imm_f(0.0f));
 425    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 426    brw_MOV(p, dst, brw_imm_f(1.0f));
 427    brw_set_predicate_control_flag_value(p, 0xff);
 428 }
 429
 430 static void emit_seq( struct brw_vs_compile *c,
 431                       struct brw_reg dst,
 432                       struct brw_reg arg0,
 433                       struct brw_reg arg1 )
 434 {
 435    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 436 }
 437
 438 static void emit_sne( struct brw_vs_compile *c,
 439                       struct brw_reg dst,
 440                       struct brw_reg arg0,
 441                       struct brw_reg arg1 )
 442 {
 443    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 444 }
 445 static void emit_slt( struct brw_vs_compile *c,
 446                       struct brw_reg dst,
 447                       struct brw_reg arg0,
 448                       struct brw_reg arg1 )
 449 {
 450    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 451 }
 452
 453 static void emit_sle( struct brw_vs_compile *c,
 454                       struct brw_reg dst,
 455                       struct brw_reg arg0,
 456                       struct brw_reg arg1 )
 457 {
 458    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 459 }
 460
 461 static void emit_sgt( struct brw_vs_compile *c,
 462                       struct brw_reg dst,
 463                       struct brw_reg arg0,
 464                       struct brw_reg arg1 )
 465 {
 466    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 467 }
 468
 469 static void emit_sge( struct brw_vs_compile *c,
 470                       struct brw_reg dst,
 471                       struct brw_reg arg0,
 472                       struct brw_reg arg1 )
 473 {
 474   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 475 }
 476
 477 static void emit_cmp( struct brw_compile *p,
 478                       struct brw_reg dst,
 479                       struct brw_reg arg0,
 480                       struct brw_reg arg1,
 481                       struct brw_reg arg2 )
 482 {
 483    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 484    brw_SEL(p, dst, arg1, arg2);
 485    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 486 }
 487
 488 static void emit_max( struct brw_compile *p,
 489                       struct brw_reg dst,
 490                       struct brw_reg arg0,
 491                       struct brw_reg arg1 )
 492 {
 493    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 494    brw_SEL(p, dst, arg0, arg1);
 495    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 496 }
 497
 498 static void emit_min( struct brw_compile *p,
 499                       struct brw_reg dst,
 500                       struct brw_reg arg0,
 501                       struct brw_reg arg1 )
 502 {
 503    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 504    brw_SEL(p, dst, arg0, arg1);
 505    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 506 }
 507
 508
 509 static void emit_math1( struct brw_vs_compile *c,
 510                         GLuint function,
 511                         struct brw_reg dst,
 512                         struct brw_reg arg0,
 513                         GLuint precision)
 514 {
 515    /* There are various odd behaviours with SEND on the simulator.  In
 516     * addition there are documented issues with the fact that the GEN4
 517     * processor doesn't do dependency control properly on SEND
 518     * results.  So, on balance, this kludge to get around failures
 519     * with writemasked math results looks like it might be necessary
 520     * whether that turns out to be a simulator bug or not:
 521     */
 522    struct brw_compile *p = &c->func;
 523    struct intel_context *intel = &p->brw->intel;
 524    struct brw_reg tmp = dst;
 525    GLboolean need_tmp = (intel->gen < 6 &&
 526                          (dst.dw1.bits.writemask != 0xf ||
 527                           dst.file != BRW_GENERAL_REGISTER_FILE));
 528
 529    if (need_tmp)
 530       tmp = get_tmp(c);
 531
 532    brw_math(p,
 533             tmp,
 534             function,
 535             BRW_MATH_SATURATE_NONE,
 536             2,
 537             arg0,
 538             BRW_MATH_DATA_SCALAR,
 539             precision);
 540
 541    if (need_tmp) {
 542       brw_MOV(p, dst, tmp);
 543       release_tmp(c, tmp);
 544    }
 545 }
 546
 547
 548 static void emit_math2( struct brw_vs_compile *c,
 549                         GLuint function,
 550                         struct brw_reg dst,
 551                         struct brw_reg arg0,
 552                         struct brw_reg arg1,
 553                         GLuint precision)
 554 {
 555    struct brw_compile *p = &c->func;
 556    struct intel_context *intel = &p->brw->intel;
 557    struct brw_reg tmp = dst;
 558    GLboolean need_tmp = (intel->gen < 6 &&
 559                          (dst.dw1.bits.writemask != 0xf ||
 560                           dst.file != BRW_GENERAL_REGISTER_FILE));
 561
 562    if (need_tmp)
 563       tmp = get_tmp(c);
 564
 565    brw_MOV(p, brw_message_reg(3), arg1);
 566
 567    brw_math(p,
 568             tmp,
 569             function,
 570             BRW_MATH_SATURATE_NONE,
 571             2,
 572             arg0,
 573             BRW_MATH_DATA_SCALAR,
 574             precision);
 575
 576    if (need_tmp) {
 577       brw_MOV(p, dst, tmp);
 578       release_tmp(c, tmp);
 579    }
 580 }
 581
 582
 583 static void emit_exp_noalias( struct brw_vs_compile *c,
 584                               struct brw_reg dst,
 585                               struct brw_reg arg0 )
 586 {
 587    struct brw_compile *p = &c->func;
 588
 589
 590    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 591       struct brw_reg tmp = get_tmp(c);
 592       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 593
 594       /* tmp_d = floor(arg0.x) */
 595       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 596
 597       /* result[0] = 2.0 ^ tmp */
 598
 599       /* Adjust exponent for floating point:
 600        * exp += 127
 601        */
 602       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 603
 604       /* Install exponent and sign.
 605        * Excess drops off the edge:
 606        */
 607       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 608               tmp_d, brw_imm_d(23));
 609
 610       release_tmp(c, tmp);
 611    }
 612
 613    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 614       /* result[1] = arg0.x - floor(arg0.x) */
 615       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 616    }
 617
 618    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 619       /* As with the LOG instruction, we might be better off just
 620        * doing a taylor expansion here, seeing as we have to do all
 621        * the prep work.
 622        *
 623        * If mathbox partial precision is too low, consider also:
 624        * result[3] = result[0] * EXP(result[1])
 625        */
 626       emit_math1(c,
 627                  BRW_MATH_FUNCTION_EXP,
 628                  brw_writemask(dst, WRITEMASK_Z),
 629                  brw_swizzle1(arg0, 0),
 630                  BRW_MATH_PRECISION_FULL);
 631    }
 632
 633    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 634       /* result[3] = 1.0; */
 635       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 636    }
 637 }
 638
 639
 640 static void emit_log_noalias( struct brw_vs_compile *c,
 641                               struct brw_reg dst,
 642                               struct brw_reg arg0 )
 643 {
 644    struct brw_compile *p = &c->func;
 645    struct brw_reg tmp = dst;
 646    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 647    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 648    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 649                          dst.file != BRW_GENERAL_REGISTER_FILE);
 650
 651    if (need_tmp) {
 652       tmp = get_tmp(c);
 653       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 654    }
 655
 656    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 657     * according to spec:
 658     *
 659     * These almost look likey they could be joined up, but not really
 660     * practical:
 661     *
 662     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 663     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 664     */
 665    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 666       brw_AND(p,
 667               brw_writemask(tmp_ud, WRITEMASK_X),
 668               brw_swizzle1(arg0_ud, 0),
 669               brw_imm_ud((1U<<31)-1));
 670
 671       brw_SHR(p,
 672               brw_writemask(tmp_ud, WRITEMASK_X),
 673               tmp_ud,
 674               brw_imm_ud(23));
 675
 676       brw_ADD(p,
 677               brw_writemask(tmp, WRITEMASK_X),
 678               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 679               brw_imm_d(-127));
 680    }
 681
 682    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 683       brw_AND(p,
 684               brw_writemask(tmp_ud, WRITEMASK_Y),
 685               brw_swizzle1(arg0_ud, 0),
 686               brw_imm_ud((1<<23)-1));
 687
 688       brw_OR(p,
 689              brw_writemask(tmp_ud, WRITEMASK_Y),
 690              tmp_ud,
 691              brw_imm_ud(127<<23));
 692    }
 693
 694    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 695       /* result[2] = result[0] + LOG2(result[1]); */
 696
 697       /* Why bother?  The above is just a hint how to do this with a
 698        * taylor series.  Maybe we *should* use a taylor series as by
 699        * the time all the above has been done it's almost certainly
 700        * quicker than calling the mathbox, even with low precision.
 701        *
 702        * Options are:
 703        *    - result[0] + mathbox.LOG2(result[1])
 704        *    - mathbox.LOG2(arg0.x)
 705        *    - result[0] + inline_taylor_approx(result[1])
 706        */
 707       emit_math1(c,
 708                  BRW_MATH_FUNCTION_LOG,
 709                  brw_writemask(tmp, WRITEMASK_Z),
 710                  brw_swizzle1(tmp, 1),
 711                  BRW_MATH_PRECISION_FULL);
 712
 713       brw_ADD(p,
 714               brw_writemask(tmp, WRITEMASK_Z),
 715               brw_swizzle1(tmp, 2),
 716               brw_swizzle1(tmp, 0));
 717    }
 718
 719    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 720       /* result[3] = 1.0; */
 721       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 722    }
 723
 724    if (need_tmp) {
 725       brw_MOV(p, dst, tmp);
 726       release_tmp(c, tmp);
 727    }
 728 }
 729
 730
 731 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 732  */
 733 static void emit_dst_noalias( struct brw_vs_compile *c,
 734                               struct brw_reg dst,
 735                               struct brw_reg arg0,
 736                               struct brw_reg arg1)
 737 {
 738    struct brw_compile *p = &c->func;
 739
 740    /* There must be a better way to do this:
 741     */
 742    if (dst.dw1.bits.writemask & WRITEMASK_X)
 743       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 744    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 745       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 746    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 747       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 748    if (dst.dw1.bits.writemask & WRITEMASK_W)
 749       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 750 }
 751
 752
 753 static void emit_xpd( struct brw_compile *p,
 754                       struct brw_reg dst,
 755                       struct brw_reg t,
 756                       struct brw_reg u)
 757 {
 758    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 759    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 760 }
 761
 762
 763 static void emit_lit_noalias( struct brw_vs_compile *c,
 764                               struct brw_reg dst,
 765                               struct brw_reg arg0 )
 766 {
 767    struct brw_compile *p = &c->func;
 768    struct brw_instruction *if_insn;
 769    struct brw_reg tmp = dst;
 770    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 771
 772    if (need_tmp)
 773       tmp = get_tmp(c);
 774
 775    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 776    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 777
 778    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 779     * to get all channels active inside the IF.  In the clipping code
 780     * we run with NoMask, so it's not an option and we can use
 781     * BRW_EXECUTE_1 for all comparisions.
 782     */
 783    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 784    if_insn = brw_IF(p, BRW_EXECUTE_8);
 785    {
 786       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 787
 788       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 789       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 790       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 791
 792       emit_math2(c,
 793                  BRW_MATH_FUNCTION_POW,
 794                  brw_writemask(dst, WRITEMASK_Z),
 795                  brw_swizzle1(tmp, 2),
 796                  brw_swizzle1(arg0, 3),
 797                  BRW_MATH_PRECISION_PARTIAL);
 798    }
 799
 800    brw_ENDIF(p, if_insn);
 801
 802    release_tmp(c, tmp);
 803 }
 804
 805 static void emit_lrp_noalias(struct brw_vs_compile *c,
 806                              struct brw_reg dst,
 807                              struct brw_reg arg0,
 808                              struct brw_reg arg1,
 809                              struct brw_reg arg2)
 810 {
 811    struct brw_compile *p = &c->func;
 812
 813    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 814    brw_MUL(p, brw_null_reg(), dst, arg2);
 815    brw_MAC(p, dst, arg0, arg1);
 816 }
 817
 818 /** 3 or 4-component vector normalization */
 819 static void emit_nrm( struct brw_vs_compile *c,
 820                       struct brw_reg dst,
 821                       struct brw_reg arg0,
 822                       int num_comps)
 823 {
 824    struct brw_compile *p = &c->func;
 825    struct brw_reg tmp = get_tmp(c);
 826
 827    /* tmp = dot(arg0, arg0) */
 828    if (num_comps == 3)
 829       brw_DP3(p, tmp, arg0, arg0);
 830    else
 831       brw_DP4(p, tmp, arg0, arg0);
 832
 833    /* tmp = 1 / sqrt(tmp) */
 834    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 835
 836    /* dst = arg0 * tmp */
 837    brw_MUL(p, dst, arg0, tmp);
 838
 839    release_tmp(c, tmp);
 840 }
 841
 842
 843 static struct brw_reg
 844 get_constant(struct brw_vs_compile *c,
 845              const struct prog_instruction *inst,
 846              GLuint argIndex)
 847 {
 848    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 849    struct brw_compile *p = &c->func;
 850    struct brw_reg const_reg = c->current_const[argIndex].reg;
 851
 852    assert(argIndex < 3);
 853
 854    if (c->current_const[argIndex].index != src->Index) {
 855       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 856
 857       /* Keep track of the last constant loaded in this slot, for reuse. */
 858       c->current_const[argIndex].index = src->Index;
 859
 860 #if 0
 861       printf("  fetch const[%d] for arg %d into reg %d\n",
 862              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 863 #endif
 864       /* need to fetch the constant now */
 865       brw_dp_READ_4_vs(p,
 866                        const_reg,                     /* writeback dest */
 867                        0,                             /* oword */
 868                        0,                             /* relative indexing? */
 869                        addrReg,                       /* address register */
 870                        16 * src->Index,               /* byte offset */
 871                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 872                        );
 873    }
 874
 875    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 876    const_reg = stride(const_reg, 0, 4, 0);
 877    const_reg.subnr = 0;
 878
 879    return const_reg;
 880 }
 881
 882 static struct brw_reg
 883 get_reladdr_constant(struct brw_vs_compile *c,
 884                      const struct prog_instruction *inst,
 885                      GLuint argIndex)
 886 {
 887    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 888    struct brw_compile *p = &c->func;
 889    struct brw_reg const_reg = c->current_const[argIndex].reg;
 890    struct brw_reg const2_reg;
 891    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 892
 893    assert(argIndex < 3);
 894
 895    /* Can't reuse a reladdr constant load. */
 896    c->current_const[argIndex].index = -1;
 897
 898  #if 0
 899    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 900           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 901 #endif
 902
 903    /* fetch the first vec4 */
 904    brw_dp_READ_4_vs(p,
 905                     const_reg,                     /* writeback dest */
 906                     0,                             /* oword */
 907                     1,                             /* relative indexing? */
 908                     addrReg,                       /* address register */
 909                     16 * src->Index,               /* byte offset */
 910                     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 911                     );
 912    /* second vec4 */
 913    const2_reg = get_tmp(c);
 914
 915    /* use upper half of address reg for second read */
 916    addrReg = stride(addrReg, 0, 4, 0);
 917    addrReg.subnr = 16;
 918
 919    brw_dp_READ_4_vs(p,
 920                     const2_reg,              /* writeback dest */
 921                     1,                       /* oword */
 922                     1,                       /* relative indexing? */
 923                     addrReg,                 /* address register */
 924                     16 * src->Index,         /* byte offset */
 925                     SURF_INDEX_VERT_CONST_BUFFER
 926                     );
 927
 928    /* merge the two Owords into the constant register */
 929    /* const_reg[7..4] = const2_reg[7..4] */
 930    brw_MOV(p,
 931            suboffset(stride(const_reg, 0, 4, 1), 4),
 932            suboffset(stride(const2_reg, 0, 4, 1), 4));
 933    release_tmp(c, const2_reg);
 934
 935    return const_reg;
 936 }
 937
 938
 939
 940 /* TODO: relative addressing!
 941  */
 942 static struct brw_reg get_reg( struct brw_vs_compile *c,
 943                                gl_register_file file,
 944                                GLuint index )
 945 {
 946    switch (file) {
 947    case PROGRAM_TEMPORARY:
 948    case PROGRAM_INPUT:
 949    case PROGRAM_OUTPUT:
 950       assert(c->regs[file][index].nr != 0);
 951       return c->regs[file][index];
 952    case PROGRAM_STATE_VAR:
 953    case PROGRAM_CONSTANT:
 954    case PROGRAM_UNIFORM:
 955       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 956       return c->regs[PROGRAM_STATE_VAR][index];
 957    case PROGRAM_ADDRESS:
 958       assert(index == 0);
 959       return c->regs[file][index];
 960
 961    case PROGRAM_UNDEFINED:                      /* undef values */
 962       return brw_null_reg();
 963
 964    case PROGRAM_LOCAL_PARAM:
 965    case PROGRAM_ENV_PARAM:
 966    case PROGRAM_WRITE_ONLY:
 967    default:
 968       assert(0);
 969       return brw_null_reg();
 970    }
 971 }
 972
 973
 974 /**
 975  * Indirect addressing:  get reg[[arg] + offset].
 976  */
 977 static struct brw_reg deref( struct brw_vs_compile *c,
 978                              struct brw_reg arg,
 979                              GLint offset)
 980 {
 981    struct brw_compile *p = &c->func;
 982    struct brw_reg tmp = vec4(get_tmp(c));
 983    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 984    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 985    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 986    struct brw_reg indirect = brw_vec4_indirect(0,0);
 987
 988    {
 989       brw_push_insn_state(p);
 990       brw_set_access_mode(p, BRW_ALIGN_1);
 991
 992       /* This is pretty clunky - load the address register twice and
 993        * fetch each 4-dword value in turn.  There must be a way to do
 994        * this in a single pass, but I couldn't get it to work.
 995        */
 996       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 997       brw_MOV(p, tmp, indirect);
 998
 999       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
1000       brw_MOV(p, suboffset(tmp, 4), indirect);
1001
1002       brw_pop_insn_state(p);
1003    }
1004
1005    /* NOTE: tmp not released */
1006    return vec8(tmp);
1007 }
1008
1009
1010 /**
1011  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1012  * TODO: relative addressing!
1013  */
1014 static struct brw_reg
1015 get_src_reg( struct brw_vs_compile *c,
1016              const struct prog_instruction *inst,
1017              GLuint argIndex )
1018 {
1019    const GLuint file = inst->SrcReg[argIndex].File;
1020    const GLint index = inst->SrcReg[argIndex].Index;
1021    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1022
1023    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1024       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1025
1026       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1027                                         SWIZZLE_ZERO,
1028                                         SWIZZLE_ZERO,
1029                                         SWIZZLE_ZERO)) {
1030           return brw_imm_f(0.0f);
1031       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1032                                                SWIZZLE_ONE,
1033                                                SWIZZLE_ONE,
1034                                                SWIZZLE_ONE)) {
1035          if (src->Negate)
1036             return brw_imm_f(-1.0F);
1037          else
1038             return brw_imm_f(1.0F);
1039       } else if (src->File == PROGRAM_CONSTANT) {
1040          const struct gl_program_parameter_list *params;
1041          float f;
1042          int component = -1;
1043
1044          switch (src->Swizzle) {
1045          case SWIZZLE_XXXX:
1046             component = 0;
1047             break;
1048          case SWIZZLE_YYYY:
1049             component = 1;
1050             break;
1051          case SWIZZLE_ZZZZ:
1052             component = 2;
1053             break;
1054          case SWIZZLE_WWWW:
1055             component = 3;
1056             break;
1057          }
1058
1059          if (component >= 0) {
1060             params = c->vp->program.Base.Parameters;
1061             f = params->ParameterValues[src->Index][component];
1062
1063             if (src->Abs)
1064                f = fabs(f);
1065             if (src->Negate)
1066                f = -f;
1067             return brw_imm_f(f);
1068          }
1069       }
1070    }
1071
1072    switch (file) {
1073    case PROGRAM_TEMPORARY:
1074    case PROGRAM_INPUT:
1075    case PROGRAM_OUTPUT:
1076       if (relAddr) {
1077          return deref(c, c->regs[file][0], index);
1078       }
1079       else {
1080          assert(c->regs[file][index].nr != 0);
1081          return c->regs[file][index];
1082       }
1083
1084    case PROGRAM_STATE_VAR:
1085    case PROGRAM_CONSTANT:
1086    case PROGRAM_UNIFORM:
1087    case PROGRAM_ENV_PARAM:
1088    case PROGRAM_LOCAL_PARAM:
1089       if (c->vp->use_const_buffer) {
1090          if (!relAddr && c->constant_map[index] != -1) {
1091             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1092             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1093          } else if (relAddr)
1094             return get_reladdr_constant(c, inst, argIndex);
1095          else
1096             return get_constant(c, inst, argIndex);
1097       }
1098       else if (relAddr) {
1099          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
1100       }
1101       else {
1102          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1103          return c->regs[PROGRAM_STATE_VAR][index];
1104       }
1105    case PROGRAM_ADDRESS:
1106       assert(index == 0);
1107       return c->regs[file][index];
1108
1109    case PROGRAM_UNDEFINED:
1110       /* this is a normal case since we loop over all three src args */
1111       return brw_null_reg();
1112
1113    case PROGRAM_WRITE_ONLY:
1114    default:
1115       assert(0);
1116       return brw_null_reg();
1117    }
1118 }
1119
1120
1121 static void emit_arl( struct brw_vs_compile *c,
1122                       struct brw_reg dst,
1123                       struct brw_reg arg0 )
1124 {
1125    struct brw_compile *p = &c->func;
1126    struct brw_reg tmp = dst;
1127    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1128
1129    if (need_tmp)
1130       tmp = get_tmp(c);
1131
1132    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
1133    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
1134
1135    if (need_tmp)
1136       release_tmp(c, tmp);
1137 }
1138
1139
1140 /**
1141  * Return the brw reg for the given instruction's src argument.
1142  * Will return mangled results for SWZ op.  The emit_swz() function
1143  * ignores this result and recalculates taking extended swizzles into
1144  * account.
1145  */
1146 static struct brw_reg get_arg( struct brw_vs_compile *c,
1147                                const struct prog_instruction *inst,
1148                                GLuint argIndex )
1149 {
1150    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1151    struct brw_reg reg;
1152
1153    if (src->File == PROGRAM_UNDEFINED)
1154       return brw_null_reg();
1155
1156    reg = get_src_reg(c, inst, argIndex);
1157
1158    /* Convert 3-bit swizzle to 2-bit.
1159     */
1160    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1161                                        GET_SWZ(src->Swizzle, 1),
1162                                        GET_SWZ(src->Swizzle, 2),
1163                                        GET_SWZ(src->Swizzle, 3));
1164
1165    /* Note this is ok for non-swizzle instructions:
1166     */
1167    reg.negate = src->Negate ? 1 : 0;
1168
1169    return reg;
1170 }
1171
1172
1173 /**
1174  * Get brw register for the given program dest register.
1175  */
1176 static struct brw_reg get_dst( struct brw_vs_compile *c,
1177                                struct prog_dst_register dst )
1178 {
1179    struct brw_reg reg;
1180
1181    switch (dst.File) {
1182    case PROGRAM_TEMPORARY:
1183    case PROGRAM_OUTPUT:
1184       assert(c->regs[dst.File][dst.Index].nr != 0);
1185       reg = c->regs[dst.File][dst.Index];
1186       break;
1187    case PROGRAM_ADDRESS:
1188       assert(dst.Index == 0);
1189       reg = c->regs[dst.File][dst.Index];
1190       break;
1191    case PROGRAM_UNDEFINED:
1192       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1193       reg = brw_null_reg();
1194       break;
1195    default:
1196       assert(0);
1197       reg = brw_null_reg();
1198    }
1199
1200    reg.dw1.bits.writemask = dst.WriteMask;
1201
1202    return reg;
1203 }
1204
1205
1206 static void emit_swz( struct brw_vs_compile *c,
1207                       struct brw_reg dst,
1208                       const struct prog_instruction *inst)
1209 {
1210    const GLuint argIndex = 0;
1211    const struct prog_src_register src = inst->SrcReg[argIndex];
1212    struct brw_compile *p = &c->func;
1213    GLuint zeros_mask = 0;
1214    GLuint ones_mask = 0;
1215    GLuint src_mask = 0;
1216    GLubyte src_swz[4];
1217    GLboolean need_tmp = (src.Negate &&
1218                          dst.file != BRW_GENERAL_REGISTER_FILE);
1219    struct brw_reg tmp = dst;
1220    GLuint i;
1221
1222    if (need_tmp)
1223       tmp = get_tmp(c);
1224
1225    for (i = 0; i < 4; i++) {
1226       if (dst.dw1.bits.writemask & (1<<i)) {
1227          GLubyte s = GET_SWZ(src.Swizzle, i);
1228          switch (s) {
1229          case SWIZZLE_X:
1230          case SWIZZLE_Y:
1231          case SWIZZLE_Z:
1232          case SWIZZLE_W:
1233             src_mask |= 1<<i;
1234             src_swz[i] = s;
1235             break;
1236          case SWIZZLE_ZERO:
1237             zeros_mask |= 1<<i;
1238             break;
1239          case SWIZZLE_ONE:
1240             ones_mask |= 1<<i;
1241             break;
1242          }
1243       }
1244    }
1245
1246    /* Do src first, in case dst aliases src:
1247     */
1248    if (src_mask) {
1249       struct brw_reg arg0;
1250
1251       arg0 = get_src_reg(c, inst, argIndex);
1252
1253       arg0 = brw_swizzle(arg0,
1254                          src_swz[0], src_swz[1],
1255                          src_swz[2], src_swz[3]);
1256
1257       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1258    }
1259
1260    if (zeros_mask)
1261       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1262
1263    if (ones_mask)
1264       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1265
1266    if (src.Negate)
1267       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1268
1269    if (need_tmp) {
1270       brw_MOV(p, dst, tmp);
1271       release_tmp(c, tmp);
1272    }
1273 }
1274
1275
1276 /**
1277  * Post-vertex-program processing.  Send the results to the URB.
1278  */
1279 static void emit_vertex_write( struct brw_vs_compile *c)
1280 {
1281    struct brw_compile *p = &c->func;
1282    struct brw_context *brw = p->brw;
1283    struct intel_context *intel = &brw->intel;
1284    struct brw_reg m0 = brw_message_reg(0);
1285    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1286    struct brw_reg ndc;
1287    int eot;
1288    GLuint len_vertex_header = 2;
1289
1290    if (c->key.copy_edgeflag) {
1291       brw_MOV(p,
1292               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1293               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1294    }
1295
1296    if (intel->gen < 6) {
1297       /* Build ndc coords */
1298       ndc = get_tmp(c);
1299       /* ndc = 1.0 / pos.w */
1300       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1301       /* ndc.xyz = pos * ndc */
1302       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1303    }
1304
1305    /* Update the header for point size, user clipping flags, and -ve rhw
1306     * workaround.
1307     */
1308    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1309        c->key.nr_userclip || brw->has_negative_rhw_bug)
1310    {
1311       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1312       GLuint i;
1313
1314       brw_MOV(p, header1, brw_imm_ud(0));
1315
1316       brw_set_access_mode(p, BRW_ALIGN_16);
1317
1318       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1319          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1320          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1321          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1322       }
1323
1324       for (i = 0; i < c->key.nr_userclip; i++) {
1325          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1326          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1327          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1328          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1329       }
1330
1331       /* i965 clipping workaround:
1332        * 1) Test for -ve rhw
1333        * 2) If set,
1334        *      set ndc = (0,0,0,0)
1335        *      set ucp[6] = 1
1336        *
1337        * Later, clipping will detect ucp[6] and ensure the primitive is
1338        * clipped against all fixed planes.
1339        */
1340       if (brw->has_negative_rhw_bug) {
1341          brw_CMP(p,
1342                  vec8(brw_null_reg()),
1343                  BRW_CONDITIONAL_L,
1344                  brw_swizzle1(ndc, 3),
1345                  brw_imm_f(0));
1346
1347          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1348          brw_MOV(p, ndc, brw_imm_f(0));
1349          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1350       }
1351
1352       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1353       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1354       brw_set_access_mode(p, BRW_ALIGN_16);
1355
1356       release_tmp(c, header1);
1357    }
1358    else {
1359       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1360    }
1361
1362    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1363     * of zeros followed by two sets of NDC coordinates:
1364     */
1365    brw_set_access_mode(p, BRW_ALIGN_1);
1366
1367    if (intel->gen >= 6) {
1368       /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
1369        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1370        * dword 4-7 (m2) is the 4D space position
1371        * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
1372        * m5 is the first vertex data we fill, which is the vertex position.
1373        */
1374       brw_MOV(p, offset(m0, 2), pos);
1375       brw_MOV(p, offset(m0, 5), pos);
1376       len_vertex_header = 4;
1377    } else if (intel->gen == 5) {
1378       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1379        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1380        * dword 4-7 (m2) is the ndc position (set above)
1381        * dword 8-11 (m3) of the vertex header is the 4D space position
1382        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1383        * m6 is a pad so that the vertex element data is aligned
1384        * m7 is the first vertex data we fill, which is the vertex position.
1385        */
1386       brw_MOV(p, offset(m0, 2), ndc);
1387       brw_MOV(p, offset(m0, 3), pos);
1388       brw_MOV(p, offset(m0, 7), pos);
1389       len_vertex_header = 6;
1390    } else {
1391       /* There are 8 dwords in VUE header pre-Ironlake:
1392        * dword 0-3 (m1) is indices, point width, clip flags.
1393        * dword 4-7 (m2) is ndc position (set above)
1394        *
1395        * dword 8-11 (m3) is the first vertex data, which we always have be the
1396        * vertex position.
1397        */
1398       brw_MOV(p, offset(m0, 2), ndc);
1399       brw_MOV(p, offset(m0, 3), pos);
1400       len_vertex_header = 2;
1401    }
1402
1403    eot = (c->first_overflow_output == 0);
1404
1405    brw_urb_WRITE(p,
1406                  brw_null_reg(), /* dest */
1407                  0,             /* starting mrf reg nr */
1408                  c->r0,         /* src */
1409                  0,             /* allocate */
1410                  1,             /* used */
1411                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1412                  0,             /* response len */
1413                  eot,           /* eot */
1414                  eot,           /* writes complete */
1415                  0,             /* urb destination offset */
1416                  BRW_URB_SWIZZLE_INTERLEAVE);
1417
1418    if (c->first_overflow_output > 0) {
1419       /* Not all of the vertex outputs/results fit into the MRF.
1420        * Move the overflowed attributes from the GRF to the MRF and
1421        * issue another brw_urb_WRITE().
1422        */
1423       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1424        * at mrf[4] atm...
1425        */
1426       GLuint i, mrf = 0;
1427       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1428          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1429             /* move from GRF to MRF */
1430             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1431             mrf++;
1432          }
1433       }
1434
1435       brw_urb_WRITE(p,
1436                     brw_null_reg(), /* dest */
1437                     4,              /* starting mrf reg nr */
1438                     c->r0,          /* src */
1439                     0,              /* allocate */
1440                     1,              /* used */
1441                     mrf+1,          /* msg len */
1442                     0,              /* response len */
1443                     1,              /* eot */
1444                     1,              /* writes complete */
1445                     BRW_MAX_MRF-1,  /* urb destination offset */
1446                     BRW_URB_SWIZZLE_INTERLEAVE);
1447    }
1448 }
1449
1450 static GLboolean
1451 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1452 {
1453    struct brw_compile *p = &c->func;
1454    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1455
1456    if (p->nr_insn == 0)
1457       return GL_FALSE;
1458
1459    if (val.address_mode != BRW_ADDRESS_DIRECT)
1460       return GL_FALSE;
1461
1462    switch (prev_insn->header.opcode) {
1463    case BRW_OPCODE_MOV:
1464    case BRW_OPCODE_MAC:
1465    case BRW_OPCODE_MUL:
1466       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1467           prev_insn->header.execution_size == val.width &&
1468           prev_insn->bits1.da1.dest_reg_file == val.file &&
1469           prev_insn->bits1.da1.dest_reg_type == val.type &&
1470           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1471           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1472           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1473           prev_insn->bits1.da16.dest_writemask == 0xf)
1474          return GL_TRUE;
1475       else
1476          return GL_FALSE;
1477    default:
1478       return GL_FALSE;
1479    }
1480 }
1481
1482 static uint32_t
1483 get_predicate(const struct prog_instruction *inst)
1484 {
1485    if (inst->DstReg.CondMask == COND_TR)
1486       return BRW_PREDICATE_NONE;
1487
1488    /* All of GLSL only produces predicates for COND_NE and one channel per
1489     * vector.  Fail badly if someone starts doing something else, as it might
1490     * mean infinite looping or something.
1491     *
1492     * We'd like to support all the condition codes, but our hardware doesn't
1493     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1494     * those, the instruction may update the condition codes or not, then any
1495     * later instruction may use one of those condition codes.  For gen4, the
1496     * instruction may update the flags register based on one of the condition
1497     * codes output by the instruction, and then further instructions may
1498     * predicate on that.  We can probably support this, but it won't
1499     * necessarily be easy.
1500     */
1501    assert(inst->DstReg.CondMask == COND_NE);
1502
1503    switch (inst->DstReg.CondSwizzle) {
1504    case SWIZZLE_XXXX:
1505       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1506    case SWIZZLE_YYYY:
1507       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1508    case SWIZZLE_ZZZZ:
1509       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1510    case SWIZZLE_WWWW:
1511       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1512    default:
1513       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1514                     inst->DstReg.CondMask);
1515       return BRW_PREDICATE_NORMAL;
1516    }
1517 }
1518
1519 /* Emit the vertex program instructions here.
1520  */
1521 void brw_vs_emit(struct brw_vs_compile *c )
1522 {
1523 #define MAX_IF_DEPTH 32
1524 #define MAX_LOOP_DEPTH 32
1525    struct brw_compile *p = &c->func;
1526    struct brw_context *brw = p->brw;
1527    struct intel_context *intel = &brw->intel;
1528    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1529    GLuint insn, if_depth = 0, loop_depth = 0;
1530    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1531    const struct brw_indirect stack_index = brw_indirect(0, 0);
1532    GLuint index;
1533    GLuint file;
1534
1535    if (INTEL_DEBUG & DEBUG_VS) {
1536       printf("vs-mesa:\n");
1537       _mesa_print_program(&c->vp->program.Base);
1538       printf("\n");
1539    }
1540
1541    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1542    brw_set_access_mode(p, BRW_ALIGN_16);
1543
1544    for (insn = 0; insn < nr_insns; insn++) {
1545        GLuint i;
1546        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1547
1548        /* Message registers can't be read, so copy the output into GRF
1549         * register if they are used in source registers
1550         */
1551        for (i = 0; i < 3; i++) {
1552            struct prog_src_register *src = &inst->SrcReg[i];
1553            GLuint index = src->Index;
1554            GLuint file = src->File;
1555            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1556                c->output_regs[index].used_in_src = GL_TRUE;
1557        }
1558
1559        switch (inst->Opcode) {
1560        case OPCODE_CAL:
1561        case OPCODE_RET:
1562           c->needs_stack = GL_TRUE;
1563           break;
1564        default:
1565           break;
1566        }
1567    }
1568
1569    /* Static register allocation
1570     */
1571    brw_vs_alloc_regs(c);
1572
1573    if (c->needs_stack)
1574       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1575
1576    for (insn = 0; insn < nr_insns; insn++) {
1577
1578       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1579       struct brw_reg args[3], dst;
1580       GLuint i;
1581
1582 #if 0
1583       printf("%d: ", insn);
1584       _mesa_print_instruction(inst);
1585 #endif
1586
1587       /* Get argument regs.  SWZ is special and does this itself.
1588        */
1589       if (inst->Opcode != OPCODE_SWZ)
1590           for (i = 0; i < 3; i++) {
1591               const struct prog_src_register *src = &inst->SrcReg[i];
1592               index = src->Index;
1593               file = src->File;
1594               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1595                   args[i] = c->output_regs[index].reg;
1596               else
1597                   args[i] = get_arg(c, inst, i);
1598           }
1599
1600       /* Get dest regs.  Note that it is possible for a reg to be both
1601        * dst and arg, given the static allocation of registers.  So
1602        * care needs to be taken emitting multi-operation instructions.
1603        */
1604       index = inst->DstReg.Index;
1605       file = inst->DstReg.File;
1606       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1607           dst = c->output_regs[index].reg;
1608       else
1609           dst = get_dst(c, inst->DstReg);
1610
1611       if (inst->SaturateMode != SATURATE_OFF) {
1612          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1613                        inst->SaturateMode);
1614       }
1615
1616       switch (inst->Opcode) {
1617       case OPCODE_ABS:
1618          brw_MOV(p, dst, brw_abs(args[0]));
1619          break;
1620       case OPCODE_ADD:
1621          brw_ADD(p, dst, args[0], args[1]);
1622          break;
1623       case OPCODE_COS:
1624          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1625          break;
1626       case OPCODE_DP3:
1627          brw_DP3(p, dst, args[0], args[1]);
1628          break;
1629       case OPCODE_DP4:
1630          brw_DP4(p, dst, args[0], args[1]);
1631          break;
1632       case OPCODE_DPH:
1633          brw_DPH(p, dst, args[0], args[1]);
1634          break;
1635       case OPCODE_NRM3:
1636          emit_nrm(c, dst, args[0], 3);
1637          break;
1638       case OPCODE_NRM4:
1639          emit_nrm(c, dst, args[0], 4);
1640          break;
1641       case OPCODE_DST:
1642          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1643          break;
1644       case OPCODE_EXP:
1645          unalias1(c, dst, args[0], emit_exp_noalias);
1646          break;
1647       case OPCODE_EX2:
1648          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1649          break;
1650       case OPCODE_ARL:
1651          emit_arl(c, dst, args[0]);
1652          break;
1653       case OPCODE_FLR:
1654          brw_RNDD(p, dst, args[0]);
1655          break;
1656       case OPCODE_FRC:
1657          brw_FRC(p, dst, args[0]);
1658          break;
1659       case OPCODE_LOG:
1660          unalias1(c, dst, args[0], emit_log_noalias);
1661          break;
1662       case OPCODE_LG2:
1663          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1664          break;
1665       case OPCODE_LIT:
1666          unalias1(c, dst, args[0], emit_lit_noalias);
1667          break;
1668       case OPCODE_LRP:
1669          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1670          break;
1671       case OPCODE_MAD:
1672          if (!accumulator_contains(c, args[2]))
1673             brw_MOV(p, brw_acc_reg(), args[2]);
1674          brw_MAC(p, dst, args[0], args[1]);
1675          break;
1676       case OPCODE_CMP:
1677          emit_cmp(p, dst, args[0], args[1], args[2]);
1678          break;
1679       case OPCODE_MAX:
1680          emit_max(p, dst, args[0], args[1]);
1681          break;
1682       case OPCODE_MIN:
1683          emit_min(p, dst, args[0], args[1]);
1684          break;
1685       case OPCODE_MOV:
1686          brw_MOV(p, dst, args[0]);
1687          break;
1688       case OPCODE_MUL:
1689          brw_MUL(p, dst, args[0], args[1]);
1690          break;
1691       case OPCODE_POW:
1692          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1693          break;
1694       case OPCODE_RCP:
1695          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1696          break;
1697       case OPCODE_RSQ:
1698          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1699          break;
1700
1701       case OPCODE_SEQ:
1702          unalias2(c, dst, args[0], args[1], emit_seq);
1703          break;
1704       case OPCODE_SIN:
1705          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1706          break;
1707       case OPCODE_SNE:
1708          unalias2(c, dst, args[0], args[1], emit_sne);
1709          break;
1710       case OPCODE_SGE:
1711          unalias2(c, dst, args[0], args[1], emit_sge);
1712          break;
1713       case OPCODE_SGT:
1714          unalias2(c, dst, args[0], args[1], emit_sgt);
1715          break;
1716       case OPCODE_SLT:
1717          unalias2(c, dst, args[0], args[1], emit_slt);
1718          break;
1719       case OPCODE_SLE:
1720          unalias2(c, dst, args[0], args[1], emit_sle);
1721          break;
1722       case OPCODE_SUB:
1723          brw_ADD(p, dst, args[0], negate(args[1]));
1724          break;
1725       case OPCODE_SWZ:
1726          /* The args[0] value can't be used here as it won't have
1727           * correctly encoded the full swizzle:
1728           */
1729          emit_swz(c, dst, inst);
1730          break;
1731       case OPCODE_TRUNC:
1732          /* round toward zero */
1733          brw_RNDZ(p, dst, args[0]);
1734          break;
1735       case OPCODE_XPD:
1736          emit_xpd(p, dst, args[0], args[1]);
1737          break;
1738       case OPCODE_IF:
1739          assert(if_depth < MAX_IF_DEPTH);
1740          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1741          /* Note that brw_IF smashes the predicate_control field. */
1742          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1743          if_depth++;
1744          break;
1745       case OPCODE_ELSE:
1746          assert(if_depth > 0);
1747          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1748          break;
1749       case OPCODE_ENDIF:
1750          assert(if_depth > 0);
1751          brw_ENDIF(p, if_inst[--if_depth]);
1752          break;
1753       case OPCODE_BGNLOOP:
1754          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1755          break;
1756       case OPCODE_BRK:
1757          brw_set_predicate_control(p, get_predicate(inst));
1758          brw_BREAK(p);
1759          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1760          break;
1761       case OPCODE_CONT:
1762          brw_set_predicate_control(p, get_predicate(inst));
1763          brw_CONT(p);
1764          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1765          break;
1766       case OPCODE_ENDLOOP:
1767          {
1768             struct brw_instruction *inst0, *inst1;
1769             GLuint br = 1;
1770
1771             loop_depth--;
1772
1773             if (intel->gen == 5)
1774                br = 2;
1775
1776             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1777             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1778             while (inst0 > loop_inst[loop_depth]) {
1779                inst0--;
1780                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1781                    inst0->bits3.if_else.jump_count == 0) {
1782                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1783                   inst0->bits3.if_else.pop_count = 0;
1784                }
1785                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1786                         inst0->bits3.if_else.jump_count == 0) {
1787                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1788                   inst0->bits3.if_else.pop_count = 0;
1789                }
1790             }
1791          }
1792          break;
1793       case OPCODE_BRA:
1794          brw_set_predicate_control(p, get_predicate(inst));
1795          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1796          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1797          break;
1798       case OPCODE_CAL:
1799          brw_set_access_mode(p, BRW_ALIGN_1);
1800          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1801          brw_set_access_mode(p, BRW_ALIGN_16);
1802          brw_ADD(p, get_addr_reg(stack_index),
1803                          get_addr_reg(stack_index), brw_imm_d(4));
1804          brw_save_call(p, inst->Comment, p->nr_insn);
1805          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1806          break;
1807       case OPCODE_RET:
1808          brw_ADD(p, get_addr_reg(stack_index),
1809                          get_addr_reg(stack_index), brw_imm_d(-4));
1810          brw_set_access_mode(p, BRW_ALIGN_1);
1811          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1812          brw_set_access_mode(p, BRW_ALIGN_16);
1813          break;
1814       case OPCODE_END:
1815          emit_vertex_write(c);
1816          break;
1817       case OPCODE_PRINT:
1818          /* no-op */
1819          break;
1820       case OPCODE_BGNSUB:
1821          brw_save_label(p, inst->Comment, p->nr_insn);
1822          break;
1823       case OPCODE_ENDSUB:
1824          /* no-op */
1825          break;
1826       default:
1827          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1828                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1829                                     _mesa_opcode_string(inst->Opcode) :
1830                                     "unknown");
1831       }
1832
1833       /* Set the predication update on the last instruction of the native
1834        * instruction sequence.
1835        *
1836        * This would be problematic if it was set on a math instruction,
1837        * but that shouldn't be the case with the current GLSL compiler.
1838        */
1839       if (inst->CondUpdate) {
1840          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1841
1842          assert(hw_insn->header.destreg__conditionalmod == 0);
1843          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1844       }
1845
1846       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1847           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1848           && c->output_regs[inst->DstReg.Index].used_in_src) {
1849          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1850       }
1851
1852       /* Result color clamping.
1853        *
1854        * When destination register is an output register and
1855        * it's primary/secondary front/back color, we have to clamp
1856        * the result to [0,1]. This is done by enabling the
1857        * saturation bit for the last instruction.
1858        *
1859        * We don't use brw_set_saturate() as it modifies
1860        * p->current->header.saturate, which affects all the subsequent
1861        * instructions. Instead, we directly modify the header
1862        * of the last (already stored) instruction.
1863        */
1864       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1865          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1866              || (inst->DstReg.Index == VERT_RESULT_COL1)
1867              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1868              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1869             p->store[p->nr_insn-1].header.saturate = 1;
1870          }
1871       }
1872
1873       release_tmps(c);
1874    }
1875
1876    brw_resolve_cals(p);
1877
1878    brw_optimize(p);
1879
1880    if (INTEL_DEBUG & DEBUG_VS) {
1881       int i;
1882
1883       printf("vs-native:\n");
1884       for (i = 0; i < p->nr_insn; i++)
1885          brw_disasm(stderr, &p->store[i], intel->gen);
1886       printf("\n");
1887    }
1888 }