src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return GL_TRUE;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return GL_FALSE;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101 static int
 102 get_first_reladdr_output(struct gl_vertex_program *vp)
 103 {
 104    int i;
 105    int first_reladdr_output = VERT_RESULT_MAX;
 106
 107    for (i = 0; i < vp->Base.NumInstructions; i++) {
 108       struct prog_instruction *inst = vp->Base.Instructions + i;
 109
 110       if (inst->DstReg.File == PROGRAM_OUTPUT &&
 111           inst->DstReg.RelAddr &&
 112           inst->DstReg.Index < first_reladdr_output)
 113          first_reladdr_output = inst->DstReg.Index;
 114    }
 115
 116    return first_reladdr_output;
 117 }
 118
 119 /**
 120  * Preallocate GRF register before code emit.
 121  * Do things as simply as possible.  Allocate and populate all regs
 122  * ahead of time.
 123  */
 124 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 125 {
 126    struct intel_context *intel = &c->func.brw->intel;
 127    GLuint i, reg = 0, mrf;
 128    int attributes_in_vue;
 129    int first_reladdr_output;
 130
 131    /* Determine whether to use a real constant buffer or use a block
 132     * of GRF registers for constants.  The later is faster but only
 133     * works if everything fits in the GRF.
 134     * XXX this heuristic/check may need some fine tuning...
 135     */
 136    if (c->vp->program.Base.Parameters->NumParameters +
 137        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 138       c->vp->use_const_buffer = GL_TRUE;
 139    else
 140       c->vp->use_const_buffer = GL_FALSE;
 141
 142    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 143
 144    /* r0 -- reserved as usual
 145     */
 146    c->r0 = brw_vec8_grf(reg, 0);
 147    reg++;
 148
 149    /* User clip planes from curbe:
 150     */
 151    if (c->key.nr_userclip) {
 152       for (i = 0; i < c->key.nr_userclip; i++) {
 153          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 154       }
 155
 156       /* Deal with curbe alignment:
 157        */
 158       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 159    }
 160
 161    /* Vertex program parameters from curbe:
 162     */
 163    if (c->vp->use_const_buffer) {
 164       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 165       int constant = 0;
 166
 167       /* We've got more constants than we can load with the push
 168        * mechanism.  This is often correlated with reladdr loads where
 169        * we should probably be using a pull mechanism anyway to avoid
 170        * excessive reading.  However, the pull mechanism is slow in
 171        * general.  So, we try to allocate as many non-reladdr-loaded
 172        * constants through the push buffer as we can before giving up.
 173        */
 174       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 175       for (i = 0;
 176            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 177            i++) {
 178          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 179          int arg;
 180
 181          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 182             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 183                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 184                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 185                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 186                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 187                 inst->SrcReg[arg].RelAddr)
 188                continue;
 189
 190             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 191                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 192             }
 193          }
 194       }
 195
 196       for (i = 0; i < constant; i++) {
 197          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 198                                                               (i%2) * 4),
 199                                                  0, 4, 1);
 200       }
 201       reg += (constant + 1) / 2;
 202       c->prog_data.curb_read_length = reg - 1;
 203       /* XXX 0 causes a bug elsewhere... */
 204       c->prog_data.nr_params = MAX2(constant * 4, 4);
 205    }
 206    else {
 207       /* use a section of the GRF for constants */
 208       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 209       for (i = 0; i < nr_params; i++) {
 210          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 211       }
 212       reg += (nr_params + 1) / 2;
 213       c->prog_data.curb_read_length = reg - 1;
 214
 215       c->prog_data.nr_params = nr_params * 4;
 216    }
 217
 218    /* Allocate input regs:
 219     */
 220    c->nr_inputs = 0;
 221    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 222       if (c->prog_data.inputs_read & (1 << i)) {
 223          c->nr_inputs++;
 224          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 225          reg++;
 226       }
 227    }
 228    /* If there are no inputs, we'll still be reading one attribute's worth
 229     * because it's required -- see urb_read_length setting.
 230     */
 231    if (c->nr_inputs == 0)
 232       reg++;
 233
 234    /* Allocate outputs.  The non-position outputs go straight into message regs.
 235     */
 236    c->nr_outputs = 0;
 237    c->first_output = reg;
 238    c->first_overflow_output = 0;
 239
 240    if (intel->gen >= 6)
 241       mrf = 4;
 242    else if (intel->gen == 5)
 243       mrf = 8;
 244    else
 245       mrf = 4;
 246
 247    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
 248    for (i = 0; i < VERT_RESULT_MAX; i++) {
 249       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 250          c->nr_outputs++;
 251          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 252          if (i == VERT_RESULT_HPOS) {
 253             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 254             reg++;
 255          }
 256          else if (i == VERT_RESULT_PSIZ) {
 257             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 258             reg++;
 259             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 260          }
 261          else {
 262             /* Two restrictions on our compute-to-MRF here.  The
 263              * message length for all SEND messages is restricted to
 264              * [1,15], so we can't use mrf 15, as that means a length
 265              * of 16.
 266              *
 267              * Additionally, URB writes are aligned to URB rows, so we
 268              * need to put an even number of registers of URB data in
 269              * each URB write so that the later write is aligned.  A
 270              * message length of 15 means 1 message header reg plus 14
 271              * regs of URB data.
 272              *
 273              * For attributes beyond the compute-to-MRF, we compute to
 274              * GRFs and they will be written in the second URB_WRITE.
 275              */
 276             if (first_reladdr_output > i && mrf < 15) {
 277                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 278                mrf++;
 279             }
 280             else {
 281                if (mrf >= 15 && !c->first_overflow_output)
 282                   c->first_overflow_output = i;
 283                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 284                reg++;
 285                mrf++;
 286             }
 287          }
 288       }
 289    }
 290
 291    /* Allocate program temporaries:
 292     */
 293    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 294       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 295       reg++;
 296    }
 297
 298    /* Address reg(s).  Don't try to use the internal address reg until
 299     * deref time.
 300     */
 301    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 302       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 303                                              reg,
 304                                              0,
 305                                              BRW_REGISTER_TYPE_D,
 306                                              BRW_VERTICAL_STRIDE_8,
 307                                              BRW_WIDTH_8,
 308                                              BRW_HORIZONTAL_STRIDE_1,
 309                                              BRW_SWIZZLE_XXXX,
 310                                              WRITEMASK_X);
 311       reg++;
 312    }
 313
 314    if (c->vp->use_const_buffer) {
 315       for (i = 0; i < 3; i++) {
 316          c->current_const[i].index = -1;
 317          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 318          reg++;
 319       }
 320    }
 321
 322    for (i = 0; i < 128; i++) {
 323       if (c->output_regs[i].used_in_src) {
 324          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 325          reg++;
 326       }
 327    }
 328
 329    if (c->needs_stack) {
 330       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 331       reg += 2;
 332    }
 333
 334    /* Some opcodes need an internal temporary:
 335     */
 336    c->first_tmp = reg;
 337    c->last_tmp = reg;           /* for allocation purposes */
 338
 339    /* Each input reg holds data from two vertices.  The
 340     * urb_read_length is the number of registers read from *each*
 341     * vertex urb, so is half the amount:
 342     */
 343    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 344    /* Setting this field to 0 leads to undefined behavior according to the
 345     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 346     * sitting in them, even if it's padding.
 347     */
 348    if (c->prog_data.urb_read_length == 0)
 349       c->prog_data.urb_read_length = 1;
 350
 351    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 352     * them to fit the biggest thing they need to.
 353     */
 354    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 355
 356    /* See emit_vertex_write() for where the VUE's overhead on top of the
 357     * attributes comes from.
 358     */
 359    if (intel->gen >= 6)
 360       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
 361    else if (intel->gen == 5)
 362       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 363    else
 364       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 365
 366    c->prog_data.total_grf = reg;
 367
 368    if (INTEL_DEBUG & DEBUG_VS) {
 369       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 370       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 371       printf("%s reg = %d\n", __FUNCTION__, reg);
 372    }
 373 }
 374
 375
 376 /**
 377  * If an instruction uses a temp reg both as a src and the dest, we
 378  * sometimes need to allocate an intermediate temporary.
 379  */
 380 static void unalias1( struct brw_vs_compile *c,
 381                       struct brw_reg dst,
 382                       struct brw_reg arg0,
 383                       void (*func)( struct brw_vs_compile *,
 384                                     struct brw_reg,
 385                                     struct brw_reg ))
 386 {
 387    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 388       struct brw_compile *p = &c->func;
 389       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 390       func(c, tmp, arg0);
 391       brw_MOV(p, dst, tmp);
 392       release_tmp(c, tmp);
 393    }
 394    else {
 395       func(c, dst, arg0);
 396    }
 397 }
 398
 399 /**
 400  * \sa unalias2
 401  * Checkes if 2-operand instruction needs an intermediate temporary.
 402  */
 403 static void unalias2( struct brw_vs_compile *c,
 404                       struct brw_reg dst,
 405                       struct brw_reg arg0,
 406                       struct brw_reg arg1,
 407                       void (*func)( struct brw_vs_compile *,
 408                                     struct brw_reg,
 409                                     struct brw_reg,
 410                                     struct brw_reg ))
 411 {
 412    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 413        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 414       struct brw_compile *p = &c->func;
 415       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 416       func(c, tmp, arg0, arg1);
 417       brw_MOV(p, dst, tmp);
 418       release_tmp(c, tmp);
 419    }
 420    else {
 421       func(c, dst, arg0, arg1);
 422    }
 423 }
 424
 425 /**
 426  * \sa unalias2
 427  * Checkes if 3-operand instruction needs an intermediate temporary.
 428  */
 429 static void unalias3( struct brw_vs_compile *c,
 430                       struct brw_reg dst,
 431                       struct brw_reg arg0,
 432                       struct brw_reg arg1,
 433                       struct brw_reg arg2,
 434                       void (*func)( struct brw_vs_compile *,
 435                                     struct brw_reg,
 436                                     struct brw_reg,
 437                                     struct brw_reg,
 438                                     struct brw_reg ))
 439 {
 440    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 441        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 442        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 443       struct brw_compile *p = &c->func;
 444       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 445       func(c, tmp, arg0, arg1, arg2);
 446       brw_MOV(p, dst, tmp);
 447       release_tmp(c, tmp);
 448    }
 449    else {
 450       func(c, dst, arg0, arg1, arg2);
 451    }
 452 }
 453
 454 static void emit_sop( struct brw_vs_compile *c,
 455                       struct brw_reg dst,
 456                       struct brw_reg arg0,
 457                       struct brw_reg arg1,
 458                       GLuint cond)
 459 {
 460    struct brw_compile *p = &c->func;
 461
 462    brw_MOV(p, dst, brw_imm_f(0.0f));
 463    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 464    brw_MOV(p, dst, brw_imm_f(1.0f));
 465    brw_set_predicate_control_flag_value(p, 0xff);
 466 }
 467
 468 static void emit_seq( struct brw_vs_compile *c,
 469                       struct brw_reg dst,
 470                       struct brw_reg arg0,
 471                       struct brw_reg arg1 )
 472 {
 473    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 474 }
 475
 476 static void emit_sne( struct brw_vs_compile *c,
 477                       struct brw_reg dst,
 478                       struct brw_reg arg0,
 479                       struct brw_reg arg1 )
 480 {
 481    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 482 }
 483 static void emit_slt( struct brw_vs_compile *c,
 484                       struct brw_reg dst,
 485                       struct brw_reg arg0,
 486                       struct brw_reg arg1 )
 487 {
 488    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 489 }
 490
 491 static void emit_sle( struct brw_vs_compile *c,
 492                       struct brw_reg dst,
 493                       struct brw_reg arg0,
 494                       struct brw_reg arg1 )
 495 {
 496    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 497 }
 498
 499 static void emit_sgt( struct brw_vs_compile *c,
 500                       struct brw_reg dst,
 501                       struct brw_reg arg0,
 502                       struct brw_reg arg1 )
 503 {
 504    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 505 }
 506
 507 static void emit_sge( struct brw_vs_compile *c,
 508                       struct brw_reg dst,
 509                       struct brw_reg arg0,
 510                       struct brw_reg arg1 )
 511 {
 512   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 513 }
 514
 515 static void emit_cmp( struct brw_compile *p,
 516                       struct brw_reg dst,
 517                       struct brw_reg arg0,
 518                       struct brw_reg arg1,
 519                       struct brw_reg arg2 )
 520 {
 521    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 522    brw_SEL(p, dst, arg1, arg2);
 523    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 524 }
 525
 526 static void emit_sign(struct brw_vs_compile *c,
 527                       struct brw_reg dst,
 528                       struct brw_reg arg0)
 529 {
 530    struct brw_compile *p = &c->func;
 531
 532    brw_MOV(p, dst, brw_imm_f(0));
 533
 534    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 535    brw_MOV(p, dst, brw_imm_f(-1.0));
 536    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 537
 538    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 539    brw_MOV(p, dst, brw_imm_f(1.0));
 540    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 541 }
 542
 543 static void emit_max( struct brw_compile *p,
 544                       struct brw_reg dst,
 545                       struct brw_reg arg0,
 546                       struct brw_reg arg1 )
 547 {
 548    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 549    brw_SEL(p, dst, arg0, arg1);
 550    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 551 }
 552
 553 static void emit_min( struct brw_compile *p,
 554                       struct brw_reg dst,
 555                       struct brw_reg arg0,
 556                       struct brw_reg arg1 )
 557 {
 558    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 559    brw_SEL(p, dst, arg0, arg1);
 560    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 561 }
 562
 563
 564 static void emit_math1( struct brw_vs_compile *c,
 565                         GLuint function,
 566                         struct brw_reg dst,
 567                         struct brw_reg arg0,
 568                         GLuint precision)
 569 {
 570    /* There are various odd behaviours with SEND on the simulator.  In
 571     * addition there are documented issues with the fact that the GEN4
 572     * processor doesn't do dependency control properly on SEND
 573     * results.  So, on balance, this kludge to get around failures
 574     * with writemasked math results looks like it might be necessary
 575     * whether that turns out to be a simulator bug or not:
 576     */
 577    struct brw_compile *p = &c->func;
 578    struct intel_context *intel = &p->brw->intel;
 579    struct brw_reg tmp = dst;
 580    GLboolean need_tmp = (intel->gen < 6 &&
 581                          (dst.dw1.bits.writemask != 0xf ||
 582                           dst.file != BRW_GENERAL_REGISTER_FILE));
 583
 584    if (need_tmp)
 585       tmp = get_tmp(c);
 586
 587    brw_math(p,
 588             tmp,
 589             function,
 590             BRW_MATH_SATURATE_NONE,
 591             2,
 592             arg0,
 593             BRW_MATH_DATA_SCALAR,
 594             precision);
 595
 596    if (need_tmp) {
 597       brw_MOV(p, dst, tmp);
 598       release_tmp(c, tmp);
 599    }
 600 }
 601
 602
 603 static void emit_math2( struct brw_vs_compile *c,
 604                         GLuint function,
 605                         struct brw_reg dst,
 606                         struct brw_reg arg0,
 607                         struct brw_reg arg1,
 608                         GLuint precision)
 609 {
 610    struct brw_compile *p = &c->func;
 611    struct intel_context *intel = &p->brw->intel;
 612    struct brw_reg tmp = dst;
 613    GLboolean need_tmp = (intel->gen < 6 &&
 614                          (dst.dw1.bits.writemask != 0xf ||
 615                           dst.file != BRW_GENERAL_REGISTER_FILE));
 616
 617    if (need_tmp)
 618       tmp = get_tmp(c);
 619
 620    brw_MOV(p, brw_message_reg(3), arg1);
 621
 622    brw_math(p,
 623             tmp,
 624             function,
 625             BRW_MATH_SATURATE_NONE,
 626             2,
 627             arg0,
 628             BRW_MATH_DATA_SCALAR,
 629             precision);
 630
 631    if (need_tmp) {
 632       brw_MOV(p, dst, tmp);
 633       release_tmp(c, tmp);
 634    }
 635 }
 636
 637
 638 static void emit_exp_noalias( struct brw_vs_compile *c,
 639                               struct brw_reg dst,
 640                               struct brw_reg arg0 )
 641 {
 642    struct brw_compile *p = &c->func;
 643
 644
 645    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 646       struct brw_reg tmp = get_tmp(c);
 647       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 648
 649       /* tmp_d = floor(arg0.x) */
 650       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 651
 652       /* result[0] = 2.0 ^ tmp */
 653
 654       /* Adjust exponent for floating point:
 655        * exp += 127
 656        */
 657       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 658
 659       /* Install exponent and sign.
 660        * Excess drops off the edge:
 661        */
 662       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 663               tmp_d, brw_imm_d(23));
 664
 665       release_tmp(c, tmp);
 666    }
 667
 668    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 669       /* result[1] = arg0.x - floor(arg0.x) */
 670       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 671    }
 672
 673    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 674       /* As with the LOG instruction, we might be better off just
 675        * doing a taylor expansion here, seeing as we have to do all
 676        * the prep work.
 677        *
 678        * If mathbox partial precision is too low, consider also:
 679        * result[3] = result[0] * EXP(result[1])
 680        */
 681       emit_math1(c,
 682                  BRW_MATH_FUNCTION_EXP,
 683                  brw_writemask(dst, WRITEMASK_Z),
 684                  brw_swizzle1(arg0, 0),
 685                  BRW_MATH_PRECISION_FULL);
 686    }
 687
 688    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 689       /* result[3] = 1.0; */
 690       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 691    }
 692 }
 693
 694
 695 static void emit_log_noalias( struct brw_vs_compile *c,
 696                               struct brw_reg dst,
 697                               struct brw_reg arg0 )
 698 {
 699    struct brw_compile *p = &c->func;
 700    struct brw_reg tmp = dst;
 701    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 702    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 703    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 704                          dst.file != BRW_GENERAL_REGISTER_FILE);
 705
 706    if (need_tmp) {
 707       tmp = get_tmp(c);
 708       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 709    }
 710
 711    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 712     * according to spec:
 713     *
 714     * These almost look likey they could be joined up, but not really
 715     * practical:
 716     *
 717     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 718     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 719     */
 720    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 721       brw_AND(p,
 722               brw_writemask(tmp_ud, WRITEMASK_X),
 723               brw_swizzle1(arg0_ud, 0),
 724               brw_imm_ud((1U<<31)-1));
 725
 726       brw_SHR(p,
 727               brw_writemask(tmp_ud, WRITEMASK_X),
 728               tmp_ud,
 729               brw_imm_ud(23));
 730
 731       brw_ADD(p,
 732               brw_writemask(tmp, WRITEMASK_X),
 733               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 734               brw_imm_d(-127));
 735    }
 736
 737    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 738       brw_AND(p,
 739               brw_writemask(tmp_ud, WRITEMASK_Y),
 740               brw_swizzle1(arg0_ud, 0),
 741               brw_imm_ud((1<<23)-1));
 742
 743       brw_OR(p,
 744              brw_writemask(tmp_ud, WRITEMASK_Y),
 745              tmp_ud,
 746              brw_imm_ud(127<<23));
 747    }
 748
 749    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 750       /* result[2] = result[0] + LOG2(result[1]); */
 751
 752       /* Why bother?  The above is just a hint how to do this with a
 753        * taylor series.  Maybe we *should* use a taylor series as by
 754        * the time all the above has been done it's almost certainly
 755        * quicker than calling the mathbox, even with low precision.
 756        *
 757        * Options are:
 758        *    - result[0] + mathbox.LOG2(result[1])
 759        *    - mathbox.LOG2(arg0.x)
 760        *    - result[0] + inline_taylor_approx(result[1])
 761        */
 762       emit_math1(c,
 763                  BRW_MATH_FUNCTION_LOG,
 764                  brw_writemask(tmp, WRITEMASK_Z),
 765                  brw_swizzle1(tmp, 1),
 766                  BRW_MATH_PRECISION_FULL);
 767
 768       brw_ADD(p,
 769               brw_writemask(tmp, WRITEMASK_Z),
 770               brw_swizzle1(tmp, 2),
 771               brw_swizzle1(tmp, 0));
 772    }
 773
 774    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 775       /* result[3] = 1.0; */
 776       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 777    }
 778
 779    if (need_tmp) {
 780       brw_MOV(p, dst, tmp);
 781       release_tmp(c, tmp);
 782    }
 783 }
 784
 785
 786 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 787  */
 788 static void emit_dst_noalias( struct brw_vs_compile *c,
 789                               struct brw_reg dst,
 790                               struct brw_reg arg0,
 791                               struct brw_reg arg1)
 792 {
 793    struct brw_compile *p = &c->func;
 794
 795    /* There must be a better way to do this:
 796     */
 797    if (dst.dw1.bits.writemask & WRITEMASK_X)
 798       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 799    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 800       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 801    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 802       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 803    if (dst.dw1.bits.writemask & WRITEMASK_W)
 804       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 805 }
 806
 807
 808 static void emit_xpd( struct brw_compile *p,
 809                       struct brw_reg dst,
 810                       struct brw_reg t,
 811                       struct brw_reg u)
 812 {
 813    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 814    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 815 }
 816
 817
 818 static void emit_lit_noalias( struct brw_vs_compile *c,
 819                               struct brw_reg dst,
 820                               struct brw_reg arg0 )
 821 {
 822    struct brw_compile *p = &c->func;
 823    struct brw_instruction *if_insn;
 824    struct brw_reg tmp = dst;
 825    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 826
 827    if (need_tmp)
 828       tmp = get_tmp(c);
 829
 830    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 831    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 832
 833    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 834     * to get all channels active inside the IF.  In the clipping code
 835     * we run with NoMask, so it's not an option and we can use
 836     * BRW_EXECUTE_1 for all comparisions.
 837     */
 838    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 839    if_insn = brw_IF(p, BRW_EXECUTE_8);
 840    {
 841       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 842
 843       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 844       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 845       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 846
 847       emit_math2(c,
 848                  BRW_MATH_FUNCTION_POW,
 849                  brw_writemask(dst, WRITEMASK_Z),
 850                  brw_swizzle1(tmp, 2),
 851                  brw_swizzle1(arg0, 3),
 852                  BRW_MATH_PRECISION_PARTIAL);
 853    }
 854
 855    brw_ENDIF(p, if_insn);
 856
 857    release_tmp(c, tmp);
 858 }
 859
 860 static void emit_lrp_noalias(struct brw_vs_compile *c,
 861                              struct brw_reg dst,
 862                              struct brw_reg arg0,
 863                              struct brw_reg arg1,
 864                              struct brw_reg arg2)
 865 {
 866    struct brw_compile *p = &c->func;
 867
 868    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 869    brw_MUL(p, brw_null_reg(), dst, arg2);
 870    brw_MAC(p, dst, arg0, arg1);
 871 }
 872
 873 /** 3 or 4-component vector normalization */
 874 static void emit_nrm( struct brw_vs_compile *c,
 875                       struct brw_reg dst,
 876                       struct brw_reg arg0,
 877                       int num_comps)
 878 {
 879    struct brw_compile *p = &c->func;
 880    struct brw_reg tmp = get_tmp(c);
 881
 882    /* tmp = dot(arg0, arg0) */
 883    if (num_comps == 3)
 884       brw_DP3(p, tmp, arg0, arg0);
 885    else
 886       brw_DP4(p, tmp, arg0, arg0);
 887
 888    /* tmp = 1 / sqrt(tmp) */
 889    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 890
 891    /* dst = arg0 * tmp */
 892    brw_MUL(p, dst, arg0, tmp);
 893
 894    release_tmp(c, tmp);
 895 }
 896
 897
 898 static struct brw_reg
 899 get_constant(struct brw_vs_compile *c,
 900              const struct prog_instruction *inst,
 901              GLuint argIndex)
 902 {
 903    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 904    struct brw_compile *p = &c->func;
 905    struct brw_reg const_reg = c->current_const[argIndex].reg;
 906
 907    assert(argIndex < 3);
 908
 909    if (c->current_const[argIndex].index != src->Index) {
 910       /* Keep track of the last constant loaded in this slot, for reuse. */
 911       c->current_const[argIndex].index = src->Index;
 912
 913 #if 0
 914       printf("  fetch const[%d] for arg %d into reg %d\n",
 915              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 916 #endif
 917       /* need to fetch the constant now */
 918       brw_dp_READ_4_vs(p,
 919                        const_reg,                     /* writeback dest */
 920                        16 * src->Index,               /* byte offset */
 921                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 922                        );
 923    }
 924
 925    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 926    const_reg = stride(const_reg, 0, 4, 0);
 927    const_reg.subnr = 0;
 928
 929    return const_reg;
 930 }
 931
 932 static struct brw_reg
 933 get_reladdr_constant(struct brw_vs_compile *c,
 934                      const struct prog_instruction *inst,
 935                      GLuint argIndex)
 936 {
 937    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 938    struct brw_compile *p = &c->func;
 939    struct brw_reg const_reg = c->current_const[argIndex].reg;
 940    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 941    struct brw_reg byte_addr_reg = get_tmp(c);
 942
 943    assert(argIndex < 3);
 944
 945    /* Can't reuse a reladdr constant load. */
 946    c->current_const[argIndex].index = -1;
 947
 948  #if 0
 949    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 950           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 951 #endif
 952
 953    brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
 954
 955    /* fetch the first vec4 */
 956    brw_dp_READ_4_vs_relative(p,
 957                              const_reg,                     /* writeback dest */
 958                              byte_addr_reg,                 /* address register */
 959                              16 * src->Index,               /* byte offset */
 960                              SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 961                              );
 962
 963    return const_reg;
 964 }
 965
 966
 967
 968 /* TODO: relative addressing!
 969  */
 970 static struct brw_reg get_reg( struct brw_vs_compile *c,
 971                                gl_register_file file,
 972                                GLuint index )
 973 {
 974    switch (file) {
 975    case PROGRAM_TEMPORARY:
 976    case PROGRAM_INPUT:
 977    case PROGRAM_OUTPUT:
 978       assert(c->regs[file][index].nr != 0);
 979       return c->regs[file][index];
 980    case PROGRAM_STATE_VAR:
 981    case PROGRAM_CONSTANT:
 982    case PROGRAM_UNIFORM:
 983       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 984       return c->regs[PROGRAM_STATE_VAR][index];
 985    case PROGRAM_ADDRESS:
 986       assert(index == 0);
 987       return c->regs[file][index];
 988
 989    case PROGRAM_UNDEFINED:                      /* undef values */
 990       return brw_null_reg();
 991
 992    case PROGRAM_LOCAL_PARAM:
 993    case PROGRAM_ENV_PARAM:
 994    case PROGRAM_WRITE_ONLY:
 995    default:
 996       assert(0);
 997       return brw_null_reg();
 998    }
 999 }
1000
1001
1002 /**
1003  * Indirect addressing:  get reg[[arg] + offset].
1004  */
1005 static struct brw_reg deref( struct brw_vs_compile *c,
1006                              struct brw_reg arg,
1007                              GLint offset,
1008                              GLuint reg_size )
1009 {
1010    struct brw_compile *p = &c->func;
1011    struct brw_reg tmp = get_tmp(c);
1012    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1013    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1014    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1015    struct brw_reg indirect = brw_vec4_indirect(0,0);
1016    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1017
1018    /* Set the vertical stride on the register access so that the first
1019     * 4 components come from a0.0 and the second 4 from a0.1.
1020     */
1021    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1022
1023    {
1024       brw_push_insn_state(p);
1025       brw_set_access_mode(p, BRW_ALIGN_1);
1026
1027       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1028       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1029
1030       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1031       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1032
1033       brw_MOV(p, tmp, indirect);
1034
1035       brw_pop_insn_state(p);
1036    }
1037
1038    /* NOTE: tmp not released */
1039    return tmp;
1040 }
1041
1042 static void
1043 move_to_reladdr_dst(struct brw_vs_compile *c,
1044                     const struct prog_instruction *inst,
1045                     struct brw_reg val)
1046 {
1047    struct brw_compile *p = &c->func;
1048    int reg_size = 32;
1049    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1050    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1051    struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1052    GLuint byte_offset = base.nr * 32 + base.subnr;
1053    struct brw_reg indirect = brw_vec4_indirect(0,0);
1054    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1055
1056    brw_push_insn_state(p);
1057    brw_set_access_mode(p, BRW_ALIGN_1);
1058
1059    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1060    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1061    brw_MOV(p, indirect, val);
1062
1063    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1064    brw_ADD(p, brw_address_reg(0), acc,
1065            brw_imm_uw(byte_offset + reg_size / 2));
1066    brw_MOV(p, indirect, suboffset(val, 4));
1067
1068    brw_pop_insn_state(p);
1069 }
1070
1071 /**
1072  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1073  * TODO: relative addressing!
1074  */
1075 static struct brw_reg
1076 get_src_reg( struct brw_vs_compile *c,
1077              const struct prog_instruction *inst,
1078              GLuint argIndex )
1079 {
1080    const GLuint file = inst->SrcReg[argIndex].File;
1081    const GLint index = inst->SrcReg[argIndex].Index;
1082    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1083
1084    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1085       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1086
1087       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1088                                         SWIZZLE_ZERO,
1089                                         SWIZZLE_ZERO,
1090                                         SWIZZLE_ZERO)) {
1091           return brw_imm_f(0.0f);
1092       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1093                                                SWIZZLE_ONE,
1094                                                SWIZZLE_ONE,
1095                                                SWIZZLE_ONE)) {
1096          if (src->Negate)
1097             return brw_imm_f(-1.0F);
1098          else
1099             return brw_imm_f(1.0F);
1100       } else if (src->File == PROGRAM_CONSTANT) {
1101          const struct gl_program_parameter_list *params;
1102          float f;
1103          int component = -1;
1104
1105          switch (src->Swizzle) {
1106          case SWIZZLE_XXXX:
1107             component = 0;
1108             break;
1109          case SWIZZLE_YYYY:
1110             component = 1;
1111             break;
1112          case SWIZZLE_ZZZZ:
1113             component = 2;
1114             break;
1115          case SWIZZLE_WWWW:
1116             component = 3;
1117             break;
1118          }
1119
1120          if (component >= 0) {
1121             params = c->vp->program.Base.Parameters;
1122             f = params->ParameterValues[src->Index][component];
1123
1124             if (src->Abs)
1125                f = fabs(f);
1126             if (src->Negate)
1127                f = -f;
1128             return brw_imm_f(f);
1129          }
1130       }
1131    }
1132
1133    switch (file) {
1134    case PROGRAM_TEMPORARY:
1135    case PROGRAM_INPUT:
1136    case PROGRAM_OUTPUT:
1137       if (relAddr) {
1138          return deref(c, c->regs[file][0], index, 32);
1139       }
1140       else {
1141          assert(c->regs[file][index].nr != 0);
1142          return c->regs[file][index];
1143       }
1144
1145    case PROGRAM_STATE_VAR:
1146    case PROGRAM_CONSTANT:
1147    case PROGRAM_UNIFORM:
1148    case PROGRAM_ENV_PARAM:
1149    case PROGRAM_LOCAL_PARAM:
1150       if (c->vp->use_const_buffer) {
1151          if (!relAddr && c->constant_map[index] != -1) {
1152             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1153             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1154          } else if (relAddr)
1155             return get_reladdr_constant(c, inst, argIndex);
1156          else
1157             return get_constant(c, inst, argIndex);
1158       }
1159       else if (relAddr) {
1160          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1161       }
1162       else {
1163          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1164          return c->regs[PROGRAM_STATE_VAR][index];
1165       }
1166    case PROGRAM_ADDRESS:
1167       assert(index == 0);
1168       return c->regs[file][index];
1169
1170    case PROGRAM_UNDEFINED:
1171       /* this is a normal case since we loop over all three src args */
1172       return brw_null_reg();
1173
1174    case PROGRAM_WRITE_ONLY:
1175    default:
1176       assert(0);
1177       return brw_null_reg();
1178    }
1179 }
1180
1181 /**
1182  * Return the brw reg for the given instruction's src argument.
1183  * Will return mangled results for SWZ op.  The emit_swz() function
1184  * ignores this result and recalculates taking extended swizzles into
1185  * account.
1186  */
1187 static struct brw_reg get_arg( struct brw_vs_compile *c,
1188                                const struct prog_instruction *inst,
1189                                GLuint argIndex )
1190 {
1191    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1192    struct brw_reg reg;
1193
1194    if (src->File == PROGRAM_UNDEFINED)
1195       return brw_null_reg();
1196
1197    reg = get_src_reg(c, inst, argIndex);
1198
1199    /* Convert 3-bit swizzle to 2-bit.
1200     */
1201    if (reg.file != BRW_IMMEDIATE_VALUE) {
1202       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1203                                           GET_SWZ(src->Swizzle, 1),
1204                                           GET_SWZ(src->Swizzle, 2),
1205                                           GET_SWZ(src->Swizzle, 3));
1206    }
1207
1208    /* Note this is ok for non-swizzle instructions:
1209     */
1210    reg.negate = src->Negate ? 1 : 0;
1211
1212    return reg;
1213 }
1214
1215
1216 /**
1217  * Get brw register for the given program dest register.
1218  */
1219 static struct brw_reg get_dst( struct brw_vs_compile *c,
1220                                struct prog_dst_register dst )
1221 {
1222    struct brw_reg reg;
1223
1224    switch (dst.File) {
1225    case PROGRAM_TEMPORARY:
1226    case PROGRAM_OUTPUT:
1227       /* register-indirect addressing is only 1x1, not VxH, for
1228        * destination regs.  So, for RelAddr we'll return a temporary
1229        * for the dest and do a move of the result to the RelAddr
1230        * register after the instruction emit.
1231        */
1232       if (dst.RelAddr) {
1233          reg = get_tmp(c);
1234       } else {
1235          assert(c->regs[dst.File][dst.Index].nr != 0);
1236          reg = c->regs[dst.File][dst.Index];
1237       }
1238       break;
1239    case PROGRAM_ADDRESS:
1240       assert(dst.Index == 0);
1241       reg = c->regs[dst.File][dst.Index];
1242       break;
1243    case PROGRAM_UNDEFINED:
1244       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1245       reg = brw_null_reg();
1246       break;
1247    default:
1248       assert(0);
1249       reg = brw_null_reg();
1250    }
1251
1252    assert(reg.type != BRW_IMMEDIATE_VALUE);
1253    reg.dw1.bits.writemask = dst.WriteMask;
1254
1255    return reg;
1256 }
1257
1258
1259 static void emit_swz( struct brw_vs_compile *c,
1260                       struct brw_reg dst,
1261                       const struct prog_instruction *inst)
1262 {
1263    const GLuint argIndex = 0;
1264    const struct prog_src_register src = inst->SrcReg[argIndex];
1265    struct brw_compile *p = &c->func;
1266    GLuint zeros_mask = 0;
1267    GLuint ones_mask = 0;
1268    GLuint src_mask = 0;
1269    GLubyte src_swz[4];
1270    GLboolean need_tmp = (src.Negate &&
1271                          dst.file != BRW_GENERAL_REGISTER_FILE);
1272    struct brw_reg tmp = dst;
1273    GLuint i;
1274
1275    if (need_tmp)
1276       tmp = get_tmp(c);
1277
1278    for (i = 0; i < 4; i++) {
1279       if (dst.dw1.bits.writemask & (1<<i)) {
1280          GLubyte s = GET_SWZ(src.Swizzle, i);
1281          switch (s) {
1282          case SWIZZLE_X:
1283          case SWIZZLE_Y:
1284          case SWIZZLE_Z:
1285          case SWIZZLE_W:
1286             src_mask |= 1<<i;
1287             src_swz[i] = s;
1288             break;
1289          case SWIZZLE_ZERO:
1290             zeros_mask |= 1<<i;
1291             break;
1292          case SWIZZLE_ONE:
1293             ones_mask |= 1<<i;
1294             break;
1295          }
1296       }
1297    }
1298
1299    /* Do src first, in case dst aliases src:
1300     */
1301    if (src_mask) {
1302       struct brw_reg arg0;
1303
1304       arg0 = get_src_reg(c, inst, argIndex);
1305
1306       arg0 = brw_swizzle(arg0,
1307                          src_swz[0], src_swz[1],
1308                          src_swz[2], src_swz[3]);
1309
1310       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1311    }
1312
1313    if (zeros_mask)
1314       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1315
1316    if (ones_mask)
1317       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1318
1319    if (src.Negate)
1320       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1321
1322    if (need_tmp) {
1323       brw_MOV(p, dst, tmp);
1324       release_tmp(c, tmp);
1325    }
1326 }
1327
1328
1329 /**
1330  * Post-vertex-program processing.  Send the results to the URB.
1331  */
1332 static void emit_vertex_write( struct brw_vs_compile *c)
1333 {
1334    struct brw_compile *p = &c->func;
1335    struct brw_context *brw = p->brw;
1336    struct intel_context *intel = &brw->intel;
1337    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1338    struct brw_reg ndc;
1339    int eot;
1340    GLuint len_vertex_header = 2;
1341    int next_mrf, i;
1342
1343    if (c->key.copy_edgeflag) {
1344       brw_MOV(p,
1345               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1346               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1347    }
1348
1349    if (intel->gen < 6) {
1350       /* Build ndc coords */
1351       ndc = get_tmp(c);
1352       /* ndc = 1.0 / pos.w */
1353       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1354       /* ndc.xyz = pos * ndc */
1355       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1356    }
1357
1358    /* Update the header for point size, user clipping flags, and -ve rhw
1359     * workaround.
1360     */
1361    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1362        c->key.nr_userclip || brw->has_negative_rhw_bug)
1363    {
1364       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1365       GLuint i;
1366
1367       brw_MOV(p, header1, brw_imm_ud(0));
1368
1369       brw_set_access_mode(p, BRW_ALIGN_16);
1370
1371       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1372          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1373          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1374          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1375       }
1376
1377       for (i = 0; i < c->key.nr_userclip; i++) {
1378          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1379          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1380          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1381          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1382       }
1383
1384       /* i965 clipping workaround:
1385        * 1) Test for -ve rhw
1386        * 2) If set,
1387        *      set ndc = (0,0,0,0)
1388        *      set ucp[6] = 1
1389        *
1390        * Later, clipping will detect ucp[6] and ensure the primitive is
1391        * clipped against all fixed planes.
1392        */
1393       if (brw->has_negative_rhw_bug) {
1394          brw_CMP(p,
1395                  vec8(brw_null_reg()),
1396                  BRW_CONDITIONAL_L,
1397                  brw_swizzle1(ndc, 3),
1398                  brw_imm_f(0));
1399
1400          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1401          brw_MOV(p, ndc, brw_imm_f(0));
1402          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1403       }
1404
1405       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1406       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1407       brw_set_access_mode(p, BRW_ALIGN_16);
1408
1409       release_tmp(c, header1);
1410    }
1411    else {
1412       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1413    }
1414
1415    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1416     * of zeros followed by two sets of NDC coordinates:
1417     */
1418    brw_set_access_mode(p, BRW_ALIGN_1);
1419    brw_set_acc_write_control(p, 0);
1420
1421    /* The VUE layout is documented in Volume 2a. */
1422    if (intel->gen >= 6) {
1423       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1424        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1425        * dword 4-7 (m2) is the 4D space position
1426        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1427        * enabled.  We don't use it, so skip it.
1428        * m3 is the first vertex element data we fill, which is the vertex
1429        * position.
1430        */
1431       brw_MOV(p, brw_message_reg(2), pos);
1432       brw_MOV(p, brw_message_reg(3), pos);
1433       len_vertex_header = 2;
1434    } else if (intel->gen == 5) {
1435       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1436        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1437        * dword 4-7 (m2) is the ndc position (set above)
1438        * dword 8-11 (m3) of the vertex header is the 4D space position
1439        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1440        * m6 is a pad so that the vertex element data is aligned
1441        * m7 is the first vertex data we fill, which is the vertex position.
1442        */
1443       brw_MOV(p, brw_message_reg(2), ndc);
1444       brw_MOV(p, brw_message_reg(3), pos);
1445       brw_MOV(p, brw_message_reg(7), pos);
1446       len_vertex_header = 6;
1447    } else {
1448       /* There are 8 dwords in VUE header pre-Ironlake:
1449        * dword 0-3 (m1) is indices, point width, clip flags.
1450        * dword 4-7 (m2) is ndc position (set above)
1451        *
1452        * dword 8-11 (m3) is the first vertex data, which we always have be the
1453        * vertex position.
1454        */
1455       brw_MOV(p, brw_message_reg(2), ndc);
1456       brw_MOV(p, brw_message_reg(3), pos);
1457       len_vertex_header = 2;
1458    }
1459
1460    /* Move variable-addressed, non-overflow outputs to their MRFs. */
1461    next_mrf = 2 + len_vertex_header;
1462    for (i = 0; i < VERT_RESULT_MAX; i++) {
1463       if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1464          break;
1465       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1466          continue;
1467
1468       if (i >= VERT_RESULT_TEX0 &&
1469           c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1470          brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1471          next_mrf++;
1472       } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1473          next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1474       }
1475    }
1476
1477    eot = (c->first_overflow_output == 0);
1478
1479    brw_urb_WRITE(p,
1480                  brw_null_reg(), /* dest */
1481                  0,             /* starting mrf reg nr */
1482                  c->r0,         /* src */
1483                  0,             /* allocate */
1484                  1,             /* used */
1485                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1486                  0,             /* response len */
1487                  eot,           /* eot */
1488                  eot,           /* writes complete */
1489                  0,             /* urb destination offset */
1490                  BRW_URB_SWIZZLE_INTERLEAVE);
1491
1492    if (c->first_overflow_output > 0) {
1493       /* Not all of the vertex outputs/results fit into the MRF.
1494        * Move the overflowed attributes from the GRF to the MRF and
1495        * issue another brw_urb_WRITE().
1496        */
1497       GLuint i, mrf = 1;
1498       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1499          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1500             /* move from GRF to MRF */
1501             brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1502             mrf++;
1503          }
1504       }
1505
1506       brw_urb_WRITE(p,
1507                     brw_null_reg(), /* dest */
1508                     0,              /* starting mrf reg nr */
1509                     c->r0,          /* src */
1510                     0,              /* allocate */
1511                     1,              /* used */
1512                     mrf,            /* msg len */
1513                     0,              /* response len */
1514                     1,              /* eot */
1515                     1,              /* writes complete */
1516                     14 / 2,  /* urb destination offset */
1517                     BRW_URB_SWIZZLE_INTERLEAVE);
1518    }
1519 }
1520
1521 static GLboolean
1522 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1523 {
1524    struct brw_compile *p = &c->func;
1525    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1526
1527    if (p->nr_insn == 0)
1528       return GL_FALSE;
1529
1530    if (val.address_mode != BRW_ADDRESS_DIRECT)
1531       return GL_FALSE;
1532
1533    switch (prev_insn->header.opcode) {
1534    case BRW_OPCODE_MOV:
1535    case BRW_OPCODE_MAC:
1536    case BRW_OPCODE_MUL:
1537       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1538           prev_insn->header.execution_size == val.width &&
1539           prev_insn->bits1.da1.dest_reg_file == val.file &&
1540           prev_insn->bits1.da1.dest_reg_type == val.type &&
1541           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1542           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1543           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1544           prev_insn->bits1.da16.dest_writemask == 0xf)
1545          return GL_TRUE;
1546       else
1547          return GL_FALSE;
1548    default:
1549       return GL_FALSE;
1550    }
1551 }
1552
1553 static uint32_t
1554 get_predicate(const struct prog_instruction *inst)
1555 {
1556    if (inst->DstReg.CondMask == COND_TR)
1557       return BRW_PREDICATE_NONE;
1558
1559    /* All of GLSL only produces predicates for COND_NE and one channel per
1560     * vector.  Fail badly if someone starts doing something else, as it might
1561     * mean infinite looping or something.
1562     *
1563     * We'd like to support all the condition codes, but our hardware doesn't
1564     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1565     * those, the instruction may update the condition codes or not, then any
1566     * later instruction may use one of those condition codes.  For gen4, the
1567     * instruction may update the flags register based on one of the condition
1568     * codes output by the instruction, and then further instructions may
1569     * predicate on that.  We can probably support this, but it won't
1570     * necessarily be easy.
1571     */
1572    assert(inst->DstReg.CondMask == COND_NE);
1573
1574    switch (inst->DstReg.CondSwizzle) {
1575    case SWIZZLE_XXXX:
1576       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1577    case SWIZZLE_YYYY:
1578       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1579    case SWIZZLE_ZZZZ:
1580       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1581    case SWIZZLE_WWWW:
1582       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1583    default:
1584       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1585                     inst->DstReg.CondMask);
1586       return BRW_PREDICATE_NORMAL;
1587    }
1588 }
1589
1590 /* Emit the vertex program instructions here.
1591  */
1592 void brw_vs_emit(struct brw_vs_compile *c )
1593 {
1594 #define MAX_IF_DEPTH 32
1595 #define MAX_LOOP_DEPTH 32
1596    struct brw_compile *p = &c->func;
1597    struct brw_context *brw = p->brw;
1598    struct intel_context *intel = &brw->intel;
1599    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1600    GLuint insn, if_depth = 0, loop_depth = 0;
1601    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1602    int if_depth_in_loop[MAX_LOOP_DEPTH];
1603    const struct brw_indirect stack_index = brw_indirect(0, 0);
1604    GLuint index;
1605    GLuint file;
1606
1607    if (INTEL_DEBUG & DEBUG_VS) {
1608       printf("vs-mesa:\n");
1609       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1610                                GL_TRUE);
1611       printf("\n");
1612    }
1613
1614    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1615    brw_set_access_mode(p, BRW_ALIGN_16);
1616    if_depth_in_loop[loop_depth] = 0;
1617
1618    brw_set_acc_write_control(p, 1);
1619
1620    for (insn = 0; insn < nr_insns; insn++) {
1621        GLuint i;
1622        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1623
1624        /* Message registers can't be read, so copy the output into GRF
1625         * register if they are used in source registers
1626         */
1627        for (i = 0; i < 3; i++) {
1628            struct prog_src_register *src = &inst->SrcReg[i];
1629            GLuint index = src->Index;
1630            GLuint file = src->File;
1631            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1632                c->output_regs[index].used_in_src = GL_TRUE;
1633        }
1634
1635        switch (inst->Opcode) {
1636        case OPCODE_CAL:
1637        case OPCODE_RET:
1638           c->needs_stack = GL_TRUE;
1639           break;
1640        default:
1641           break;
1642        }
1643    }
1644
1645    /* Static register allocation
1646     */
1647    brw_vs_alloc_regs(c);
1648
1649    if (c->needs_stack)
1650       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1651
1652    for (insn = 0; insn < nr_insns; insn++) {
1653
1654       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1655       struct brw_reg args[3], dst;
1656       GLuint i;
1657
1658 #if 0
1659       printf("%d: ", insn);
1660       _mesa_print_instruction(inst);
1661 #endif
1662
1663       /* Get argument regs.  SWZ is special and does this itself.
1664        */
1665       if (inst->Opcode != OPCODE_SWZ)
1666           for (i = 0; i < 3; i++) {
1667               const struct prog_src_register *src = &inst->SrcReg[i];
1668               index = src->Index;
1669               file = src->File;
1670               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1671                   args[i] = c->output_regs[index].reg;
1672               else
1673                   args[i] = get_arg(c, inst, i);
1674           }
1675
1676       /* Get dest regs.  Note that it is possible for a reg to be both
1677        * dst and arg, given the static allocation of registers.  So
1678        * care needs to be taken emitting multi-operation instructions.
1679        */
1680       index = inst->DstReg.Index;
1681       file = inst->DstReg.File;
1682       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1683           dst = c->output_regs[index].reg;
1684       else
1685           dst = get_dst(c, inst->DstReg);
1686
1687       if (inst->SaturateMode != SATURATE_OFF) {
1688          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1689                        inst->SaturateMode);
1690       }
1691
1692       switch (inst->Opcode) {
1693       case OPCODE_ABS:
1694          brw_MOV(p, dst, brw_abs(args[0]));
1695          break;
1696       case OPCODE_ADD:
1697          brw_ADD(p, dst, args[0], args[1]);
1698          break;
1699       case OPCODE_COS:
1700          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1701          break;
1702       case OPCODE_DP2:
1703          brw_DP2(p, dst, args[0], args[1]);
1704          break;
1705       case OPCODE_DP3:
1706          brw_DP3(p, dst, args[0], args[1]);
1707          break;
1708       case OPCODE_DP4:
1709          brw_DP4(p, dst, args[0], args[1]);
1710          break;
1711       case OPCODE_DPH:
1712          brw_DPH(p, dst, args[0], args[1]);
1713          break;
1714       case OPCODE_NRM3:
1715          emit_nrm(c, dst, args[0], 3);
1716          break;
1717       case OPCODE_NRM4:
1718          emit_nrm(c, dst, args[0], 4);
1719          break;
1720       case OPCODE_DST:
1721          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1722          break;
1723       case OPCODE_EXP:
1724          unalias1(c, dst, args[0], emit_exp_noalias);
1725          break;
1726       case OPCODE_EX2:
1727          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1728          break;
1729       case OPCODE_ARL:
1730          brw_RNDD(p, dst, args[0]);
1731          break;
1732       case OPCODE_FLR:
1733          brw_RNDD(p, dst, args[0]);
1734          break;
1735       case OPCODE_FRC:
1736          brw_FRC(p, dst, args[0]);
1737          break;
1738       case OPCODE_LOG:
1739          unalias1(c, dst, args[0], emit_log_noalias);
1740          break;
1741       case OPCODE_LG2:
1742          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1743          break;
1744       case OPCODE_LIT:
1745          unalias1(c, dst, args[0], emit_lit_noalias);
1746          break;
1747       case OPCODE_LRP:
1748          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1749          break;
1750       case OPCODE_MAD:
1751          if (!accumulator_contains(c, args[2]))
1752             brw_MOV(p, brw_acc_reg(), args[2]);
1753          brw_MAC(p, dst, args[0], args[1]);
1754          break;
1755       case OPCODE_CMP:
1756          emit_cmp(p, dst, args[0], args[1], args[2]);
1757          break;
1758       case OPCODE_MAX:
1759          emit_max(p, dst, args[0], args[1]);
1760          break;
1761       case OPCODE_MIN:
1762          emit_min(p, dst, args[0], args[1]);
1763          break;
1764       case OPCODE_MOV:
1765          brw_MOV(p, dst, args[0]);
1766          break;
1767       case OPCODE_MUL:
1768          brw_MUL(p, dst, args[0], args[1]);
1769          break;
1770       case OPCODE_POW:
1771          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1772          break;
1773       case OPCODE_RCP:
1774          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1775          break;
1776       case OPCODE_RSQ:
1777          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1778          break;
1779
1780       case OPCODE_SEQ:
1781          unalias2(c, dst, args[0], args[1], emit_seq);
1782          break;
1783       case OPCODE_SIN:
1784          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1785          break;
1786       case OPCODE_SNE:
1787          unalias2(c, dst, args[0], args[1], emit_sne);
1788          break;
1789       case OPCODE_SGE:
1790          unalias2(c, dst, args[0], args[1], emit_sge);
1791          break;
1792       case OPCODE_SGT:
1793          unalias2(c, dst, args[0], args[1], emit_sgt);
1794          break;
1795       case OPCODE_SLT:
1796          unalias2(c, dst, args[0], args[1], emit_slt);
1797          break;
1798       case OPCODE_SLE:
1799          unalias2(c, dst, args[0], args[1], emit_sle);
1800          break;
1801       case OPCODE_SSG:
1802          unalias1(c, dst, args[0], emit_sign);
1803          break;
1804       case OPCODE_SUB:
1805          brw_ADD(p, dst, args[0], negate(args[1]));
1806          break;
1807       case OPCODE_SWZ:
1808          /* The args[0] value can't be used here as it won't have
1809           * correctly encoded the full swizzle:
1810           */
1811          emit_swz(c, dst, inst);
1812          break;
1813       case OPCODE_TRUNC:
1814          /* round toward zero */
1815          brw_RNDZ(p, dst, args[0]);
1816          break;
1817       case OPCODE_XPD:
1818          emit_xpd(p, dst, args[0], args[1]);
1819          break;
1820       case OPCODE_IF:
1821          assert(if_depth < MAX_IF_DEPTH);
1822          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1823          /* Note that brw_IF smashes the predicate_control field. */
1824          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1825          if_depth_in_loop[loop_depth]++;
1826          if_depth++;
1827          break;
1828       case OPCODE_ELSE:
1829          assert(if_depth > 0);
1830          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1831          break;
1832       case OPCODE_ENDIF:
1833          assert(if_depth > 0);
1834          brw_ENDIF(p, if_inst[--if_depth]);
1835          if_depth_in_loop[loop_depth]--;
1836          break;
1837       case OPCODE_BGNLOOP:
1838          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1839          if_depth_in_loop[loop_depth] = 0;
1840          break;
1841       case OPCODE_BRK:
1842          brw_set_predicate_control(p, get_predicate(inst));
1843          brw_BREAK(p, if_depth_in_loop[loop_depth]);
1844          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1845          break;
1846       case OPCODE_CONT:
1847          brw_set_predicate_control(p, get_predicate(inst));
1848          brw_CONT(p, if_depth_in_loop[loop_depth]);
1849          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1850          break;
1851       case OPCODE_ENDLOOP:
1852          {
1853             struct brw_instruction *inst0, *inst1;
1854             GLuint br = 1;
1855
1856             loop_depth--;
1857
1858             if (intel->gen == 5)
1859                br = 2;
1860
1861             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1862             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1863             while (inst0 > loop_inst[loop_depth]) {
1864                inst0--;
1865                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1866                    inst0->bits3.if_else.jump_count == 0) {
1867                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1868                }
1869                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1870                         inst0->bits3.if_else.jump_count == 0) {
1871                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1872                }
1873             }
1874          }
1875          break;
1876       case OPCODE_BRA:
1877          brw_set_predicate_control(p, get_predicate(inst));
1878          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1879          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1880          break;
1881       case OPCODE_CAL:
1882          brw_set_access_mode(p, BRW_ALIGN_1);
1883          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1884          brw_set_access_mode(p, BRW_ALIGN_16);
1885          brw_ADD(p, get_addr_reg(stack_index),
1886                          get_addr_reg(stack_index), brw_imm_d(4));
1887          brw_save_call(p, inst->Comment, p->nr_insn);
1888          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1889          break;
1890       case OPCODE_RET:
1891          brw_ADD(p, get_addr_reg(stack_index),
1892                          get_addr_reg(stack_index), brw_imm_d(-4));
1893          brw_set_access_mode(p, BRW_ALIGN_1);
1894          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1895          brw_set_access_mode(p, BRW_ALIGN_16);
1896          break;
1897       case OPCODE_END:
1898          emit_vertex_write(c);
1899          break;
1900       case OPCODE_PRINT:
1901          /* no-op */
1902          break;
1903       case OPCODE_BGNSUB:
1904          brw_save_label(p, inst->Comment, p->nr_insn);
1905          break;
1906       case OPCODE_ENDSUB:
1907          /* no-op */
1908          break;
1909       default:
1910          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1911                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1912                                     _mesa_opcode_string(inst->Opcode) :
1913                                     "unknown");
1914       }
1915
1916       /* Set the predication update on the last instruction of the native
1917        * instruction sequence.
1918        *
1919        * This would be problematic if it was set on a math instruction,
1920        * but that shouldn't be the case with the current GLSL compiler.
1921        */
1922       if (inst->CondUpdate) {
1923          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1924
1925          assert(hw_insn->header.destreg__conditionalmod == 0);
1926          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1927       }
1928
1929       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1930           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1931           && c->output_regs[inst->DstReg.Index].used_in_src) {
1932          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1933       }
1934
1935       /* Result color clamping.
1936        *
1937        * When destination register is an output register and
1938        * it's primary/secondary front/back color, we have to clamp
1939        * the result to [0,1]. This is done by enabling the
1940        * saturation bit for the last instruction.
1941        *
1942        * We don't use brw_set_saturate() as it modifies
1943        * p->current->header.saturate, which affects all the subsequent
1944        * instructions. Instead, we directly modify the header
1945        * of the last (already stored) instruction.
1946        */
1947       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1948          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1949              || (inst->DstReg.Index == VERT_RESULT_COL1)
1950              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1951              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1952             p->store[p->nr_insn-1].header.saturate = 1;
1953          }
1954       }
1955
1956       if (inst->DstReg.RelAddr) {
1957          assert(inst->DstReg.File == PROGRAM_TEMPORARY||
1958                 inst->DstReg.File == PROGRAM_OUTPUT);
1959          move_to_reladdr_dst(c, inst, dst);
1960       }
1961
1962       release_tmps(c);
1963    }
1964
1965    brw_resolve_cals(p);
1966
1967    brw_optimize(p);
1968
1969    if (INTEL_DEBUG & DEBUG_VS) {
1970       int i;
1971
1972       printf("vs-native:\n");
1973       for (i = 0; i < p->nr_insn; i++)
1974          brw_disasm(stdout, &p->store[i], intel->gen);
1975       printf("\n");
1976    }
1977 }