src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return GL_TRUE;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return GL_FALSE;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101 static int
 102 get_first_reladdr_output(struct gl_vertex_program *vp)
 103 {
 104    int i;
 105    int first_reladdr_output = VERT_RESULT_MAX;
 106
 107    for (i = 0; i < vp->Base.NumInstructions; i++) {
 108       struct prog_instruction *inst = vp->Base.Instructions + i;
 109
 110       if (inst->DstReg.File == PROGRAM_OUTPUT &&
 111           inst->DstReg.RelAddr &&
 112           inst->DstReg.Index < first_reladdr_output)
 113          first_reladdr_output = inst->DstReg.Index;
 114    }
 115
 116    return first_reladdr_output;
 117 }
 118
 119 /* Clears the record of which vp_const_buffer elements have been
 120  * loaded into our constant buffer registers, for the starts of new
 121  * blocks after control flow.
 122  */
 123 static void
 124 clear_current_const(struct brw_vs_compile *c)
 125 {
 126    unsigned int i;
 127
 128    if (c->vp->use_const_buffer) {
 129       for (i = 0; i < 3; i++) {
 130          c->current_const[i].index = -1;
 131       }
 132    }
 133 }
 134
 135 /**
 136  * Preallocate GRF register before code emit.
 137  * Do things as simply as possible.  Allocate and populate all regs
 138  * ahead of time.
 139  */
 140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 141 {
 142    struct intel_context *intel = &c->func.brw->intel;
 143    GLuint i, reg = 0, mrf;
 144    int attributes_in_vue;
 145    int first_reladdr_output;
 146
 147    /* Determine whether to use a real constant buffer or use a block
 148     * of GRF registers for constants.  The later is faster but only
 149     * works if everything fits in the GRF.
 150     * XXX this heuristic/check may need some fine tuning...
 151     */
 152    if (c->vp->program.Base.Parameters->NumParameters +
 153        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 154       c->vp->use_const_buffer = GL_TRUE;
 155    else
 156       c->vp->use_const_buffer = GL_FALSE;
 157
 158    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 159
 160    /* r0 -- reserved as usual
 161     */
 162    c->r0 = brw_vec8_grf(reg, 0);
 163    reg++;
 164
 165    /* User clip planes from curbe:
 166     */
 167    if (c->key.nr_userclip) {
 168       if (intel->gen >= 6) {
 169          for (i = 0; i < c->key.nr_userclip; i++) {
 170             c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
 171                                                   (i % 2) * 4), 0, 4, 1);
 172          }
 173          reg += ALIGN(c->key.nr_userclip, 2) / 2;
 174       } else {
 175          for (i = 0; i < c->key.nr_userclip; i++) {
 176             c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
 177                                                   (i % 2) * 4), 0, 4, 1);
 178          }
 179          reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
 180       }
 181
 182    }
 183
 184    /* Vertex program parameters from curbe:
 185     */
 186    if (c->vp->use_const_buffer) {
 187       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 188       int constant = 0;
 189
 190       /* We've got more constants than we can load with the push
 191        * mechanism.  This is often correlated with reladdr loads where
 192        * we should probably be using a pull mechanism anyway to avoid
 193        * excessive reading.  However, the pull mechanism is slow in
 194        * general.  So, we try to allocate as many non-reladdr-loaded
 195        * constants through the push buffer as we can before giving up.
 196        */
 197       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 198       for (i = 0;
 199            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 200            i++) {
 201          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 202          int arg;
 203
 204          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 205             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 206                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 207                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 208                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 209                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 210                 inst->SrcReg[arg].RelAddr)
 211                continue;
 212
 213             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 214                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 215             }
 216          }
 217       }
 218
 219       for (i = 0; i < constant; i++) {
 220          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 221                                                               (i%2) * 4),
 222                                                  0, 4, 1);
 223       }
 224       reg += (constant + 1) / 2;
 225       c->prog_data.curb_read_length = reg - 1;
 226       /* XXX 0 causes a bug elsewhere... */
 227       c->prog_data.nr_params = MAX2(constant * 4, 4);
 228    }
 229    else {
 230       /* use a section of the GRF for constants */
 231       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 232       for (i = 0; i < nr_params; i++) {
 233          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 234       }
 235       reg += (nr_params + 1) / 2;
 236       c->prog_data.curb_read_length = reg - 1;
 237
 238       c->prog_data.nr_params = nr_params * 4;
 239    }
 240
 241    /* Allocate input regs:
 242     */
 243    c->nr_inputs = 0;
 244    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 245       if (c->prog_data.inputs_read & (1 << i)) {
 246          c->nr_inputs++;
 247          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 248          reg++;
 249       }
 250    }
 251    /* If there are no inputs, we'll still be reading one attribute's worth
 252     * because it's required -- see urb_read_length setting.
 253     */
 254    if (c->nr_inputs == 0)
 255       reg++;
 256
 257    /* Allocate outputs.  The non-position outputs go straight into message regs.
 258     */
 259    c->nr_outputs = 0;
 260    c->first_output = reg;
 261    c->first_overflow_output = 0;
 262
 263    if (intel->gen >= 6) {
 264       mrf = 3;
 265       if (c->key.nr_userclip)
 266          mrf += 2;
 267    } else if (intel->gen == 5)
 268       mrf = 8;
 269    else
 270       mrf = 4;
 271
 272    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
 273    for (i = 0; i < VERT_RESULT_MAX; i++) {
 274       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 275          c->nr_outputs++;
 276          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 277          if (i == VERT_RESULT_HPOS) {
 278             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 279             reg++;
 280          }
 281          else if (i == VERT_RESULT_PSIZ) {
 282             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 283             reg++;
 284             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 285          }
 286          else {
 287             /* Two restrictions on our compute-to-MRF here.  The
 288              * message length for all SEND messages is restricted to
 289              * [1,15], so we can't use mrf 15, as that means a length
 290              * of 16.
 291              *
 292              * Additionally, URB writes are aligned to URB rows, so we
 293              * need to put an even number of registers of URB data in
 294              * each URB write so that the later write is aligned.  A
 295              * message length of 15 means 1 message header reg plus 14
 296              * regs of URB data.
 297              *
 298              * For attributes beyond the compute-to-MRF, we compute to
 299              * GRFs and they will be written in the second URB_WRITE.
 300              */
 301             if (first_reladdr_output > i && mrf < 15) {
 302                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 303                mrf++;
 304             }
 305             else {
 306                if (mrf >= 15 && !c->first_overflow_output)
 307                   c->first_overflow_output = i;
 308                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 309                reg++;
 310                mrf++;
 311             }
 312          }
 313       }
 314    }
 315
 316    /* Allocate program temporaries:
 317     */
 318    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 319       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 320       reg++;
 321    }
 322
 323    /* Address reg(s).  Don't try to use the internal address reg until
 324     * deref time.
 325     */
 326    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 327       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 328                                              reg,
 329                                              0,
 330                                              BRW_REGISTER_TYPE_D,
 331                                              BRW_VERTICAL_STRIDE_8,
 332                                              BRW_WIDTH_8,
 333                                              BRW_HORIZONTAL_STRIDE_1,
 334                                              BRW_SWIZZLE_XXXX,
 335                                              WRITEMASK_X);
 336       reg++;
 337    }
 338
 339    if (c->vp->use_const_buffer) {
 340       for (i = 0; i < 3; i++) {
 341          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 342          reg++;
 343       }
 344       clear_current_const(c);
 345    }
 346
 347    for (i = 0; i < 128; i++) {
 348       if (c->output_regs[i].used_in_src) {
 349          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 350          reg++;
 351       }
 352    }
 353
 354    if (c->needs_stack) {
 355       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 356       reg += 2;
 357    }
 358
 359    /* Some opcodes need an internal temporary:
 360     */
 361    c->first_tmp = reg;
 362    c->last_tmp = reg;           /* for allocation purposes */
 363
 364    /* Each input reg holds data from two vertices.  The
 365     * urb_read_length is the number of registers read from *each*
 366     * vertex urb, so is half the amount:
 367     */
 368    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 369    /* Setting this field to 0 leads to undefined behavior according to the
 370     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 371     * sitting in them, even if it's padding.
 372     */
 373    if (c->prog_data.urb_read_length == 0)
 374       c->prog_data.urb_read_length = 1;
 375
 376    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 377     * them to fit the biggest thing they need to.
 378     */
 379    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 380
 381    /* See emit_vertex_write() for where the VUE's overhead on top of the
 382     * attributes comes from.
 383     */
 384    if (intel->gen >= 6) {
 385       int header_regs = 2;
 386       if (c->key.nr_userclip)
 387          header_regs += 2;
 388
 389       c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
 390    } else if (intel->gen == 5)
 391       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 392    else
 393       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 394
 395    c->prog_data.total_grf = reg;
 396
 397    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
 398       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 399       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 400       printf("%s reg = %d\n", __FUNCTION__, reg);
 401    }
 402 }
 403
 404
 405 /**
 406  * If an instruction uses a temp reg both as a src and the dest, we
 407  * sometimes need to allocate an intermediate temporary.
 408  */
 409 static void unalias1( struct brw_vs_compile *c,
 410                       struct brw_reg dst,
 411                       struct brw_reg arg0,
 412                       void (*func)( struct brw_vs_compile *,
 413                                     struct brw_reg,
 414                                     struct brw_reg ))
 415 {
 416    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 417       struct brw_compile *p = &c->func;
 418       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 419       func(c, tmp, arg0);
 420       brw_MOV(p, dst, tmp);
 421       release_tmp(c, tmp);
 422    }
 423    else {
 424       func(c, dst, arg0);
 425    }
 426 }
 427
 428 /**
 429  * \sa unalias2
 430  * Checkes if 2-operand instruction needs an intermediate temporary.
 431  */
 432 static void unalias2( struct brw_vs_compile *c,
 433                       struct brw_reg dst,
 434                       struct brw_reg arg0,
 435                       struct brw_reg arg1,
 436                       void (*func)( struct brw_vs_compile *,
 437                                     struct brw_reg,
 438                                     struct brw_reg,
 439                                     struct brw_reg ))
 440 {
 441    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 442        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 443       struct brw_compile *p = &c->func;
 444       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 445       func(c, tmp, arg0, arg1);
 446       brw_MOV(p, dst, tmp);
 447       release_tmp(c, tmp);
 448    }
 449    else {
 450       func(c, dst, arg0, arg1);
 451    }
 452 }
 453
 454 /**
 455  * \sa unalias2
 456  * Checkes if 3-operand instruction needs an intermediate temporary.
 457  */
 458 static void unalias3( struct brw_vs_compile *c,
 459                       struct brw_reg dst,
 460                       struct brw_reg arg0,
 461                       struct brw_reg arg1,
 462                       struct brw_reg arg2,
 463                       void (*func)( struct brw_vs_compile *,
 464                                     struct brw_reg,
 465                                     struct brw_reg,
 466                                     struct brw_reg,
 467                                     struct brw_reg ))
 468 {
 469    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 470        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 471        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 472       struct brw_compile *p = &c->func;
 473       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 474       func(c, tmp, arg0, arg1, arg2);
 475       brw_MOV(p, dst, tmp);
 476       release_tmp(c, tmp);
 477    }
 478    else {
 479       func(c, dst, arg0, arg1, arg2);
 480    }
 481 }
 482
 483 static void emit_sop( struct brw_vs_compile *c,
 484                       struct brw_reg dst,
 485                       struct brw_reg arg0,
 486                       struct brw_reg arg1,
 487                       GLuint cond)
 488 {
 489    struct brw_compile *p = &c->func;
 490
 491    brw_MOV(p, dst, brw_imm_f(0.0f));
 492    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 493    brw_MOV(p, dst, brw_imm_f(1.0f));
 494    brw_set_predicate_control_flag_value(p, 0xff);
 495 }
 496
 497 static void emit_seq( struct brw_vs_compile *c,
 498                       struct brw_reg dst,
 499                       struct brw_reg arg0,
 500                       struct brw_reg arg1 )
 501 {
 502    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 503 }
 504
 505 static void emit_sne( struct brw_vs_compile *c,
 506                       struct brw_reg dst,
 507                       struct brw_reg arg0,
 508                       struct brw_reg arg1 )
 509 {
 510    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 511 }
 512 static void emit_slt( struct brw_vs_compile *c,
 513                       struct brw_reg dst,
 514                       struct brw_reg arg0,
 515                       struct brw_reg arg1 )
 516 {
 517    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 518 }
 519
 520 static void emit_sle( struct brw_vs_compile *c,
 521                       struct brw_reg dst,
 522                       struct brw_reg arg0,
 523                       struct brw_reg arg1 )
 524 {
 525    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 526 }
 527
 528 static void emit_sgt( struct brw_vs_compile *c,
 529                       struct brw_reg dst,
 530                       struct brw_reg arg0,
 531                       struct brw_reg arg1 )
 532 {
 533    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 534 }
 535
 536 static void emit_sge( struct brw_vs_compile *c,
 537                       struct brw_reg dst,
 538                       struct brw_reg arg0,
 539                       struct brw_reg arg1 )
 540 {
 541   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 542 }
 543
 544 static void emit_cmp( struct brw_compile *p,
 545                       struct brw_reg dst,
 546                       struct brw_reg arg0,
 547                       struct brw_reg arg1,
 548                       struct brw_reg arg2 )
 549 {
 550    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 551    brw_SEL(p, dst, arg1, arg2);
 552    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 553 }
 554
 555 static void emit_sign(struct brw_vs_compile *c,
 556                       struct brw_reg dst,
 557                       struct brw_reg arg0)
 558 {
 559    struct brw_compile *p = &c->func;
 560
 561    brw_MOV(p, dst, brw_imm_f(0));
 562
 563    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 564    brw_MOV(p, dst, brw_imm_f(-1.0));
 565    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 566
 567    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 568    brw_MOV(p, dst, brw_imm_f(1.0));
 569    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 570 }
 571
 572 static void emit_max( struct brw_compile *p,
 573                       struct brw_reg dst,
 574                       struct brw_reg arg0,
 575                       struct brw_reg arg1 )
 576 {
 577    struct intel_context *intel = &p->brw->intel;
 578
 579    if (intel->gen >= 6) {
 580       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
 581       brw_SEL(p, dst, arg0, arg1);
 582       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 583       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 584    } else {
 585       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 586       brw_SEL(p, dst, arg0, arg1);
 587       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 588    }
 589 }
 590
 591 static void emit_min( struct brw_compile *p,
 592                       struct brw_reg dst,
 593                       struct brw_reg arg0,
 594                       struct brw_reg arg1 )
 595 {
 596    struct intel_context *intel = &p->brw->intel;
 597
 598    if (intel->gen >= 6) {
 599       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 600       brw_SEL(p, dst, arg0, arg1);
 601       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 602       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 603    } else {
 604       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 605       brw_SEL(p, dst, arg0, arg1);
 606       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 607    }
 608 }
 609
 610 static void emit_math1_gen4(struct brw_vs_compile *c,
 611                             GLuint function,
 612                             struct brw_reg dst,
 613                             struct brw_reg arg0,
 614                             GLuint precision)
 615 {
 616    /* There are various odd behaviours with SEND on the simulator.  In
 617     * addition there are documented issues with the fact that the GEN4
 618     * processor doesn't do dependency control properly on SEND
 619     * results.  So, on balance, this kludge to get around failures
 620     * with writemasked math results looks like it might be necessary
 621     * whether that turns out to be a simulator bug or not:
 622     */
 623    struct brw_compile *p = &c->func;
 624    struct brw_reg tmp = dst;
 625    GLboolean need_tmp = GL_FALSE;
 626
 627    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 628        dst.dw1.bits.writemask != 0xf)
 629       need_tmp = GL_TRUE;
 630
 631    if (need_tmp)
 632       tmp = get_tmp(c);
 633
 634    brw_math(p,
 635             tmp,
 636             function,
 637             BRW_MATH_SATURATE_NONE,
 638             2,
 639             arg0,
 640             BRW_MATH_DATA_SCALAR,
 641             precision);
 642
 643    if (need_tmp) {
 644       brw_MOV(p, dst, tmp);
 645       release_tmp(c, tmp);
 646    }
 647 }
 648
 649 static void
 650 emit_math1_gen6(struct brw_vs_compile *c,
 651                 GLuint function,
 652                 struct brw_reg dst,
 653                 struct brw_reg arg0,
 654                 GLuint precision)
 655 {
 656    struct brw_compile *p = &c->func;
 657    struct brw_reg tmp_src, tmp_dst;
 658
 659    /* Something is strange on gen6 math in 16-wide mode, though the
 660     * docs say it's supposed to work.  Punt to using align1 mode,
 661     * which doesn't do writemasking and swizzles.
 662     */
 663    tmp_src = get_tmp(c);
 664    tmp_dst = get_tmp(c);
 665
 666    brw_MOV(p, tmp_src, arg0);
 667
 668    brw_set_access_mode(p, BRW_ALIGN_1);
 669    brw_math(p,
 670             tmp_dst,
 671             function,
 672             BRW_MATH_SATURATE_NONE,
 673             2,
 674             tmp_src,
 675             BRW_MATH_DATA_SCALAR,
 676             precision);
 677    brw_set_access_mode(p, BRW_ALIGN_16);
 678
 679    brw_MOV(p, dst, tmp_dst);
 680
 681    release_tmp(c, tmp_src);
 682    release_tmp(c, tmp_dst);
 683 }
 684
 685 static void
 686 emit_math1(struct brw_vs_compile *c,
 687            GLuint function,
 688            struct brw_reg dst,
 689            struct brw_reg arg0,
 690            GLuint precision)
 691 {
 692    struct brw_compile *p = &c->func;
 693    struct intel_context *intel = &p->brw->intel;
 694
 695    if (intel->gen >= 6)
 696       emit_math1_gen6(c, function, dst, arg0, precision);
 697    else
 698       emit_math1_gen4(c, function, dst, arg0, precision);
 699 }
 700
 701 static void emit_math2( struct brw_vs_compile *c,
 702                         GLuint function,
 703                         struct brw_reg dst,
 704                         struct brw_reg arg0,
 705                         struct brw_reg arg1,
 706                         GLuint precision)
 707 {
 708    struct brw_compile *p = &c->func;
 709    struct intel_context *intel = &p->brw->intel;
 710    struct brw_reg tmp = dst;
 711    GLboolean need_tmp = GL_FALSE;
 712
 713    if (dst.file != BRW_GENERAL_REGISTER_FILE)
 714       need_tmp = GL_TRUE;
 715
 716    if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
 717       need_tmp = GL_TRUE;
 718
 719    if (need_tmp)
 720       tmp = get_tmp(c);
 721
 722    brw_MOV(p, brw_message_reg(3), arg1);
 723
 724    brw_math(p,
 725             tmp,
 726             function,
 727             BRW_MATH_SATURATE_NONE,
 728             2,
 729             arg0,
 730             BRW_MATH_DATA_SCALAR,
 731             precision);
 732
 733    if (need_tmp) {
 734       brw_MOV(p, dst, tmp);
 735       release_tmp(c, tmp);
 736    }
 737 }
 738
 739
 740 static void emit_exp_noalias( struct brw_vs_compile *c,
 741                               struct brw_reg dst,
 742                               struct brw_reg arg0 )
 743 {
 744    struct brw_compile *p = &c->func;
 745
 746
 747    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 748       struct brw_reg tmp = get_tmp(c);
 749       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 750
 751       /* tmp_d = floor(arg0.x) */
 752       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 753
 754       /* result[0] = 2.0 ^ tmp */
 755
 756       /* Adjust exponent for floating point:
 757        * exp += 127
 758        */
 759       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 760
 761       /* Install exponent and sign.
 762        * Excess drops off the edge:
 763        */
 764       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 765               tmp_d, brw_imm_d(23));
 766
 767       release_tmp(c, tmp);
 768    }
 769
 770    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 771       /* result[1] = arg0.x - floor(arg0.x) */
 772       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 773    }
 774
 775    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 776       /* As with the LOG instruction, we might be better off just
 777        * doing a taylor expansion here, seeing as we have to do all
 778        * the prep work.
 779        *
 780        * If mathbox partial precision is too low, consider also:
 781        * result[3] = result[0] * EXP(result[1])
 782        */
 783       emit_math1(c,
 784                  BRW_MATH_FUNCTION_EXP,
 785                  brw_writemask(dst, WRITEMASK_Z),
 786                  brw_swizzle1(arg0, 0),
 787                  BRW_MATH_PRECISION_FULL);
 788    }
 789
 790    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 791       /* result[3] = 1.0; */
 792       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 793    }
 794 }
 795
 796
 797 static void emit_log_noalias( struct brw_vs_compile *c,
 798                               struct brw_reg dst,
 799                               struct brw_reg arg0 )
 800 {
 801    struct brw_compile *p = &c->func;
 802    struct brw_reg tmp = dst;
 803    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 804    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 805    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 806                          dst.file != BRW_GENERAL_REGISTER_FILE);
 807
 808    if (need_tmp) {
 809       tmp = get_tmp(c);
 810       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 811    }
 812
 813    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 814     * according to spec:
 815     *
 816     * These almost look likey they could be joined up, but not really
 817     * practical:
 818     *
 819     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 820     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 821     */
 822    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 823       brw_AND(p,
 824               brw_writemask(tmp_ud, WRITEMASK_X),
 825               brw_swizzle1(arg0_ud, 0),
 826               brw_imm_ud((1U<<31)-1));
 827
 828       brw_SHR(p,
 829               brw_writemask(tmp_ud, WRITEMASK_X),
 830               tmp_ud,
 831               brw_imm_ud(23));
 832
 833       brw_ADD(p,
 834               brw_writemask(tmp, WRITEMASK_X),
 835               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 836               brw_imm_d(-127));
 837    }
 838
 839    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 840       brw_AND(p,
 841               brw_writemask(tmp_ud, WRITEMASK_Y),
 842               brw_swizzle1(arg0_ud, 0),
 843               brw_imm_ud((1<<23)-1));
 844
 845       brw_OR(p,
 846              brw_writemask(tmp_ud, WRITEMASK_Y),
 847              tmp_ud,
 848              brw_imm_ud(127<<23));
 849    }
 850
 851    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 852       /* result[2] = result[0] + LOG2(result[1]); */
 853
 854       /* Why bother?  The above is just a hint how to do this with a
 855        * taylor series.  Maybe we *should* use a taylor series as by
 856        * the time all the above has been done it's almost certainly
 857        * quicker than calling the mathbox, even with low precision.
 858        *
 859        * Options are:
 860        *    - result[0] + mathbox.LOG2(result[1])
 861        *    - mathbox.LOG2(arg0.x)
 862        *    - result[0] + inline_taylor_approx(result[1])
 863        */
 864       emit_math1(c,
 865                  BRW_MATH_FUNCTION_LOG,
 866                  brw_writemask(tmp, WRITEMASK_Z),
 867                  brw_swizzle1(tmp, 1),
 868                  BRW_MATH_PRECISION_FULL);
 869
 870       brw_ADD(p,
 871               brw_writemask(tmp, WRITEMASK_Z),
 872               brw_swizzle1(tmp, 2),
 873               brw_swizzle1(tmp, 0));
 874    }
 875
 876    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 877       /* result[3] = 1.0; */
 878       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 879    }
 880
 881    if (need_tmp) {
 882       brw_MOV(p, dst, tmp);
 883       release_tmp(c, tmp);
 884    }
 885 }
 886
 887
 888 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 889  */
 890 static void emit_dst_noalias( struct brw_vs_compile *c,
 891                               struct brw_reg dst,
 892                               struct brw_reg arg0,
 893                               struct brw_reg arg1)
 894 {
 895    struct brw_compile *p = &c->func;
 896
 897    /* There must be a better way to do this:
 898     */
 899    if (dst.dw1.bits.writemask & WRITEMASK_X)
 900       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 901    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 902       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 903    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 904       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 905    if (dst.dw1.bits.writemask & WRITEMASK_W)
 906       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 907 }
 908
 909
 910 static void emit_xpd( struct brw_compile *p,
 911                       struct brw_reg dst,
 912                       struct brw_reg t,
 913                       struct brw_reg u)
 914 {
 915    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 916    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 917 }
 918
 919
 920 static void emit_lit_noalias( struct brw_vs_compile *c,
 921                               struct brw_reg dst,
 922                               struct brw_reg arg0 )
 923 {
 924    struct brw_compile *p = &c->func;
 925    struct brw_instruction *if_insn;
 926    struct brw_reg tmp = dst;
 927    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 928
 929    if (need_tmp)
 930       tmp = get_tmp(c);
 931
 932    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 933    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 934
 935    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 936     * to get all channels active inside the IF.  In the clipping code
 937     * we run with NoMask, so it's not an option and we can use
 938     * BRW_EXECUTE_1 for all comparisions.
 939     */
 940    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 941    if_insn = brw_IF(p, BRW_EXECUTE_8);
 942    {
 943       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 944
 945       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 946       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 947       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 948
 949       emit_math2(c,
 950                  BRW_MATH_FUNCTION_POW,
 951                  brw_writemask(dst, WRITEMASK_Z),
 952                  brw_swizzle1(tmp, 2),
 953                  brw_swizzle1(arg0, 3),
 954                  BRW_MATH_PRECISION_PARTIAL);
 955    }
 956
 957    brw_ENDIF(p, if_insn);
 958
 959    release_tmp(c, tmp);
 960 }
 961
 962 static void emit_lrp_noalias(struct brw_vs_compile *c,
 963                              struct brw_reg dst,
 964                              struct brw_reg arg0,
 965                              struct brw_reg arg1,
 966                              struct brw_reg arg2)
 967 {
 968    struct brw_compile *p = &c->func;
 969
 970    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 971    brw_MUL(p, brw_null_reg(), dst, arg2);
 972    brw_MAC(p, dst, arg0, arg1);
 973 }
 974
 975 /** 3 or 4-component vector normalization */
 976 static void emit_nrm( struct brw_vs_compile *c,
 977                       struct brw_reg dst,
 978                       struct brw_reg arg0,
 979                       int num_comps)
 980 {
 981    struct brw_compile *p = &c->func;
 982    struct brw_reg tmp = get_tmp(c);
 983
 984    /* tmp = dot(arg0, arg0) */
 985    if (num_comps == 3)
 986       brw_DP3(p, tmp, arg0, arg0);
 987    else
 988       brw_DP4(p, tmp, arg0, arg0);
 989
 990    /* tmp = 1 / sqrt(tmp) */
 991    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 992
 993    /* dst = arg0 * tmp */
 994    brw_MUL(p, dst, arg0, tmp);
 995
 996    release_tmp(c, tmp);
 997 }
 998
 999
1000 static struct brw_reg
1001 get_constant(struct brw_vs_compile *c,
1002              const struct prog_instruction *inst,
1003              GLuint argIndex)
1004 {
1005    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1006    struct brw_compile *p = &c->func;
1007    struct brw_reg const_reg = c->current_const[argIndex].reg;
1008
1009    assert(argIndex < 3);
1010
1011    assert(c->func.brw->intel.gen < 6); /* FINISHME */
1012
1013    if (c->current_const[argIndex].index != src->Index) {
1014       /* Keep track of the last constant loaded in this slot, for reuse. */
1015       c->current_const[argIndex].index = src->Index;
1016
1017 #if 0
1018       printf("  fetch const[%d] for arg %d into reg %d\n",
1019              src->Index, argIndex, c->current_const[argIndex].reg.nr);
1020 #endif
1021       /* need to fetch the constant now */
1022       brw_dp_READ_4_vs(p,
1023                        const_reg,                     /* writeback dest */
1024                        16 * src->Index,               /* byte offset */
1025                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
1026                        );
1027    }
1028
1029    /* replicate lower four floats into upper half (to get XYZWXYZW) */
1030    const_reg = stride(const_reg, 0, 4, 0);
1031    const_reg.subnr = 0;
1032
1033    return const_reg;
1034 }
1035
1036 static struct brw_reg
1037 get_reladdr_constant(struct brw_vs_compile *c,
1038                      const struct prog_instruction *inst,
1039                      GLuint argIndex)
1040 {
1041    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1042    struct brw_compile *p = &c->func;
1043    struct brw_reg const_reg = c->current_const[argIndex].reg;
1044    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
1045    struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1046
1047    assert(argIndex < 3);
1048
1049    assert(c->func.brw->intel.gen < 6); /* FINISHME */
1050
1051    /* Can't reuse a reladdr constant load. */
1052    c->current_const[argIndex].index = -1;
1053
1054  #if 0
1055    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
1056           src->Index, argIndex, c->current_const[argIndex].reg.nr);
1057 #endif
1058
1059    brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
1060
1061    /* fetch the first vec4 */
1062    brw_dp_READ_4_vs_relative(p,
1063                              const_reg,                     /* writeback dest */
1064                              byte_addr_reg,                 /* address register */
1065                              16 * src->Index,               /* byte offset */
1066                              SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
1067                              );
1068
1069    return const_reg;
1070 }
1071
1072
1073
1074 /* TODO: relative addressing!
1075  */
1076 static struct brw_reg get_reg( struct brw_vs_compile *c,
1077                                gl_register_file file,
1078                                GLuint index )
1079 {
1080    switch (file) {
1081    case PROGRAM_TEMPORARY:
1082    case PROGRAM_INPUT:
1083    case PROGRAM_OUTPUT:
1084       assert(c->regs[file][index].nr != 0);
1085       return c->regs[file][index];
1086    case PROGRAM_STATE_VAR:
1087    case PROGRAM_CONSTANT:
1088    case PROGRAM_UNIFORM:
1089       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1090       return c->regs[PROGRAM_STATE_VAR][index];
1091    case PROGRAM_ADDRESS:
1092       assert(index == 0);
1093       return c->regs[file][index];
1094
1095    case PROGRAM_UNDEFINED:                      /* undef values */
1096       return brw_null_reg();
1097
1098    case PROGRAM_LOCAL_PARAM:
1099    case PROGRAM_ENV_PARAM:
1100    case PROGRAM_WRITE_ONLY:
1101    default:
1102       assert(0);
1103       return brw_null_reg();
1104    }
1105 }
1106
1107
1108 /**
1109  * Indirect addressing:  get reg[[arg] + offset].
1110  */
1111 static struct brw_reg deref( struct brw_vs_compile *c,
1112                              struct brw_reg arg,
1113                              GLint offset,
1114                              GLuint reg_size )
1115 {
1116    struct brw_compile *p = &c->func;
1117    struct brw_reg tmp = get_tmp(c);
1118    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1119    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1120    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1121    struct brw_reg indirect = brw_vec4_indirect(0,0);
1122    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1123
1124    /* Set the vertical stride on the register access so that the first
1125     * 4 components come from a0.0 and the second 4 from a0.1.
1126     */
1127    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1128
1129    {
1130       brw_push_insn_state(p);
1131       brw_set_access_mode(p, BRW_ALIGN_1);
1132
1133       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1134       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1135
1136       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1137       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1138
1139       brw_MOV(p, tmp, indirect);
1140
1141       brw_pop_insn_state(p);
1142    }
1143
1144    /* NOTE: tmp not released */
1145    return tmp;
1146 }
1147
1148 static void
1149 move_to_reladdr_dst(struct brw_vs_compile *c,
1150                     const struct prog_instruction *inst,
1151                     struct brw_reg val)
1152 {
1153    struct brw_compile *p = &c->func;
1154    int reg_size = 32;
1155    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1156    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1157    struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1158    GLuint byte_offset = base.nr * 32 + base.subnr;
1159    struct brw_reg indirect = brw_vec4_indirect(0,0);
1160    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1161
1162    /* Because destination register indirect addressing can only use
1163     * one index, we'll write each vertex's vec4 value separately.
1164     */
1165    val.width = BRW_WIDTH_4;
1166    val.vstride = BRW_VERTICAL_STRIDE_4;
1167
1168    brw_push_insn_state(p);
1169    brw_set_access_mode(p, BRW_ALIGN_1);
1170
1171    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1172    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1173    brw_MOV(p, indirect, val);
1174
1175    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1176    brw_ADD(p, brw_address_reg(0), acc,
1177            brw_imm_uw(byte_offset + reg_size / 2));
1178    brw_MOV(p, indirect, suboffset(val, 4));
1179
1180    brw_pop_insn_state(p);
1181 }
1182
1183 /**
1184  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1185  * TODO: relative addressing!
1186  */
1187 static struct brw_reg
1188 get_src_reg( struct brw_vs_compile *c,
1189              const struct prog_instruction *inst,
1190              GLuint argIndex )
1191 {
1192    const GLuint file = inst->SrcReg[argIndex].File;
1193    const GLint index = inst->SrcReg[argIndex].Index;
1194    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1195
1196    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1197       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1198
1199       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1200                                         SWIZZLE_ZERO,
1201                                         SWIZZLE_ZERO,
1202                                         SWIZZLE_ZERO)) {
1203           return brw_imm_f(0.0f);
1204       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1205                                                SWIZZLE_ONE,
1206                                                SWIZZLE_ONE,
1207                                                SWIZZLE_ONE)) {
1208          if (src->Negate)
1209             return brw_imm_f(-1.0F);
1210          else
1211             return brw_imm_f(1.0F);
1212       } else if (src->File == PROGRAM_CONSTANT) {
1213          const struct gl_program_parameter_list *params;
1214          float f;
1215          int component = -1;
1216
1217          switch (src->Swizzle) {
1218          case SWIZZLE_XXXX:
1219             component = 0;
1220             break;
1221          case SWIZZLE_YYYY:
1222             component = 1;
1223             break;
1224          case SWIZZLE_ZZZZ:
1225             component = 2;
1226             break;
1227          case SWIZZLE_WWWW:
1228             component = 3;
1229             break;
1230          }
1231
1232          if (component >= 0) {
1233             params = c->vp->program.Base.Parameters;
1234             f = params->ParameterValues[src->Index][component];
1235
1236             if (src->Abs)
1237                f = fabs(f);
1238             if (src->Negate)
1239                f = -f;
1240             return brw_imm_f(f);
1241          }
1242       }
1243    }
1244
1245    switch (file) {
1246    case PROGRAM_TEMPORARY:
1247    case PROGRAM_INPUT:
1248    case PROGRAM_OUTPUT:
1249       if (relAddr) {
1250          return deref(c, c->regs[file][0], index, 32);
1251       }
1252       else {
1253          assert(c->regs[file][index].nr != 0);
1254          return c->regs[file][index];
1255       }
1256
1257    case PROGRAM_STATE_VAR:
1258    case PROGRAM_CONSTANT:
1259    case PROGRAM_UNIFORM:
1260    case PROGRAM_ENV_PARAM:
1261    case PROGRAM_LOCAL_PARAM:
1262       if (c->vp->use_const_buffer) {
1263          if (!relAddr && c->constant_map[index] != -1) {
1264             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1265             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1266          } else if (relAddr)
1267             return get_reladdr_constant(c, inst, argIndex);
1268          else
1269             return get_constant(c, inst, argIndex);
1270       }
1271       else if (relAddr) {
1272          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1273       }
1274       else {
1275          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1276          return c->regs[PROGRAM_STATE_VAR][index];
1277       }
1278    case PROGRAM_ADDRESS:
1279       assert(index == 0);
1280       return c->regs[file][index];
1281
1282    case PROGRAM_UNDEFINED:
1283       /* this is a normal case since we loop over all three src args */
1284       return brw_null_reg();
1285
1286    case PROGRAM_WRITE_ONLY:
1287    default:
1288       assert(0);
1289       return brw_null_reg();
1290    }
1291 }
1292
1293 /**
1294  * Return the brw reg for the given instruction's src argument.
1295  * Will return mangled results for SWZ op.  The emit_swz() function
1296  * ignores this result and recalculates taking extended swizzles into
1297  * account.
1298  */
1299 static struct brw_reg get_arg( struct brw_vs_compile *c,
1300                                const struct prog_instruction *inst,
1301                                GLuint argIndex )
1302 {
1303    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1304    struct brw_reg reg;
1305
1306    if (src->File == PROGRAM_UNDEFINED)
1307       return brw_null_reg();
1308
1309    reg = get_src_reg(c, inst, argIndex);
1310
1311    /* Convert 3-bit swizzle to 2-bit.
1312     */
1313    if (reg.file != BRW_IMMEDIATE_VALUE) {
1314       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1315                                           GET_SWZ(src->Swizzle, 1),
1316                                           GET_SWZ(src->Swizzle, 2),
1317                                           GET_SWZ(src->Swizzle, 3));
1318    }
1319
1320    /* Note this is ok for non-swizzle instructions:
1321     */
1322    reg.negate = src->Negate ? 1 : 0;
1323
1324    return reg;
1325 }
1326
1327
1328 /**
1329  * Get brw register for the given program dest register.
1330  */
1331 static struct brw_reg get_dst( struct brw_vs_compile *c,
1332                                struct prog_dst_register dst )
1333 {
1334    struct brw_reg reg;
1335
1336    switch (dst.File) {
1337    case PROGRAM_TEMPORARY:
1338    case PROGRAM_OUTPUT:
1339       /* register-indirect addressing is only 1x1, not VxH, for
1340        * destination regs.  So, for RelAddr we'll return a temporary
1341        * for the dest and do a move of the result to the RelAddr
1342        * register after the instruction emit.
1343        */
1344       if (dst.RelAddr) {
1345          reg = get_tmp(c);
1346       } else {
1347          assert(c->regs[dst.File][dst.Index].nr != 0);
1348          reg = c->regs[dst.File][dst.Index];
1349       }
1350       break;
1351    case PROGRAM_ADDRESS:
1352       assert(dst.Index == 0);
1353       reg = c->regs[dst.File][dst.Index];
1354       break;
1355    case PROGRAM_UNDEFINED:
1356       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1357       reg = brw_null_reg();
1358       break;
1359    default:
1360       assert(0);
1361       reg = brw_null_reg();
1362    }
1363
1364    assert(reg.type != BRW_IMMEDIATE_VALUE);
1365    reg.dw1.bits.writemask = dst.WriteMask;
1366
1367    return reg;
1368 }
1369
1370
1371 static void emit_swz( struct brw_vs_compile *c,
1372                       struct brw_reg dst,
1373                       const struct prog_instruction *inst)
1374 {
1375    const GLuint argIndex = 0;
1376    const struct prog_src_register src = inst->SrcReg[argIndex];
1377    struct brw_compile *p = &c->func;
1378    GLuint zeros_mask = 0;
1379    GLuint ones_mask = 0;
1380    GLuint src_mask = 0;
1381    GLubyte src_swz[4];
1382    GLboolean need_tmp = (src.Negate &&
1383                          dst.file != BRW_GENERAL_REGISTER_FILE);
1384    struct brw_reg tmp = dst;
1385    GLuint i;
1386
1387    if (need_tmp)
1388       tmp = get_tmp(c);
1389
1390    for (i = 0; i < 4; i++) {
1391       if (dst.dw1.bits.writemask & (1<<i)) {
1392          GLubyte s = GET_SWZ(src.Swizzle, i);
1393          switch (s) {
1394          case SWIZZLE_X:
1395          case SWIZZLE_Y:
1396          case SWIZZLE_Z:
1397          case SWIZZLE_W:
1398             src_mask |= 1<<i;
1399             src_swz[i] = s;
1400             break;
1401          case SWIZZLE_ZERO:
1402             zeros_mask |= 1<<i;
1403             break;
1404          case SWIZZLE_ONE:
1405             ones_mask |= 1<<i;
1406             break;
1407          }
1408       }
1409    }
1410
1411    /* Do src first, in case dst aliases src:
1412     */
1413    if (src_mask) {
1414       struct brw_reg arg0;
1415
1416       arg0 = get_src_reg(c, inst, argIndex);
1417
1418       arg0 = brw_swizzle(arg0,
1419                          src_swz[0], src_swz[1],
1420                          src_swz[2], src_swz[3]);
1421
1422       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1423    }
1424
1425    if (zeros_mask)
1426       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1427
1428    if (ones_mask)
1429       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1430
1431    if (src.Negate)
1432       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1433
1434    if (need_tmp) {
1435       brw_MOV(p, dst, tmp);
1436       release_tmp(c, tmp);
1437    }
1438 }
1439
1440
1441 /**
1442  * Post-vertex-program processing.  Send the results to the URB.
1443  */
1444 static void emit_vertex_write( struct brw_vs_compile *c)
1445 {
1446    struct brw_compile *p = &c->func;
1447    struct brw_context *brw = p->brw;
1448    struct intel_context *intel = &brw->intel;
1449    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1450    struct brw_reg ndc;
1451    int eot;
1452    GLuint len_vertex_header = 2;
1453    int next_mrf, i;
1454
1455    if (c->key.copy_edgeflag) {
1456       brw_MOV(p,
1457               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1458               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1459    }
1460
1461    if (intel->gen < 6) {
1462       /* Build ndc coords */
1463       ndc = get_tmp(c);
1464       /* ndc = 1.0 / pos.w */
1465       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1466       /* ndc.xyz = pos * ndc */
1467       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1468    }
1469
1470    /* Update the header for point size, user clipping flags, and -ve rhw
1471     * workaround.
1472     */
1473    if (intel->gen >= 6) {
1474       struct brw_reg m1 = brw_message_reg(1);
1475
1476       /* On gen6, m1 has each value in a separate dword, so we never
1477        * need to mess with a temporary for computing the m1 value.
1478        */
1479       brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1480       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1481          brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1482                  brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1483       }
1484
1485       /* Set the user clip distances in dword 8-15. (m3-4)*/
1486       if (c->key.nr_userclip) {
1487          for (i = 0; i < c->key.nr_userclip; i++) {
1488             struct brw_reg m;
1489             if (i < 4)
1490                m = brw_message_reg(3);
1491             else
1492                m = brw_message_reg(4);
1493
1494             brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1495          }
1496       }
1497    } else if ((c->prog_data.outputs_written &
1498                BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1499               c->key.nr_userclip || brw->has_negative_rhw_bug) {
1500       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1501       GLuint i;
1502
1503       brw_MOV(p, header1, brw_imm_ud(0));
1504
1505       brw_set_access_mode(p, BRW_ALIGN_16);
1506
1507       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1508          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1509          brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1510                  brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1511          brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1512                  header1, brw_imm_ud(0x7ff<<8));
1513       }
1514
1515       for (i = 0; i < c->key.nr_userclip; i++) {
1516          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1517          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1518          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1519          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1520       }
1521
1522       /* i965 clipping workaround:
1523        * 1) Test for -ve rhw
1524        * 2) If set,
1525        *      set ndc = (0,0,0,0)
1526        *      set ucp[6] = 1
1527        *
1528        * Later, clipping will detect ucp[6] and ensure the primitive is
1529        * clipped against all fixed planes.
1530        */
1531       if (brw->has_negative_rhw_bug) {
1532          brw_CMP(p,
1533                  vec8(brw_null_reg()),
1534                  BRW_CONDITIONAL_L,
1535                  brw_swizzle1(ndc, 3),
1536                  brw_imm_f(0));
1537
1538          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1539          brw_MOV(p, ndc, brw_imm_f(0));
1540          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1541       }
1542
1543       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1544       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1545       brw_set_access_mode(p, BRW_ALIGN_16);
1546
1547       release_tmp(c, header1);
1548    }
1549    else {
1550       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1551    }
1552
1553    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1554     * of zeros followed by two sets of NDC coordinates:
1555     */
1556    brw_set_access_mode(p, BRW_ALIGN_1);
1557    brw_set_acc_write_control(p, 0);
1558
1559    /* The VUE layout is documented in Volume 2a. */
1560    if (intel->gen >= 6) {
1561       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1562        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1563        * dword 4-7 (m2) is the 4D space position
1564        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1565        * enabled.
1566        * m3 or 5 is the first vertex element data we fill, which is
1567        * the vertex position.
1568        */
1569       brw_MOV(p, brw_message_reg(2), pos);
1570       len_vertex_header = 1;
1571       if (c->key.nr_userclip > 0)
1572          len_vertex_header += 2;
1573    } else if (intel->gen == 5) {
1574       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1575        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1576        * dword 4-7 (m2) is the ndc position (set above)
1577        * dword 8-11 (m3) of the vertex header is the 4D space position
1578        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1579        * m6 is a pad so that the vertex element data is aligned
1580        * m7 is the first vertex data we fill, which is the vertex position.
1581        */
1582       brw_MOV(p, brw_message_reg(2), ndc);
1583       brw_MOV(p, brw_message_reg(3), pos);
1584       brw_MOV(p, brw_message_reg(7), pos);
1585       len_vertex_header = 6;
1586    } else {
1587       /* There are 8 dwords in VUE header pre-Ironlake:
1588        * dword 0-3 (m1) is indices, point width, clip flags.
1589        * dword 4-7 (m2) is ndc position (set above)
1590        *
1591        * dword 8-11 (m3) is the first vertex data, which we always have be the
1592        * vertex position.
1593        */
1594       brw_MOV(p, brw_message_reg(2), ndc);
1595       brw_MOV(p, brw_message_reg(3), pos);
1596       len_vertex_header = 2;
1597    }
1598
1599    /* Move variable-addressed, non-overflow outputs to their MRFs. */
1600    next_mrf = 2 + len_vertex_header;
1601    for (i = 0; i < VERT_RESULT_MAX; i++) {
1602       if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1603          break;
1604       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1605          continue;
1606
1607       if (i >= VERT_RESULT_TEX0 &&
1608           c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1609          brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1610          next_mrf++;
1611       } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1612          next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1613       }
1614    }
1615
1616    eot = (c->first_overflow_output == 0);
1617
1618    brw_urb_WRITE(p,
1619                  brw_null_reg(), /* dest */
1620                  0,             /* starting mrf reg nr */
1621                  c->r0,         /* src */
1622                  0,             /* allocate */
1623                  1,             /* used */
1624                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1625                  0,             /* response len */
1626                  eot,           /* eot */
1627                  eot,           /* writes complete */
1628                  0,             /* urb destination offset */
1629                  BRW_URB_SWIZZLE_INTERLEAVE);
1630
1631    if (c->first_overflow_output > 0) {
1632       /* Not all of the vertex outputs/results fit into the MRF.
1633        * Move the overflowed attributes from the GRF to the MRF and
1634        * issue another brw_urb_WRITE().
1635        */
1636       GLuint i, mrf = 1;
1637       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1638          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1639             /* move from GRF to MRF */
1640             brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1641             mrf++;
1642          }
1643       }
1644
1645       brw_urb_WRITE(p,
1646                     brw_null_reg(), /* dest */
1647                     0,              /* starting mrf reg nr */
1648                     c->r0,          /* src */
1649                     0,              /* allocate */
1650                     1,              /* used */
1651                     mrf,            /* msg len */
1652                     0,              /* response len */
1653                     1,              /* eot */
1654                     1,              /* writes complete */
1655                     14 / 2,  /* urb destination offset */
1656                     BRW_URB_SWIZZLE_INTERLEAVE);
1657    }
1658 }
1659
1660 static GLboolean
1661 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1662 {
1663    struct brw_compile *p = &c->func;
1664    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1665
1666    if (p->nr_insn == 0)
1667       return GL_FALSE;
1668
1669    if (val.address_mode != BRW_ADDRESS_DIRECT)
1670       return GL_FALSE;
1671
1672    switch (prev_insn->header.opcode) {
1673    case BRW_OPCODE_MOV:
1674    case BRW_OPCODE_MAC:
1675    case BRW_OPCODE_MUL:
1676       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1677           prev_insn->header.execution_size == val.width &&
1678           prev_insn->bits1.da1.dest_reg_file == val.file &&
1679           prev_insn->bits1.da1.dest_reg_type == val.type &&
1680           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1681           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1682           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1683           prev_insn->bits1.da16.dest_writemask == 0xf)
1684          return GL_TRUE;
1685       else
1686          return GL_FALSE;
1687    default:
1688       return GL_FALSE;
1689    }
1690 }
1691
1692 static uint32_t
1693 get_predicate(const struct prog_instruction *inst)
1694 {
1695    if (inst->DstReg.CondMask == COND_TR)
1696       return BRW_PREDICATE_NONE;
1697
1698    /* All of GLSL only produces predicates for COND_NE and one channel per
1699     * vector.  Fail badly if someone starts doing something else, as it might
1700     * mean infinite looping or something.
1701     *
1702     * We'd like to support all the condition codes, but our hardware doesn't
1703     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1704     * those, the instruction may update the condition codes or not, then any
1705     * later instruction may use one of those condition codes.  For gen4, the
1706     * instruction may update the flags register based on one of the condition
1707     * codes output by the instruction, and then further instructions may
1708     * predicate on that.  We can probably support this, but it won't
1709     * necessarily be easy.
1710     */
1711    assert(inst->DstReg.CondMask == COND_NE);
1712
1713    switch (inst->DstReg.CondSwizzle) {
1714    case SWIZZLE_XXXX:
1715       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1716    case SWIZZLE_YYYY:
1717       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1718    case SWIZZLE_ZZZZ:
1719       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1720    case SWIZZLE_WWWW:
1721       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1722    default:
1723       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1724                     inst->DstReg.CondMask);
1725       return BRW_PREDICATE_NORMAL;
1726    }
1727 }
1728
1729 /* Emit the vertex program instructions here.
1730  */
1731 void brw_vs_emit(struct brw_vs_compile *c )
1732 {
1733 #define MAX_IF_DEPTH 32
1734 #define MAX_LOOP_DEPTH 32
1735    struct brw_compile *p = &c->func;
1736    struct brw_context *brw = p->brw;
1737    struct intel_context *intel = &brw->intel;
1738    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1739    GLuint insn, if_depth = 0, loop_depth = 0;
1740    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1741    int if_depth_in_loop[MAX_LOOP_DEPTH];
1742    const struct brw_indirect stack_index = brw_indirect(0, 0);
1743    GLuint index;
1744    GLuint file;
1745
1746    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1747       printf("vs-mesa:\n");
1748       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1749                                GL_TRUE);
1750       printf("\n");
1751    }
1752
1753    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1754    brw_set_access_mode(p, BRW_ALIGN_16);
1755    if_depth_in_loop[loop_depth] = 0;
1756
1757    brw_set_acc_write_control(p, 1);
1758
1759    for (insn = 0; insn < nr_insns; insn++) {
1760        GLuint i;
1761        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1762
1763        /* Message registers can't be read, so copy the output into GRF
1764         * register if they are used in source registers
1765         */
1766        for (i = 0; i < 3; i++) {
1767            struct prog_src_register *src = &inst->SrcReg[i];
1768            GLuint index = src->Index;
1769            GLuint file = src->File;
1770            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1771                c->output_regs[index].used_in_src = GL_TRUE;
1772        }
1773
1774        switch (inst->Opcode) {
1775        case OPCODE_CAL:
1776        case OPCODE_RET:
1777           c->needs_stack = GL_TRUE;
1778           break;
1779        default:
1780           break;
1781        }
1782    }
1783
1784    /* Static register allocation
1785     */
1786    brw_vs_alloc_regs(c);
1787
1788    if (c->needs_stack)
1789       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1790
1791    for (insn = 0; insn < nr_insns; insn++) {
1792
1793       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1794       struct brw_reg args[3], dst;
1795       GLuint i;
1796
1797 #if 0
1798       printf("%d: ", insn);
1799       _mesa_print_instruction(inst);
1800 #endif
1801
1802       /* Get argument regs.  SWZ is special and does this itself.
1803        */
1804       if (inst->Opcode != OPCODE_SWZ)
1805           for (i = 0; i < 3; i++) {
1806               const struct prog_src_register *src = &inst->SrcReg[i];
1807               index = src->Index;
1808               file = src->File;
1809               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1810                   args[i] = c->output_regs[index].reg;
1811               else
1812                   args[i] = get_arg(c, inst, i);
1813           }
1814
1815       /* Get dest regs.  Note that it is possible for a reg to be both
1816        * dst and arg, given the static allocation of registers.  So
1817        * care needs to be taken emitting multi-operation instructions.
1818        */
1819       index = inst->DstReg.Index;
1820       file = inst->DstReg.File;
1821       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1822           dst = c->output_regs[index].reg;
1823       else
1824           dst = get_dst(c, inst->DstReg);
1825
1826       if (inst->SaturateMode != SATURATE_OFF) {
1827          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1828                        inst->SaturateMode);
1829       }
1830
1831       switch (inst->Opcode) {
1832       case OPCODE_ABS:
1833          brw_MOV(p, dst, brw_abs(args[0]));
1834          break;
1835       case OPCODE_ADD:
1836          brw_ADD(p, dst, args[0], args[1]);
1837          break;
1838       case OPCODE_COS:
1839          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1840          break;
1841       case OPCODE_DP2:
1842          brw_DP2(p, dst, args[0], args[1]);
1843          break;
1844       case OPCODE_DP3:
1845          brw_DP3(p, dst, args[0], args[1]);
1846          break;
1847       case OPCODE_DP4:
1848          brw_DP4(p, dst, args[0], args[1]);
1849          break;
1850       case OPCODE_DPH:
1851          brw_DPH(p, dst, args[0], args[1]);
1852          break;
1853       case OPCODE_NRM3:
1854          emit_nrm(c, dst, args[0], 3);
1855          break;
1856       case OPCODE_NRM4:
1857          emit_nrm(c, dst, args[0], 4);
1858          break;
1859       case OPCODE_DST:
1860          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1861          break;
1862       case OPCODE_EXP:
1863          unalias1(c, dst, args[0], emit_exp_noalias);
1864          break;
1865       case OPCODE_EX2:
1866          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1867          break;
1868       case OPCODE_ARL:
1869          brw_RNDD(p, dst, args[0]);
1870          break;
1871       case OPCODE_FLR:
1872          brw_RNDD(p, dst, args[0]);
1873          break;
1874       case OPCODE_FRC:
1875          brw_FRC(p, dst, args[0]);
1876          break;
1877       case OPCODE_LOG:
1878          unalias1(c, dst, args[0], emit_log_noalias);
1879          break;
1880       case OPCODE_LG2:
1881          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1882          break;
1883       case OPCODE_LIT:
1884          unalias1(c, dst, args[0], emit_lit_noalias);
1885          break;
1886       case OPCODE_LRP:
1887          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1888          break;
1889       case OPCODE_MAD:
1890          if (!accumulator_contains(c, args[2]))
1891             brw_MOV(p, brw_acc_reg(), args[2]);
1892          brw_MAC(p, dst, args[0], args[1]);
1893          break;
1894       case OPCODE_CMP:
1895          emit_cmp(p, dst, args[0], args[1], args[2]);
1896          break;
1897       case OPCODE_MAX:
1898          emit_max(p, dst, args[0], args[1]);
1899          break;
1900       case OPCODE_MIN:
1901          emit_min(p, dst, args[0], args[1]);
1902          break;
1903       case OPCODE_MOV:
1904          brw_MOV(p, dst, args[0]);
1905          break;
1906       case OPCODE_MUL:
1907          brw_MUL(p, dst, args[0], args[1]);
1908          break;
1909       case OPCODE_POW:
1910          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1911          break;
1912       case OPCODE_RCP:
1913          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1914          break;
1915       case OPCODE_RSQ:
1916          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1917          break;
1918
1919       case OPCODE_SEQ:
1920          unalias2(c, dst, args[0], args[1], emit_seq);
1921          break;
1922       case OPCODE_SIN:
1923          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1924          break;
1925       case OPCODE_SNE:
1926          unalias2(c, dst, args[0], args[1], emit_sne);
1927          break;
1928       case OPCODE_SGE:
1929          unalias2(c, dst, args[0], args[1], emit_sge);
1930          break;
1931       case OPCODE_SGT:
1932          unalias2(c, dst, args[0], args[1], emit_sgt);
1933          break;
1934       case OPCODE_SLT:
1935          unalias2(c, dst, args[0], args[1], emit_slt);
1936          break;
1937       case OPCODE_SLE:
1938          unalias2(c, dst, args[0], args[1], emit_sle);
1939          break;
1940       case OPCODE_SSG:
1941          unalias1(c, dst, args[0], emit_sign);
1942          break;
1943       case OPCODE_SUB:
1944          brw_ADD(p, dst, args[0], negate(args[1]));
1945          break;
1946       case OPCODE_SWZ:
1947          /* The args[0] value can't be used here as it won't have
1948           * correctly encoded the full swizzle:
1949           */
1950          emit_swz(c, dst, inst);
1951          break;
1952       case OPCODE_TRUNC:
1953          /* round toward zero */
1954          brw_RNDZ(p, dst, args[0]);
1955          break;
1956       case OPCODE_XPD:
1957          emit_xpd(p, dst, args[0], args[1]);
1958          break;
1959       case OPCODE_IF:
1960          assert(if_depth < MAX_IF_DEPTH);
1961          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1962          /* Note that brw_IF smashes the predicate_control field. */
1963          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1964          if_depth_in_loop[loop_depth]++;
1965          if_depth++;
1966          break;
1967       case OPCODE_ELSE:
1968          clear_current_const(c);
1969          assert(if_depth > 0);
1970          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1971          break;
1972       case OPCODE_ENDIF:
1973          clear_current_const(c);
1974          assert(if_depth > 0);
1975          brw_ENDIF(p, if_inst[--if_depth]);
1976          if_depth_in_loop[loop_depth]--;
1977          break;
1978       case OPCODE_BGNLOOP:
1979          clear_current_const(c);
1980          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1981          if_depth_in_loop[loop_depth] = 0;
1982          break;
1983       case OPCODE_BRK:
1984          brw_set_predicate_control(p, get_predicate(inst));
1985          brw_BREAK(p, if_depth_in_loop[loop_depth]);
1986          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1987          break;
1988       case OPCODE_CONT:
1989          brw_set_predicate_control(p, get_predicate(inst));
1990          brw_CONT(p, if_depth_in_loop[loop_depth]);
1991          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1992          break;
1993       case OPCODE_ENDLOOP:
1994          {
1995             clear_current_const(c);
1996             struct brw_instruction *inst0, *inst1;
1997             GLuint br = 1;
1998
1999             loop_depth--;
2000
2001             if (intel->gen == 5)
2002                br = 2;
2003
2004             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2005             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2006             while (inst0 > loop_inst[loop_depth]) {
2007                inst0--;
2008                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2009                    inst0->bits3.if_else.jump_count == 0) {
2010                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2011                }
2012                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2013                         inst0->bits3.if_else.jump_count == 0) {
2014                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2015                }
2016             }
2017          }
2018          break;
2019       case OPCODE_BRA:
2020          brw_set_predicate_control(p, get_predicate(inst));
2021          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2022          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2023          break;
2024       case OPCODE_CAL:
2025          brw_set_access_mode(p, BRW_ALIGN_1);
2026          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2027          brw_set_access_mode(p, BRW_ALIGN_16);
2028          brw_ADD(p, get_addr_reg(stack_index),
2029                          get_addr_reg(stack_index), brw_imm_d(4));
2030          brw_save_call(p, inst->Comment, p->nr_insn);
2031          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2032          break;
2033       case OPCODE_RET:
2034          brw_ADD(p, get_addr_reg(stack_index),
2035                          get_addr_reg(stack_index), brw_imm_d(-4));
2036          brw_set_access_mode(p, BRW_ALIGN_1);
2037          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2038          brw_set_access_mode(p, BRW_ALIGN_16);
2039          break;
2040       case OPCODE_END:
2041          emit_vertex_write(c);
2042          break;
2043       case OPCODE_PRINT:
2044          /* no-op */
2045          break;
2046       case OPCODE_BGNSUB:
2047          brw_save_label(p, inst->Comment, p->nr_insn);
2048          break;
2049       case OPCODE_ENDSUB:
2050          /* no-op */
2051          break;
2052       default:
2053          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2054                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2055                                     _mesa_opcode_string(inst->Opcode) :
2056                                     "unknown");
2057       }
2058
2059       /* Set the predication update on the last instruction of the native
2060        * instruction sequence.
2061        *
2062        * This would be problematic if it was set on a math instruction,
2063        * but that shouldn't be the case with the current GLSL compiler.
2064        */
2065       if (inst->CondUpdate) {
2066          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2067
2068          assert(hw_insn->header.destreg__conditionalmod == 0);
2069          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2070       }
2071
2072       if ((inst->DstReg.File == PROGRAM_OUTPUT)
2073           && (inst->DstReg.Index != VERT_RESULT_HPOS)
2074           && c->output_regs[inst->DstReg.Index].used_in_src) {
2075          brw_MOV(p, get_dst(c, inst->DstReg), dst);
2076       }
2077
2078       /* Result color clamping.
2079        *
2080        * When destination register is an output register and
2081        * it's primary/secondary front/back color, we have to clamp
2082        * the result to [0,1]. This is done by enabling the
2083        * saturation bit for the last instruction.
2084        *
2085        * We don't use brw_set_saturate() as it modifies
2086        * p->current->header.saturate, which affects all the subsequent
2087        * instructions. Instead, we directly modify the header
2088        * of the last (already stored) instruction.
2089        */
2090       if (inst->DstReg.File == PROGRAM_OUTPUT) {
2091          if ((inst->DstReg.Index == VERT_RESULT_COL0)
2092              || (inst->DstReg.Index == VERT_RESULT_COL1)
2093              || (inst->DstReg.Index == VERT_RESULT_BFC0)
2094              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2095             p->store[p->nr_insn-1].header.saturate = 1;
2096          }
2097       }
2098
2099       if (inst->DstReg.RelAddr) {
2100          assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2101                 inst->DstReg.File == PROGRAM_OUTPUT);
2102          move_to_reladdr_dst(c, inst, dst);
2103       }
2104
2105       release_tmps(c);
2106    }
2107
2108    brw_resolve_cals(p);
2109
2110    brw_optimize(p);
2111
2112    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2113       int i;
2114
2115       printf("vs-native:\n");
2116       for (i = 0; i < p->nr_insn; i++)
2117          brw_disasm(stdout, &p->store[i], intel->gen);
2118       printf("\n");
2119    }
2120 }