src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return GL_TRUE;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return GL_FALSE;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101
 102 /**
 103  * Preallocate GRF register before code emit.
 104  * Do things as simply as possible.  Allocate and populate all regs
 105  * ahead of time.
 106  */
 107 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 108 {
 109    struct intel_context *intel = &c->func.brw->intel;
 110    GLuint i, reg = 0, mrf;
 111    int attributes_in_vue;
 112
 113    /* Determine whether to use a real constant buffer or use a block
 114     * of GRF registers for constants.  The later is faster but only
 115     * works if everything fits in the GRF.
 116     * XXX this heuristic/check may need some fine tuning...
 117     */
 118    if (c->vp->program.Base.Parameters->NumParameters +
 119        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 120       c->vp->use_const_buffer = GL_TRUE;
 121    else
 122       c->vp->use_const_buffer = GL_FALSE;
 123
 124    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 125
 126    /* r0 -- reserved as usual
 127     */
 128    c->r0 = brw_vec8_grf(reg, 0);
 129    reg++;
 130
 131    /* User clip planes from curbe:
 132     */
 133    if (c->key.nr_userclip) {
 134       for (i = 0; i < c->key.nr_userclip; i++) {
 135          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 136       }
 137
 138       /* Deal with curbe alignment:
 139        */
 140       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 141    }
 142
 143    /* Vertex program parameters from curbe:
 144     */
 145    if (c->vp->use_const_buffer) {
 146       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 147       int constant = 0;
 148
 149       /* We've got more constants than we can load with the push
 150        * mechanism.  This is often correlated with reladdr loads where
 151        * we should probably be using a pull mechanism anyway to avoid
 152        * excessive reading.  However, the pull mechanism is slow in
 153        * general.  So, we try to allocate as many non-reladdr-loaded
 154        * constants through the push buffer as we can before giving up.
 155        */
 156       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 157       for (i = 0;
 158            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 159            i++) {
 160          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 161          int arg;
 162
 163          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 164             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 165                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 166                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 167                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 168                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 169                 inst->SrcReg[arg].RelAddr)
 170                continue;
 171
 172             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 173                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 174             }
 175          }
 176       }
 177
 178       for (i = 0; i < constant; i++) {
 179          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 180                                                               (i%2) * 4),
 181                                                  0, 4, 1);
 182       }
 183       reg += (constant + 1) / 2;
 184       c->prog_data.curb_read_length = reg - 1;
 185       /* XXX 0 causes a bug elsewhere... */
 186       c->prog_data.nr_params = MAX2(constant * 4, 4);
 187    }
 188    else {
 189       /* use a section of the GRF for constants */
 190       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 191       for (i = 0; i < nr_params; i++) {
 192          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 193       }
 194       reg += (nr_params + 1) / 2;
 195       c->prog_data.curb_read_length = reg - 1;
 196
 197       c->prog_data.nr_params = nr_params * 4;
 198    }
 199
 200    /* Allocate input regs:
 201     */
 202    c->nr_inputs = 0;
 203    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 204       if (c->prog_data.inputs_read & (1 << i)) {
 205          c->nr_inputs++;
 206          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 207          reg++;
 208       }
 209    }
 210    /* If there are no inputs, we'll still be reading one attribute's worth
 211     * because it's required -- see urb_read_length setting.
 212     */
 213    if (c->nr_inputs == 0)
 214       reg++;
 215
 216    /* Allocate outputs.  The non-position outputs go straight into message regs.
 217     */
 218    c->nr_outputs = 0;
 219    c->first_output = reg;
 220    c->first_overflow_output = 0;
 221
 222    if (intel->gen >= 6)
 223       mrf = 4;
 224    else if (intel->gen == 5)
 225       mrf = 8;
 226    else
 227       mrf = 4;
 228
 229    for (i = 0; i < VERT_RESULT_MAX; i++) {
 230       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 231          c->nr_outputs++;
 232          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 233          if (i == VERT_RESULT_HPOS) {
 234             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 235             reg++;
 236          }
 237          else if (i == VERT_RESULT_PSIZ) {
 238             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 239             reg++;
 240             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 241          }
 242          else {
 243             /* Two restrictions on our compute-to-MRF here.  The
 244              * message length for all SEND messages is restricted to
 245              * [1,15], so we can't use mrf 15, as that means a length
 246              * of 16.
 247              *
 248              * Additionally, URB writes are aligned to URB rows, so we
 249              * need to put an even number of registers of URB data in
 250              * each URB write so that the later write is aligned.  A
 251              * message length of 15 means 1 message header reg plus 14
 252              * regs of URB data.
 253              *
 254              * For attributes beyond the compute-to-MRF, we compute to
 255              * GRFs and they will be written in the second URB_WRITE.
 256              */
 257             if (mrf < 15) {
 258                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 259                mrf++;
 260             }
 261             else {
 262                if (!c->first_overflow_output)
 263                   c->first_overflow_output = i;
 264                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 265                reg++;
 266             }
 267          }
 268       }
 269    }
 270
 271    /* Allocate program temporaries:
 272     */
 273    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 274       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 275       reg++;
 276    }
 277
 278    /* Address reg(s).  Don't try to use the internal address reg until
 279     * deref time.
 280     */
 281    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 282       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 283                                              reg,
 284                                              0,
 285                                              BRW_REGISTER_TYPE_D,
 286                                              BRW_VERTICAL_STRIDE_8,
 287                                              BRW_WIDTH_8,
 288                                              BRW_HORIZONTAL_STRIDE_1,
 289                                              BRW_SWIZZLE_XXXX,
 290                                              WRITEMASK_X);
 291       reg++;
 292    }
 293
 294    if (c->vp->use_const_buffer) {
 295       for (i = 0; i < 3; i++) {
 296          c->current_const[i].index = -1;
 297          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 298          reg++;
 299       }
 300    }
 301
 302    for (i = 0; i < 128; i++) {
 303       if (c->output_regs[i].used_in_src) {
 304          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 305          reg++;
 306       }
 307    }
 308
 309    if (c->needs_stack) {
 310       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 311       reg += 2;
 312    }
 313
 314    /* Some opcodes need an internal temporary:
 315     */
 316    c->first_tmp = reg;
 317    c->last_tmp = reg;           /* for allocation purposes */
 318
 319    /* Each input reg holds data from two vertices.  The
 320     * urb_read_length is the number of registers read from *each*
 321     * vertex urb, so is half the amount:
 322     */
 323    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 324    /* Setting this field to 0 leads to undefined behavior according to the
 325     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 326     * sitting in them, even if it's padding.
 327     */
 328    if (c->prog_data.urb_read_length == 0)
 329       c->prog_data.urb_read_length = 1;
 330
 331    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 332     * them to fit the biggest thing they need to.
 333     */
 334    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 335
 336    /* See emit_vertex_write() for where the VUE's overhead on top of the
 337     * attributes comes from.
 338     */
 339    if (intel->gen >= 6)
 340       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
 341    else if (intel->gen == 5)
 342       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 343    else
 344       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 345
 346    c->prog_data.total_grf = reg;
 347
 348    if (INTEL_DEBUG & DEBUG_VS) {
 349       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 350       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 351       printf("%s reg = %d\n", __FUNCTION__, reg);
 352    }
 353 }
 354
 355
 356 /**
 357  * If an instruction uses a temp reg both as a src and the dest, we
 358  * sometimes need to allocate an intermediate temporary.
 359  */
 360 static void unalias1( struct brw_vs_compile *c,
 361                       struct brw_reg dst,
 362                       struct brw_reg arg0,
 363                       void (*func)( struct brw_vs_compile *,
 364                                     struct brw_reg,
 365                                     struct brw_reg ))
 366 {
 367    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 368       struct brw_compile *p = &c->func;
 369       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 370       func(c, tmp, arg0);
 371       brw_MOV(p, dst, tmp);
 372       release_tmp(c, tmp);
 373    }
 374    else {
 375       func(c, dst, arg0);
 376    }
 377 }
 378
 379 /**
 380  * \sa unalias2
 381  * Checkes if 2-operand instruction needs an intermediate temporary.
 382  */
 383 static void unalias2( struct brw_vs_compile *c,
 384                       struct brw_reg dst,
 385                       struct brw_reg arg0,
 386                       struct brw_reg arg1,
 387                       void (*func)( struct brw_vs_compile *,
 388                                     struct brw_reg,
 389                                     struct brw_reg,
 390                                     struct brw_reg ))
 391 {
 392    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 393        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 394       struct brw_compile *p = &c->func;
 395       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 396       func(c, tmp, arg0, arg1);
 397       brw_MOV(p, dst, tmp);
 398       release_tmp(c, tmp);
 399    }
 400    else {
 401       func(c, dst, arg0, arg1);
 402    }
 403 }
 404
 405 /**
 406  * \sa unalias2
 407  * Checkes if 3-operand instruction needs an intermediate temporary.
 408  */
 409 static void unalias3( struct brw_vs_compile *c,
 410                       struct brw_reg dst,
 411                       struct brw_reg arg0,
 412                       struct brw_reg arg1,
 413                       struct brw_reg arg2,
 414                       void (*func)( struct brw_vs_compile *,
 415                                     struct brw_reg,
 416                                     struct brw_reg,
 417                                     struct brw_reg,
 418                                     struct brw_reg ))
 419 {
 420    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 421        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 422        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 423       struct brw_compile *p = &c->func;
 424       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 425       func(c, tmp, arg0, arg1, arg2);
 426       brw_MOV(p, dst, tmp);
 427       release_tmp(c, tmp);
 428    }
 429    else {
 430       func(c, dst, arg0, arg1, arg2);
 431    }
 432 }
 433
 434 static void emit_sop( struct brw_vs_compile *c,
 435                       struct brw_reg dst,
 436                       struct brw_reg arg0,
 437                       struct brw_reg arg1,
 438                       GLuint cond)
 439 {
 440    struct brw_compile *p = &c->func;
 441
 442    brw_MOV(p, dst, brw_imm_f(0.0f));
 443    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 444    brw_MOV(p, dst, brw_imm_f(1.0f));
 445    brw_set_predicate_control_flag_value(p, 0xff);
 446 }
 447
 448 static void emit_seq( struct brw_vs_compile *c,
 449                       struct brw_reg dst,
 450                       struct brw_reg arg0,
 451                       struct brw_reg arg1 )
 452 {
 453    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 454 }
 455
 456 static void emit_sne( struct brw_vs_compile *c,
 457                       struct brw_reg dst,
 458                       struct brw_reg arg0,
 459                       struct brw_reg arg1 )
 460 {
 461    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 462 }
 463 static void emit_slt( struct brw_vs_compile *c,
 464                       struct brw_reg dst,
 465                       struct brw_reg arg0,
 466                       struct brw_reg arg1 )
 467 {
 468    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 469 }
 470
 471 static void emit_sle( struct brw_vs_compile *c,
 472                       struct brw_reg dst,
 473                       struct brw_reg arg0,
 474                       struct brw_reg arg1 )
 475 {
 476    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 477 }
 478
 479 static void emit_sgt( struct brw_vs_compile *c,
 480                       struct brw_reg dst,
 481                       struct brw_reg arg0,
 482                       struct brw_reg arg1 )
 483 {
 484    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 485 }
 486
 487 static void emit_sge( struct brw_vs_compile *c,
 488                       struct brw_reg dst,
 489                       struct brw_reg arg0,
 490                       struct brw_reg arg1 )
 491 {
 492   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 493 }
 494
 495 static void emit_cmp( struct brw_compile *p,
 496                       struct brw_reg dst,
 497                       struct brw_reg arg0,
 498                       struct brw_reg arg1,
 499                       struct brw_reg arg2 )
 500 {
 501    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 502    brw_SEL(p, dst, arg1, arg2);
 503    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 504 }
 505
 506 static void emit_sign(struct brw_vs_compile *c,
 507                       struct brw_reg dst,
 508                       struct brw_reg arg0)
 509 {
 510    struct brw_compile *p = &c->func;
 511
 512    brw_MOV(p, dst, brw_imm_f(0));
 513
 514    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 515    brw_MOV(p, dst, brw_imm_f(-1.0));
 516    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 517
 518    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 519    brw_MOV(p, dst, brw_imm_f(1.0));
 520    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 521 }
 522
 523 static void emit_max( struct brw_compile *p,
 524                       struct brw_reg dst,
 525                       struct brw_reg arg0,
 526                       struct brw_reg arg1 )
 527 {
 528    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 529    brw_SEL(p, dst, arg0, arg1);
 530    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 531 }
 532
 533 static void emit_min( struct brw_compile *p,
 534                       struct brw_reg dst,
 535                       struct brw_reg arg0,
 536                       struct brw_reg arg1 )
 537 {
 538    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 539    brw_SEL(p, dst, arg0, arg1);
 540    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 541 }
 542
 543
 544 static void emit_math1( struct brw_vs_compile *c,
 545                         GLuint function,
 546                         struct brw_reg dst,
 547                         struct brw_reg arg0,
 548                         GLuint precision)
 549 {
 550    /* There are various odd behaviours with SEND on the simulator.  In
 551     * addition there are documented issues with the fact that the GEN4
 552     * processor doesn't do dependency control properly on SEND
 553     * results.  So, on balance, this kludge to get around failures
 554     * with writemasked math results looks like it might be necessary
 555     * whether that turns out to be a simulator bug or not:
 556     */
 557    struct brw_compile *p = &c->func;
 558    struct intel_context *intel = &p->brw->intel;
 559    struct brw_reg tmp = dst;
 560    GLboolean need_tmp = (intel->gen < 6 &&
 561                          (dst.dw1.bits.writemask != 0xf ||
 562                           dst.file != BRW_GENERAL_REGISTER_FILE));
 563
 564    if (need_tmp)
 565       tmp = get_tmp(c);
 566
 567    brw_math(p,
 568             tmp,
 569             function,
 570             BRW_MATH_SATURATE_NONE,
 571             2,
 572             arg0,
 573             BRW_MATH_DATA_SCALAR,
 574             precision);
 575
 576    if (need_tmp) {
 577       brw_MOV(p, dst, tmp);
 578       release_tmp(c, tmp);
 579    }
 580 }
 581
 582
 583 static void emit_math2( struct brw_vs_compile *c,
 584                         GLuint function,
 585                         struct brw_reg dst,
 586                         struct brw_reg arg0,
 587                         struct brw_reg arg1,
 588                         GLuint precision)
 589 {
 590    struct brw_compile *p = &c->func;
 591    struct intel_context *intel = &p->brw->intel;
 592    struct brw_reg tmp = dst;
 593    GLboolean need_tmp = (intel->gen < 6 &&
 594                          (dst.dw1.bits.writemask != 0xf ||
 595                           dst.file != BRW_GENERAL_REGISTER_FILE));
 596
 597    if (need_tmp)
 598       tmp = get_tmp(c);
 599
 600    brw_MOV(p, brw_message_reg(3), arg1);
 601
 602    brw_math(p,
 603             tmp,
 604             function,
 605             BRW_MATH_SATURATE_NONE,
 606             2,
 607             arg0,
 608             BRW_MATH_DATA_SCALAR,
 609             precision);
 610
 611    if (need_tmp) {
 612       brw_MOV(p, dst, tmp);
 613       release_tmp(c, tmp);
 614    }
 615 }
 616
 617
 618 static void emit_exp_noalias( struct brw_vs_compile *c,
 619                               struct brw_reg dst,
 620                               struct brw_reg arg0 )
 621 {
 622    struct brw_compile *p = &c->func;
 623
 624
 625    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 626       struct brw_reg tmp = get_tmp(c);
 627       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 628
 629       /* tmp_d = floor(arg0.x) */
 630       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 631
 632       /* result[0] = 2.0 ^ tmp */
 633
 634       /* Adjust exponent for floating point:
 635        * exp += 127
 636        */
 637       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 638
 639       /* Install exponent and sign.
 640        * Excess drops off the edge:
 641        */
 642       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 643               tmp_d, brw_imm_d(23));
 644
 645       release_tmp(c, tmp);
 646    }
 647
 648    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 649       /* result[1] = arg0.x - floor(arg0.x) */
 650       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 651    }
 652
 653    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 654       /* As with the LOG instruction, we might be better off just
 655        * doing a taylor expansion here, seeing as we have to do all
 656        * the prep work.
 657        *
 658        * If mathbox partial precision is too low, consider also:
 659        * result[3] = result[0] * EXP(result[1])
 660        */
 661       emit_math1(c,
 662                  BRW_MATH_FUNCTION_EXP,
 663                  brw_writemask(dst, WRITEMASK_Z),
 664                  brw_swizzle1(arg0, 0),
 665                  BRW_MATH_PRECISION_FULL);
 666    }
 667
 668    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 669       /* result[3] = 1.0; */
 670       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 671    }
 672 }
 673
 674
 675 static void emit_log_noalias( struct brw_vs_compile *c,
 676                               struct brw_reg dst,
 677                               struct brw_reg arg0 )
 678 {
 679    struct brw_compile *p = &c->func;
 680    struct brw_reg tmp = dst;
 681    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 682    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 683    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 684                          dst.file != BRW_GENERAL_REGISTER_FILE);
 685
 686    if (need_tmp) {
 687       tmp = get_tmp(c);
 688       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 689    }
 690
 691    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 692     * according to spec:
 693     *
 694     * These almost look likey they could be joined up, but not really
 695     * practical:
 696     *
 697     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 698     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 699     */
 700    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 701       brw_AND(p,
 702               brw_writemask(tmp_ud, WRITEMASK_X),
 703               brw_swizzle1(arg0_ud, 0),
 704               brw_imm_ud((1U<<31)-1));
 705
 706       brw_SHR(p,
 707               brw_writemask(tmp_ud, WRITEMASK_X),
 708               tmp_ud,
 709               brw_imm_ud(23));
 710
 711       brw_ADD(p,
 712               brw_writemask(tmp, WRITEMASK_X),
 713               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 714               brw_imm_d(-127));
 715    }
 716
 717    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 718       brw_AND(p,
 719               brw_writemask(tmp_ud, WRITEMASK_Y),
 720               brw_swizzle1(arg0_ud, 0),
 721               brw_imm_ud((1<<23)-1));
 722
 723       brw_OR(p,
 724              brw_writemask(tmp_ud, WRITEMASK_Y),
 725              tmp_ud,
 726              brw_imm_ud(127<<23));
 727    }
 728
 729    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 730       /* result[2] = result[0] + LOG2(result[1]); */
 731
 732       /* Why bother?  The above is just a hint how to do this with a
 733        * taylor series.  Maybe we *should* use a taylor series as by
 734        * the time all the above has been done it's almost certainly
 735        * quicker than calling the mathbox, even with low precision.
 736        *
 737        * Options are:
 738        *    - result[0] + mathbox.LOG2(result[1])
 739        *    - mathbox.LOG2(arg0.x)
 740        *    - result[0] + inline_taylor_approx(result[1])
 741        */
 742       emit_math1(c,
 743                  BRW_MATH_FUNCTION_LOG,
 744                  brw_writemask(tmp, WRITEMASK_Z),
 745                  brw_swizzle1(tmp, 1),
 746                  BRW_MATH_PRECISION_FULL);
 747
 748       brw_ADD(p,
 749               brw_writemask(tmp, WRITEMASK_Z),
 750               brw_swizzle1(tmp, 2),
 751               brw_swizzle1(tmp, 0));
 752    }
 753
 754    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 755       /* result[3] = 1.0; */
 756       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 757    }
 758
 759    if (need_tmp) {
 760       brw_MOV(p, dst, tmp);
 761       release_tmp(c, tmp);
 762    }
 763 }
 764
 765
 766 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 767  */
 768 static void emit_dst_noalias( struct brw_vs_compile *c,
 769                               struct brw_reg dst,
 770                               struct brw_reg arg0,
 771                               struct brw_reg arg1)
 772 {
 773    struct brw_compile *p = &c->func;
 774
 775    /* There must be a better way to do this:
 776     */
 777    if (dst.dw1.bits.writemask & WRITEMASK_X)
 778       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 779    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 780       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 781    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 782       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 783    if (dst.dw1.bits.writemask & WRITEMASK_W)
 784       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 785 }
 786
 787
 788 static void emit_xpd( struct brw_compile *p,
 789                       struct brw_reg dst,
 790                       struct brw_reg t,
 791                       struct brw_reg u)
 792 {
 793    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 794    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 795 }
 796
 797
 798 static void emit_lit_noalias( struct brw_vs_compile *c,
 799                               struct brw_reg dst,
 800                               struct brw_reg arg0 )
 801 {
 802    struct brw_compile *p = &c->func;
 803    struct brw_instruction *if_insn;
 804    struct brw_reg tmp = dst;
 805    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 806
 807    if (need_tmp)
 808       tmp = get_tmp(c);
 809
 810    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 811    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 812
 813    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 814     * to get all channels active inside the IF.  In the clipping code
 815     * we run with NoMask, so it's not an option and we can use
 816     * BRW_EXECUTE_1 for all comparisions.
 817     */
 818    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 819    if_insn = brw_IF(p, BRW_EXECUTE_8);
 820    {
 821       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 822
 823       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 824       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 825       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 826
 827       emit_math2(c,
 828                  BRW_MATH_FUNCTION_POW,
 829                  brw_writemask(dst, WRITEMASK_Z),
 830                  brw_swizzle1(tmp, 2),
 831                  brw_swizzle1(arg0, 3),
 832                  BRW_MATH_PRECISION_PARTIAL);
 833    }
 834
 835    brw_ENDIF(p, if_insn);
 836
 837    release_tmp(c, tmp);
 838 }
 839
 840 static void emit_lrp_noalias(struct brw_vs_compile *c,
 841                              struct brw_reg dst,
 842                              struct brw_reg arg0,
 843                              struct brw_reg arg1,
 844                              struct brw_reg arg2)
 845 {
 846    struct brw_compile *p = &c->func;
 847
 848    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 849    brw_MUL(p, brw_null_reg(), dst, arg2);
 850    brw_MAC(p, dst, arg0, arg1);
 851 }
 852
 853 /** 3 or 4-component vector normalization */
 854 static void emit_nrm( struct brw_vs_compile *c,
 855                       struct brw_reg dst,
 856                       struct brw_reg arg0,
 857                       int num_comps)
 858 {
 859    struct brw_compile *p = &c->func;
 860    struct brw_reg tmp = get_tmp(c);
 861
 862    /* tmp = dot(arg0, arg0) */
 863    if (num_comps == 3)
 864       brw_DP3(p, tmp, arg0, arg0);
 865    else
 866       brw_DP4(p, tmp, arg0, arg0);
 867
 868    /* tmp = 1 / sqrt(tmp) */
 869    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 870
 871    /* dst = arg0 * tmp */
 872    brw_MUL(p, dst, arg0, tmp);
 873
 874    release_tmp(c, tmp);
 875 }
 876
 877
 878 static struct brw_reg
 879 get_constant(struct brw_vs_compile *c,
 880              const struct prog_instruction *inst,
 881              GLuint argIndex)
 882 {
 883    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 884    struct brw_compile *p = &c->func;
 885    struct brw_reg const_reg = c->current_const[argIndex].reg;
 886
 887    assert(argIndex < 3);
 888
 889    if (c->current_const[argIndex].index != src->Index) {
 890       /* Keep track of the last constant loaded in this slot, for reuse. */
 891       c->current_const[argIndex].index = src->Index;
 892
 893 #if 0
 894       printf("  fetch const[%d] for arg %d into reg %d\n",
 895              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 896 #endif
 897       /* need to fetch the constant now */
 898       brw_dp_READ_4_vs(p,
 899                        const_reg,                     /* writeback dest */
 900                        16 * src->Index,               /* byte offset */
 901                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 902                        );
 903    }
 904
 905    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 906    const_reg = stride(const_reg, 0, 4, 0);
 907    const_reg.subnr = 0;
 908
 909    return const_reg;
 910 }
 911
 912 static struct brw_reg
 913 get_reladdr_constant(struct brw_vs_compile *c,
 914                      const struct prog_instruction *inst,
 915                      GLuint argIndex)
 916 {
 917    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 918    struct brw_compile *p = &c->func;
 919    struct brw_reg const_reg = c->current_const[argIndex].reg;
 920    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 921    struct brw_reg byte_addr_reg = get_tmp(c);
 922
 923    assert(argIndex < 3);
 924
 925    /* Can't reuse a reladdr constant load. */
 926    c->current_const[argIndex].index = -1;
 927
 928  #if 0
 929    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 930           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 931 #endif
 932
 933    brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
 934
 935    /* fetch the first vec4 */
 936    brw_dp_READ_4_vs_relative(p,
 937                              const_reg,                     /* writeback dest */
 938                              byte_addr_reg,                 /* address register */
 939                              16 * src->Index,               /* byte offset */
 940                              SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 941                              );
 942
 943    return const_reg;
 944 }
 945
 946
 947
 948 /* TODO: relative addressing!
 949  */
 950 static struct brw_reg get_reg( struct brw_vs_compile *c,
 951                                gl_register_file file,
 952                                GLuint index )
 953 {
 954    switch (file) {
 955    case PROGRAM_TEMPORARY:
 956    case PROGRAM_INPUT:
 957    case PROGRAM_OUTPUT:
 958       assert(c->regs[file][index].nr != 0);
 959       return c->regs[file][index];
 960    case PROGRAM_STATE_VAR:
 961    case PROGRAM_CONSTANT:
 962    case PROGRAM_UNIFORM:
 963       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 964       return c->regs[PROGRAM_STATE_VAR][index];
 965    case PROGRAM_ADDRESS:
 966       assert(index == 0);
 967       return c->regs[file][index];
 968
 969    case PROGRAM_UNDEFINED:                      /* undef values */
 970       return brw_null_reg();
 971
 972    case PROGRAM_LOCAL_PARAM:
 973    case PROGRAM_ENV_PARAM:
 974    case PROGRAM_WRITE_ONLY:
 975    default:
 976       assert(0);
 977       return brw_null_reg();
 978    }
 979 }
 980
 981
 982 /**
 983  * Indirect addressing:  get reg[[arg] + offset].
 984  */
 985 static struct brw_reg deref( struct brw_vs_compile *c,
 986                              struct brw_reg arg,
 987                              GLint offset,
 988                              GLuint reg_size )
 989 {
 990    struct brw_compile *p = &c->func;
 991    struct brw_reg tmp = get_tmp(c);
 992    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 993    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
 994    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
 995    struct brw_reg indirect = brw_vec4_indirect(0,0);
 996    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
 997
 998    /* Set the vertical stride on the register access so that the first
 999     * 4 components come from a0.0 and the second 4 from a0.1.
1000     */
1001    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1002
1003    {
1004       brw_push_insn_state(p);
1005       brw_set_access_mode(p, BRW_ALIGN_1);
1006
1007       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1008       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1009
1010       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1011       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1012
1013       brw_MOV(p, tmp, indirect);
1014
1015       brw_pop_insn_state(p);
1016    }
1017
1018    /* NOTE: tmp not released */
1019    return tmp;
1020 }
1021
1022 static void
1023 move_to_reladdr_dst(struct brw_vs_compile *c,
1024                     const struct prog_instruction *inst,
1025                     struct brw_reg val)
1026 {
1027    struct brw_compile *p = &c->func;
1028    int reg_size = 32;
1029    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1030    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1031    struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
1032    GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
1033    struct brw_reg indirect = brw_vec4_indirect(0,0);
1034    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1035
1036    byte_offset += inst->DstReg.Index * reg_size;
1037
1038    brw_push_insn_state(p);
1039    brw_set_access_mode(p, BRW_ALIGN_1);
1040
1041    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1042    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1043    brw_MOV(p, indirect, val);
1044
1045    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1046    brw_ADD(p, brw_address_reg(0), acc,
1047            brw_imm_uw(byte_offset + reg_size / 2));
1048    brw_MOV(p, indirect, suboffset(val, 4));
1049
1050    brw_pop_insn_state(p);
1051 }
1052
1053 /**
1054  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1055  * TODO: relative addressing!
1056  */
1057 static struct brw_reg
1058 get_src_reg( struct brw_vs_compile *c,
1059              const struct prog_instruction *inst,
1060              GLuint argIndex )
1061 {
1062    const GLuint file = inst->SrcReg[argIndex].File;
1063    const GLint index = inst->SrcReg[argIndex].Index;
1064    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1065
1066    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1067       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1068
1069       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1070                                         SWIZZLE_ZERO,
1071                                         SWIZZLE_ZERO,
1072                                         SWIZZLE_ZERO)) {
1073           return brw_imm_f(0.0f);
1074       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1075                                                SWIZZLE_ONE,
1076                                                SWIZZLE_ONE,
1077                                                SWIZZLE_ONE)) {
1078          if (src->Negate)
1079             return brw_imm_f(-1.0F);
1080          else
1081             return brw_imm_f(1.0F);
1082       } else if (src->File == PROGRAM_CONSTANT) {
1083          const struct gl_program_parameter_list *params;
1084          float f;
1085          int component = -1;
1086
1087          switch (src->Swizzle) {
1088          case SWIZZLE_XXXX:
1089             component = 0;
1090             break;
1091          case SWIZZLE_YYYY:
1092             component = 1;
1093             break;
1094          case SWIZZLE_ZZZZ:
1095             component = 2;
1096             break;
1097          case SWIZZLE_WWWW:
1098             component = 3;
1099             break;
1100          }
1101
1102          if (component >= 0) {
1103             params = c->vp->program.Base.Parameters;
1104             f = params->ParameterValues[src->Index][component];
1105
1106             if (src->Abs)
1107                f = fabs(f);
1108             if (src->Negate)
1109                f = -f;
1110             return brw_imm_f(f);
1111          }
1112       }
1113    }
1114
1115    switch (file) {
1116    case PROGRAM_TEMPORARY:
1117    case PROGRAM_INPUT:
1118    case PROGRAM_OUTPUT:
1119       if (relAddr) {
1120          return deref(c, c->regs[file][0], index, 32);
1121       }
1122       else {
1123          assert(c->regs[file][index].nr != 0);
1124          return c->regs[file][index];
1125       }
1126
1127    case PROGRAM_STATE_VAR:
1128    case PROGRAM_CONSTANT:
1129    case PROGRAM_UNIFORM:
1130    case PROGRAM_ENV_PARAM:
1131    case PROGRAM_LOCAL_PARAM:
1132       if (c->vp->use_const_buffer) {
1133          if (!relAddr && c->constant_map[index] != -1) {
1134             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1135             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1136          } else if (relAddr)
1137             return get_reladdr_constant(c, inst, argIndex);
1138          else
1139             return get_constant(c, inst, argIndex);
1140       }
1141       else if (relAddr) {
1142          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1143       }
1144       else {
1145          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1146          return c->regs[PROGRAM_STATE_VAR][index];
1147       }
1148    case PROGRAM_ADDRESS:
1149       assert(index == 0);
1150       return c->regs[file][index];
1151
1152    case PROGRAM_UNDEFINED:
1153       /* this is a normal case since we loop over all three src args */
1154       return brw_null_reg();
1155
1156    case PROGRAM_WRITE_ONLY:
1157    default:
1158       assert(0);
1159       return brw_null_reg();
1160    }
1161 }
1162
1163 /**
1164  * Return the brw reg for the given instruction's src argument.
1165  * Will return mangled results for SWZ op.  The emit_swz() function
1166  * ignores this result and recalculates taking extended swizzles into
1167  * account.
1168  */
1169 static struct brw_reg get_arg( struct brw_vs_compile *c,
1170                                const struct prog_instruction *inst,
1171                                GLuint argIndex )
1172 {
1173    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1174    struct brw_reg reg;
1175
1176    if (src->File == PROGRAM_UNDEFINED)
1177       return brw_null_reg();
1178
1179    reg = get_src_reg(c, inst, argIndex);
1180
1181    /* Convert 3-bit swizzle to 2-bit.
1182     */
1183    if (reg.file != BRW_IMMEDIATE_VALUE) {
1184       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1185                                           GET_SWZ(src->Swizzle, 1),
1186                                           GET_SWZ(src->Swizzle, 2),
1187                                           GET_SWZ(src->Swizzle, 3));
1188    }
1189
1190    /* Note this is ok for non-swizzle instructions:
1191     */
1192    reg.negate = src->Negate ? 1 : 0;
1193
1194    return reg;
1195 }
1196
1197
1198 /**
1199  * Get brw register for the given program dest register.
1200  */
1201 static struct brw_reg get_dst( struct brw_vs_compile *c,
1202                                struct prog_dst_register dst )
1203 {
1204    struct brw_reg reg;
1205
1206    switch (dst.File) {
1207    case PROGRAM_TEMPORARY:
1208    case PROGRAM_OUTPUT:
1209       /* register-indirect addressing is only 1x1, not VxH, for
1210        * destination regs.  So, for RelAddr we'll return a temporary
1211        * for the dest and do a move of the result to the RelAddr
1212        * register after the instruction emit.
1213        */
1214       if (dst.RelAddr) {
1215          reg = get_tmp(c);
1216       } else {
1217          assert(c->regs[dst.File][dst.Index].nr != 0);
1218          reg = c->regs[dst.File][dst.Index];
1219       }
1220       break;
1221    case PROGRAM_ADDRESS:
1222       assert(dst.Index == 0);
1223       reg = c->regs[dst.File][dst.Index];
1224       break;
1225    case PROGRAM_UNDEFINED:
1226       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1227       reg = brw_null_reg();
1228       break;
1229    default:
1230       assert(0);
1231       reg = brw_null_reg();
1232    }
1233
1234    assert(reg.type != BRW_IMMEDIATE_VALUE);
1235    reg.dw1.bits.writemask = dst.WriteMask;
1236
1237    return reg;
1238 }
1239
1240
1241 static void emit_swz( struct brw_vs_compile *c,
1242                       struct brw_reg dst,
1243                       const struct prog_instruction *inst)
1244 {
1245    const GLuint argIndex = 0;
1246    const struct prog_src_register src = inst->SrcReg[argIndex];
1247    struct brw_compile *p = &c->func;
1248    GLuint zeros_mask = 0;
1249    GLuint ones_mask = 0;
1250    GLuint src_mask = 0;
1251    GLubyte src_swz[4];
1252    GLboolean need_tmp = (src.Negate &&
1253                          dst.file != BRW_GENERAL_REGISTER_FILE);
1254    struct brw_reg tmp = dst;
1255    GLuint i;
1256
1257    if (need_tmp)
1258       tmp = get_tmp(c);
1259
1260    for (i = 0; i < 4; i++) {
1261       if (dst.dw1.bits.writemask & (1<<i)) {
1262          GLubyte s = GET_SWZ(src.Swizzle, i);
1263          switch (s) {
1264          case SWIZZLE_X:
1265          case SWIZZLE_Y:
1266          case SWIZZLE_Z:
1267          case SWIZZLE_W:
1268             src_mask |= 1<<i;
1269             src_swz[i] = s;
1270             break;
1271          case SWIZZLE_ZERO:
1272             zeros_mask |= 1<<i;
1273             break;
1274          case SWIZZLE_ONE:
1275             ones_mask |= 1<<i;
1276             break;
1277          }
1278       }
1279    }
1280
1281    /* Do src first, in case dst aliases src:
1282     */
1283    if (src_mask) {
1284       struct brw_reg arg0;
1285
1286       arg0 = get_src_reg(c, inst, argIndex);
1287
1288       arg0 = brw_swizzle(arg0,
1289                          src_swz[0], src_swz[1],
1290                          src_swz[2], src_swz[3]);
1291
1292       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1293    }
1294
1295    if (zeros_mask)
1296       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1297
1298    if (ones_mask)
1299       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1300
1301    if (src.Negate)
1302       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1303
1304    if (need_tmp) {
1305       brw_MOV(p, dst, tmp);
1306       release_tmp(c, tmp);
1307    }
1308 }
1309
1310
1311 /**
1312  * Post-vertex-program processing.  Send the results to the URB.
1313  */
1314 static void emit_vertex_write( struct brw_vs_compile *c)
1315 {
1316    struct brw_compile *p = &c->func;
1317    struct brw_context *brw = p->brw;
1318    struct intel_context *intel = &brw->intel;
1319    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1320    struct brw_reg ndc;
1321    int eot;
1322    GLuint len_vertex_header = 2;
1323
1324    if (c->key.copy_edgeflag) {
1325       brw_MOV(p,
1326               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1327               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1328    }
1329
1330    if (intel->gen < 6) {
1331       /* Build ndc coords */
1332       ndc = get_tmp(c);
1333       /* ndc = 1.0 / pos.w */
1334       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1335       /* ndc.xyz = pos * ndc */
1336       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1337    }
1338
1339    /* Update the header for point size, user clipping flags, and -ve rhw
1340     * workaround.
1341     */
1342    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1343        c->key.nr_userclip || brw->has_negative_rhw_bug)
1344    {
1345       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1346       GLuint i;
1347
1348       brw_MOV(p, header1, brw_imm_ud(0));
1349
1350       brw_set_access_mode(p, BRW_ALIGN_16);
1351
1352       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1353          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1354          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1355          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1356       }
1357
1358       for (i = 0; i < c->key.nr_userclip; i++) {
1359          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1360          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1361          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1362          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1363       }
1364
1365       /* i965 clipping workaround:
1366        * 1) Test for -ve rhw
1367        * 2) If set,
1368        *      set ndc = (0,0,0,0)
1369        *      set ucp[6] = 1
1370        *
1371        * Later, clipping will detect ucp[6] and ensure the primitive is
1372        * clipped against all fixed planes.
1373        */
1374       if (brw->has_negative_rhw_bug) {
1375          brw_CMP(p,
1376                  vec8(brw_null_reg()),
1377                  BRW_CONDITIONAL_L,
1378                  brw_swizzle1(ndc, 3),
1379                  brw_imm_f(0));
1380
1381          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1382          brw_MOV(p, ndc, brw_imm_f(0));
1383          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1384       }
1385
1386       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1387       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1388       brw_set_access_mode(p, BRW_ALIGN_16);
1389
1390       release_tmp(c, header1);
1391    }
1392    else {
1393       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1394    }
1395
1396    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1397     * of zeros followed by two sets of NDC coordinates:
1398     */
1399    brw_set_access_mode(p, BRW_ALIGN_1);
1400
1401    /* The VUE layout is documented in Volume 2a. */
1402    if (intel->gen >= 6) {
1403       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1404        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1405        * dword 4-7 (m2) is the 4D space position
1406        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1407        * enabled.  We don't use it, so skip it.
1408        * m3 is the first vertex element data we fill, which is the vertex
1409        * position.
1410        */
1411       brw_MOV(p, brw_message_reg(2), pos);
1412       brw_MOV(p, brw_message_reg(3), pos);
1413       len_vertex_header = 2;
1414    } else if (intel->gen == 5) {
1415       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1416        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1417        * dword 4-7 (m2) is the ndc position (set above)
1418        * dword 8-11 (m3) of the vertex header is the 4D space position
1419        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1420        * m6 is a pad so that the vertex element data is aligned
1421        * m7 is the first vertex data we fill, which is the vertex position.
1422        */
1423       brw_MOV(p, brw_message_reg(2), ndc);
1424       brw_MOV(p, brw_message_reg(3), pos);
1425       brw_MOV(p, brw_message_reg(7), pos);
1426       len_vertex_header = 6;
1427    } else {
1428       /* There are 8 dwords in VUE header pre-Ironlake:
1429        * dword 0-3 (m1) is indices, point width, clip flags.
1430        * dword 4-7 (m2) is ndc position (set above)
1431        *
1432        * dword 8-11 (m3) is the first vertex data, which we always have be the
1433        * vertex position.
1434        */
1435       brw_MOV(p, brw_message_reg(2), ndc);
1436       brw_MOV(p, brw_message_reg(3), pos);
1437       len_vertex_header = 2;
1438    }
1439
1440    eot = (c->first_overflow_output == 0);
1441
1442    brw_urb_WRITE(p,
1443                  brw_null_reg(), /* dest */
1444                  0,             /* starting mrf reg nr */
1445                  c->r0,         /* src */
1446                  0,             /* allocate */
1447                  1,             /* used */
1448                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1449                  0,             /* response len */
1450                  eot,           /* eot */
1451                  eot,           /* writes complete */
1452                  0,             /* urb destination offset */
1453                  BRW_URB_SWIZZLE_INTERLEAVE);
1454
1455    if (c->first_overflow_output > 0) {
1456       /* Not all of the vertex outputs/results fit into the MRF.
1457        * Move the overflowed attributes from the GRF to the MRF and
1458        * issue another brw_urb_WRITE().
1459        */
1460       GLuint i, mrf = 1;
1461       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1462          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1463             /* move from GRF to MRF */
1464             brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1465             mrf++;
1466          }
1467       }
1468
1469       brw_urb_WRITE(p,
1470                     brw_null_reg(), /* dest */
1471                     0,              /* starting mrf reg nr */
1472                     c->r0,          /* src */
1473                     0,              /* allocate */
1474                     1,              /* used */
1475                     mrf,            /* msg len */
1476                     0,              /* response len */
1477                     1,              /* eot */
1478                     1,              /* writes complete */
1479                     14 / 2,  /* urb destination offset */
1480                     BRW_URB_SWIZZLE_INTERLEAVE);
1481    }
1482 }
1483
1484 static GLboolean
1485 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1486 {
1487    struct brw_compile *p = &c->func;
1488    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1489
1490    if (p->nr_insn == 0)
1491       return GL_FALSE;
1492
1493    if (val.address_mode != BRW_ADDRESS_DIRECT)
1494       return GL_FALSE;
1495
1496    switch (prev_insn->header.opcode) {
1497    case BRW_OPCODE_MOV:
1498    case BRW_OPCODE_MAC:
1499    case BRW_OPCODE_MUL:
1500       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1501           prev_insn->header.execution_size == val.width &&
1502           prev_insn->bits1.da1.dest_reg_file == val.file &&
1503           prev_insn->bits1.da1.dest_reg_type == val.type &&
1504           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1505           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1506           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1507           prev_insn->bits1.da16.dest_writemask == 0xf)
1508          return GL_TRUE;
1509       else
1510          return GL_FALSE;
1511    default:
1512       return GL_FALSE;
1513    }
1514 }
1515
1516 static uint32_t
1517 get_predicate(const struct prog_instruction *inst)
1518 {
1519    if (inst->DstReg.CondMask == COND_TR)
1520       return BRW_PREDICATE_NONE;
1521
1522    /* All of GLSL only produces predicates for COND_NE and one channel per
1523     * vector.  Fail badly if someone starts doing something else, as it might
1524     * mean infinite looping or something.
1525     *
1526     * We'd like to support all the condition codes, but our hardware doesn't
1527     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1528     * those, the instruction may update the condition codes or not, then any
1529     * later instruction may use one of those condition codes.  For gen4, the
1530     * instruction may update the flags register based on one of the condition
1531     * codes output by the instruction, and then further instructions may
1532     * predicate on that.  We can probably support this, but it won't
1533     * necessarily be easy.
1534     */
1535    assert(inst->DstReg.CondMask == COND_NE);
1536
1537    switch (inst->DstReg.CondSwizzle) {
1538    case SWIZZLE_XXXX:
1539       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1540    case SWIZZLE_YYYY:
1541       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1542    case SWIZZLE_ZZZZ:
1543       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1544    case SWIZZLE_WWWW:
1545       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1546    default:
1547       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1548                     inst->DstReg.CondMask);
1549       return BRW_PREDICATE_NORMAL;
1550    }
1551 }
1552
1553 /* Emit the vertex program instructions here.
1554  */
1555 void brw_vs_emit(struct brw_vs_compile *c )
1556 {
1557 #define MAX_IF_DEPTH 32
1558 #define MAX_LOOP_DEPTH 32
1559    struct brw_compile *p = &c->func;
1560    struct brw_context *brw = p->brw;
1561    struct intel_context *intel = &brw->intel;
1562    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1563    GLuint insn, if_depth = 0, loop_depth = 0;
1564    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1565    const struct brw_indirect stack_index = brw_indirect(0, 0);
1566    GLuint index;
1567    GLuint file;
1568
1569    if (INTEL_DEBUG & DEBUG_VS) {
1570       printf("vs-mesa:\n");
1571       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1572                                GL_TRUE);
1573       printf("\n");
1574    }
1575
1576    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1577    brw_set_access_mode(p, BRW_ALIGN_16);
1578
1579    for (insn = 0; insn < nr_insns; insn++) {
1580        GLuint i;
1581        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1582
1583        /* Message registers can't be read, so copy the output into GRF
1584         * register if they are used in source registers
1585         */
1586        for (i = 0; i < 3; i++) {
1587            struct prog_src_register *src = &inst->SrcReg[i];
1588            GLuint index = src->Index;
1589            GLuint file = src->File;
1590            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1591                c->output_regs[index].used_in_src = GL_TRUE;
1592        }
1593
1594        switch (inst->Opcode) {
1595        case OPCODE_CAL:
1596        case OPCODE_RET:
1597           c->needs_stack = GL_TRUE;
1598           break;
1599        default:
1600           break;
1601        }
1602    }
1603
1604    /* Static register allocation
1605     */
1606    brw_vs_alloc_regs(c);
1607
1608    if (c->needs_stack)
1609       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1610
1611    for (insn = 0; insn < nr_insns; insn++) {
1612
1613       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1614       struct brw_reg args[3], dst;
1615       GLuint i;
1616
1617 #if 0
1618       printf("%d: ", insn);
1619       _mesa_print_instruction(inst);
1620 #endif
1621
1622       /* Get argument regs.  SWZ is special and does this itself.
1623        */
1624       if (inst->Opcode != OPCODE_SWZ)
1625           for (i = 0; i < 3; i++) {
1626               const struct prog_src_register *src = &inst->SrcReg[i];
1627               index = src->Index;
1628               file = src->File;
1629               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1630                   args[i] = c->output_regs[index].reg;
1631               else
1632                   args[i] = get_arg(c, inst, i);
1633           }
1634
1635       /* Get dest regs.  Note that it is possible for a reg to be both
1636        * dst and arg, given the static allocation of registers.  So
1637        * care needs to be taken emitting multi-operation instructions.
1638        */
1639       index = inst->DstReg.Index;
1640       file = inst->DstReg.File;
1641       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1642           dst = c->output_regs[index].reg;
1643       else
1644           dst = get_dst(c, inst->DstReg);
1645
1646       if (inst->SaturateMode != SATURATE_OFF) {
1647          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1648                        inst->SaturateMode);
1649       }
1650
1651       switch (inst->Opcode) {
1652       case OPCODE_ABS:
1653          brw_MOV(p, dst, brw_abs(args[0]));
1654          break;
1655       case OPCODE_ADD:
1656          brw_ADD(p, dst, args[0], args[1]);
1657          break;
1658       case OPCODE_COS:
1659          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1660          break;
1661       case OPCODE_DP2:
1662          brw_DP2(p, dst, args[0], args[1]);
1663          break;
1664       case OPCODE_DP3:
1665          brw_DP3(p, dst, args[0], args[1]);
1666          break;
1667       case OPCODE_DP4:
1668          brw_DP4(p, dst, args[0], args[1]);
1669          break;
1670       case OPCODE_DPH:
1671          brw_DPH(p, dst, args[0], args[1]);
1672          break;
1673       case OPCODE_NRM3:
1674          emit_nrm(c, dst, args[0], 3);
1675          break;
1676       case OPCODE_NRM4:
1677          emit_nrm(c, dst, args[0], 4);
1678          break;
1679       case OPCODE_DST:
1680          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1681          break;
1682       case OPCODE_EXP:
1683          unalias1(c, dst, args[0], emit_exp_noalias);
1684          break;
1685       case OPCODE_EX2:
1686          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1687          break;
1688       case OPCODE_ARL:
1689          brw_RNDD(p, dst, args[0]);
1690          break;
1691       case OPCODE_FLR:
1692          brw_RNDD(p, dst, args[0]);
1693          break;
1694       case OPCODE_FRC:
1695          brw_FRC(p, dst, args[0]);
1696          break;
1697       case OPCODE_LOG:
1698          unalias1(c, dst, args[0], emit_log_noalias);
1699          break;
1700       case OPCODE_LG2:
1701          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1702          break;
1703       case OPCODE_LIT:
1704          unalias1(c, dst, args[0], emit_lit_noalias);
1705          break;
1706       case OPCODE_LRP:
1707          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1708          break;
1709       case OPCODE_MAD:
1710          if (!accumulator_contains(c, args[2]))
1711             brw_MOV(p, brw_acc_reg(), args[2]);
1712          brw_MAC(p, dst, args[0], args[1]);
1713          break;
1714       case OPCODE_CMP:
1715          emit_cmp(p, dst, args[0], args[1], args[2]);
1716          break;
1717       case OPCODE_MAX:
1718          emit_max(p, dst, args[0], args[1]);
1719          break;
1720       case OPCODE_MIN:
1721          emit_min(p, dst, args[0], args[1]);
1722          break;
1723       case OPCODE_MOV:
1724          brw_MOV(p, dst, args[0]);
1725          break;
1726       case OPCODE_MUL:
1727          brw_MUL(p, dst, args[0], args[1]);
1728          break;
1729       case OPCODE_POW:
1730          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1731          break;
1732       case OPCODE_RCP:
1733          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1734          break;
1735       case OPCODE_RSQ:
1736          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1737          break;
1738
1739       case OPCODE_SEQ:
1740          unalias2(c, dst, args[0], args[1], emit_seq);
1741          break;
1742       case OPCODE_SIN:
1743          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1744          break;
1745       case OPCODE_SNE:
1746          unalias2(c, dst, args[0], args[1], emit_sne);
1747          break;
1748       case OPCODE_SGE:
1749          unalias2(c, dst, args[0], args[1], emit_sge);
1750          break;
1751       case OPCODE_SGT:
1752          unalias2(c, dst, args[0], args[1], emit_sgt);
1753          break;
1754       case OPCODE_SLT:
1755          unalias2(c, dst, args[0], args[1], emit_slt);
1756          break;
1757       case OPCODE_SLE:
1758          unalias2(c, dst, args[0], args[1], emit_sle);
1759          break;
1760       case OPCODE_SSG:
1761          unalias1(c, dst, args[0], emit_sign);
1762          break;
1763       case OPCODE_SUB:
1764          brw_ADD(p, dst, args[0], negate(args[1]));
1765          break;
1766       case OPCODE_SWZ:
1767          /* The args[0] value can't be used here as it won't have
1768           * correctly encoded the full swizzle:
1769           */
1770          emit_swz(c, dst, inst);
1771          break;
1772       case OPCODE_TRUNC:
1773          /* round toward zero */
1774          brw_RNDZ(p, dst, args[0]);
1775          break;
1776       case OPCODE_XPD:
1777          emit_xpd(p, dst, args[0], args[1]);
1778          break;
1779       case OPCODE_IF:
1780          assert(if_depth < MAX_IF_DEPTH);
1781          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1782          /* Note that brw_IF smashes the predicate_control field. */
1783          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1784          if_depth++;
1785          break;
1786       case OPCODE_ELSE:
1787          assert(if_depth > 0);
1788          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1789          break;
1790       case OPCODE_ENDIF:
1791          assert(if_depth > 0);
1792          brw_ENDIF(p, if_inst[--if_depth]);
1793          break;
1794       case OPCODE_BGNLOOP:
1795          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1796          break;
1797       case OPCODE_BRK:
1798          brw_set_predicate_control(p, get_predicate(inst));
1799          brw_BREAK(p);
1800          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1801          break;
1802       case OPCODE_CONT:
1803          brw_set_predicate_control(p, get_predicate(inst));
1804          brw_CONT(p);
1805          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1806          break;
1807       case OPCODE_ENDLOOP:
1808          {
1809             struct brw_instruction *inst0, *inst1;
1810             GLuint br = 1;
1811
1812             loop_depth--;
1813
1814             if (intel->gen == 5)
1815                br = 2;
1816
1817             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1818             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1819             while (inst0 > loop_inst[loop_depth]) {
1820                inst0--;
1821                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1822                    inst0->bits3.if_else.jump_count == 0) {
1823                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1824                   inst0->bits3.if_else.pop_count = 0;
1825                }
1826                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1827                         inst0->bits3.if_else.jump_count == 0) {
1828                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1829                   inst0->bits3.if_else.pop_count = 0;
1830                }
1831             }
1832          }
1833          break;
1834       case OPCODE_BRA:
1835          brw_set_predicate_control(p, get_predicate(inst));
1836          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1837          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1838          break;
1839       case OPCODE_CAL:
1840          brw_set_access_mode(p, BRW_ALIGN_1);
1841          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1842          brw_set_access_mode(p, BRW_ALIGN_16);
1843          brw_ADD(p, get_addr_reg(stack_index),
1844                          get_addr_reg(stack_index), brw_imm_d(4));
1845          brw_save_call(p, inst->Comment, p->nr_insn);
1846          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1847          break;
1848       case OPCODE_RET:
1849          brw_ADD(p, get_addr_reg(stack_index),
1850                          get_addr_reg(stack_index), brw_imm_d(-4));
1851          brw_set_access_mode(p, BRW_ALIGN_1);
1852          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1853          brw_set_access_mode(p, BRW_ALIGN_16);
1854          break;
1855       case OPCODE_END:
1856          emit_vertex_write(c);
1857          break;
1858       case OPCODE_PRINT:
1859          /* no-op */
1860          break;
1861       case OPCODE_BGNSUB:
1862          brw_save_label(p, inst->Comment, p->nr_insn);
1863          break;
1864       case OPCODE_ENDSUB:
1865          /* no-op */
1866          break;
1867       default:
1868          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1869                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1870                                     _mesa_opcode_string(inst->Opcode) :
1871                                     "unknown");
1872       }
1873
1874       /* Set the predication update on the last instruction of the native
1875        * instruction sequence.
1876        *
1877        * This would be problematic if it was set on a math instruction,
1878        * but that shouldn't be the case with the current GLSL compiler.
1879        */
1880       if (inst->CondUpdate) {
1881          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1882
1883          assert(hw_insn->header.destreg__conditionalmod == 0);
1884          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1885       }
1886
1887       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1888           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1889           && c->output_regs[inst->DstReg.Index].used_in_src) {
1890          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1891       }
1892
1893       /* Result color clamping.
1894        *
1895        * When destination register is an output register and
1896        * it's primary/secondary front/back color, we have to clamp
1897        * the result to [0,1]. This is done by enabling the
1898        * saturation bit for the last instruction.
1899        *
1900        * We don't use brw_set_saturate() as it modifies
1901        * p->current->header.saturate, which affects all the subsequent
1902        * instructions. Instead, we directly modify the header
1903        * of the last (already stored) instruction.
1904        */
1905       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1906          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1907              || (inst->DstReg.Index == VERT_RESULT_COL1)
1908              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1909              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1910             p->store[p->nr_insn-1].header.saturate = 1;
1911          }
1912       }
1913
1914       if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
1915          /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1916           * compute-to-mrf and the fact that we are allocating
1917           * registers for only the used PROGRAM_OUTPUTs.
1918           */
1919          move_to_reladdr_dst(c, inst, dst);
1920       }
1921
1922       release_tmps(c);
1923    }
1924
1925    brw_resolve_cals(p);
1926
1927    brw_optimize(p);
1928
1929    if (INTEL_DEBUG & DEBUG_VS) {
1930       int i;
1931
1932       printf("vs-native:\n");
1933       for (i = 0; i < p->nr_insn; i++)
1934          brw_disasm(stdout, &p->store[i], intel->gen);
1935       printf("\n");
1936    }
1937 }