src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return GL_TRUE;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return GL_FALSE;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101
 102 /**
 103  * Preallocate GRF register before code emit.
 104  * Do things as simply as possible.  Allocate and populate all regs
 105  * ahead of time.
 106  */
 107 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 108 {
 109    struct intel_context *intel = &c->func.brw->intel;
 110    GLuint i, reg = 0, mrf;
 111    int attributes_in_vue;
 112
 113    /* Determine whether to use a real constant buffer or use a block
 114     * of GRF registers for constants.  The later is faster but only
 115     * works if everything fits in the GRF.
 116     * XXX this heuristic/check may need some fine tuning...
 117     */
 118    if (c->vp->program.Base.Parameters->NumParameters +
 119        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 120       c->vp->use_const_buffer = GL_TRUE;
 121    else
 122       c->vp->use_const_buffer = GL_FALSE;
 123
 124    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 125
 126    /* r0 -- reserved as usual
 127     */
 128    c->r0 = brw_vec8_grf(reg, 0);
 129    reg++;
 130
 131    /* User clip planes from curbe:
 132     */
 133    if (c->key.nr_userclip) {
 134       for (i = 0; i < c->key.nr_userclip; i++) {
 135          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 136       }
 137
 138       /* Deal with curbe alignment:
 139        */
 140       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 141    }
 142
 143    /* Vertex program parameters from curbe:
 144     */
 145    if (c->vp->use_const_buffer) {
 146       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 147       int constant = 0;
 148
 149       /* We've got more constants than we can load with the push
 150        * mechanism.  This is often correlated with reladdr loads where
 151        * we should probably be using a pull mechanism anyway to avoid
 152        * excessive reading.  However, the pull mechanism is slow in
 153        * general.  So, we try to allocate as many non-reladdr-loaded
 154        * constants through the push buffer as we can before giving up.
 155        */
 156       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 157       for (i = 0;
 158            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 159            i++) {
 160          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 161          int arg;
 162
 163          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 164             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 165                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 166                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 167                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 168                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 169                 inst->SrcReg[arg].RelAddr)
 170                continue;
 171
 172             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 173                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 174             }
 175          }
 176       }
 177
 178       for (i = 0; i < constant; i++) {
 179          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 180                                                               (i%2) * 4),
 181                                                  0, 4, 1);
 182       }
 183       reg += (constant + 1) / 2;
 184       c->prog_data.curb_read_length = reg - 1;
 185       /* XXX 0 causes a bug elsewhere... */
 186       c->prog_data.nr_params = MAX2(constant * 4, 4);
 187    }
 188    else {
 189       /* use a section of the GRF for constants */
 190       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 191       for (i = 0; i < nr_params; i++) {
 192          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 193       }
 194       reg += (nr_params + 1) / 2;
 195       c->prog_data.curb_read_length = reg - 1;
 196
 197       c->prog_data.nr_params = nr_params * 4;
 198    }
 199
 200    /* Allocate input regs:
 201     */
 202    c->nr_inputs = 0;
 203    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 204       if (c->prog_data.inputs_read & (1 << i)) {
 205          c->nr_inputs++;
 206          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 207          reg++;
 208       }
 209    }
 210    /* If there are no inputs, we'll still be reading one attribute's worth
 211     * because it's required -- see urb_read_length setting.
 212     */
 213    if (c->nr_inputs == 0)
 214       reg++;
 215
 216    /* Allocate outputs.  The non-position outputs go straight into message regs.
 217     */
 218    c->nr_outputs = 0;
 219    c->first_output = reg;
 220    c->first_overflow_output = 0;
 221
 222    if (intel->gen >= 6)
 223       mrf = 4;
 224    else if (intel->gen == 5)
 225       mrf = 8;
 226    else
 227       mrf = 4;
 228
 229    for (i = 0; i < VERT_RESULT_MAX; i++) {
 230       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 231          c->nr_outputs++;
 232          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 233          if (i == VERT_RESULT_HPOS) {
 234             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 235             reg++;
 236          }
 237          else if (i == VERT_RESULT_PSIZ) {
 238             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 239             reg++;
 240             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 241          }
 242          else {
 243             /* Two restrictions on our compute-to-MRF here.  The
 244              * message length for all SEND messages is restricted to
 245              * [1,15], so we can't use mrf 15, as that means a length
 246              * of 16.
 247              *
 248              * Additionally, URB writes are aligned to URB rows, so we
 249              * need to put an even number of registers of URB data in
 250              * each URB write so that the later write is aligned.  A
 251              * message length of 15 means 1 message header reg plus 14
 252              * regs of URB data.
 253              *
 254              * For attributes beyond the compute-to-MRF, we compute to
 255              * GRFs and they will be written in the second URB_WRITE.
 256              */
 257             if (mrf < 15) {
 258                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 259                mrf++;
 260             }
 261             else {
 262                if (!c->first_overflow_output)
 263                   c->first_overflow_output = i;
 264                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 265                reg++;
 266             }
 267          }
 268       }
 269    }
 270
 271    /* Allocate program temporaries:
 272     */
 273    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 274       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 275       reg++;
 276    }
 277
 278    /* Address reg(s).  Don't try to use the internal address reg until
 279     * deref time.
 280     */
 281    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 282       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 283                                              reg,
 284                                              0,
 285                                              BRW_REGISTER_TYPE_D,
 286                                              BRW_VERTICAL_STRIDE_8,
 287                                              BRW_WIDTH_8,
 288                                              BRW_HORIZONTAL_STRIDE_1,
 289                                              BRW_SWIZZLE_XXXX,
 290                                              WRITEMASK_X);
 291       reg++;
 292    }
 293
 294    if (c->vp->use_const_buffer) {
 295       for (i = 0; i < 3; i++) {
 296          c->current_const[i].index = -1;
 297          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 298          reg++;
 299       }
 300    }
 301
 302    for (i = 0; i < 128; i++) {
 303       if (c->output_regs[i].used_in_src) {
 304          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 305          reg++;
 306       }
 307    }
 308
 309    if (c->needs_stack) {
 310       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 311       reg += 2;
 312    }
 313
 314    /* Some opcodes need an internal temporary:
 315     */
 316    c->first_tmp = reg;
 317    c->last_tmp = reg;           /* for allocation purposes */
 318
 319    /* Each input reg holds data from two vertices.  The
 320     * urb_read_length is the number of registers read from *each*
 321     * vertex urb, so is half the amount:
 322     */
 323    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 324    /* Setting this field to 0 leads to undefined behavior according to the
 325     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 326     * sitting in them, even if it's padding.
 327     */
 328    if (c->prog_data.urb_read_length == 0)
 329       c->prog_data.urb_read_length = 1;
 330
 331    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 332     * them to fit the biggest thing they need to.
 333     */
 334    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 335
 336    /* See emit_vertex_write() for where the VUE's overhead on top of the
 337     * attributes comes from.
 338     */
 339    if (intel->gen >= 6)
 340       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
 341    else if (intel->gen == 5)
 342       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 343    else
 344       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 345
 346    c->prog_data.total_grf = reg;
 347
 348    if (INTEL_DEBUG & DEBUG_VS) {
 349       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 350       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 351       printf("%s reg = %d\n", __FUNCTION__, reg);
 352    }
 353 }
 354
 355
 356 /**
 357  * If an instruction uses a temp reg both as a src and the dest, we
 358  * sometimes need to allocate an intermediate temporary.
 359  */
 360 static void unalias1( struct brw_vs_compile *c,
 361                       struct brw_reg dst,
 362                       struct brw_reg arg0,
 363                       void (*func)( struct brw_vs_compile *,
 364                                     struct brw_reg,
 365                                     struct brw_reg ))
 366 {
 367    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 368       struct brw_compile *p = &c->func;
 369       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 370       func(c, tmp, arg0);
 371       brw_MOV(p, dst, tmp);
 372       release_tmp(c, tmp);
 373    }
 374    else {
 375       func(c, dst, arg0);
 376    }
 377 }
 378
 379 /**
 380  * \sa unalias2
 381  * Checkes if 2-operand instruction needs an intermediate temporary.
 382  */
 383 static void unalias2( struct brw_vs_compile *c,
 384                       struct brw_reg dst,
 385                       struct brw_reg arg0,
 386                       struct brw_reg arg1,
 387                       void (*func)( struct brw_vs_compile *,
 388                                     struct brw_reg,
 389                                     struct brw_reg,
 390                                     struct brw_reg ))
 391 {
 392    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 393        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 394       struct brw_compile *p = &c->func;
 395       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 396       func(c, tmp, arg0, arg1);
 397       brw_MOV(p, dst, tmp);
 398       release_tmp(c, tmp);
 399    }
 400    else {
 401       func(c, dst, arg0, arg1);
 402    }
 403 }
 404
 405 /**
 406  * \sa unalias2
 407  * Checkes if 3-operand instruction needs an intermediate temporary.
 408  */
 409 static void unalias3( struct brw_vs_compile *c,
 410                       struct brw_reg dst,
 411                       struct brw_reg arg0,
 412                       struct brw_reg arg1,
 413                       struct brw_reg arg2,
 414                       void (*func)( struct brw_vs_compile *,
 415                                     struct brw_reg,
 416                                     struct brw_reg,
 417                                     struct brw_reg,
 418                                     struct brw_reg ))
 419 {
 420    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 421        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 422        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 423       struct brw_compile *p = &c->func;
 424       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 425       func(c, tmp, arg0, arg1, arg2);
 426       brw_MOV(p, dst, tmp);
 427       release_tmp(c, tmp);
 428    }
 429    else {
 430       func(c, dst, arg0, arg1, arg2);
 431    }
 432 }
 433
 434 static void emit_sop( struct brw_vs_compile *c,
 435                       struct brw_reg dst,
 436                       struct brw_reg arg0,
 437                       struct brw_reg arg1,
 438                       GLuint cond)
 439 {
 440    struct brw_compile *p = &c->func;
 441
 442    brw_MOV(p, dst, brw_imm_f(0.0f));
 443    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 444    brw_MOV(p, dst, brw_imm_f(1.0f));
 445    brw_set_predicate_control_flag_value(p, 0xff);
 446 }
 447
 448 static void emit_seq( struct brw_vs_compile *c,
 449                       struct brw_reg dst,
 450                       struct brw_reg arg0,
 451                       struct brw_reg arg1 )
 452 {
 453    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 454 }
 455
 456 static void emit_sne( struct brw_vs_compile *c,
 457                       struct brw_reg dst,
 458                       struct brw_reg arg0,
 459                       struct brw_reg arg1 )
 460 {
 461    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 462 }
 463 static void emit_slt( struct brw_vs_compile *c,
 464                       struct brw_reg dst,
 465                       struct brw_reg arg0,
 466                       struct brw_reg arg1 )
 467 {
 468    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 469 }
 470
 471 static void emit_sle( struct brw_vs_compile *c,
 472                       struct brw_reg dst,
 473                       struct brw_reg arg0,
 474                       struct brw_reg arg1 )
 475 {
 476    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 477 }
 478
 479 static void emit_sgt( struct brw_vs_compile *c,
 480                       struct brw_reg dst,
 481                       struct brw_reg arg0,
 482                       struct brw_reg arg1 )
 483 {
 484    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 485 }
 486
 487 static void emit_sge( struct brw_vs_compile *c,
 488                       struct brw_reg dst,
 489                       struct brw_reg arg0,
 490                       struct brw_reg arg1 )
 491 {
 492   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 493 }
 494
 495 static void emit_cmp( struct brw_compile *p,
 496                       struct brw_reg dst,
 497                       struct brw_reg arg0,
 498                       struct brw_reg arg1,
 499                       struct brw_reg arg2 )
 500 {
 501    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 502    brw_SEL(p, dst, arg1, arg2);
 503    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 504 }
 505
 506 static void emit_sign(struct brw_vs_compile *c,
 507                       struct brw_reg dst,
 508                       struct brw_reg arg0)
 509 {
 510    struct brw_compile *p = &c->func;
 511
 512    brw_MOV(p, dst, brw_imm_f(0));
 513
 514    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 515    brw_MOV(p, dst, brw_imm_f(-1.0));
 516    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 517
 518    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 519    brw_MOV(p, dst, brw_imm_f(1.0));
 520    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 521 }
 522
 523 static void emit_max( struct brw_compile *p,
 524                       struct brw_reg dst,
 525                       struct brw_reg arg0,
 526                       struct brw_reg arg1 )
 527 {
 528    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 529    brw_SEL(p, dst, arg0, arg1);
 530    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 531 }
 532
 533 static void emit_min( struct brw_compile *p,
 534                       struct brw_reg dst,
 535                       struct brw_reg arg0,
 536                       struct brw_reg arg1 )
 537 {
 538    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 539    brw_SEL(p, dst, arg0, arg1);
 540    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 541 }
 542
 543
 544 static void emit_math1( struct brw_vs_compile *c,
 545                         GLuint function,
 546                         struct brw_reg dst,
 547                         struct brw_reg arg0,
 548                         GLuint precision)
 549 {
 550    /* There are various odd behaviours with SEND on the simulator.  In
 551     * addition there are documented issues with the fact that the GEN4
 552     * processor doesn't do dependency control properly on SEND
 553     * results.  So, on balance, this kludge to get around failures
 554     * with writemasked math results looks like it might be necessary
 555     * whether that turns out to be a simulator bug or not:
 556     */
 557    struct brw_compile *p = &c->func;
 558    struct intel_context *intel = &p->brw->intel;
 559    struct brw_reg tmp = dst;
 560    GLboolean need_tmp = (intel->gen < 6 &&
 561                          (dst.dw1.bits.writemask != 0xf ||
 562                           dst.file != BRW_GENERAL_REGISTER_FILE));
 563
 564    if (need_tmp)
 565       tmp = get_tmp(c);
 566
 567    brw_math(p,
 568             tmp,
 569             function,
 570             BRW_MATH_SATURATE_NONE,
 571             2,
 572             arg0,
 573             BRW_MATH_DATA_SCALAR,
 574             precision);
 575
 576    if (need_tmp) {
 577       brw_MOV(p, dst, tmp);
 578       release_tmp(c, tmp);
 579    }
 580 }
 581
 582
 583 static void emit_math2( struct brw_vs_compile *c,
 584                         GLuint function,
 585                         struct brw_reg dst,
 586                         struct brw_reg arg0,
 587                         struct brw_reg arg1,
 588                         GLuint precision)
 589 {
 590    struct brw_compile *p = &c->func;
 591    struct intel_context *intel = &p->brw->intel;
 592    struct brw_reg tmp = dst;
 593    GLboolean need_tmp = (intel->gen < 6 &&
 594                          (dst.dw1.bits.writemask != 0xf ||
 595                           dst.file != BRW_GENERAL_REGISTER_FILE));
 596
 597    if (need_tmp)
 598       tmp = get_tmp(c);
 599
 600    brw_MOV(p, brw_message_reg(3), arg1);
 601
 602    brw_math(p,
 603             tmp,
 604             function,
 605             BRW_MATH_SATURATE_NONE,
 606             2,
 607             arg0,
 608             BRW_MATH_DATA_SCALAR,
 609             precision);
 610
 611    if (need_tmp) {
 612       brw_MOV(p, dst, tmp);
 613       release_tmp(c, tmp);
 614    }
 615 }
 616
 617
 618 static void emit_exp_noalias( struct brw_vs_compile *c,
 619                               struct brw_reg dst,
 620                               struct brw_reg arg0 )
 621 {
 622    struct brw_compile *p = &c->func;
 623
 624
 625    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 626       struct brw_reg tmp = get_tmp(c);
 627       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 628
 629       /* tmp_d = floor(arg0.x) */
 630       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 631
 632       /* result[0] = 2.0 ^ tmp */
 633
 634       /* Adjust exponent for floating point:
 635        * exp += 127
 636        */
 637       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 638
 639       /* Install exponent and sign.
 640        * Excess drops off the edge:
 641        */
 642       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 643               tmp_d, brw_imm_d(23));
 644
 645       release_tmp(c, tmp);
 646    }
 647
 648    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 649       /* result[1] = arg0.x - floor(arg0.x) */
 650       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 651    }
 652
 653    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 654       /* As with the LOG instruction, we might be better off just
 655        * doing a taylor expansion here, seeing as we have to do all
 656        * the prep work.
 657        *
 658        * If mathbox partial precision is too low, consider also:
 659        * result[3] = result[0] * EXP(result[1])
 660        */
 661       emit_math1(c,
 662                  BRW_MATH_FUNCTION_EXP,
 663                  brw_writemask(dst, WRITEMASK_Z),
 664                  brw_swizzle1(arg0, 0),
 665                  BRW_MATH_PRECISION_FULL);
 666    }
 667
 668    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 669       /* result[3] = 1.0; */
 670       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 671    }
 672 }
 673
 674
 675 static void emit_log_noalias( struct brw_vs_compile *c,
 676                               struct brw_reg dst,
 677                               struct brw_reg arg0 )
 678 {
 679    struct brw_compile *p = &c->func;
 680    struct brw_reg tmp = dst;
 681    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 682    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 683    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 684                          dst.file != BRW_GENERAL_REGISTER_FILE);
 685
 686    if (need_tmp) {
 687       tmp = get_tmp(c);
 688       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 689    }
 690
 691    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 692     * according to spec:
 693     *
 694     * These almost look likey they could be joined up, but not really
 695     * practical:
 696     *
 697     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 698     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 699     */
 700    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 701       brw_AND(p,
 702               brw_writemask(tmp_ud, WRITEMASK_X),
 703               brw_swizzle1(arg0_ud, 0),
 704               brw_imm_ud((1U<<31)-1));
 705
 706       brw_SHR(p,
 707               brw_writemask(tmp_ud, WRITEMASK_X),
 708               tmp_ud,
 709               brw_imm_ud(23));
 710
 711       brw_ADD(p,
 712               brw_writemask(tmp, WRITEMASK_X),
 713               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 714               brw_imm_d(-127));
 715    }
 716
 717    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 718       brw_AND(p,
 719               brw_writemask(tmp_ud, WRITEMASK_Y),
 720               brw_swizzle1(arg0_ud, 0),
 721               brw_imm_ud((1<<23)-1));
 722
 723       brw_OR(p,
 724              brw_writemask(tmp_ud, WRITEMASK_Y),
 725              tmp_ud,
 726              brw_imm_ud(127<<23));
 727    }
 728
 729    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 730       /* result[2] = result[0] + LOG2(result[1]); */
 731
 732       /* Why bother?  The above is just a hint how to do this with a
 733        * taylor series.  Maybe we *should* use a taylor series as by
 734        * the time all the above has been done it's almost certainly
 735        * quicker than calling the mathbox, even with low precision.
 736        *
 737        * Options are:
 738        *    - result[0] + mathbox.LOG2(result[1])
 739        *    - mathbox.LOG2(arg0.x)
 740        *    - result[0] + inline_taylor_approx(result[1])
 741        */
 742       emit_math1(c,
 743                  BRW_MATH_FUNCTION_LOG,
 744                  brw_writemask(tmp, WRITEMASK_Z),
 745                  brw_swizzle1(tmp, 1),
 746                  BRW_MATH_PRECISION_FULL);
 747
 748       brw_ADD(p,
 749               brw_writemask(tmp, WRITEMASK_Z),
 750               brw_swizzle1(tmp, 2),
 751               brw_swizzle1(tmp, 0));
 752    }
 753
 754    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 755       /* result[3] = 1.0; */
 756       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 757    }
 758
 759    if (need_tmp) {
 760       brw_MOV(p, dst, tmp);
 761       release_tmp(c, tmp);
 762    }
 763 }
 764
 765
 766 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 767  */
 768 static void emit_dst_noalias( struct brw_vs_compile *c,
 769                               struct brw_reg dst,
 770                               struct brw_reg arg0,
 771                               struct brw_reg arg1)
 772 {
 773    struct brw_compile *p = &c->func;
 774
 775    /* There must be a better way to do this:
 776     */
 777    if (dst.dw1.bits.writemask & WRITEMASK_X)
 778       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 779    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 780       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 781    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 782       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 783    if (dst.dw1.bits.writemask & WRITEMASK_W)
 784       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 785 }
 786
 787
 788 static void emit_xpd( struct brw_compile *p,
 789                       struct brw_reg dst,
 790                       struct brw_reg t,
 791                       struct brw_reg u)
 792 {
 793    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 794    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 795 }
 796
 797
 798 static void emit_lit_noalias( struct brw_vs_compile *c,
 799                               struct brw_reg dst,
 800                               struct brw_reg arg0 )
 801 {
 802    struct brw_compile *p = &c->func;
 803    struct brw_instruction *if_insn;
 804    struct brw_reg tmp = dst;
 805    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 806
 807    if (need_tmp)
 808       tmp = get_tmp(c);
 809
 810    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 811    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 812
 813    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 814     * to get all channels active inside the IF.  In the clipping code
 815     * we run with NoMask, so it's not an option and we can use
 816     * BRW_EXECUTE_1 for all comparisions.
 817     */
 818    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 819    if_insn = brw_IF(p, BRW_EXECUTE_8);
 820    {
 821       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 822
 823       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 824       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 825       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 826
 827       emit_math2(c,
 828                  BRW_MATH_FUNCTION_POW,
 829                  brw_writemask(dst, WRITEMASK_Z),
 830                  brw_swizzle1(tmp, 2),
 831                  brw_swizzle1(arg0, 3),
 832                  BRW_MATH_PRECISION_PARTIAL);
 833    }
 834
 835    brw_ENDIF(p, if_insn);
 836
 837    release_tmp(c, tmp);
 838 }
 839
 840 static void emit_lrp_noalias(struct brw_vs_compile *c,
 841                              struct brw_reg dst,
 842                              struct brw_reg arg0,
 843                              struct brw_reg arg1,
 844                              struct brw_reg arg2)
 845 {
 846    struct brw_compile *p = &c->func;
 847
 848    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 849    brw_MUL(p, brw_null_reg(), dst, arg2);
 850    brw_MAC(p, dst, arg0, arg1);
 851 }
 852
 853 /** 3 or 4-component vector normalization */
 854 static void emit_nrm( struct brw_vs_compile *c,
 855                       struct brw_reg dst,
 856                       struct brw_reg arg0,
 857                       int num_comps)
 858 {
 859    struct brw_compile *p = &c->func;
 860    struct brw_reg tmp = get_tmp(c);
 861
 862    /* tmp = dot(arg0, arg0) */
 863    if (num_comps == 3)
 864       brw_DP3(p, tmp, arg0, arg0);
 865    else
 866       brw_DP4(p, tmp, arg0, arg0);
 867
 868    /* tmp = 1 / sqrt(tmp) */
 869    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 870
 871    /* dst = arg0 * tmp */
 872    brw_MUL(p, dst, arg0, tmp);
 873
 874    release_tmp(c, tmp);
 875 }
 876
 877
 878 static struct brw_reg
 879 get_constant(struct brw_vs_compile *c,
 880              const struct prog_instruction *inst,
 881              GLuint argIndex)
 882 {
 883    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 884    struct brw_compile *p = &c->func;
 885    struct brw_reg const_reg = c->current_const[argIndex].reg;
 886
 887    assert(argIndex < 3);
 888
 889    if (c->current_const[argIndex].index != src->Index) {
 890       /* Keep track of the last constant loaded in this slot, for reuse. */
 891       c->current_const[argIndex].index = src->Index;
 892
 893 #if 0
 894       printf("  fetch const[%d] for arg %d into reg %d\n",
 895              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 896 #endif
 897       /* need to fetch the constant now */
 898       brw_dp_READ_4_vs(p,
 899                        const_reg,                     /* writeback dest */
 900                        16 * src->Index,               /* byte offset */
 901                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 902                        );
 903    }
 904
 905    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 906    const_reg = stride(const_reg, 0, 4, 0);
 907    const_reg.subnr = 0;
 908
 909    return const_reg;
 910 }
 911
 912 static struct brw_reg
 913 get_reladdr_constant(struct brw_vs_compile *c,
 914                      const struct prog_instruction *inst,
 915                      GLuint argIndex)
 916 {
 917    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 918    struct brw_compile *p = &c->func;
 919    struct brw_reg const_reg = c->current_const[argIndex].reg;
 920    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 921    struct brw_reg byte_addr_reg = get_tmp(c);
 922
 923    assert(argIndex < 3);
 924
 925    /* Can't reuse a reladdr constant load. */
 926    c->current_const[argIndex].index = -1;
 927
 928  #if 0
 929    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 930           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 931 #endif
 932
 933    brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
 934
 935    /* fetch the first vec4 */
 936    brw_dp_READ_4_vs_relative(p,
 937                              const_reg,                     /* writeback dest */
 938                              byte_addr_reg,                 /* address register */
 939                              16 * src->Index,               /* byte offset */
 940                              SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 941                              );
 942
 943    return const_reg;
 944 }
 945
 946
 947
 948 /* TODO: relative addressing!
 949  */
 950 static struct brw_reg get_reg( struct brw_vs_compile *c,
 951                                gl_register_file file,
 952                                GLuint index )
 953 {
 954    switch (file) {
 955    case PROGRAM_TEMPORARY:
 956    case PROGRAM_INPUT:
 957    case PROGRAM_OUTPUT:
 958       assert(c->regs[file][index].nr != 0);
 959       return c->regs[file][index];
 960    case PROGRAM_STATE_VAR:
 961    case PROGRAM_CONSTANT:
 962    case PROGRAM_UNIFORM:
 963       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 964       return c->regs[PROGRAM_STATE_VAR][index];
 965    case PROGRAM_ADDRESS:
 966       assert(index == 0);
 967       return c->regs[file][index];
 968
 969    case PROGRAM_UNDEFINED:                      /* undef values */
 970       return brw_null_reg();
 971
 972    case PROGRAM_LOCAL_PARAM:
 973    case PROGRAM_ENV_PARAM:
 974    case PROGRAM_WRITE_ONLY:
 975    default:
 976       assert(0);
 977       return brw_null_reg();
 978    }
 979 }
 980
 981
 982 /**
 983  * Indirect addressing:  get reg[[arg] + offset].
 984  */
 985 static struct brw_reg deref( struct brw_vs_compile *c,
 986                              struct brw_reg arg,
 987                              GLint offset,
 988                              GLuint reg_size )
 989 {
 990    struct brw_compile *p = &c->func;
 991    struct brw_reg tmp = get_tmp(c);
 992    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 993    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
 994    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
 995    struct brw_reg indirect = brw_vec4_indirect(0,0);
 996    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
 997
 998    /* Set the vertical stride on the register access so that the first
 999     * 4 components come from a0.0 and the second 4 from a0.1.
1000     */
1001    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1002
1003    {
1004       brw_push_insn_state(p);
1005       brw_set_access_mode(p, BRW_ALIGN_1);
1006
1007       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1008       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1009
1010       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1011       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1012
1013       brw_MOV(p, tmp, indirect);
1014
1015       brw_pop_insn_state(p);
1016    }
1017
1018    /* NOTE: tmp not released */
1019    return tmp;
1020 }
1021
1022 static void
1023 move_to_reladdr_dst(struct brw_vs_compile *c,
1024                     const struct prog_instruction *inst,
1025                     struct brw_reg val)
1026 {
1027    struct brw_compile *p = &c->func;
1028    int reg_size = 32;
1029    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1030    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1031    struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
1032    GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
1033    struct brw_reg indirect = brw_vec4_indirect(0,0);
1034    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1035
1036    byte_offset += inst->DstReg.Index * reg_size;
1037
1038    brw_push_insn_state(p);
1039    brw_set_access_mode(p, BRW_ALIGN_1);
1040
1041    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1042    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1043    brw_MOV(p, indirect, val);
1044
1045    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1046    brw_ADD(p, brw_address_reg(0), acc,
1047            brw_imm_uw(byte_offset + reg_size / 2));
1048    brw_MOV(p, indirect, suboffset(val, 4));
1049
1050    brw_pop_insn_state(p);
1051 }
1052
1053 /**
1054  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1055  * TODO: relative addressing!
1056  */
1057 static struct brw_reg
1058 get_src_reg( struct brw_vs_compile *c,
1059              const struct prog_instruction *inst,
1060              GLuint argIndex )
1061 {
1062    const GLuint file = inst->SrcReg[argIndex].File;
1063    const GLint index = inst->SrcReg[argIndex].Index;
1064    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1065
1066    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1067       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1068
1069       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1070                                         SWIZZLE_ZERO,
1071                                         SWIZZLE_ZERO,
1072                                         SWIZZLE_ZERO)) {
1073           return brw_imm_f(0.0f);
1074       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1075                                                SWIZZLE_ONE,
1076                                                SWIZZLE_ONE,
1077                                                SWIZZLE_ONE)) {
1078          if (src->Negate)
1079             return brw_imm_f(-1.0F);
1080          else
1081             return brw_imm_f(1.0F);
1082       } else if (src->File == PROGRAM_CONSTANT) {
1083          const struct gl_program_parameter_list *params;
1084          float f;
1085          int component = -1;
1086
1087          switch (src->Swizzle) {
1088          case SWIZZLE_XXXX:
1089             component = 0;
1090             break;
1091          case SWIZZLE_YYYY:
1092             component = 1;
1093             break;
1094          case SWIZZLE_ZZZZ:
1095             component = 2;
1096             break;
1097          case SWIZZLE_WWWW:
1098             component = 3;
1099             break;
1100          }
1101
1102          if (component >= 0) {
1103             params = c->vp->program.Base.Parameters;
1104             f = params->ParameterValues[src->Index][component];
1105
1106             if (src->Abs)
1107                f = fabs(f);
1108             if (src->Negate)
1109                f = -f;
1110             return brw_imm_f(f);
1111          }
1112       }
1113    }
1114
1115    switch (file) {
1116    case PROGRAM_TEMPORARY:
1117    case PROGRAM_INPUT:
1118    case PROGRAM_OUTPUT:
1119       if (relAddr) {
1120          return deref(c, c->regs[file][0], index, 32);
1121       }
1122       else {
1123          assert(c->regs[file][index].nr != 0);
1124          return c->regs[file][index];
1125       }
1126
1127    case PROGRAM_STATE_VAR:
1128    case PROGRAM_CONSTANT:
1129    case PROGRAM_UNIFORM:
1130    case PROGRAM_ENV_PARAM:
1131    case PROGRAM_LOCAL_PARAM:
1132       if (c->vp->use_const_buffer) {
1133          if (!relAddr && c->constant_map[index] != -1) {
1134             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1135             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1136          } else if (relAddr)
1137             return get_reladdr_constant(c, inst, argIndex);
1138          else
1139             return get_constant(c, inst, argIndex);
1140       }
1141       else if (relAddr) {
1142          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1143       }
1144       else {
1145          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1146          return c->regs[PROGRAM_STATE_VAR][index];
1147       }
1148    case PROGRAM_ADDRESS:
1149       assert(index == 0);
1150       return c->regs[file][index];
1151
1152    case PROGRAM_UNDEFINED:
1153       /* this is a normal case since we loop over all three src args */
1154       return brw_null_reg();
1155
1156    case PROGRAM_WRITE_ONLY:
1157    default:
1158       assert(0);
1159       return brw_null_reg();
1160    }
1161 }
1162
1163 /**
1164  * Return the brw reg for the given instruction's src argument.
1165  * Will return mangled results for SWZ op.  The emit_swz() function
1166  * ignores this result and recalculates taking extended swizzles into
1167  * account.
1168  */
1169 static struct brw_reg get_arg( struct brw_vs_compile *c,
1170                                const struct prog_instruction *inst,
1171                                GLuint argIndex )
1172 {
1173    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1174    struct brw_reg reg;
1175
1176    if (src->File == PROGRAM_UNDEFINED)
1177       return brw_null_reg();
1178
1179    reg = get_src_reg(c, inst, argIndex);
1180
1181    /* Convert 3-bit swizzle to 2-bit.
1182     */
1183    if (reg.file != BRW_IMMEDIATE_VALUE) {
1184       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1185                                           GET_SWZ(src->Swizzle, 1),
1186                                           GET_SWZ(src->Swizzle, 2),
1187                                           GET_SWZ(src->Swizzle, 3));
1188    }
1189
1190    /* Note this is ok for non-swizzle instructions:
1191     */
1192    reg.negate = src->Negate ? 1 : 0;
1193
1194    return reg;
1195 }
1196
1197
1198 /**
1199  * Get brw register for the given program dest register.
1200  */
1201 static struct brw_reg get_dst( struct brw_vs_compile *c,
1202                                struct prog_dst_register dst )
1203 {
1204    struct brw_reg reg;
1205
1206    switch (dst.File) {
1207    case PROGRAM_TEMPORARY:
1208    case PROGRAM_OUTPUT:
1209       /* register-indirect addressing is only 1x1, not VxH, for
1210        * destination regs.  So, for RelAddr we'll return a temporary
1211        * for the dest and do a move of the result to the RelAddr
1212        * register after the instruction emit.
1213        */
1214       if (dst.RelAddr) {
1215          reg = get_tmp(c);
1216       } else {
1217          assert(c->regs[dst.File][dst.Index].nr != 0);
1218          reg = c->regs[dst.File][dst.Index];
1219       }
1220       break;
1221    case PROGRAM_ADDRESS:
1222       assert(dst.Index == 0);
1223       reg = c->regs[dst.File][dst.Index];
1224       break;
1225    case PROGRAM_UNDEFINED:
1226       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1227       reg = brw_null_reg();
1228       break;
1229    default:
1230       assert(0);
1231       reg = brw_null_reg();
1232    }
1233
1234    assert(reg.type != BRW_IMMEDIATE_VALUE);
1235    reg.dw1.bits.writemask = dst.WriteMask;
1236
1237    return reg;
1238 }
1239
1240
1241 static void emit_swz( struct brw_vs_compile *c,
1242                       struct brw_reg dst,
1243                       const struct prog_instruction *inst)
1244 {
1245    const GLuint argIndex = 0;
1246    const struct prog_src_register src = inst->SrcReg[argIndex];
1247    struct brw_compile *p = &c->func;
1248    GLuint zeros_mask = 0;
1249    GLuint ones_mask = 0;
1250    GLuint src_mask = 0;
1251    GLubyte src_swz[4];
1252    GLboolean need_tmp = (src.Negate &&
1253                          dst.file != BRW_GENERAL_REGISTER_FILE);
1254    struct brw_reg tmp = dst;
1255    GLuint i;
1256
1257    if (need_tmp)
1258       tmp = get_tmp(c);
1259
1260    for (i = 0; i < 4; i++) {
1261       if (dst.dw1.bits.writemask & (1<<i)) {
1262          GLubyte s = GET_SWZ(src.Swizzle, i);
1263          switch (s) {
1264          case SWIZZLE_X:
1265          case SWIZZLE_Y:
1266          case SWIZZLE_Z:
1267          case SWIZZLE_W:
1268             src_mask |= 1<<i;
1269             src_swz[i] = s;
1270             break;
1271          case SWIZZLE_ZERO:
1272             zeros_mask |= 1<<i;
1273             break;
1274          case SWIZZLE_ONE:
1275             ones_mask |= 1<<i;
1276             break;
1277          }
1278       }
1279    }
1280
1281    /* Do src first, in case dst aliases src:
1282     */
1283    if (src_mask) {
1284       struct brw_reg arg0;
1285
1286       arg0 = get_src_reg(c, inst, argIndex);
1287
1288       arg0 = brw_swizzle(arg0,
1289                          src_swz[0], src_swz[1],
1290                          src_swz[2], src_swz[3]);
1291
1292       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1293    }
1294
1295    if (zeros_mask)
1296       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1297
1298    if (ones_mask)
1299       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1300
1301    if (src.Negate)
1302       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1303
1304    if (need_tmp) {
1305       brw_MOV(p, dst, tmp);
1306       release_tmp(c, tmp);
1307    }
1308 }
1309
1310
1311 /**
1312  * Post-vertex-program processing.  Send the results to the URB.
1313  */
1314 static void emit_vertex_write( struct brw_vs_compile *c)
1315 {
1316    struct brw_compile *p = &c->func;
1317    struct brw_context *brw = p->brw;
1318    struct intel_context *intel = &brw->intel;
1319    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1320    struct brw_reg ndc;
1321    int eot;
1322    GLuint len_vertex_header = 2;
1323
1324    if (c->key.copy_edgeflag) {
1325       brw_MOV(p,
1326               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1327               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1328    }
1329
1330    if (intel->gen < 6) {
1331       /* Build ndc coords */
1332       ndc = get_tmp(c);
1333       /* ndc = 1.0 / pos.w */
1334       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1335       /* ndc.xyz = pos * ndc */
1336       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1337    }
1338
1339    /* Update the header for point size, user clipping flags, and -ve rhw
1340     * workaround.
1341     */
1342    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1343        c->key.nr_userclip || brw->has_negative_rhw_bug)
1344    {
1345       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1346       GLuint i;
1347
1348       brw_MOV(p, header1, brw_imm_ud(0));
1349
1350       brw_set_access_mode(p, BRW_ALIGN_16);
1351
1352       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1353          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1354          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1355          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1356       }
1357
1358       for (i = 0; i < c->key.nr_userclip; i++) {
1359          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1360          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1361          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1362          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1363       }
1364
1365       /* i965 clipping workaround:
1366        * 1) Test for -ve rhw
1367        * 2) If set,
1368        *      set ndc = (0,0,0,0)
1369        *      set ucp[6] = 1
1370        *
1371        * Later, clipping will detect ucp[6] and ensure the primitive is
1372        * clipped against all fixed planes.
1373        */
1374       if (brw->has_negative_rhw_bug) {
1375          brw_CMP(p,
1376                  vec8(brw_null_reg()),
1377                  BRW_CONDITIONAL_L,
1378                  brw_swizzle1(ndc, 3),
1379                  brw_imm_f(0));
1380
1381          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1382          brw_MOV(p, ndc, brw_imm_f(0));
1383          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1384       }
1385
1386       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1387       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1388       brw_set_access_mode(p, BRW_ALIGN_16);
1389
1390       release_tmp(c, header1);
1391    }
1392    else {
1393       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1394    }
1395
1396    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1397     * of zeros followed by two sets of NDC coordinates:
1398     */
1399    brw_set_access_mode(p, BRW_ALIGN_1);
1400    brw_set_acc_write_control(p, 0);
1401
1402    /* The VUE layout is documented in Volume 2a. */
1403    if (intel->gen >= 6) {
1404       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1405        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1406        * dword 4-7 (m2) is the 4D space position
1407        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1408        * enabled.  We don't use it, so skip it.
1409        * m3 is the first vertex element data we fill, which is the vertex
1410        * position.
1411        */
1412       brw_MOV(p, brw_message_reg(2), pos);
1413       brw_MOV(p, brw_message_reg(3), pos);
1414       len_vertex_header = 2;
1415    } else if (intel->gen == 5) {
1416       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1417        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1418        * dword 4-7 (m2) is the ndc position (set above)
1419        * dword 8-11 (m3) of the vertex header is the 4D space position
1420        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1421        * m6 is a pad so that the vertex element data is aligned
1422        * m7 is the first vertex data we fill, which is the vertex position.
1423        */
1424       brw_MOV(p, brw_message_reg(2), ndc);
1425       brw_MOV(p, brw_message_reg(3), pos);
1426       brw_MOV(p, brw_message_reg(7), pos);
1427       len_vertex_header = 6;
1428    } else {
1429       /* There are 8 dwords in VUE header pre-Ironlake:
1430        * dword 0-3 (m1) is indices, point width, clip flags.
1431        * dword 4-7 (m2) is ndc position (set above)
1432        *
1433        * dword 8-11 (m3) is the first vertex data, which we always have be the
1434        * vertex position.
1435        */
1436       brw_MOV(p, brw_message_reg(2), ndc);
1437       brw_MOV(p, brw_message_reg(3), pos);
1438       len_vertex_header = 2;
1439    }
1440
1441    eot = (c->first_overflow_output == 0);
1442
1443    brw_urb_WRITE(p,
1444                  brw_null_reg(), /* dest */
1445                  0,             /* starting mrf reg nr */
1446                  c->r0,         /* src */
1447                  0,             /* allocate */
1448                  1,             /* used */
1449                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1450                  0,             /* response len */
1451                  eot,           /* eot */
1452                  eot,           /* writes complete */
1453                  0,             /* urb destination offset */
1454                  BRW_URB_SWIZZLE_INTERLEAVE);
1455
1456    if (c->first_overflow_output > 0) {
1457       /* Not all of the vertex outputs/results fit into the MRF.
1458        * Move the overflowed attributes from the GRF to the MRF and
1459        * issue another brw_urb_WRITE().
1460        */
1461       GLuint i, mrf = 1;
1462       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1463          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1464             /* move from GRF to MRF */
1465             brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1466             mrf++;
1467          }
1468       }
1469
1470       brw_urb_WRITE(p,
1471                     brw_null_reg(), /* dest */
1472                     0,              /* starting mrf reg nr */
1473                     c->r0,          /* src */
1474                     0,              /* allocate */
1475                     1,              /* used */
1476                     mrf,            /* msg len */
1477                     0,              /* response len */
1478                     1,              /* eot */
1479                     1,              /* writes complete */
1480                     14 / 2,  /* urb destination offset */
1481                     BRW_URB_SWIZZLE_INTERLEAVE);
1482    }
1483 }
1484
1485 static GLboolean
1486 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1487 {
1488    struct brw_compile *p = &c->func;
1489    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1490
1491    if (p->nr_insn == 0)
1492       return GL_FALSE;
1493
1494    if (val.address_mode != BRW_ADDRESS_DIRECT)
1495       return GL_FALSE;
1496
1497    switch (prev_insn->header.opcode) {
1498    case BRW_OPCODE_MOV:
1499    case BRW_OPCODE_MAC:
1500    case BRW_OPCODE_MUL:
1501       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1502           prev_insn->header.execution_size == val.width &&
1503           prev_insn->bits1.da1.dest_reg_file == val.file &&
1504           prev_insn->bits1.da1.dest_reg_type == val.type &&
1505           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1506           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1507           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1508           prev_insn->bits1.da16.dest_writemask == 0xf)
1509          return GL_TRUE;
1510       else
1511          return GL_FALSE;
1512    default:
1513       return GL_FALSE;
1514    }
1515 }
1516
1517 static uint32_t
1518 get_predicate(const struct prog_instruction *inst)
1519 {
1520    if (inst->DstReg.CondMask == COND_TR)
1521       return BRW_PREDICATE_NONE;
1522
1523    /* All of GLSL only produces predicates for COND_NE and one channel per
1524     * vector.  Fail badly if someone starts doing something else, as it might
1525     * mean infinite looping or something.
1526     *
1527     * We'd like to support all the condition codes, but our hardware doesn't
1528     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1529     * those, the instruction may update the condition codes or not, then any
1530     * later instruction may use one of those condition codes.  For gen4, the
1531     * instruction may update the flags register based on one of the condition
1532     * codes output by the instruction, and then further instructions may
1533     * predicate on that.  We can probably support this, but it won't
1534     * necessarily be easy.
1535     */
1536    assert(inst->DstReg.CondMask == COND_NE);
1537
1538    switch (inst->DstReg.CondSwizzle) {
1539    case SWIZZLE_XXXX:
1540       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1541    case SWIZZLE_YYYY:
1542       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1543    case SWIZZLE_ZZZZ:
1544       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1545    case SWIZZLE_WWWW:
1546       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1547    default:
1548       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1549                     inst->DstReg.CondMask);
1550       return BRW_PREDICATE_NORMAL;
1551    }
1552 }
1553
1554 /* Emit the vertex program instructions here.
1555  */
1556 void brw_vs_emit(struct brw_vs_compile *c )
1557 {
1558 #define MAX_IF_DEPTH 32
1559 #define MAX_LOOP_DEPTH 32
1560    struct brw_compile *p = &c->func;
1561    struct brw_context *brw = p->brw;
1562    struct intel_context *intel = &brw->intel;
1563    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1564    GLuint insn, if_depth = 0, loop_depth = 0;
1565    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1566    int if_depth_in_loop[MAX_LOOP_DEPTH];
1567    const struct brw_indirect stack_index = brw_indirect(0, 0);
1568    GLuint index;
1569    GLuint file;
1570
1571    if (INTEL_DEBUG & DEBUG_VS) {
1572       printf("vs-mesa:\n");
1573       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1574                                GL_TRUE);
1575       printf("\n");
1576    }
1577
1578    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1579    brw_set_access_mode(p, BRW_ALIGN_16);
1580    if_depth_in_loop[loop_depth] = 0;
1581
1582    brw_set_acc_write_control(p, 1);
1583
1584    for (insn = 0; insn < nr_insns; insn++) {
1585        GLuint i;
1586        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1587
1588        /* Message registers can't be read, so copy the output into GRF
1589         * register if they are used in source registers
1590         */
1591        for (i = 0; i < 3; i++) {
1592            struct prog_src_register *src = &inst->SrcReg[i];
1593            GLuint index = src->Index;
1594            GLuint file = src->File;
1595            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1596                c->output_regs[index].used_in_src = GL_TRUE;
1597        }
1598
1599        switch (inst->Opcode) {
1600        case OPCODE_CAL:
1601        case OPCODE_RET:
1602           c->needs_stack = GL_TRUE;
1603           break;
1604        default:
1605           break;
1606        }
1607    }
1608
1609    /* Static register allocation
1610     */
1611    brw_vs_alloc_regs(c);
1612
1613    if (c->needs_stack)
1614       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1615
1616    for (insn = 0; insn < nr_insns; insn++) {
1617
1618       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1619       struct brw_reg args[3], dst;
1620       GLuint i;
1621       struct brw_instruction *temp;
1622
1623 #if 0
1624       printf("%d: ", insn);
1625       _mesa_print_instruction(inst);
1626 #endif
1627
1628       /* Get argument regs.  SWZ is special and does this itself.
1629        */
1630       if (inst->Opcode != OPCODE_SWZ)
1631           for (i = 0; i < 3; i++) {
1632               const struct prog_src_register *src = &inst->SrcReg[i];
1633               index = src->Index;
1634               file = src->File;
1635               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1636                   args[i] = c->output_regs[index].reg;
1637               else
1638                   args[i] = get_arg(c, inst, i);
1639           }
1640
1641       /* Get dest regs.  Note that it is possible for a reg to be both
1642        * dst and arg, given the static allocation of registers.  So
1643        * care needs to be taken emitting multi-operation instructions.
1644        */
1645       index = inst->DstReg.Index;
1646       file = inst->DstReg.File;
1647       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1648           dst = c->output_regs[index].reg;
1649       else
1650           dst = get_dst(c, inst->DstReg);
1651
1652       if (inst->SaturateMode != SATURATE_OFF) {
1653          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1654                        inst->SaturateMode);
1655       }
1656
1657       switch (inst->Opcode) {
1658       case OPCODE_ABS:
1659          brw_MOV(p, dst, brw_abs(args[0]));
1660          break;
1661       case OPCODE_ADD:
1662          brw_ADD(p, dst, args[0], args[1]);
1663          break;
1664       case OPCODE_COS:
1665          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1666          break;
1667       case OPCODE_DP2:
1668          brw_DP2(p, dst, args[0], args[1]);
1669          break;
1670       case OPCODE_DP3:
1671          brw_DP3(p, dst, args[0], args[1]);
1672          break;
1673       case OPCODE_DP4:
1674          brw_DP4(p, dst, args[0], args[1]);
1675          break;
1676       case OPCODE_DPH:
1677          brw_DPH(p, dst, args[0], args[1]);
1678          break;
1679       case OPCODE_NRM3:
1680          emit_nrm(c, dst, args[0], 3);
1681          break;
1682       case OPCODE_NRM4:
1683          emit_nrm(c, dst, args[0], 4);
1684          break;
1685       case OPCODE_DST:
1686          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1687          break;
1688       case OPCODE_EXP:
1689          unalias1(c, dst, args[0], emit_exp_noalias);
1690          break;
1691       case OPCODE_EX2:
1692          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1693          break;
1694       case OPCODE_ARL:
1695          brw_RNDD(p, dst, args[0]);
1696          break;
1697       case OPCODE_FLR:
1698          brw_RNDD(p, dst, args[0]);
1699          break;
1700       case OPCODE_FRC:
1701          brw_FRC(p, dst, args[0]);
1702          break;
1703       case OPCODE_LOG:
1704          unalias1(c, dst, args[0], emit_log_noalias);
1705          break;
1706       case OPCODE_LG2:
1707          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1708          break;
1709       case OPCODE_LIT:
1710          unalias1(c, dst, args[0], emit_lit_noalias);
1711          break;
1712       case OPCODE_LRP:
1713          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1714          break;
1715       case OPCODE_MAD:
1716          if (!accumulator_contains(c, args[2]))
1717             brw_MOV(p, brw_acc_reg(), args[2]);
1718          brw_MAC(p, dst, args[0], args[1]);
1719          break;
1720       case OPCODE_CMP:
1721          emit_cmp(p, dst, args[0], args[1], args[2]);
1722          break;
1723       case OPCODE_MAX:
1724          emit_max(p, dst, args[0], args[1]);
1725          break;
1726       case OPCODE_MIN:
1727          emit_min(p, dst, args[0], args[1]);
1728          break;
1729       case OPCODE_MOV:
1730          brw_MOV(p, dst, args[0]);
1731          break;
1732       case OPCODE_MUL:
1733          brw_MUL(p, dst, args[0], args[1]);
1734          break;
1735       case OPCODE_POW:
1736          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1737          break;
1738       case OPCODE_RCP:
1739          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1740          break;
1741       case OPCODE_RSQ:
1742          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1743          break;
1744
1745       case OPCODE_SEQ:
1746          unalias2(c, dst, args[0], args[1], emit_seq);
1747          break;
1748       case OPCODE_SIN:
1749          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1750          break;
1751       case OPCODE_SNE:
1752          unalias2(c, dst, args[0], args[1], emit_sne);
1753          break;
1754       case OPCODE_SGE:
1755          unalias2(c, dst, args[0], args[1], emit_sge);
1756          break;
1757       case OPCODE_SGT:
1758          unalias2(c, dst, args[0], args[1], emit_sgt);
1759          break;
1760       case OPCODE_SLT:
1761          unalias2(c, dst, args[0], args[1], emit_slt);
1762          break;
1763       case OPCODE_SLE:
1764          unalias2(c, dst, args[0], args[1], emit_sle);
1765          break;
1766       case OPCODE_SSG:
1767          unalias1(c, dst, args[0], emit_sign);
1768          break;
1769       case OPCODE_SUB:
1770          brw_ADD(p, dst, args[0], negate(args[1]));
1771          break;
1772       case OPCODE_SWZ:
1773          /* The args[0] value can't be used here as it won't have
1774           * correctly encoded the full swizzle:
1775           */
1776          emit_swz(c, dst, inst);
1777          break;
1778       case OPCODE_TRUNC:
1779          /* round toward zero */
1780          brw_RNDZ(p, dst, args[0]);
1781          break;
1782       case OPCODE_XPD:
1783          emit_xpd(p, dst, args[0], args[1]);
1784          break;
1785       case OPCODE_IF:
1786          assert(if_depth < MAX_IF_DEPTH);
1787          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1788          /* Note that brw_IF smashes the predicate_control field. */
1789          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1790          if_depth_in_loop[loop_depth]++;
1791          if_depth++;
1792          break;
1793       case OPCODE_ELSE:
1794          assert(if_depth > 0);
1795          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1796          break;
1797       case OPCODE_ENDIF:
1798          assert(if_depth > 0);
1799          brw_ENDIF(p, if_inst[--if_depth]);
1800          if_depth_in_loop[loop_depth]--;
1801          break;
1802       case OPCODE_BGNLOOP:
1803          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1804          if_depth_in_loop[loop_depth] = 0;
1805          break;
1806       case OPCODE_BRK:
1807          brw_set_predicate_control(p, get_predicate(inst));
1808          temp = brw_BREAK(p);
1809          temp->bits3.if_else.pop_count = if_depth_in_loop[loop_depth];
1810          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1811          break;
1812       case OPCODE_CONT:
1813          brw_set_predicate_control(p, get_predicate(inst));
1814          temp = brw_CONT(p);
1815          temp->bits3.if_else.pop_count = if_depth_in_loop[loop_depth];
1816          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1817          break;
1818       case OPCODE_ENDLOOP:
1819          {
1820             struct brw_instruction *inst0, *inst1;
1821             GLuint br = 1;
1822
1823             loop_depth--;
1824
1825             if (intel->gen == 5)
1826                br = 2;
1827
1828             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1829             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1830             while (inst0 > loop_inst[loop_depth]) {
1831                inst0--;
1832                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1833                    inst0->bits3.if_else.jump_count == 0) {
1834                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1835                }
1836                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1837                         inst0->bits3.if_else.jump_count == 0) {
1838                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1839                }
1840             }
1841          }
1842          break;
1843       case OPCODE_BRA:
1844          brw_set_predicate_control(p, get_predicate(inst));
1845          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1846          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1847          break;
1848       case OPCODE_CAL:
1849          brw_set_access_mode(p, BRW_ALIGN_1);
1850          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1851          brw_set_access_mode(p, BRW_ALIGN_16);
1852          brw_ADD(p, get_addr_reg(stack_index),
1853                          get_addr_reg(stack_index), brw_imm_d(4));
1854          brw_save_call(p, inst->Comment, p->nr_insn);
1855          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1856          break;
1857       case OPCODE_RET:
1858          brw_ADD(p, get_addr_reg(stack_index),
1859                          get_addr_reg(stack_index), brw_imm_d(-4));
1860          brw_set_access_mode(p, BRW_ALIGN_1);
1861          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1862          brw_set_access_mode(p, BRW_ALIGN_16);
1863          break;
1864       case OPCODE_END:
1865          emit_vertex_write(c);
1866          break;
1867       case OPCODE_PRINT:
1868          /* no-op */
1869          break;
1870       case OPCODE_BGNSUB:
1871          brw_save_label(p, inst->Comment, p->nr_insn);
1872          break;
1873       case OPCODE_ENDSUB:
1874          /* no-op */
1875          break;
1876       default:
1877          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1878                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1879                                     _mesa_opcode_string(inst->Opcode) :
1880                                     "unknown");
1881       }
1882
1883       /* Set the predication update on the last instruction of the native
1884        * instruction sequence.
1885        *
1886        * This would be problematic if it was set on a math instruction,
1887        * but that shouldn't be the case with the current GLSL compiler.
1888        */
1889       if (inst->CondUpdate) {
1890          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1891
1892          assert(hw_insn->header.destreg__conditionalmod == 0);
1893          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1894       }
1895
1896       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1897           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1898           && c->output_regs[inst->DstReg.Index].used_in_src) {
1899          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1900       }
1901
1902       /* Result color clamping.
1903        *
1904        * When destination register is an output register and
1905        * it's primary/secondary front/back color, we have to clamp
1906        * the result to [0,1]. This is done by enabling the
1907        * saturation bit for the last instruction.
1908        *
1909        * We don't use brw_set_saturate() as it modifies
1910        * p->current->header.saturate, which affects all the subsequent
1911        * instructions. Instead, we directly modify the header
1912        * of the last (already stored) instruction.
1913        */
1914       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1915          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1916              || (inst->DstReg.Index == VERT_RESULT_COL1)
1917              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1918              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1919             p->store[p->nr_insn-1].header.saturate = 1;
1920          }
1921       }
1922
1923       if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
1924          /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1925           * compute-to-mrf and the fact that we are allocating
1926           * registers for only the used PROGRAM_OUTPUTs.
1927           */
1928          move_to_reladdr_dst(c, inst, dst);
1929       }
1930
1931       release_tmps(c);
1932    }
1933
1934    brw_resolve_cals(p);
1935
1936    brw_optimize(p);
1937
1938    if (INTEL_DEBUG & DEBUG_VS) {
1939       int i;
1940
1941       printf("vs-native:\n");
1942       for (i = 0; i < p->nr_insn; i++)
1943          brw_disasm(stdout, &p->store[i], intel->gen);
1944       printf("\n");
1945    }
1946 }