src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP3] = 2,
  51       [OPCODE_DP4] = 2,
  52       [OPCODE_DPH] = 2,
  53       [OPCODE_MAX] = 2,
  54       [OPCODE_MIN] = 2,
  55       [OPCODE_MUL] = 2,
  56       [OPCODE_SEQ] = 2,
  57       [OPCODE_SGE] = 2,
  58       [OPCODE_SGT] = 2,
  59       [OPCODE_SLE] = 2,
  60       [OPCODE_SLT] = 2,
  61       [OPCODE_SNE] = 2,
  62       [OPCODE_XPD] = 2,
  63    };
  64
  65    /* These opcodes get broken down in a way that allow two
  66     * args to be immediates.
  67     */
  68    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  69       if (arg == 1 || arg == 2)
  70          return GL_TRUE;
  71    }
  72
  73    if (opcode > ARRAY_SIZE(opcode_array))
  74       return GL_FALSE;
  75
  76    return arg == opcode_array[opcode] - 1;
  77 }
  78
  79 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  80 {
  81    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  82
  83    if (++c->last_tmp > c->prog_data.total_grf)
  84       c->prog_data.total_grf = c->last_tmp;
  85
  86    return tmp;
  87 }
  88
  89 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  90 {
  91    if (tmp.nr == c->last_tmp-1)
  92       c->last_tmp--;
  93 }
  94
  95 static void release_tmps( struct brw_vs_compile *c )
  96 {
  97    c->last_tmp = c->first_tmp;
  98 }
  99
 100
 101 /**
 102  * Preallocate GRF register before code emit.
 103  * Do things as simply as possible.  Allocate and populate all regs
 104  * ahead of time.
 105  */
 106 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 107 {
 108    struct intel_context *intel = &c->func.brw->intel;
 109    GLuint i, reg = 0, mrf;
 110    int attributes_in_vue;
 111
 112    /* Determine whether to use a real constant buffer or use a block
 113     * of GRF registers for constants.  The later is faster but only
 114     * works if everything fits in the GRF.
 115     * XXX this heuristic/check may need some fine tuning...
 116     */
 117    if (c->vp->program.Base.Parameters->NumParameters +
 118        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 119       c->vp->use_const_buffer = GL_TRUE;
 120    else
 121       c->vp->use_const_buffer = GL_FALSE;
 122
 123    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 124
 125    /* r0 -- reserved as usual
 126     */
 127    c->r0 = brw_vec8_grf(reg, 0);
 128    reg++;
 129
 130    /* User clip planes from curbe:
 131     */
 132    if (c->key.nr_userclip) {
 133       for (i = 0; i < c->key.nr_userclip; i++) {
 134          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 135       }
 136
 137       /* Deal with curbe alignment:
 138        */
 139       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 140    }
 141
 142    /* Vertex program parameters from curbe:
 143     */
 144    if (c->vp->use_const_buffer) {
 145       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 146       int constant = 0;
 147
 148       /* We've got more constants than we can load with the push
 149        * mechanism.  This is often correlated with reladdr loads where
 150        * we should probably be using a pull mechanism anyway to avoid
 151        * excessive reading.  However, the pull mechanism is slow in
 152        * general.  So, we try to allocate as many non-reladdr-loaded
 153        * constants through the push buffer as we can before giving up.
 154        */
 155       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 156       for (i = 0;
 157            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 158            i++) {
 159          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 160          int arg;
 161
 162          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 163             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 164                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 165                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 166                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 167                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 168                 inst->SrcReg[arg].RelAddr)
 169                continue;
 170
 171             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 172                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 173             }
 174          }
 175       }
 176
 177       for (i = 0; i < constant; i++) {
 178          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 179                                                               (i%2) * 4),
 180                                                  0, 4, 1);
 181       }
 182       reg += (constant + 1) / 2;
 183       c->prog_data.curb_read_length = reg - 1;
 184       /* XXX 0 causes a bug elsewhere... */
 185       c->prog_data.nr_params = MAX2(constant * 4, 4);
 186    }
 187    else {
 188       /* use a section of the GRF for constants */
 189       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 190       for (i = 0; i < nr_params; i++) {
 191          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 192       }
 193       reg += (nr_params + 1) / 2;
 194       c->prog_data.curb_read_length = reg - 1;
 195
 196       c->prog_data.nr_params = nr_params * 4;
 197    }
 198
 199    /* Allocate input regs:
 200     */
 201    c->nr_inputs = 0;
 202    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 203       if (c->prog_data.inputs_read & (1 << i)) {
 204          c->nr_inputs++;
 205          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 206          reg++;
 207       }
 208    }
 209    /* If there are no inputs, we'll still be reading one attribute's worth
 210     * because it's required -- see urb_read_length setting.
 211     */
 212    if (c->nr_inputs == 0)
 213       reg++;
 214
 215    /* Allocate outputs.  The non-position outputs go straight into message regs.
 216     */
 217    c->nr_outputs = 0;
 218    c->first_output = reg;
 219    c->first_overflow_output = 0;
 220
 221    if (intel->gen >= 6)
 222       mrf = 4;
 223    else if (intel->gen == 5)
 224       mrf = 8;
 225    else
 226       mrf = 4;
 227
 228    for (i = 0; i < VERT_RESULT_MAX; i++) {
 229       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 230          c->nr_outputs++;
 231          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 232          if (i == VERT_RESULT_HPOS) {
 233             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 234             reg++;
 235          }
 236          else if (i == VERT_RESULT_PSIZ) {
 237             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 238             reg++;
 239             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 240          }
 241          else {
 242             /* Two restrictions on our compute-to-MRF here.  The
 243              * message length for all SEND messages is restricted to
 244              * [1,15], so we can't use mrf 15, as that means a length
 245              * of 16.
 246              *
 247              * Additionally, URB writes are aligned to URB rows, so we
 248              * need to put an even number of registers of URB data in
 249              * each URB write so that the later write is aligned.  A
 250              * message length of 15 means 1 message header reg plus 14
 251              * regs of URB data.
 252              *
 253              * For attributes beyond the compute-to-MRF, we compute to
 254              * GRFs and they will be written in the second URB_WRITE.
 255              */
 256             if (mrf < 15) {
 257                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 258                mrf++;
 259             }
 260             else {
 261                if (!c->first_overflow_output)
 262                   c->first_overflow_output = i;
 263                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 264                reg++;
 265             }
 266          }
 267       }
 268    }
 269
 270    /* Allocate program temporaries:
 271     */
 272    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 273       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 274       reg++;
 275    }
 276
 277    /* Address reg(s).  Don't try to use the internal address reg until
 278     * deref time.
 279     */
 280    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 281       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 282                                              reg,
 283                                              0,
 284                                              BRW_REGISTER_TYPE_D,
 285                                              BRW_VERTICAL_STRIDE_8,
 286                                              BRW_WIDTH_8,
 287                                              BRW_HORIZONTAL_STRIDE_1,
 288                                              BRW_SWIZZLE_XXXX,
 289                                              WRITEMASK_X);
 290       reg++;
 291    }
 292
 293    if (c->vp->use_const_buffer) {
 294       for (i = 0; i < 3; i++) {
 295          c->current_const[i].index = -1;
 296          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 297          reg++;
 298       }
 299    }
 300
 301    for (i = 0; i < 128; i++) {
 302       if (c->output_regs[i].used_in_src) {
 303          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 304          reg++;
 305       }
 306    }
 307
 308    if (c->needs_stack) {
 309       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 310       reg += 2;
 311    }
 312
 313    /* Some opcodes need an internal temporary:
 314     */
 315    c->first_tmp = reg;
 316    c->last_tmp = reg;           /* for allocation purposes */
 317
 318    /* Each input reg holds data from two vertices.  The
 319     * urb_read_length is the number of registers read from *each*
 320     * vertex urb, so is half the amount:
 321     */
 322    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 323    /* Setting this field to 0 leads to undefined behavior according to the
 324     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 325     * sitting in them, even if it's padding.
 326     */
 327    if (c->prog_data.urb_read_length == 0)
 328       c->prog_data.urb_read_length = 1;
 329
 330    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 331     * them to fit the biggest thing they need to.
 332     */
 333    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 334
 335    /* See emit_vertex_write() for where the VUE's overhead on top of the
 336     * attributes comes from.
 337     */
 338    if (intel->gen >= 6)
 339       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
 340    else if (intel->gen == 5)
 341       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 342    else
 343       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 344
 345    c->prog_data.total_grf = reg;
 346
 347    if (INTEL_DEBUG & DEBUG_VS) {
 348       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 349       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 350       printf("%s reg = %d\n", __FUNCTION__, reg);
 351    }
 352 }
 353
 354
 355 /**
 356  * If an instruction uses a temp reg both as a src and the dest, we
 357  * sometimes need to allocate an intermediate temporary.
 358  */
 359 static void unalias1( struct brw_vs_compile *c,
 360                       struct brw_reg dst,
 361                       struct brw_reg arg0,
 362                       void (*func)( struct brw_vs_compile *,
 363                                     struct brw_reg,
 364                                     struct brw_reg ))
 365 {
 366    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 367       struct brw_compile *p = &c->func;
 368       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 369       func(c, tmp, arg0);
 370       brw_MOV(p, dst, tmp);
 371       release_tmp(c, tmp);
 372    }
 373    else {
 374       func(c, dst, arg0);
 375    }
 376 }
 377
 378 /**
 379  * \sa unalias2
 380  * Checkes if 2-operand instruction needs an intermediate temporary.
 381  */
 382 static void unalias2( struct brw_vs_compile *c,
 383                       struct brw_reg dst,
 384                       struct brw_reg arg0,
 385                       struct brw_reg arg1,
 386                       void (*func)( struct brw_vs_compile *,
 387                                     struct brw_reg,
 388                                     struct brw_reg,
 389                                     struct brw_reg ))
 390 {
 391    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 392        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 393       struct brw_compile *p = &c->func;
 394       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 395       func(c, tmp, arg0, arg1);
 396       brw_MOV(p, dst, tmp);
 397       release_tmp(c, tmp);
 398    }
 399    else {
 400       func(c, dst, arg0, arg1);
 401    }
 402 }
 403
 404 /**
 405  * \sa unalias2
 406  * Checkes if 3-operand instruction needs an intermediate temporary.
 407  */
 408 static void unalias3( struct brw_vs_compile *c,
 409                       struct brw_reg dst,
 410                       struct brw_reg arg0,
 411                       struct brw_reg arg1,
 412                       struct brw_reg arg2,
 413                       void (*func)( struct brw_vs_compile *,
 414                                     struct brw_reg,
 415                                     struct brw_reg,
 416                                     struct brw_reg,
 417                                     struct brw_reg ))
 418 {
 419    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 420        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 421        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 422       struct brw_compile *p = &c->func;
 423       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 424       func(c, tmp, arg0, arg1, arg2);
 425       brw_MOV(p, dst, tmp);
 426       release_tmp(c, tmp);
 427    }
 428    else {
 429       func(c, dst, arg0, arg1, arg2);
 430    }
 431 }
 432
 433 static void emit_sop( struct brw_vs_compile *c,
 434                       struct brw_reg dst,
 435                       struct brw_reg arg0,
 436                       struct brw_reg arg1,
 437                       GLuint cond)
 438 {
 439    struct brw_compile *p = &c->func;
 440
 441    brw_MOV(p, dst, brw_imm_f(0.0f));
 442    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 443    brw_MOV(p, dst, brw_imm_f(1.0f));
 444    brw_set_predicate_control_flag_value(p, 0xff);
 445 }
 446
 447 static void emit_seq( struct brw_vs_compile *c,
 448                       struct brw_reg dst,
 449                       struct brw_reg arg0,
 450                       struct brw_reg arg1 )
 451 {
 452    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 453 }
 454
 455 static void emit_sne( struct brw_vs_compile *c,
 456                       struct brw_reg dst,
 457                       struct brw_reg arg0,
 458                       struct brw_reg arg1 )
 459 {
 460    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 461 }
 462 static void emit_slt( struct brw_vs_compile *c,
 463                       struct brw_reg dst,
 464                       struct brw_reg arg0,
 465                       struct brw_reg arg1 )
 466 {
 467    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 468 }
 469
 470 static void emit_sle( struct brw_vs_compile *c,
 471                       struct brw_reg dst,
 472                       struct brw_reg arg0,
 473                       struct brw_reg arg1 )
 474 {
 475    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 476 }
 477
 478 static void emit_sgt( struct brw_vs_compile *c,
 479                       struct brw_reg dst,
 480                       struct brw_reg arg0,
 481                       struct brw_reg arg1 )
 482 {
 483    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 484 }
 485
 486 static void emit_sge( struct brw_vs_compile *c,
 487                       struct brw_reg dst,
 488                       struct brw_reg arg0,
 489                       struct brw_reg arg1 )
 490 {
 491   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 492 }
 493
 494 static void emit_cmp( struct brw_compile *p,
 495                       struct brw_reg dst,
 496                       struct brw_reg arg0,
 497                       struct brw_reg arg1,
 498                       struct brw_reg arg2 )
 499 {
 500    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 501    brw_SEL(p, dst, arg1, arg2);
 502    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 503 }
 504
 505 static void emit_sign(struct brw_vs_compile *c,
 506                       struct brw_reg dst,
 507                       struct brw_reg arg0)
 508 {
 509    struct brw_compile *p = &c->func;
 510
 511    brw_MOV(p, dst, brw_imm_f(0));
 512
 513    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 514    brw_MOV(p, dst, brw_imm_f(-1.0));
 515    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 516
 517    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 518    brw_MOV(p, dst, brw_imm_f(1.0));
 519    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 520 }
 521
 522 static void emit_max( struct brw_compile *p,
 523                       struct brw_reg dst,
 524                       struct brw_reg arg0,
 525                       struct brw_reg arg1 )
 526 {
 527    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 528    brw_SEL(p, dst, arg0, arg1);
 529    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 530 }
 531
 532 static void emit_min( struct brw_compile *p,
 533                       struct brw_reg dst,
 534                       struct brw_reg arg0,
 535                       struct brw_reg arg1 )
 536 {
 537    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 538    brw_SEL(p, dst, arg0, arg1);
 539    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 540 }
 541
 542
 543 static void emit_math1( struct brw_vs_compile *c,
 544                         GLuint function,
 545                         struct brw_reg dst,
 546                         struct brw_reg arg0,
 547                         GLuint precision)
 548 {
 549    /* There are various odd behaviours with SEND on the simulator.  In
 550     * addition there are documented issues with the fact that the GEN4
 551     * processor doesn't do dependency control properly on SEND
 552     * results.  So, on balance, this kludge to get around failures
 553     * with writemasked math results looks like it might be necessary
 554     * whether that turns out to be a simulator bug or not:
 555     */
 556    struct brw_compile *p = &c->func;
 557    struct intel_context *intel = &p->brw->intel;
 558    struct brw_reg tmp = dst;
 559    GLboolean need_tmp = (intel->gen < 6 &&
 560                          (dst.dw1.bits.writemask != 0xf ||
 561                           dst.file != BRW_GENERAL_REGISTER_FILE));
 562
 563    if (need_tmp)
 564       tmp = get_tmp(c);
 565
 566    brw_math(p,
 567             tmp,
 568             function,
 569             BRW_MATH_SATURATE_NONE,
 570             2,
 571             arg0,
 572             BRW_MATH_DATA_SCALAR,
 573             precision);
 574
 575    if (need_tmp) {
 576       brw_MOV(p, dst, tmp);
 577       release_tmp(c, tmp);
 578    }
 579 }
 580
 581
 582 static void emit_math2( struct brw_vs_compile *c,
 583                         GLuint function,
 584                         struct brw_reg dst,
 585                         struct brw_reg arg0,
 586                         struct brw_reg arg1,
 587                         GLuint precision)
 588 {
 589    struct brw_compile *p = &c->func;
 590    struct intel_context *intel = &p->brw->intel;
 591    struct brw_reg tmp = dst;
 592    GLboolean need_tmp = (intel->gen < 6 &&
 593                          (dst.dw1.bits.writemask != 0xf ||
 594                           dst.file != BRW_GENERAL_REGISTER_FILE));
 595
 596    if (need_tmp)
 597       tmp = get_tmp(c);
 598
 599    brw_MOV(p, brw_message_reg(3), arg1);
 600
 601    brw_math(p,
 602             tmp,
 603             function,
 604             BRW_MATH_SATURATE_NONE,
 605             2,
 606             arg0,
 607             BRW_MATH_DATA_SCALAR,
 608             precision);
 609
 610    if (need_tmp) {
 611       brw_MOV(p, dst, tmp);
 612       release_tmp(c, tmp);
 613    }
 614 }
 615
 616
 617 static void emit_exp_noalias( struct brw_vs_compile *c,
 618                               struct brw_reg dst,
 619                               struct brw_reg arg0 )
 620 {
 621    struct brw_compile *p = &c->func;
 622
 623
 624    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 625       struct brw_reg tmp = get_tmp(c);
 626       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 627
 628       /* tmp_d = floor(arg0.x) */
 629       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 630
 631       /* result[0] = 2.0 ^ tmp */
 632
 633       /* Adjust exponent for floating point:
 634        * exp += 127
 635        */
 636       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 637
 638       /* Install exponent and sign.
 639        * Excess drops off the edge:
 640        */
 641       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 642               tmp_d, brw_imm_d(23));
 643
 644       release_tmp(c, tmp);
 645    }
 646
 647    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 648       /* result[1] = arg0.x - floor(arg0.x) */
 649       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 650    }
 651
 652    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 653       /* As with the LOG instruction, we might be better off just
 654        * doing a taylor expansion here, seeing as we have to do all
 655        * the prep work.
 656        *
 657        * If mathbox partial precision is too low, consider also:
 658        * result[3] = result[0] * EXP(result[1])
 659        */
 660       emit_math1(c,
 661                  BRW_MATH_FUNCTION_EXP,
 662                  brw_writemask(dst, WRITEMASK_Z),
 663                  brw_swizzle1(arg0, 0),
 664                  BRW_MATH_PRECISION_FULL);
 665    }
 666
 667    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 668       /* result[3] = 1.0; */
 669       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 670    }
 671 }
 672
 673
 674 static void emit_log_noalias( struct brw_vs_compile *c,
 675                               struct brw_reg dst,
 676                               struct brw_reg arg0 )
 677 {
 678    struct brw_compile *p = &c->func;
 679    struct brw_reg tmp = dst;
 680    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 681    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 682    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 683                          dst.file != BRW_GENERAL_REGISTER_FILE);
 684
 685    if (need_tmp) {
 686       tmp = get_tmp(c);
 687       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 688    }
 689
 690    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 691     * according to spec:
 692     *
 693     * These almost look likey they could be joined up, but not really
 694     * practical:
 695     *
 696     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 697     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 698     */
 699    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 700       brw_AND(p,
 701               brw_writemask(tmp_ud, WRITEMASK_X),
 702               brw_swizzle1(arg0_ud, 0),
 703               brw_imm_ud((1U<<31)-1));
 704
 705       brw_SHR(p,
 706               brw_writemask(tmp_ud, WRITEMASK_X),
 707               tmp_ud,
 708               brw_imm_ud(23));
 709
 710       brw_ADD(p,
 711               brw_writemask(tmp, WRITEMASK_X),
 712               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 713               brw_imm_d(-127));
 714    }
 715
 716    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 717       brw_AND(p,
 718               brw_writemask(tmp_ud, WRITEMASK_Y),
 719               brw_swizzle1(arg0_ud, 0),
 720               brw_imm_ud((1<<23)-1));
 721
 722       brw_OR(p,
 723              brw_writemask(tmp_ud, WRITEMASK_Y),
 724              tmp_ud,
 725              brw_imm_ud(127<<23));
 726    }
 727
 728    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 729       /* result[2] = result[0] + LOG2(result[1]); */
 730
 731       /* Why bother?  The above is just a hint how to do this with a
 732        * taylor series.  Maybe we *should* use a taylor series as by
 733        * the time all the above has been done it's almost certainly
 734        * quicker than calling the mathbox, even with low precision.
 735        *
 736        * Options are:
 737        *    - result[0] + mathbox.LOG2(result[1])
 738        *    - mathbox.LOG2(arg0.x)
 739        *    - result[0] + inline_taylor_approx(result[1])
 740        */
 741       emit_math1(c,
 742                  BRW_MATH_FUNCTION_LOG,
 743                  brw_writemask(tmp, WRITEMASK_Z),
 744                  brw_swizzle1(tmp, 1),
 745                  BRW_MATH_PRECISION_FULL);
 746
 747       brw_ADD(p,
 748               brw_writemask(tmp, WRITEMASK_Z),
 749               brw_swizzle1(tmp, 2),
 750               brw_swizzle1(tmp, 0));
 751    }
 752
 753    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 754       /* result[3] = 1.0; */
 755       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 756    }
 757
 758    if (need_tmp) {
 759       brw_MOV(p, dst, tmp);
 760       release_tmp(c, tmp);
 761    }
 762 }
 763
 764
 765 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 766  */
 767 static void emit_dst_noalias( struct brw_vs_compile *c,
 768                               struct brw_reg dst,
 769                               struct brw_reg arg0,
 770                               struct brw_reg arg1)
 771 {
 772    struct brw_compile *p = &c->func;
 773
 774    /* There must be a better way to do this:
 775     */
 776    if (dst.dw1.bits.writemask & WRITEMASK_X)
 777       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 778    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 779       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 780    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 781       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 782    if (dst.dw1.bits.writemask & WRITEMASK_W)
 783       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 784 }
 785
 786
 787 static void emit_xpd( struct brw_compile *p,
 788                       struct brw_reg dst,
 789                       struct brw_reg t,
 790                       struct brw_reg u)
 791 {
 792    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 793    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 794 }
 795
 796
 797 static void emit_lit_noalias( struct brw_vs_compile *c,
 798                               struct brw_reg dst,
 799                               struct brw_reg arg0 )
 800 {
 801    struct brw_compile *p = &c->func;
 802    struct brw_instruction *if_insn;
 803    struct brw_reg tmp = dst;
 804    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 805
 806    if (need_tmp)
 807       tmp = get_tmp(c);
 808
 809    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 810    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 811
 812    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 813     * to get all channels active inside the IF.  In the clipping code
 814     * we run with NoMask, so it's not an option and we can use
 815     * BRW_EXECUTE_1 for all comparisions.
 816     */
 817    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 818    if_insn = brw_IF(p, BRW_EXECUTE_8);
 819    {
 820       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 821
 822       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 823       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 824       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 825
 826       emit_math2(c,
 827                  BRW_MATH_FUNCTION_POW,
 828                  brw_writemask(dst, WRITEMASK_Z),
 829                  brw_swizzle1(tmp, 2),
 830                  brw_swizzle1(arg0, 3),
 831                  BRW_MATH_PRECISION_PARTIAL);
 832    }
 833
 834    brw_ENDIF(p, if_insn);
 835
 836    release_tmp(c, tmp);
 837 }
 838
 839 static void emit_lrp_noalias(struct brw_vs_compile *c,
 840                              struct brw_reg dst,
 841                              struct brw_reg arg0,
 842                              struct brw_reg arg1,
 843                              struct brw_reg arg2)
 844 {
 845    struct brw_compile *p = &c->func;
 846
 847    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 848    brw_MUL(p, brw_null_reg(), dst, arg2);
 849    brw_MAC(p, dst, arg0, arg1);
 850 }
 851
 852 /** 3 or 4-component vector normalization */
 853 static void emit_nrm( struct brw_vs_compile *c,
 854                       struct brw_reg dst,
 855                       struct brw_reg arg0,
 856                       int num_comps)
 857 {
 858    struct brw_compile *p = &c->func;
 859    struct brw_reg tmp = get_tmp(c);
 860
 861    /* tmp = dot(arg0, arg0) */
 862    if (num_comps == 3)
 863       brw_DP3(p, tmp, arg0, arg0);
 864    else
 865       brw_DP4(p, tmp, arg0, arg0);
 866
 867    /* tmp = 1 / sqrt(tmp) */
 868    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 869
 870    /* dst = arg0 * tmp */
 871    brw_MUL(p, dst, arg0, tmp);
 872
 873    release_tmp(c, tmp);
 874 }
 875
 876
 877 static struct brw_reg
 878 get_constant(struct brw_vs_compile *c,
 879              const struct prog_instruction *inst,
 880              GLuint argIndex)
 881 {
 882    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 883    struct brw_compile *p = &c->func;
 884    struct brw_reg const_reg = c->current_const[argIndex].reg;
 885
 886    assert(argIndex < 3);
 887
 888    if (c->current_const[argIndex].index != src->Index) {
 889       /* Keep track of the last constant loaded in this slot, for reuse. */
 890       c->current_const[argIndex].index = src->Index;
 891
 892 #if 0
 893       printf("  fetch const[%d] for arg %d into reg %d\n",
 894              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 895 #endif
 896       /* need to fetch the constant now */
 897       brw_dp_READ_4_vs(p,
 898                        const_reg,                     /* writeback dest */
 899                        16 * src->Index,               /* byte offset */
 900                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 901                        );
 902    }
 903
 904    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 905    const_reg = stride(const_reg, 0, 4, 0);
 906    const_reg.subnr = 0;
 907
 908    return const_reg;
 909 }
 910
 911 static struct brw_reg
 912 get_reladdr_constant(struct brw_vs_compile *c,
 913                      const struct prog_instruction *inst,
 914                      GLuint argIndex)
 915 {
 916    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 917    struct brw_compile *p = &c->func;
 918    struct brw_reg const_reg = c->current_const[argIndex].reg;
 919    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 920    struct brw_reg byte_addr_reg = get_tmp(c);
 921
 922    assert(argIndex < 3);
 923
 924    /* Can't reuse a reladdr constant load. */
 925    c->current_const[argIndex].index = -1;
 926
 927  #if 0
 928    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 929           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 930 #endif
 931
 932    brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
 933
 934    /* fetch the first vec4 */
 935    brw_dp_READ_4_vs_relative(p,
 936                              const_reg,                     /* writeback dest */
 937                              byte_addr_reg,                 /* address register */
 938                              16 * src->Index,               /* byte offset */
 939                              SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 940                              );
 941
 942    return const_reg;
 943 }
 944
 945
 946
 947 /* TODO: relative addressing!
 948  */
 949 static struct brw_reg get_reg( struct brw_vs_compile *c,
 950                                gl_register_file file,
 951                                GLuint index )
 952 {
 953    switch (file) {
 954    case PROGRAM_TEMPORARY:
 955    case PROGRAM_INPUT:
 956    case PROGRAM_OUTPUT:
 957       assert(c->regs[file][index].nr != 0);
 958       return c->regs[file][index];
 959    case PROGRAM_STATE_VAR:
 960    case PROGRAM_CONSTANT:
 961    case PROGRAM_UNIFORM:
 962       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 963       return c->regs[PROGRAM_STATE_VAR][index];
 964    case PROGRAM_ADDRESS:
 965       assert(index == 0);
 966       return c->regs[file][index];
 967
 968    case PROGRAM_UNDEFINED:                      /* undef values */
 969       return brw_null_reg();
 970
 971    case PROGRAM_LOCAL_PARAM:
 972    case PROGRAM_ENV_PARAM:
 973    case PROGRAM_WRITE_ONLY:
 974    default:
 975       assert(0);
 976       return brw_null_reg();
 977    }
 978 }
 979
 980
 981 /**
 982  * Indirect addressing:  get reg[[arg] + offset].
 983  */
 984 static struct brw_reg deref( struct brw_vs_compile *c,
 985                              struct brw_reg arg,
 986                              GLint offset,
 987                              GLuint reg_size )
 988 {
 989    struct brw_compile *p = &c->func;
 990    struct brw_reg tmp = get_tmp(c);
 991    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 992    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
 993    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
 994    struct brw_reg indirect = brw_vec4_indirect(0,0);
 995    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
 996
 997    /* Set the vertical stride on the register access so that the first
 998     * 4 components come from a0.0 and the second 4 from a0.1.
 999     */
1000    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1001
1002    {
1003       brw_push_insn_state(p);
1004       brw_set_access_mode(p, BRW_ALIGN_1);
1005
1006       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1007       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1008
1009       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1010       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1011
1012       brw_MOV(p, tmp, indirect);
1013
1014       brw_pop_insn_state(p);
1015    }
1016
1017    /* NOTE: tmp not released */
1018    return tmp;
1019 }
1020
1021 static void
1022 move_to_reladdr_dst(struct brw_vs_compile *c,
1023                     const struct prog_instruction *inst,
1024                     struct brw_reg val)
1025 {
1026    struct brw_compile *p = &c->func;
1027    int reg_size = 32;
1028    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1029    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1030    struct brw_reg temp_base = c->regs[inst->DstReg.File][0];
1031    GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr;
1032    struct brw_reg indirect = brw_vec4_indirect(0,0);
1033    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1034
1035    byte_offset += inst->DstReg.Index * reg_size;
1036
1037    brw_push_insn_state(p);
1038    brw_set_access_mode(p, BRW_ALIGN_1);
1039
1040    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1041    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1042    brw_MOV(p, indirect, val);
1043
1044    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1045    brw_ADD(p, brw_address_reg(0), acc,
1046            brw_imm_uw(byte_offset + reg_size / 2));
1047    brw_MOV(p, indirect, suboffset(val, 4));
1048
1049    brw_pop_insn_state(p);
1050 }
1051
1052 /**
1053  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1054  * TODO: relative addressing!
1055  */
1056 static struct brw_reg
1057 get_src_reg( struct brw_vs_compile *c,
1058              const struct prog_instruction *inst,
1059              GLuint argIndex )
1060 {
1061    const GLuint file = inst->SrcReg[argIndex].File;
1062    const GLint index = inst->SrcReg[argIndex].Index;
1063    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1064
1065    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1066       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1067
1068       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1069                                         SWIZZLE_ZERO,
1070                                         SWIZZLE_ZERO,
1071                                         SWIZZLE_ZERO)) {
1072           return brw_imm_f(0.0f);
1073       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1074                                                SWIZZLE_ONE,
1075                                                SWIZZLE_ONE,
1076                                                SWIZZLE_ONE)) {
1077          if (src->Negate)
1078             return brw_imm_f(-1.0F);
1079          else
1080             return brw_imm_f(1.0F);
1081       } else if (src->File == PROGRAM_CONSTANT) {
1082          const struct gl_program_parameter_list *params;
1083          float f;
1084          int component = -1;
1085
1086          switch (src->Swizzle) {
1087          case SWIZZLE_XXXX:
1088             component = 0;
1089             break;
1090          case SWIZZLE_YYYY:
1091             component = 1;
1092             break;
1093          case SWIZZLE_ZZZZ:
1094             component = 2;
1095             break;
1096          case SWIZZLE_WWWW:
1097             component = 3;
1098             break;
1099          }
1100
1101          if (component >= 0) {
1102             params = c->vp->program.Base.Parameters;
1103             f = params->ParameterValues[src->Index][component];
1104
1105             if (src->Abs)
1106                f = fabs(f);
1107             if (src->Negate)
1108                f = -f;
1109             return brw_imm_f(f);
1110          }
1111       }
1112    }
1113
1114    switch (file) {
1115    case PROGRAM_TEMPORARY:
1116    case PROGRAM_INPUT:
1117    case PROGRAM_OUTPUT:
1118       if (relAddr) {
1119          return deref(c, c->regs[file][0], index, 32);
1120       }
1121       else {
1122          assert(c->regs[file][index].nr != 0);
1123          return c->regs[file][index];
1124       }
1125
1126    case PROGRAM_STATE_VAR:
1127    case PROGRAM_CONSTANT:
1128    case PROGRAM_UNIFORM:
1129    case PROGRAM_ENV_PARAM:
1130    case PROGRAM_LOCAL_PARAM:
1131       if (c->vp->use_const_buffer) {
1132          if (!relAddr && c->constant_map[index] != -1) {
1133             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1134             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1135          } else if (relAddr)
1136             return get_reladdr_constant(c, inst, argIndex);
1137          else
1138             return get_constant(c, inst, argIndex);
1139       }
1140       else if (relAddr) {
1141          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1142       }
1143       else {
1144          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1145          return c->regs[PROGRAM_STATE_VAR][index];
1146       }
1147    case PROGRAM_ADDRESS:
1148       assert(index == 0);
1149       return c->regs[file][index];
1150
1151    case PROGRAM_UNDEFINED:
1152       /* this is a normal case since we loop over all three src args */
1153       return brw_null_reg();
1154
1155    case PROGRAM_WRITE_ONLY:
1156    default:
1157       assert(0);
1158       return brw_null_reg();
1159    }
1160 }
1161
1162 /**
1163  * Return the brw reg for the given instruction's src argument.
1164  * Will return mangled results for SWZ op.  The emit_swz() function
1165  * ignores this result and recalculates taking extended swizzles into
1166  * account.
1167  */
1168 static struct brw_reg get_arg( struct brw_vs_compile *c,
1169                                const struct prog_instruction *inst,
1170                                GLuint argIndex )
1171 {
1172    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1173    struct brw_reg reg;
1174
1175    if (src->File == PROGRAM_UNDEFINED)
1176       return brw_null_reg();
1177
1178    reg = get_src_reg(c, inst, argIndex);
1179
1180    /* Convert 3-bit swizzle to 2-bit.
1181     */
1182    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1183                                        GET_SWZ(src->Swizzle, 1),
1184                                        GET_SWZ(src->Swizzle, 2),
1185                                        GET_SWZ(src->Swizzle, 3));
1186
1187    /* Note this is ok for non-swizzle instructions:
1188     */
1189    reg.negate = src->Negate ? 1 : 0;
1190
1191    return reg;
1192 }
1193
1194
1195 /**
1196  * Get brw register for the given program dest register.
1197  */
1198 static struct brw_reg get_dst( struct brw_vs_compile *c,
1199                                struct prog_dst_register dst )
1200 {
1201    struct brw_reg reg;
1202
1203    switch (dst.File) {
1204    case PROGRAM_TEMPORARY:
1205    case PROGRAM_OUTPUT:
1206       /* register-indirect addressing is only 1x1, not VxH, for
1207        * destination regs.  So, for RelAddr we'll return a temporary
1208        * for the dest and do a move of the result to the RelAddr
1209        * register after the instruction emit.
1210        */
1211       if (dst.RelAddr) {
1212          reg = get_tmp(c);
1213       } else {
1214          assert(c->regs[dst.File][dst.Index].nr != 0);
1215          reg = c->regs[dst.File][dst.Index];
1216       }
1217       break;
1218    case PROGRAM_ADDRESS:
1219       assert(dst.Index == 0);
1220       reg = c->regs[dst.File][dst.Index];
1221       break;
1222    case PROGRAM_UNDEFINED:
1223       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1224       reg = brw_null_reg();
1225       break;
1226    default:
1227       assert(0);
1228       reg = brw_null_reg();
1229    }
1230
1231    reg.dw1.bits.writemask = dst.WriteMask;
1232
1233    return reg;
1234 }
1235
1236
1237 static void emit_swz( struct brw_vs_compile *c,
1238                       struct brw_reg dst,
1239                       const struct prog_instruction *inst)
1240 {
1241    const GLuint argIndex = 0;
1242    const struct prog_src_register src = inst->SrcReg[argIndex];
1243    struct brw_compile *p = &c->func;
1244    GLuint zeros_mask = 0;
1245    GLuint ones_mask = 0;
1246    GLuint src_mask = 0;
1247    GLubyte src_swz[4];
1248    GLboolean need_tmp = (src.Negate &&
1249                          dst.file != BRW_GENERAL_REGISTER_FILE);
1250    struct brw_reg tmp = dst;
1251    GLuint i;
1252
1253    if (need_tmp)
1254       tmp = get_tmp(c);
1255
1256    for (i = 0; i < 4; i++) {
1257       if (dst.dw1.bits.writemask & (1<<i)) {
1258          GLubyte s = GET_SWZ(src.Swizzle, i);
1259          switch (s) {
1260          case SWIZZLE_X:
1261          case SWIZZLE_Y:
1262          case SWIZZLE_Z:
1263          case SWIZZLE_W:
1264             src_mask |= 1<<i;
1265             src_swz[i] = s;
1266             break;
1267          case SWIZZLE_ZERO:
1268             zeros_mask |= 1<<i;
1269             break;
1270          case SWIZZLE_ONE:
1271             ones_mask |= 1<<i;
1272             break;
1273          }
1274       }
1275    }
1276
1277    /* Do src first, in case dst aliases src:
1278     */
1279    if (src_mask) {
1280       struct brw_reg arg0;
1281
1282       arg0 = get_src_reg(c, inst, argIndex);
1283
1284       arg0 = brw_swizzle(arg0,
1285                          src_swz[0], src_swz[1],
1286                          src_swz[2], src_swz[3]);
1287
1288       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1289    }
1290
1291    if (zeros_mask)
1292       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1293
1294    if (ones_mask)
1295       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1296
1297    if (src.Negate)
1298       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1299
1300    if (need_tmp) {
1301       brw_MOV(p, dst, tmp);
1302       release_tmp(c, tmp);
1303    }
1304 }
1305
1306
1307 /**
1308  * Post-vertex-program processing.  Send the results to the URB.
1309  */
1310 static void emit_vertex_write( struct brw_vs_compile *c)
1311 {
1312    struct brw_compile *p = &c->func;
1313    struct brw_context *brw = p->brw;
1314    struct intel_context *intel = &brw->intel;
1315    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1316    struct brw_reg ndc;
1317    int eot;
1318    GLuint len_vertex_header = 2;
1319
1320    if (c->key.copy_edgeflag) {
1321       brw_MOV(p,
1322               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1323               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1324    }
1325
1326    if (intel->gen < 6) {
1327       /* Build ndc coords */
1328       ndc = get_tmp(c);
1329       /* ndc = 1.0 / pos.w */
1330       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1331       /* ndc.xyz = pos * ndc */
1332       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1333    }
1334
1335    /* Update the header for point size, user clipping flags, and -ve rhw
1336     * workaround.
1337     */
1338    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1339        c->key.nr_userclip || brw->has_negative_rhw_bug)
1340    {
1341       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1342       GLuint i;
1343
1344       brw_MOV(p, header1, brw_imm_ud(0));
1345
1346       brw_set_access_mode(p, BRW_ALIGN_16);
1347
1348       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1349          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1350          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1351          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1352       }
1353
1354       for (i = 0; i < c->key.nr_userclip; i++) {
1355          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1356          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1357          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1358          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1359       }
1360
1361       /* i965 clipping workaround:
1362        * 1) Test for -ve rhw
1363        * 2) If set,
1364        *      set ndc = (0,0,0,0)
1365        *      set ucp[6] = 1
1366        *
1367        * Later, clipping will detect ucp[6] and ensure the primitive is
1368        * clipped against all fixed planes.
1369        */
1370       if (brw->has_negative_rhw_bug) {
1371          brw_CMP(p,
1372                  vec8(brw_null_reg()),
1373                  BRW_CONDITIONAL_L,
1374                  brw_swizzle1(ndc, 3),
1375                  brw_imm_f(0));
1376
1377          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1378          brw_MOV(p, ndc, brw_imm_f(0));
1379          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1380       }
1381
1382       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1383       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1384       brw_set_access_mode(p, BRW_ALIGN_16);
1385
1386       release_tmp(c, header1);
1387    }
1388    else {
1389       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1390    }
1391
1392    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1393     * of zeros followed by two sets of NDC coordinates:
1394     */
1395    brw_set_access_mode(p, BRW_ALIGN_1);
1396
1397    /* The VUE layout is documented in Volume 2a. */
1398    if (intel->gen >= 6) {
1399       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1400        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1401        * dword 4-7 (m2) is the 4D space position
1402        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1403        * enabled.  We don't use it, so skip it.
1404        * m3 is the first vertex element data we fill, which is the vertex
1405        * position.
1406        */
1407       brw_MOV(p, brw_message_reg(2), pos);
1408       brw_MOV(p, brw_message_reg(3), pos);
1409       len_vertex_header = 2;
1410    } else if (intel->gen == 5) {
1411       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1412        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1413        * dword 4-7 (m2) is the ndc position (set above)
1414        * dword 8-11 (m3) of the vertex header is the 4D space position
1415        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1416        * m6 is a pad so that the vertex element data is aligned
1417        * m7 is the first vertex data we fill, which is the vertex position.
1418        */
1419       brw_MOV(p, brw_message_reg(2), ndc);
1420       brw_MOV(p, brw_message_reg(3), pos);
1421       brw_MOV(p, brw_message_reg(7), pos);
1422       len_vertex_header = 6;
1423    } else {
1424       /* There are 8 dwords in VUE header pre-Ironlake:
1425        * dword 0-3 (m1) is indices, point width, clip flags.
1426        * dword 4-7 (m2) is ndc position (set above)
1427        *
1428        * dword 8-11 (m3) is the first vertex data, which we always have be the
1429        * vertex position.
1430        */
1431       brw_MOV(p, brw_message_reg(2), ndc);
1432       brw_MOV(p, brw_message_reg(3), pos);
1433       len_vertex_header = 2;
1434    }
1435
1436    eot = (c->first_overflow_output == 0);
1437
1438    brw_urb_WRITE(p,
1439                  brw_null_reg(), /* dest */
1440                  0,             /* starting mrf reg nr */
1441                  c->r0,         /* src */
1442                  0,             /* allocate */
1443                  1,             /* used */
1444                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1445                  0,             /* response len */
1446                  eot,           /* eot */
1447                  eot,           /* writes complete */
1448                  0,             /* urb destination offset */
1449                  BRW_URB_SWIZZLE_INTERLEAVE);
1450
1451    if (c->first_overflow_output > 0) {
1452       /* Not all of the vertex outputs/results fit into the MRF.
1453        * Move the overflowed attributes from the GRF to the MRF and
1454        * issue another brw_urb_WRITE().
1455        */
1456       GLuint i, mrf = 1;
1457       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1458          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1459             /* move from GRF to MRF */
1460             brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1461             mrf++;
1462          }
1463       }
1464
1465       brw_urb_WRITE(p,
1466                     brw_null_reg(), /* dest */
1467                     0,              /* starting mrf reg nr */
1468                     c->r0,          /* src */
1469                     0,              /* allocate */
1470                     1,              /* used */
1471                     mrf,            /* msg len */
1472                     0,              /* response len */
1473                     1,              /* eot */
1474                     1,              /* writes complete */
1475                     14 / 2,  /* urb destination offset */
1476                     BRW_URB_SWIZZLE_INTERLEAVE);
1477    }
1478 }
1479
1480 static GLboolean
1481 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1482 {
1483    struct brw_compile *p = &c->func;
1484    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1485
1486    if (p->nr_insn == 0)
1487       return GL_FALSE;
1488
1489    if (val.address_mode != BRW_ADDRESS_DIRECT)
1490       return GL_FALSE;
1491
1492    switch (prev_insn->header.opcode) {
1493    case BRW_OPCODE_MOV:
1494    case BRW_OPCODE_MAC:
1495    case BRW_OPCODE_MUL:
1496       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1497           prev_insn->header.execution_size == val.width &&
1498           prev_insn->bits1.da1.dest_reg_file == val.file &&
1499           prev_insn->bits1.da1.dest_reg_type == val.type &&
1500           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1501           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1502           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1503           prev_insn->bits1.da16.dest_writemask == 0xf)
1504          return GL_TRUE;
1505       else
1506          return GL_FALSE;
1507    default:
1508       return GL_FALSE;
1509    }
1510 }
1511
1512 static uint32_t
1513 get_predicate(const struct prog_instruction *inst)
1514 {
1515    if (inst->DstReg.CondMask == COND_TR)
1516       return BRW_PREDICATE_NONE;
1517
1518    /* All of GLSL only produces predicates for COND_NE and one channel per
1519     * vector.  Fail badly if someone starts doing something else, as it might
1520     * mean infinite looping or something.
1521     *
1522     * We'd like to support all the condition codes, but our hardware doesn't
1523     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1524     * those, the instruction may update the condition codes or not, then any
1525     * later instruction may use one of those condition codes.  For gen4, the
1526     * instruction may update the flags register based on one of the condition
1527     * codes output by the instruction, and then further instructions may
1528     * predicate on that.  We can probably support this, but it won't
1529     * necessarily be easy.
1530     */
1531    assert(inst->DstReg.CondMask == COND_NE);
1532
1533    switch (inst->DstReg.CondSwizzle) {
1534    case SWIZZLE_XXXX:
1535       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1536    case SWIZZLE_YYYY:
1537       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1538    case SWIZZLE_ZZZZ:
1539       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1540    case SWIZZLE_WWWW:
1541       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1542    default:
1543       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1544                     inst->DstReg.CondMask);
1545       return BRW_PREDICATE_NORMAL;
1546    }
1547 }
1548
1549 /* Emit the vertex program instructions here.
1550  */
1551 void brw_vs_emit(struct brw_vs_compile *c )
1552 {
1553 #define MAX_IF_DEPTH 32
1554 #define MAX_LOOP_DEPTH 32
1555    struct brw_compile *p = &c->func;
1556    struct brw_context *brw = p->brw;
1557    struct intel_context *intel = &brw->intel;
1558    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1559    GLuint insn, if_depth = 0, loop_depth = 0;
1560    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1561    const struct brw_indirect stack_index = brw_indirect(0, 0);
1562    GLuint index;
1563    GLuint file;
1564
1565    if (INTEL_DEBUG & DEBUG_VS) {
1566       printf("vs-mesa:\n");
1567       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1568                                GL_TRUE);
1569       printf("\n");
1570    }
1571
1572    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1573    brw_set_access_mode(p, BRW_ALIGN_16);
1574
1575    for (insn = 0; insn < nr_insns; insn++) {
1576        GLuint i;
1577        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1578
1579        /* Message registers can't be read, so copy the output into GRF
1580         * register if they are used in source registers
1581         */
1582        for (i = 0; i < 3; i++) {
1583            struct prog_src_register *src = &inst->SrcReg[i];
1584            GLuint index = src->Index;
1585            GLuint file = src->File;
1586            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1587                c->output_regs[index].used_in_src = GL_TRUE;
1588        }
1589
1590        switch (inst->Opcode) {
1591        case OPCODE_CAL:
1592        case OPCODE_RET:
1593           c->needs_stack = GL_TRUE;
1594           break;
1595        default:
1596           break;
1597        }
1598    }
1599
1600    /* Static register allocation
1601     */
1602    brw_vs_alloc_regs(c);
1603
1604    if (c->needs_stack)
1605       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1606
1607    for (insn = 0; insn < nr_insns; insn++) {
1608
1609       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1610       struct brw_reg args[3], dst;
1611       GLuint i;
1612
1613 #if 0
1614       printf("%d: ", insn);
1615       _mesa_print_instruction(inst);
1616 #endif
1617
1618       /* Get argument regs.  SWZ is special and does this itself.
1619        */
1620       if (inst->Opcode != OPCODE_SWZ)
1621           for (i = 0; i < 3; i++) {
1622               const struct prog_src_register *src = &inst->SrcReg[i];
1623               index = src->Index;
1624               file = src->File;
1625               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1626                   args[i] = c->output_regs[index].reg;
1627               else
1628                   args[i] = get_arg(c, inst, i);
1629           }
1630
1631       /* Get dest regs.  Note that it is possible for a reg to be both
1632        * dst and arg, given the static allocation of registers.  So
1633        * care needs to be taken emitting multi-operation instructions.
1634        */
1635       index = inst->DstReg.Index;
1636       file = inst->DstReg.File;
1637       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1638           dst = c->output_regs[index].reg;
1639       else
1640           dst = get_dst(c, inst->DstReg);
1641
1642       if (inst->SaturateMode != SATURATE_OFF) {
1643          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1644                        inst->SaturateMode);
1645       }
1646
1647       switch (inst->Opcode) {
1648       case OPCODE_ABS:
1649          brw_MOV(p, dst, brw_abs(args[0]));
1650          break;
1651       case OPCODE_ADD:
1652          brw_ADD(p, dst, args[0], args[1]);
1653          break;
1654       case OPCODE_COS:
1655          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1656          break;
1657       case OPCODE_DP3:
1658          brw_DP3(p, dst, args[0], args[1]);
1659          break;
1660       case OPCODE_DP4:
1661          brw_DP4(p, dst, args[0], args[1]);
1662          break;
1663       case OPCODE_DPH:
1664          brw_DPH(p, dst, args[0], args[1]);
1665          break;
1666       case OPCODE_NRM3:
1667          emit_nrm(c, dst, args[0], 3);
1668          break;
1669       case OPCODE_NRM4:
1670          emit_nrm(c, dst, args[0], 4);
1671          break;
1672       case OPCODE_DST:
1673          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1674          break;
1675       case OPCODE_EXP:
1676          unalias1(c, dst, args[0], emit_exp_noalias);
1677          break;
1678       case OPCODE_EX2:
1679          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1680          break;
1681       case OPCODE_ARL:
1682          brw_RNDD(p, dst, args[0]);
1683          break;
1684       case OPCODE_FLR:
1685          brw_RNDD(p, dst, args[0]);
1686          break;
1687       case OPCODE_FRC:
1688          brw_FRC(p, dst, args[0]);
1689          break;
1690       case OPCODE_LOG:
1691          unalias1(c, dst, args[0], emit_log_noalias);
1692          break;
1693       case OPCODE_LG2:
1694          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1695          break;
1696       case OPCODE_LIT:
1697          unalias1(c, dst, args[0], emit_lit_noalias);
1698          break;
1699       case OPCODE_LRP:
1700          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1701          break;
1702       case OPCODE_MAD:
1703          if (!accumulator_contains(c, args[2]))
1704             brw_MOV(p, brw_acc_reg(), args[2]);
1705          brw_MAC(p, dst, args[0], args[1]);
1706          break;
1707       case OPCODE_CMP:
1708          emit_cmp(p, dst, args[0], args[1], args[2]);
1709          break;
1710       case OPCODE_MAX:
1711          emit_max(p, dst, args[0], args[1]);
1712          break;
1713       case OPCODE_MIN:
1714          emit_min(p, dst, args[0], args[1]);
1715          break;
1716       case OPCODE_MOV:
1717          brw_MOV(p, dst, args[0]);
1718          break;
1719       case OPCODE_MUL:
1720          brw_MUL(p, dst, args[0], args[1]);
1721          break;
1722       case OPCODE_POW:
1723          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1724          break;
1725       case OPCODE_RCP:
1726          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1727          break;
1728       case OPCODE_RSQ:
1729          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1730          break;
1731
1732       case OPCODE_SEQ:
1733          unalias2(c, dst, args[0], args[1], emit_seq);
1734          break;
1735       case OPCODE_SIN:
1736          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1737          break;
1738       case OPCODE_SNE:
1739          unalias2(c, dst, args[0], args[1], emit_sne);
1740          break;
1741       case OPCODE_SGE:
1742          unalias2(c, dst, args[0], args[1], emit_sge);
1743          break;
1744       case OPCODE_SGT:
1745          unalias2(c, dst, args[0], args[1], emit_sgt);
1746          break;
1747       case OPCODE_SLT:
1748          unalias2(c, dst, args[0], args[1], emit_slt);
1749          break;
1750       case OPCODE_SLE:
1751          unalias2(c, dst, args[0], args[1], emit_sle);
1752          break;
1753       case OPCODE_SSG:
1754          unalias1(c, dst, args[0], emit_sign);
1755          break;
1756       case OPCODE_SUB:
1757          brw_ADD(p, dst, args[0], negate(args[1]));
1758          break;
1759       case OPCODE_SWZ:
1760          /* The args[0] value can't be used here as it won't have
1761           * correctly encoded the full swizzle:
1762           */
1763          emit_swz(c, dst, inst);
1764          break;
1765       case OPCODE_TRUNC:
1766          /* round toward zero */
1767          brw_RNDZ(p, dst, args[0]);
1768          break;
1769       case OPCODE_XPD:
1770          emit_xpd(p, dst, args[0], args[1]);
1771          break;
1772       case OPCODE_IF:
1773          assert(if_depth < MAX_IF_DEPTH);
1774          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1775          /* Note that brw_IF smashes the predicate_control field. */
1776          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1777          if_depth++;
1778          break;
1779       case OPCODE_ELSE:
1780          assert(if_depth > 0);
1781          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1782          break;
1783       case OPCODE_ENDIF:
1784          assert(if_depth > 0);
1785          brw_ENDIF(p, if_inst[--if_depth]);
1786          break;
1787       case OPCODE_BGNLOOP:
1788          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1789          break;
1790       case OPCODE_BRK:
1791          brw_set_predicate_control(p, get_predicate(inst));
1792          brw_BREAK(p);
1793          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1794          break;
1795       case OPCODE_CONT:
1796          brw_set_predicate_control(p, get_predicate(inst));
1797          brw_CONT(p);
1798          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1799          break;
1800       case OPCODE_ENDLOOP:
1801          {
1802             struct brw_instruction *inst0, *inst1;
1803             GLuint br = 1;
1804
1805             loop_depth--;
1806
1807             if (intel->gen == 5)
1808                br = 2;
1809
1810             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1811             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1812             while (inst0 > loop_inst[loop_depth]) {
1813                inst0--;
1814                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1815                    inst0->bits3.if_else.jump_count == 0) {
1816                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1817                   inst0->bits3.if_else.pop_count = 0;
1818                }
1819                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1820                         inst0->bits3.if_else.jump_count == 0) {
1821                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1822                   inst0->bits3.if_else.pop_count = 0;
1823                }
1824             }
1825          }
1826          break;
1827       case OPCODE_BRA:
1828          brw_set_predicate_control(p, get_predicate(inst));
1829          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1830          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1831          break;
1832       case OPCODE_CAL:
1833          brw_set_access_mode(p, BRW_ALIGN_1);
1834          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1835          brw_set_access_mode(p, BRW_ALIGN_16);
1836          brw_ADD(p, get_addr_reg(stack_index),
1837                          get_addr_reg(stack_index), brw_imm_d(4));
1838          brw_save_call(p, inst->Comment, p->nr_insn);
1839          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1840          break;
1841       case OPCODE_RET:
1842          brw_ADD(p, get_addr_reg(stack_index),
1843                          get_addr_reg(stack_index), brw_imm_d(-4));
1844          brw_set_access_mode(p, BRW_ALIGN_1);
1845          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1846          brw_set_access_mode(p, BRW_ALIGN_16);
1847          break;
1848       case OPCODE_END:
1849          emit_vertex_write(c);
1850          break;
1851       case OPCODE_PRINT:
1852          /* no-op */
1853          break;
1854       case OPCODE_BGNSUB:
1855          brw_save_label(p, inst->Comment, p->nr_insn);
1856          break;
1857       case OPCODE_ENDSUB:
1858          /* no-op */
1859          break;
1860       default:
1861          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1862                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1863                                     _mesa_opcode_string(inst->Opcode) :
1864                                     "unknown");
1865       }
1866
1867       /* Set the predication update on the last instruction of the native
1868        * instruction sequence.
1869        *
1870        * This would be problematic if it was set on a math instruction,
1871        * but that shouldn't be the case with the current GLSL compiler.
1872        */
1873       if (inst->CondUpdate) {
1874          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1875
1876          assert(hw_insn->header.destreg__conditionalmod == 0);
1877          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1878       }
1879
1880       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1881           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1882           && c->output_regs[inst->DstReg.Index].used_in_src) {
1883          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1884       }
1885
1886       /* Result color clamping.
1887        *
1888        * When destination register is an output register and
1889        * it's primary/secondary front/back color, we have to clamp
1890        * the result to [0,1]. This is done by enabling the
1891        * saturation bit for the last instruction.
1892        *
1893        * We don't use brw_set_saturate() as it modifies
1894        * p->current->header.saturate, which affects all the subsequent
1895        * instructions. Instead, we directly modify the header
1896        * of the last (already stored) instruction.
1897        */
1898       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1899          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1900              || (inst->DstReg.Index == VERT_RESULT_COL1)
1901              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1902              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1903             p->store[p->nr_insn-1].header.saturate = 1;
1904          }
1905       }
1906
1907       if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) {
1908          /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1909           * compute-to-mrf and the fact that we are allocating
1910           * registers for only the used PROGRAM_OUTPUTs.
1911           */
1912          move_to_reladdr_dst(c, inst, dst);
1913       }
1914
1915       release_tmps(c);
1916    }
1917
1918    brw_resolve_cals(p);
1919
1920    brw_optimize(p);
1921
1922    if (INTEL_DEBUG & DEBUG_VS) {
1923       int i;
1924
1925       printf("vs-native:\n");
1926       for (i = 0; i < p->nr_insn; i++)
1927          brw_disasm(stdout, &p->store[i], intel->gen);
1928       printf("\n");
1929    }
1930 }