src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    struct intel_context *intel = &c->func.brw->intel;
  71    GLuint i, reg = 0, mrf;
  72    int attributes_in_vue;
  73
  74    /* Determine whether to use a real constant buffer or use a block
  75     * of GRF registers for constants.  The later is faster but only
  76     * works if everything fits in the GRF.
  77     * XXX this heuristic/check may need some fine tuning...
  78     */
  79    if (c->vp->program.Base.Parameters->NumParameters +
  80        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  81       c->vp->use_const_buffer = GL_TRUE;
  82    else
  83       c->vp->use_const_buffer = GL_FALSE;
  84
  85    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  86
  87    /* r0 -- reserved as usual
  88     */
  89    c->r0 = brw_vec8_grf(reg, 0);
  90    reg++;
  91
  92    /* User clip planes from curbe:
  93     */
  94    if (c->key.nr_userclip) {
  95       for (i = 0; i < c->key.nr_userclip; i++) {
  96          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  97       }
  98
  99       /* Deal with curbe alignment:
 100        */
 101       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 102    }
 103
 104    /* Vertex program parameters from curbe:
 105     */
 106    if (c->vp->use_const_buffer) {
 107       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 108       int constant = 0;
 109
 110       /* We've got more constants than we can load with the push
 111        * mechanism.  This is often correlated with reladdr loads where
 112        * we should probably be using a pull mechanism anyway to avoid
 113        * excessive reading.  However, the pull mechanism is slow in
 114        * general.  So, we try to allocate as many non-reladdr-loaded
 115        * constants through the push buffer as we can before giving up.
 116        */
 117       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 118       for (i = 0;
 119            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 120            i++) {
 121          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 122          int arg;
 123
 124          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 125             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 126                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 127                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 128                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 129                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 130                 inst->SrcReg[arg].RelAddr)
 131                continue;
 132
 133             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 134                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 135             }
 136          }
 137       }
 138
 139       for (i = 0; i < constant; i++) {
 140          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 141                                                               (i%2) * 4),
 142                                                  0, 4, 1);
 143       }
 144       reg += (constant + 1) / 2;
 145       c->prog_data.curb_read_length = reg - 1;
 146       /* XXX 0 causes a bug elsewhere... */
 147       c->prog_data.nr_params = MAX2(constant * 4, 4);
 148    }
 149    else {
 150       /* use a section of the GRF for constants */
 151       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 152       for (i = 0; i < nr_params; i++) {
 153          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 154       }
 155       reg += (nr_params + 1) / 2;
 156       c->prog_data.curb_read_length = reg - 1;
 157
 158       c->prog_data.nr_params = nr_params * 4;
 159    }
 160
 161    /* Allocate input regs:
 162     */
 163    c->nr_inputs = 0;
 164    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 165       if (c->prog_data.inputs_read & (1 << i)) {
 166          c->nr_inputs++;
 167          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 168          reg++;
 169       }
 170    }
 171    /* If there are no inputs, we'll still be reading one attribute's worth
 172     * because it's required -- see urb_read_length setting.
 173     */
 174    if (c->nr_inputs == 0)
 175       reg++;
 176
 177    /* Allocate outputs.  The non-position outputs go straight into message regs.
 178     */
 179    c->nr_outputs = 0;
 180    c->first_output = reg;
 181    c->first_overflow_output = 0;
 182
 183    if (intel->gen >= 6)
 184       mrf = 6;
 185    else if (intel->is_ironlake)
 186       mrf = 8;
 187    else
 188       mrf = 4;
 189
 190    for (i = 0; i < VERT_RESULT_MAX; i++) {
 191       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 192          c->nr_outputs++;
 193          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 194          if (i == VERT_RESULT_HPOS) {
 195             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 196             reg++;
 197          }
 198          else if (i == VERT_RESULT_PSIZ) {
 199             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 200             reg++;
 201             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 202          }
 203          else {
 204             if (mrf < 16) {
 205                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 206                mrf++;
 207             }
 208             else {
 209                /* too many vertex results to fit in MRF, use GRF for overflow */
 210                if (!c->first_overflow_output)
 211                   c->first_overflow_output = i;
 212                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 213                reg++;
 214             }
 215          }
 216       }
 217    }
 218
 219    /* Allocate program temporaries:
 220     */
 221    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 222       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 223       reg++;
 224    }
 225
 226    /* Address reg(s).  Don't try to use the internal address reg until
 227     * deref time.
 228     */
 229    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 230       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 231                                              reg,
 232                                              0,
 233                                              BRW_REGISTER_TYPE_D,
 234                                              BRW_VERTICAL_STRIDE_8,
 235                                              BRW_WIDTH_8,
 236                                              BRW_HORIZONTAL_STRIDE_1,
 237                                              BRW_SWIZZLE_XXXX,
 238                                              WRITEMASK_X);
 239       reg++;
 240    }
 241
 242    if (c->vp->use_const_buffer) {
 243       for (i = 0; i < 3; i++) {
 244          c->current_const[i].index = -1;
 245          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 246          reg++;
 247       }
 248    }
 249
 250    for (i = 0; i < 128; i++) {
 251       if (c->output_regs[i].used_in_src) {
 252          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 253          reg++;
 254       }
 255    }
 256
 257    if (c->needs_stack) {
 258       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 259       reg += 2;
 260    }
 261
 262    /* Some opcodes need an internal temporary:
 263     */
 264    c->first_tmp = reg;
 265    c->last_tmp = reg;           /* for allocation purposes */
 266
 267    /* Each input reg holds data from two vertices.  The
 268     * urb_read_length is the number of registers read from *each*
 269     * vertex urb, so is half the amount:
 270     */
 271    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 272    /* Setting this field to 0 leads to undefined behavior according to the
 273     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 274     * sitting in them, even if it's padding.
 275     */
 276    if (c->prog_data.urb_read_length == 0)
 277       c->prog_data.urb_read_length = 1;
 278
 279    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 280     * them to fit the biggest thing they need to.
 281     */
 282    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 283
 284    if (intel->gen >= 6)
 285       c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8;
 286    else if (intel->is_ironlake)
 287       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 288    else
 289       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 290
 291    c->prog_data.total_grf = reg;
 292
 293    if (INTEL_DEBUG & DEBUG_VS) {
 294       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 295       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 296       printf("%s reg = %d\n", __FUNCTION__, reg);
 297    }
 298 }
 299
 300
 301 /**
 302  * If an instruction uses a temp reg both as a src and the dest, we
 303  * sometimes need to allocate an intermediate temporary.
 304  */
 305 static void unalias1( struct brw_vs_compile *c,
 306                       struct brw_reg dst,
 307                       struct brw_reg arg0,
 308                       void (*func)( struct brw_vs_compile *,
 309                                     struct brw_reg,
 310                                     struct brw_reg ))
 311 {
 312    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 313       struct brw_compile *p = &c->func;
 314       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 315       func(c, tmp, arg0);
 316       brw_MOV(p, dst, tmp);
 317       release_tmp(c, tmp);
 318    }
 319    else {
 320       func(c, dst, arg0);
 321    }
 322 }
 323
 324 /**
 325  * \sa unalias2
 326  * Checkes if 2-operand instruction needs an intermediate temporary.
 327  */
 328 static void unalias2( struct brw_vs_compile *c,
 329                       struct brw_reg dst,
 330                       struct brw_reg arg0,
 331                       struct brw_reg arg1,
 332                       void (*func)( struct brw_vs_compile *,
 333                                     struct brw_reg,
 334                                     struct brw_reg,
 335                                     struct brw_reg ))
 336 {
 337    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 338        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 339       struct brw_compile *p = &c->func;
 340       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 341       func(c, tmp, arg0, arg1);
 342       brw_MOV(p, dst, tmp);
 343       release_tmp(c, tmp);
 344    }
 345    else {
 346       func(c, dst, arg0, arg1);
 347    }
 348 }
 349
 350 /**
 351  * \sa unalias2
 352  * Checkes if 3-operand instruction needs an intermediate temporary.
 353  */
 354 static void unalias3( struct brw_vs_compile *c,
 355                       struct brw_reg dst,
 356                       struct brw_reg arg0,
 357                       struct brw_reg arg1,
 358                       struct brw_reg arg2,
 359                       void (*func)( struct brw_vs_compile *,
 360                                     struct brw_reg,
 361                                     struct brw_reg,
 362                                     struct brw_reg,
 363                                     struct brw_reg ))
 364 {
 365    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 366        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 367        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 368       struct brw_compile *p = &c->func;
 369       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 370       func(c, tmp, arg0, arg1, arg2);
 371       brw_MOV(p, dst, tmp);
 372       release_tmp(c, tmp);
 373    }
 374    else {
 375       func(c, dst, arg0, arg1, arg2);
 376    }
 377 }
 378
 379 static void emit_sop( struct brw_vs_compile *c,
 380                       struct brw_reg dst,
 381                       struct brw_reg arg0,
 382                       struct brw_reg arg1,
 383                       GLuint cond)
 384 {
 385    struct brw_compile *p = &c->func;
 386
 387    brw_MOV(p, dst, brw_imm_f(0.0f));
 388    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 389    brw_MOV(p, dst, brw_imm_f(1.0f));
 390    brw_set_predicate_control_flag_value(p, 0xff);
 391 }
 392
 393 static void emit_seq( struct brw_vs_compile *c,
 394                       struct brw_reg dst,
 395                       struct brw_reg arg0,
 396                       struct brw_reg arg1 )
 397 {
 398    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 399 }
 400
 401 static void emit_sne( struct brw_vs_compile *c,
 402                       struct brw_reg dst,
 403                       struct brw_reg arg0,
 404                       struct brw_reg arg1 )
 405 {
 406    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 407 }
 408 static void emit_slt( struct brw_vs_compile *c,
 409                       struct brw_reg dst,
 410                       struct brw_reg arg0,
 411                       struct brw_reg arg1 )
 412 {
 413    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 414 }
 415
 416 static void emit_sle( struct brw_vs_compile *c,
 417                       struct brw_reg dst,
 418                       struct brw_reg arg0,
 419                       struct brw_reg arg1 )
 420 {
 421    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 422 }
 423
 424 static void emit_sgt( struct brw_vs_compile *c,
 425                       struct brw_reg dst,
 426                       struct brw_reg arg0,
 427                       struct brw_reg arg1 )
 428 {
 429    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 430 }
 431
 432 static void emit_sge( struct brw_vs_compile *c,
 433                       struct brw_reg dst,
 434                       struct brw_reg arg0,
 435                       struct brw_reg arg1 )
 436 {
 437   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 438 }
 439
 440 static void emit_cmp( struct brw_compile *p,
 441                       struct brw_reg dst,
 442                       struct brw_reg arg0,
 443                       struct brw_reg arg1,
 444                       struct brw_reg arg2 )
 445 {
 446    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 447    brw_SEL(p, dst, arg1, arg2);
 448    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 449 }
 450
 451 static void emit_max( struct brw_compile *p,
 452                       struct brw_reg dst,
 453                       struct brw_reg arg0,
 454                       struct brw_reg arg1 )
 455 {
 456    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 457    brw_SEL(p, dst, arg1, arg0);
 458    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 459 }
 460
 461 static void emit_min( struct brw_compile *p,
 462                       struct brw_reg dst,
 463                       struct brw_reg arg0,
 464                       struct brw_reg arg1 )
 465 {
 466    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 467    brw_SEL(p, dst, arg0, arg1);
 468    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 469 }
 470
 471
 472 static void emit_math1( struct brw_vs_compile *c,
 473                         GLuint function,
 474                         struct brw_reg dst,
 475                         struct brw_reg arg0,
 476                         GLuint precision)
 477 {
 478    /* There are various odd behaviours with SEND on the simulator.  In
 479     * addition there are documented issues with the fact that the GEN4
 480     * processor doesn't do dependency control properly on SEND
 481     * results.  So, on balance, this kludge to get around failures
 482     * with writemasked math results looks like it might be necessary
 483     * whether that turns out to be a simulator bug or not:
 484     */
 485    struct brw_compile *p = &c->func;
 486    struct intel_context *intel = &p->brw->intel;
 487    struct brw_reg tmp = dst;
 488    GLboolean need_tmp = (intel->gen < 6 &&
 489                          (dst.dw1.bits.writemask != 0xf ||
 490                           dst.file != BRW_GENERAL_REGISTER_FILE));
 491
 492    if (need_tmp)
 493       tmp = get_tmp(c);
 494
 495    brw_math(p,
 496             tmp,
 497             function,
 498             BRW_MATH_SATURATE_NONE,
 499             2,
 500             arg0,
 501             BRW_MATH_DATA_SCALAR,
 502             precision);
 503
 504    if (need_tmp) {
 505       brw_MOV(p, dst, tmp);
 506       release_tmp(c, tmp);
 507    }
 508 }
 509
 510
 511 static void emit_math2( struct brw_vs_compile *c,
 512                         GLuint function,
 513                         struct brw_reg dst,
 514                         struct brw_reg arg0,
 515                         struct brw_reg arg1,
 516                         GLuint precision)
 517 {
 518    struct brw_compile *p = &c->func;
 519    struct intel_context *intel = &p->brw->intel;
 520    struct brw_reg tmp = dst;
 521    GLboolean need_tmp = (intel->gen < 6 &&
 522                          (dst.dw1.bits.writemask != 0xf ||
 523                           dst.file != BRW_GENERAL_REGISTER_FILE));
 524
 525    if (need_tmp)
 526       tmp = get_tmp(c);
 527
 528    brw_MOV(p, brw_message_reg(3), arg1);
 529
 530    brw_math(p,
 531             tmp,
 532             function,
 533             BRW_MATH_SATURATE_NONE,
 534             2,
 535             arg0,
 536             BRW_MATH_DATA_SCALAR,
 537             precision);
 538
 539    if (need_tmp) {
 540       brw_MOV(p, dst, tmp);
 541       release_tmp(c, tmp);
 542    }
 543 }
 544
 545
 546 static void emit_exp_noalias( struct brw_vs_compile *c,
 547                               struct brw_reg dst,
 548                               struct brw_reg arg0 )
 549 {
 550    struct brw_compile *p = &c->func;
 551
 552
 553    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 554       struct brw_reg tmp = get_tmp(c);
 555       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 556
 557       /* tmp_d = floor(arg0.x) */
 558       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 559
 560       /* result[0] = 2.0 ^ tmp */
 561
 562       /* Adjust exponent for floating point:
 563        * exp += 127
 564        */
 565       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 566
 567       /* Install exponent and sign.
 568        * Excess drops off the edge:
 569        */
 570       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 571               tmp_d, brw_imm_d(23));
 572
 573       release_tmp(c, tmp);
 574    }
 575
 576    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 577       /* result[1] = arg0.x - floor(arg0.x) */
 578       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 579    }
 580
 581    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 582       /* As with the LOG instruction, we might be better off just
 583        * doing a taylor expansion here, seeing as we have to do all
 584        * the prep work.
 585        *
 586        * If mathbox partial precision is too low, consider also:
 587        * result[3] = result[0] * EXP(result[1])
 588        */
 589       emit_math1(c,
 590                  BRW_MATH_FUNCTION_EXP,
 591                  brw_writemask(dst, WRITEMASK_Z),
 592                  brw_swizzle1(arg0, 0),
 593                  BRW_MATH_PRECISION_FULL);
 594    }
 595
 596    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 597       /* result[3] = 1.0; */
 598       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 599    }
 600 }
 601
 602
 603 static void emit_log_noalias( struct brw_vs_compile *c,
 604                               struct brw_reg dst,
 605                               struct brw_reg arg0 )
 606 {
 607    struct brw_compile *p = &c->func;
 608    struct brw_reg tmp = dst;
 609    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 610    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 611    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 612                          dst.file != BRW_GENERAL_REGISTER_FILE);
 613
 614    if (need_tmp) {
 615       tmp = get_tmp(c);
 616       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 617    }
 618
 619    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 620     * according to spec:
 621     *
 622     * These almost look likey they could be joined up, but not really
 623     * practical:
 624     *
 625     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 626     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 627     */
 628    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 629       brw_AND(p,
 630               brw_writemask(tmp_ud, WRITEMASK_X),
 631               brw_swizzle1(arg0_ud, 0),
 632               brw_imm_ud((1U<<31)-1));
 633
 634       brw_SHR(p,
 635               brw_writemask(tmp_ud, WRITEMASK_X),
 636               tmp_ud,
 637               brw_imm_ud(23));
 638
 639       brw_ADD(p,
 640               brw_writemask(tmp, WRITEMASK_X),
 641               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 642               brw_imm_d(-127));
 643    }
 644
 645    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 646       brw_AND(p,
 647               brw_writemask(tmp_ud, WRITEMASK_Y),
 648               brw_swizzle1(arg0_ud, 0),
 649               brw_imm_ud((1<<23)-1));
 650
 651       brw_OR(p,
 652              brw_writemask(tmp_ud, WRITEMASK_Y),
 653              tmp_ud,
 654              brw_imm_ud(127<<23));
 655    }
 656
 657    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 658       /* result[2] = result[0] + LOG2(result[1]); */
 659
 660       /* Why bother?  The above is just a hint how to do this with a
 661        * taylor series.  Maybe we *should* use a taylor series as by
 662        * the time all the above has been done it's almost certainly
 663        * quicker than calling the mathbox, even with low precision.
 664        *
 665        * Options are:
 666        *    - result[0] + mathbox.LOG2(result[1])
 667        *    - mathbox.LOG2(arg0.x)
 668        *    - result[0] + inline_taylor_approx(result[1])
 669        */
 670       emit_math1(c,
 671                  BRW_MATH_FUNCTION_LOG,
 672                  brw_writemask(tmp, WRITEMASK_Z),
 673                  brw_swizzle1(tmp, 1),
 674                  BRW_MATH_PRECISION_FULL);
 675
 676       brw_ADD(p,
 677               brw_writemask(tmp, WRITEMASK_Z),
 678               brw_swizzle1(tmp, 2),
 679               brw_swizzle1(tmp, 0));
 680    }
 681
 682    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 683       /* result[3] = 1.0; */
 684       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 685    }
 686
 687    if (need_tmp) {
 688       brw_MOV(p, dst, tmp);
 689       release_tmp(c, tmp);
 690    }
 691 }
 692
 693
 694 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 695  */
 696 static void emit_dst_noalias( struct brw_vs_compile *c,
 697                               struct brw_reg dst,
 698                               struct brw_reg arg0,
 699                               struct brw_reg arg1)
 700 {
 701    struct brw_compile *p = &c->func;
 702
 703    /* There must be a better way to do this:
 704     */
 705    if (dst.dw1.bits.writemask & WRITEMASK_X)
 706       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 707    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 708       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 709    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 710       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 711    if (dst.dw1.bits.writemask & WRITEMASK_W)
 712       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 713 }
 714
 715
 716 static void emit_xpd( struct brw_compile *p,
 717                       struct brw_reg dst,
 718                       struct brw_reg t,
 719                       struct brw_reg u)
 720 {
 721    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 722    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 723 }
 724
 725
 726 static void emit_lit_noalias( struct brw_vs_compile *c,
 727                               struct brw_reg dst,
 728                               struct brw_reg arg0 )
 729 {
 730    struct brw_compile *p = &c->func;
 731    struct brw_instruction *if_insn;
 732    struct brw_reg tmp = dst;
 733    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 734
 735    if (need_tmp)
 736       tmp = get_tmp(c);
 737
 738    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 739    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 740
 741    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 742     * to get all channels active inside the IF.  In the clipping code
 743     * we run with NoMask, so it's not an option and we can use
 744     * BRW_EXECUTE_1 for all comparisions.
 745     */
 746    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 747    if_insn = brw_IF(p, BRW_EXECUTE_8);
 748    {
 749       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 750
 751       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 752       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 753       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 754
 755       emit_math2(c,
 756                  BRW_MATH_FUNCTION_POW,
 757                  brw_writemask(dst, WRITEMASK_Z),
 758                  brw_swizzle1(tmp, 2),
 759                  brw_swizzle1(arg0, 3),
 760                  BRW_MATH_PRECISION_PARTIAL);
 761    }
 762
 763    brw_ENDIF(p, if_insn);
 764
 765    release_tmp(c, tmp);
 766 }
 767
 768 static void emit_lrp_noalias(struct brw_vs_compile *c,
 769                              struct brw_reg dst,
 770                              struct brw_reg arg0,
 771                              struct brw_reg arg1,
 772                              struct brw_reg arg2)
 773 {
 774    struct brw_compile *p = &c->func;
 775
 776    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 777    brw_MUL(p, brw_null_reg(), dst, arg2);
 778    brw_MAC(p, dst, arg0, arg1);
 779 }
 780
 781 /** 3 or 4-component vector normalization */
 782 static void emit_nrm( struct brw_vs_compile *c,
 783                       struct brw_reg dst,
 784                       struct brw_reg arg0,
 785                       int num_comps)
 786 {
 787    struct brw_compile *p = &c->func;
 788    struct brw_reg tmp = get_tmp(c);
 789
 790    /* tmp = dot(arg0, arg0) */
 791    if (num_comps == 3)
 792       brw_DP3(p, tmp, arg0, arg0);
 793    else
 794       brw_DP4(p, tmp, arg0, arg0);
 795
 796    /* tmp = 1 / sqrt(tmp) */
 797    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 798
 799    /* dst = arg0 * tmp */
 800    brw_MUL(p, dst, arg0, tmp);
 801
 802    release_tmp(c, tmp);
 803 }
 804
 805
 806 static struct brw_reg
 807 get_constant(struct brw_vs_compile *c,
 808              const struct prog_instruction *inst,
 809              GLuint argIndex)
 810 {
 811    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 812    struct brw_compile *p = &c->func;
 813    struct brw_reg const_reg = c->current_const[argIndex].reg;
 814
 815    assert(argIndex < 3);
 816
 817    if (c->current_const[argIndex].index != src->Index) {
 818       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 819
 820       /* Keep track of the last constant loaded in this slot, for reuse. */
 821       c->current_const[argIndex].index = src->Index;
 822
 823 #if 0
 824       printf("  fetch const[%d] for arg %d into reg %d\n",
 825              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 826 #endif
 827       /* need to fetch the constant now */
 828       brw_dp_READ_4_vs(p,
 829                        const_reg,                     /* writeback dest */
 830                        0,                             /* oword */
 831                        0,                             /* relative indexing? */
 832                        addrReg,                       /* address register */
 833                        16 * src->Index,               /* byte offset */
 834                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 835                        );
 836    }
 837
 838    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 839    const_reg = stride(const_reg, 0, 4, 0);
 840    const_reg.subnr = 0;
 841
 842    return const_reg;
 843 }
 844
 845 static struct brw_reg
 846 get_reladdr_constant(struct brw_vs_compile *c,
 847                      const struct prog_instruction *inst,
 848                      GLuint argIndex)
 849 {
 850    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 851    struct brw_compile *p = &c->func;
 852    struct brw_reg const_reg = c->current_const[argIndex].reg;
 853    struct brw_reg const2_reg;
 854    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 855
 856    assert(argIndex < 3);
 857
 858    /* Can't reuse a reladdr constant load. */
 859    c->current_const[argIndex].index = -1;
 860
 861  #if 0
 862    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 863           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 864 #endif
 865
 866    /* fetch the first vec4 */
 867    brw_dp_READ_4_vs(p,
 868                     const_reg,                     /* writeback dest */
 869                     0,                             /* oword */
 870                     1,                             /* relative indexing? */
 871                     addrReg,                       /* address register */
 872                     16 * src->Index,               /* byte offset */
 873                     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 874                     );
 875    /* second vec4 */
 876    const2_reg = get_tmp(c);
 877
 878    /* use upper half of address reg for second read */
 879    addrReg = stride(addrReg, 0, 4, 0);
 880    addrReg.subnr = 16;
 881
 882    brw_dp_READ_4_vs(p,
 883                     const2_reg,              /* writeback dest */
 884                     1,                       /* oword */
 885                     1,                       /* relative indexing? */
 886                     addrReg,                 /* address register */
 887                     16 * src->Index,         /* byte offset */
 888                     SURF_INDEX_VERT_CONST_BUFFER
 889                     );
 890
 891    /* merge the two Owords into the constant register */
 892    /* const_reg[7..4] = const2_reg[7..4] */
 893    brw_MOV(p,
 894            suboffset(stride(const_reg, 0, 4, 1), 4),
 895            suboffset(stride(const2_reg, 0, 4, 1), 4));
 896    release_tmp(c, const2_reg);
 897
 898    return const_reg;
 899 }
 900
 901
 902
 903 /* TODO: relative addressing!
 904  */
 905 static struct brw_reg get_reg( struct brw_vs_compile *c,
 906                                gl_register_file file,
 907                                GLuint index )
 908 {
 909    switch (file) {
 910    case PROGRAM_TEMPORARY:
 911    case PROGRAM_INPUT:
 912    case PROGRAM_OUTPUT:
 913       assert(c->regs[file][index].nr != 0);
 914       return c->regs[file][index];
 915    case PROGRAM_STATE_VAR:
 916    case PROGRAM_CONSTANT:
 917    case PROGRAM_UNIFORM:
 918       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 919       return c->regs[PROGRAM_STATE_VAR][index];
 920    case PROGRAM_ADDRESS:
 921       assert(index == 0);
 922       return c->regs[file][index];
 923
 924    case PROGRAM_UNDEFINED:                      /* undef values */
 925       return brw_null_reg();
 926
 927    case PROGRAM_LOCAL_PARAM:
 928    case PROGRAM_ENV_PARAM:
 929    case PROGRAM_WRITE_ONLY:
 930    default:
 931       assert(0);
 932       return brw_null_reg();
 933    }
 934 }
 935
 936
 937 /**
 938  * Indirect addressing:  get reg[[arg] + offset].
 939  */
 940 static struct brw_reg deref( struct brw_vs_compile *c,
 941                              struct brw_reg arg,
 942                              GLint offset)
 943 {
 944    struct brw_compile *p = &c->func;
 945    struct brw_reg tmp = vec4(get_tmp(c));
 946    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 947    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 948    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 949    struct brw_reg indirect = brw_vec4_indirect(0,0);
 950
 951    {
 952       brw_push_insn_state(p);
 953       brw_set_access_mode(p, BRW_ALIGN_1);
 954
 955       /* This is pretty clunky - load the address register twice and
 956        * fetch each 4-dword value in turn.  There must be a way to do
 957        * this in a single pass, but I couldn't get it to work.
 958        */
 959       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 960       brw_MOV(p, tmp, indirect);
 961
 962       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 963       brw_MOV(p, suboffset(tmp, 4), indirect);
 964
 965       brw_pop_insn_state(p);
 966    }
 967
 968    /* NOTE: tmp not released */
 969    return vec8(tmp);
 970 }
 971
 972
 973 /**
 974  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 975  * TODO: relative addressing!
 976  */
 977 static struct brw_reg
 978 get_src_reg( struct brw_vs_compile *c,
 979              const struct prog_instruction *inst,
 980              GLuint argIndex )
 981 {
 982    const GLuint file = inst->SrcReg[argIndex].File;
 983    const GLint index = inst->SrcReg[argIndex].Index;
 984    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 985
 986    switch (file) {
 987    case PROGRAM_TEMPORARY:
 988    case PROGRAM_INPUT:
 989    case PROGRAM_OUTPUT:
 990       if (relAddr) {
 991          return deref(c, c->regs[file][0], index);
 992       }
 993       else {
 994          assert(c->regs[file][index].nr != 0);
 995          return c->regs[file][index];
 996       }
 997
 998    case PROGRAM_STATE_VAR:
 999    case PROGRAM_CONSTANT:
1000    case PROGRAM_UNIFORM:
1001    case PROGRAM_ENV_PARAM:
1002    case PROGRAM_LOCAL_PARAM:
1003       if (c->vp->use_const_buffer) {
1004          if (!relAddr && c->constant_map[index] != -1) {
1005             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1006             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1007          } else if (relAddr)
1008             return get_reladdr_constant(c, inst, argIndex);
1009          else
1010             return get_constant(c, inst, argIndex);
1011       }
1012       else if (relAddr) {
1013          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
1014       }
1015       else {
1016          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1017          return c->regs[PROGRAM_STATE_VAR][index];
1018       }
1019    case PROGRAM_ADDRESS:
1020       assert(index == 0);
1021       return c->regs[file][index];
1022
1023    case PROGRAM_UNDEFINED:
1024       /* this is a normal case since we loop over all three src args */
1025       return brw_null_reg();
1026
1027    case PROGRAM_WRITE_ONLY:
1028    default:
1029       assert(0);
1030       return brw_null_reg();
1031    }
1032 }
1033
1034
1035 static void emit_arl( struct brw_vs_compile *c,
1036                       struct brw_reg dst,
1037                       struct brw_reg arg0 )
1038 {
1039    struct brw_compile *p = &c->func;
1040    struct brw_reg tmp = dst;
1041    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1042
1043    if (need_tmp)
1044       tmp = get_tmp(c);
1045
1046    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
1047    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
1048
1049    if (need_tmp)
1050       release_tmp(c, tmp);
1051 }
1052
1053
1054 /**
1055  * Return the brw reg for the given instruction's src argument.
1056  * Will return mangled results for SWZ op.  The emit_swz() function
1057  * ignores this result and recalculates taking extended swizzles into
1058  * account.
1059  */
1060 static struct brw_reg get_arg( struct brw_vs_compile *c,
1061                                const struct prog_instruction *inst,
1062                                GLuint argIndex )
1063 {
1064    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1065    struct brw_reg reg;
1066
1067    if (src->File == PROGRAM_UNDEFINED)
1068       return brw_null_reg();
1069
1070    reg = get_src_reg(c, inst, argIndex);
1071
1072    /* Convert 3-bit swizzle to 2-bit.
1073     */
1074    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1075                                        GET_SWZ(src->Swizzle, 1),
1076                                        GET_SWZ(src->Swizzle, 2),
1077                                        GET_SWZ(src->Swizzle, 3));
1078
1079    /* Note this is ok for non-swizzle instructions:
1080     */
1081    reg.negate = src->Negate ? 1 : 0;
1082
1083    return reg;
1084 }
1085
1086
1087 /**
1088  * Get brw register for the given program dest register.
1089  */
1090 static struct brw_reg get_dst( struct brw_vs_compile *c,
1091                                struct prog_dst_register dst )
1092 {
1093    struct brw_reg reg;
1094
1095    switch (dst.File) {
1096    case PROGRAM_TEMPORARY:
1097    case PROGRAM_OUTPUT:
1098       assert(c->regs[dst.File][dst.Index].nr != 0);
1099       reg = c->regs[dst.File][dst.Index];
1100       break;
1101    case PROGRAM_ADDRESS:
1102       assert(dst.Index == 0);
1103       reg = c->regs[dst.File][dst.Index];
1104       break;
1105    case PROGRAM_UNDEFINED:
1106       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1107       reg = brw_null_reg();
1108       break;
1109    default:
1110       assert(0);
1111       reg = brw_null_reg();
1112    }
1113
1114    reg.dw1.bits.writemask = dst.WriteMask;
1115
1116    return reg;
1117 }
1118
1119
1120 static void emit_swz( struct brw_vs_compile *c,
1121                       struct brw_reg dst,
1122                       const struct prog_instruction *inst)
1123 {
1124    const GLuint argIndex = 0;
1125    const struct prog_src_register src = inst->SrcReg[argIndex];
1126    struct brw_compile *p = &c->func;
1127    GLuint zeros_mask = 0;
1128    GLuint ones_mask = 0;
1129    GLuint src_mask = 0;
1130    GLubyte src_swz[4];
1131    GLboolean need_tmp = (src.Negate &&
1132                          dst.file != BRW_GENERAL_REGISTER_FILE);
1133    struct brw_reg tmp = dst;
1134    GLuint i;
1135
1136    if (need_tmp)
1137       tmp = get_tmp(c);
1138
1139    for (i = 0; i < 4; i++) {
1140       if (dst.dw1.bits.writemask & (1<<i)) {
1141          GLubyte s = GET_SWZ(src.Swizzle, i);
1142          switch (s) {
1143          case SWIZZLE_X:
1144          case SWIZZLE_Y:
1145          case SWIZZLE_Z:
1146          case SWIZZLE_W:
1147             src_mask |= 1<<i;
1148             src_swz[i] = s;
1149             break;
1150          case SWIZZLE_ZERO:
1151             zeros_mask |= 1<<i;
1152             break;
1153          case SWIZZLE_ONE:
1154             ones_mask |= 1<<i;
1155             break;
1156          }
1157       }
1158    }
1159
1160    /* Do src first, in case dst aliases src:
1161     */
1162    if (src_mask) {
1163       struct brw_reg arg0;
1164
1165       arg0 = get_src_reg(c, inst, argIndex);
1166
1167       arg0 = brw_swizzle(arg0,
1168                          src_swz[0], src_swz[1],
1169                          src_swz[2], src_swz[3]);
1170
1171       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1172    }
1173
1174    if (zeros_mask)
1175       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1176
1177    if (ones_mask)
1178       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1179
1180    if (src.Negate)
1181       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1182
1183    if (need_tmp) {
1184       brw_MOV(p, dst, tmp);
1185       release_tmp(c, tmp);
1186    }
1187 }
1188
1189
1190 /**
1191  * Post-vertex-program processing.  Send the results to the URB.
1192  */
1193 static void emit_vertex_write( struct brw_vs_compile *c)
1194 {
1195    struct brw_compile *p = &c->func;
1196    struct brw_context *brw = p->brw;
1197    struct intel_context *intel = &brw->intel;
1198    struct brw_reg m0 = brw_message_reg(0);
1199    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1200    struct brw_reg ndc;
1201    int eot;
1202    GLuint len_vertex_header = 2;
1203
1204    if (c->key.copy_edgeflag) {
1205       brw_MOV(p,
1206               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1207               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1208    }
1209
1210    if (intel->gen < 6) {
1211       /* Build ndc coords */
1212       ndc = get_tmp(c);
1213       /* ndc = 1.0 / pos.w */
1214       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1215       /* ndc.xyz = pos * ndc */
1216       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1217    }
1218
1219    /* Update the header for point size, user clipping flags, and -ve rhw
1220     * workaround.
1221     */
1222    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1223        c->key.nr_userclip || brw->has_negative_rhw_bug)
1224    {
1225       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1226       GLuint i;
1227
1228       brw_MOV(p, header1, brw_imm_ud(0));
1229
1230       brw_set_access_mode(p, BRW_ALIGN_16);
1231
1232       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1233          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1234          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1235          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1236       }
1237
1238       for (i = 0; i < c->key.nr_userclip; i++) {
1239          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1240          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1241          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1242          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1243       }
1244
1245       /* i965 clipping workaround:
1246        * 1) Test for -ve rhw
1247        * 2) If set,
1248        *      set ndc = (0,0,0,0)
1249        *      set ucp[6] = 1
1250        *
1251        * Later, clipping will detect ucp[6] and ensure the primitive is
1252        * clipped against all fixed planes.
1253        */
1254       if (brw->has_negative_rhw_bug) {
1255          brw_CMP(p,
1256                  vec8(brw_null_reg()),
1257                  BRW_CONDITIONAL_L,
1258                  brw_swizzle1(ndc, 3),
1259                  brw_imm_f(0));
1260
1261          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1262          brw_MOV(p, ndc, brw_imm_f(0));
1263          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1264       }
1265
1266       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1267       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1268       brw_set_access_mode(p, BRW_ALIGN_16);
1269
1270       release_tmp(c, header1);
1271    }
1272    else {
1273       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1274    }
1275
1276    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1277     * of zeros followed by two sets of NDC coordinates:
1278     */
1279    brw_set_access_mode(p, BRW_ALIGN_1);
1280
1281    if (intel->gen >= 6) {
1282       /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
1283        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1284        * dword 4-7 (m2) is the 4D space position
1285        * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
1286        * m5 is the first vertex data we fill, which is the vertex position.
1287        */
1288       brw_MOV(p, offset(m0, 2), pos);
1289       brw_MOV(p, offset(m0, 5), pos);
1290       len_vertex_header = 4;
1291    } else if (intel->is_ironlake) {
1292       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1293        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1294        * dword 4-7 (m2) is the ndc position (set above)
1295        * dword 8-11 (m3) of the vertex header is the 4D space position
1296        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1297        * m6 is a pad so that the vertex element data is aligned
1298        * m7 is the first vertex data we fill, which is the vertex position.
1299        */
1300       brw_MOV(p, offset(m0, 2), ndc);
1301       brw_MOV(p, offset(m0, 3), pos);
1302       brw_MOV(p, offset(m0, 7), pos);
1303       len_vertex_header = 6;
1304    } else {
1305       /* There are 8 dwords in VUE header pre-Ironlake:
1306        * dword 0-3 (m1) is indices, point width, clip flags.
1307        * dword 4-7 (m2) is ndc position (set above)
1308        *
1309        * dword 8-11 (m3) is the first vertex data, which we always have be the
1310        * vertex position.
1311        */
1312       brw_MOV(p, offset(m0, 2), ndc);
1313       brw_MOV(p, offset(m0, 3), pos);
1314       len_vertex_header = 2;
1315    }
1316
1317    eot = (c->first_overflow_output == 0);
1318
1319    brw_urb_WRITE(p,
1320                  brw_null_reg(), /* dest */
1321                  0,             /* starting mrf reg nr */
1322                  c->r0,         /* src */
1323                  0,             /* allocate */
1324                  1,             /* used */
1325                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1326                  0,             /* response len */
1327                  eot,           /* eot */
1328                  eot,           /* writes complete */
1329                  0,             /* urb destination offset */
1330                  BRW_URB_SWIZZLE_INTERLEAVE);
1331
1332    if (c->first_overflow_output > 0) {
1333       /* Not all of the vertex outputs/results fit into the MRF.
1334        * Move the overflowed attributes from the GRF to the MRF and
1335        * issue another brw_urb_WRITE().
1336        */
1337       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1338        * at mrf[4] atm...
1339        */
1340       GLuint i, mrf = 0;
1341       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1342          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1343             /* move from GRF to MRF */
1344             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1345             mrf++;
1346          }
1347       }
1348
1349       brw_urb_WRITE(p,
1350                     brw_null_reg(), /* dest */
1351                     4,              /* starting mrf reg nr */
1352                     c->r0,          /* src */
1353                     0,              /* allocate */
1354                     1,              /* used */
1355                     mrf+1,          /* msg len */
1356                     0,              /* response len */
1357                     1,              /* eot */
1358                     1,              /* writes complete */
1359                     BRW_MAX_MRF-1,  /* urb destination offset */
1360                     BRW_URB_SWIZZLE_INTERLEAVE);
1361    }
1362 }
1363
1364
1365 /**
1366  * Called after code generation to resolve subroutine calls and the
1367  * END instruction.
1368  * \param end_inst  points to brw code for END instruction
1369  * \param last_inst  points to last instruction emitted before vertex write
1370  */
1371 static void
1372 post_vs_emit( struct brw_vs_compile *c,
1373               struct brw_instruction *end_inst,
1374               struct brw_instruction *last_inst )
1375 {
1376    GLint offset;
1377
1378    brw_resolve_cals(&c->func);
1379
1380    /* patch up the END code to jump past subroutines, etc */
1381    offset = last_inst - end_inst;
1382    if (offset > 1) {
1383       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1384    } else {
1385       end_inst->header.opcode = BRW_OPCODE_NOP;
1386    }
1387 }
1388
1389 static GLboolean
1390 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1391 {
1392    struct brw_compile *p = &c->func;
1393    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1394
1395    if (p->nr_insn == 0)
1396       return GL_FALSE;
1397
1398    if (val.address_mode != BRW_ADDRESS_DIRECT)
1399       return GL_FALSE;
1400
1401    switch (prev_insn->header.opcode) {
1402    case BRW_OPCODE_MOV:
1403    case BRW_OPCODE_MAC:
1404    case BRW_OPCODE_MUL:
1405       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1406           prev_insn->header.execution_size == val.width &&
1407           prev_insn->bits1.da1.dest_reg_file == val.file &&
1408           prev_insn->bits1.da1.dest_reg_type == val.type &&
1409           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1410           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1411           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1412           prev_insn->bits1.da16.dest_writemask == 0xf)
1413          return GL_TRUE;
1414       else
1415          return GL_FALSE;
1416    default:
1417       return GL_FALSE;
1418    }
1419 }
1420
1421 static uint32_t
1422 get_predicate(const struct prog_instruction *inst)
1423 {
1424    if (inst->DstReg.CondMask == COND_TR)
1425       return BRW_PREDICATE_NONE;
1426
1427    /* All of GLSL only produces predicates for COND_NE and one channel per
1428     * vector.  Fail badly if someone starts doing something else, as it might
1429     * mean infinite looping or something.
1430     *
1431     * We'd like to support all the condition codes, but our hardware doesn't
1432     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1433     * those, the instruction may update the condition codes or not, then any
1434     * later instruction may use one of those condition codes.  For gen4, the
1435     * instruction may update the flags register based on one of the condition
1436     * codes output by the instruction, and then further instructions may
1437     * predicate on that.  We can probably support this, but it won't
1438     * necessarily be easy.
1439     */
1440    assert(inst->DstReg.CondMask == COND_NE);
1441
1442    switch (inst->DstReg.CondSwizzle) {
1443    case SWIZZLE_XXXX:
1444       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1445    case SWIZZLE_YYYY:
1446       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1447    case SWIZZLE_ZZZZ:
1448       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1449    case SWIZZLE_WWWW:
1450       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1451    default:
1452       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1453                     inst->DstReg.CondMask);
1454       return BRW_PREDICATE_NORMAL;
1455    }
1456 }
1457
1458 /* Emit the vertex program instructions here.
1459  */
1460 void brw_vs_emit(struct brw_vs_compile *c )
1461 {
1462 #define MAX_IF_DEPTH 32
1463 #define MAX_LOOP_DEPTH 32
1464    struct brw_compile *p = &c->func;
1465    struct brw_context *brw = p->brw;
1466    struct intel_context *intel = &brw->intel;
1467    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1468    GLuint insn, if_depth = 0, loop_depth = 0;
1469    GLuint end_offset = 0;
1470    struct brw_instruction *end_inst, *last_inst;
1471    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1472    const struct brw_indirect stack_index = brw_indirect(0, 0);
1473    GLuint index;
1474    GLuint file;
1475
1476    if (INTEL_DEBUG & DEBUG_VS) {
1477       printf("vs-mesa:\n");
1478       _mesa_print_program(&c->vp->program.Base);
1479       printf("\n");
1480    }
1481
1482    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1483    brw_set_access_mode(p, BRW_ALIGN_16);
1484
1485    for (insn = 0; insn < nr_insns; insn++) {
1486        GLuint i;
1487        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1488
1489        /* Message registers can't be read, so copy the output into GRF
1490         * register if they are used in source registers
1491         */
1492        for (i = 0; i < 3; i++) {
1493            struct prog_src_register *src = &inst->SrcReg[i];
1494            GLuint index = src->Index;
1495            GLuint file = src->File;
1496            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1497                c->output_regs[index].used_in_src = GL_TRUE;
1498        }
1499
1500        switch (inst->Opcode) {
1501        case OPCODE_CAL:
1502        case OPCODE_RET:
1503           c->needs_stack = GL_TRUE;
1504           break;
1505        default:
1506           break;
1507        }
1508    }
1509
1510    /* Static register allocation
1511     */
1512    brw_vs_alloc_regs(c);
1513
1514    if (c->needs_stack)
1515       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1516
1517    for (insn = 0; insn < nr_insns; insn++) {
1518
1519       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1520       struct brw_reg args[3], dst;
1521       GLuint i;
1522
1523 #if 0
1524       printf("%d: ", insn);
1525       _mesa_print_instruction(inst);
1526 #endif
1527
1528       /* Get argument regs.  SWZ is special and does this itself.
1529        */
1530       if (inst->Opcode != OPCODE_SWZ)
1531           for (i = 0; i < 3; i++) {
1532               const struct prog_src_register *src = &inst->SrcReg[i];
1533               index = src->Index;
1534               file = src->File;
1535               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1536                   args[i] = c->output_regs[index].reg;
1537               else
1538                   args[i] = get_arg(c, inst, i);
1539           }
1540
1541       /* Get dest regs.  Note that it is possible for a reg to be both
1542        * dst and arg, given the static allocation of registers.  So
1543        * care needs to be taken emitting multi-operation instructions.
1544        */
1545       index = inst->DstReg.Index;
1546       file = inst->DstReg.File;
1547       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1548           dst = c->output_regs[index].reg;
1549       else
1550           dst = get_dst(c, inst->DstReg);
1551
1552       if (inst->SaturateMode != SATURATE_OFF) {
1553          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1554                        inst->SaturateMode);
1555       }
1556
1557       switch (inst->Opcode) {
1558       case OPCODE_ABS:
1559          brw_MOV(p, dst, brw_abs(args[0]));
1560          break;
1561       case OPCODE_ADD:
1562          brw_ADD(p, dst, args[0], args[1]);
1563          break;
1564       case OPCODE_COS:
1565          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1566          break;
1567       case OPCODE_DP3:
1568          brw_DP3(p, dst, args[0], args[1]);
1569          break;
1570       case OPCODE_DP4:
1571          brw_DP4(p, dst, args[0], args[1]);
1572          break;
1573       case OPCODE_DPH:
1574          brw_DPH(p, dst, args[0], args[1]);
1575          break;
1576       case OPCODE_NRM3:
1577          emit_nrm(c, dst, args[0], 3);
1578          break;
1579       case OPCODE_NRM4:
1580          emit_nrm(c, dst, args[0], 4);
1581          break;
1582       case OPCODE_DST:
1583          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1584          break;
1585       case OPCODE_EXP:
1586          unalias1(c, dst, args[0], emit_exp_noalias);
1587          break;
1588       case OPCODE_EX2:
1589          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1590          break;
1591       case OPCODE_ARL:
1592          emit_arl(c, dst, args[0]);
1593          break;
1594       case OPCODE_FLR:
1595          brw_RNDD(p, dst, args[0]);
1596          break;
1597       case OPCODE_FRC:
1598          brw_FRC(p, dst, args[0]);
1599          break;
1600       case OPCODE_LOG:
1601          unalias1(c, dst, args[0], emit_log_noalias);
1602          break;
1603       case OPCODE_LG2:
1604          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1605          break;
1606       case OPCODE_LIT:
1607          unalias1(c, dst, args[0], emit_lit_noalias);
1608          break;
1609       case OPCODE_LRP:
1610          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1611          break;
1612       case OPCODE_MAD:
1613          if (!accumulator_contains(c, args[2]))
1614             brw_MOV(p, brw_acc_reg(), args[2]);
1615          brw_MAC(p, dst, args[0], args[1]);
1616          break;
1617       case OPCODE_CMP:
1618          emit_cmp(p, dst, args[0], args[1], args[2]);
1619          break;
1620       case OPCODE_MAX:
1621          emit_max(p, dst, args[0], args[1]);
1622          break;
1623       case OPCODE_MIN:
1624          emit_min(p, dst, args[0], args[1]);
1625          break;
1626       case OPCODE_MOV:
1627          brw_MOV(p, dst, args[0]);
1628          break;
1629       case OPCODE_MUL:
1630          brw_MUL(p, dst, args[0], args[1]);
1631          break;
1632       case OPCODE_POW:
1633          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1634          break;
1635       case OPCODE_RCP:
1636          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1637          break;
1638       case OPCODE_RSQ:
1639          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1640          break;
1641
1642       case OPCODE_SEQ:
1643          unalias2(c, dst, args[0], args[1], emit_seq);
1644          break;
1645       case OPCODE_SIN:
1646          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1647          break;
1648       case OPCODE_SNE:
1649          unalias2(c, dst, args[0], args[1], emit_sne);
1650          break;
1651       case OPCODE_SGE:
1652          unalias2(c, dst, args[0], args[1], emit_sge);
1653          break;
1654       case OPCODE_SGT:
1655          unalias2(c, dst, args[0], args[1], emit_sgt);
1656          break;
1657       case OPCODE_SLT:
1658          unalias2(c, dst, args[0], args[1], emit_slt);
1659          break;
1660       case OPCODE_SLE:
1661          unalias2(c, dst, args[0], args[1], emit_sle);
1662          break;
1663       case OPCODE_SUB:
1664          brw_ADD(p, dst, args[0], negate(args[1]));
1665          break;
1666       case OPCODE_SWZ:
1667          /* The args[0] value can't be used here as it won't have
1668           * correctly encoded the full swizzle:
1669           */
1670          emit_swz(c, dst, inst);
1671          break;
1672       case OPCODE_TRUNC:
1673          /* round toward zero */
1674          brw_RNDZ(p, dst, args[0]);
1675          break;
1676       case OPCODE_XPD:
1677          emit_xpd(p, dst, args[0], args[1]);
1678          break;
1679       case OPCODE_IF:
1680          assert(if_depth < MAX_IF_DEPTH);
1681          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1682          /* Note that brw_IF smashes the predicate_control field. */
1683          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1684          if_depth++;
1685          break;
1686       case OPCODE_ELSE:
1687          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1688          break;
1689       case OPCODE_ENDIF:
1690          assert(if_depth > 0);
1691          brw_ENDIF(p, if_inst[--if_depth]);
1692          break;
1693       case OPCODE_BGNLOOP:
1694          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1695          break;
1696       case OPCODE_BRK:
1697          brw_set_predicate_control(p, get_predicate(inst));
1698          brw_BREAK(p);
1699          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1700          break;
1701       case OPCODE_CONT:
1702          brw_set_predicate_control(p, get_predicate(inst));
1703          brw_CONT(p);
1704          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1705          break;
1706       case OPCODE_ENDLOOP:
1707          {
1708             struct brw_instruction *inst0, *inst1;
1709             GLuint br = 1;
1710
1711             loop_depth--;
1712
1713             if (intel->is_ironlake)
1714                br = 2;
1715
1716             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1717             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1718             while (inst0 > loop_inst[loop_depth]) {
1719                inst0--;
1720                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1721                    inst0->bits3.if_else.jump_count == 0) {
1722                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1723                   inst0->bits3.if_else.pop_count = 0;
1724                }
1725                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1726                         inst0->bits3.if_else.jump_count == 0) {
1727                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1728                   inst0->bits3.if_else.pop_count = 0;
1729                }
1730             }
1731          }
1732          break;
1733       case OPCODE_BRA:
1734          brw_set_predicate_control(p, get_predicate(inst));
1735          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1736          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1737          break;
1738       case OPCODE_CAL:
1739          brw_set_access_mode(p, BRW_ALIGN_1);
1740          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1741          brw_set_access_mode(p, BRW_ALIGN_16);
1742          brw_ADD(p, get_addr_reg(stack_index),
1743                          get_addr_reg(stack_index), brw_imm_d(4));
1744          brw_save_call(p, inst->Comment, p->nr_insn);
1745          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1746          break;
1747       case OPCODE_RET:
1748          brw_ADD(p, get_addr_reg(stack_index),
1749                          get_addr_reg(stack_index), brw_imm_d(-4));
1750          brw_set_access_mode(p, BRW_ALIGN_1);
1751          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1752          brw_set_access_mode(p, BRW_ALIGN_16);
1753          break;
1754       case OPCODE_END:
1755          end_offset = p->nr_insn;
1756          /* this instruction will get patched later to jump past subroutine
1757           * code, etc.
1758           */
1759          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1760          break;
1761       case OPCODE_PRINT:
1762          /* no-op */
1763          break;
1764       case OPCODE_BGNSUB:
1765          brw_save_label(p, inst->Comment, p->nr_insn);
1766          break;
1767       case OPCODE_ENDSUB:
1768          /* no-op */
1769          break;
1770       default:
1771          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1772                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1773                                     _mesa_opcode_string(inst->Opcode) :
1774                                     "unknown");
1775       }
1776
1777       /* Set the predication update on the last instruction of the native
1778        * instruction sequence.
1779        *
1780        * This would be problematic if it was set on a math instruction,
1781        * but that shouldn't be the case with the current GLSL compiler.
1782        */
1783       if (inst->CondUpdate) {
1784          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1785
1786          assert(hw_insn->header.destreg__conditionalmod == 0);
1787          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1788       }
1789
1790       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1791           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1792           && c->output_regs[inst->DstReg.Index].used_in_src) {
1793          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1794       }
1795
1796       /* Result color clamping.
1797        *
1798        * When destination register is an output register and
1799        * it's primary/secondary front/back color, we have to clamp
1800        * the result to [0,1]. This is done by enabling the
1801        * saturation bit for the last instruction.
1802        *
1803        * We don't use brw_set_saturate() as it modifies
1804        * p->current->header.saturate, which affects all the subsequent
1805        * instructions. Instead, we directly modify the header
1806        * of the last (already stored) instruction.
1807        */
1808       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1809          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1810              || (inst->DstReg.Index == VERT_RESULT_COL1)
1811              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1812              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1813             p->store[p->nr_insn-1].header.saturate = 1;
1814          }
1815       }
1816
1817       release_tmps(c);
1818    }
1819
1820    end_inst = &p->store[end_offset];
1821    last_inst = &p->store[p->nr_insn];
1822
1823    /* The END instruction will be patched to jump to this code */
1824    emit_vertex_write(c);
1825
1826    post_vs_emit(c, end_inst, last_inst);
1827
1828    if (INTEL_DEBUG & DEBUG_VS) {
1829       int i;
1830
1831       printf("vs-native:\n");
1832       for (i = 0; i < p->nr_insn; i++)
1833          brw_disasm(stderr, &p->store[i]);
1834       printf("\n");
1835    }
1836 }