src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    struct intel_context *intel = &c->func.brw->intel;
  71    GLuint i, reg = 0, mrf;
  72    int attributes_in_vue;
  73
  74    /* Determine whether to use a real constant buffer or use a block
  75     * of GRF registers for constants.  The later is faster but only
  76     * works if everything fits in the GRF.
  77     * XXX this heuristic/check may need some fine tuning...
  78     */
  79    if (c->vp->program.Base.Parameters->NumParameters +
  80        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  81       c->vp->use_const_buffer = GL_TRUE;
  82    else
  83       c->vp->use_const_buffer = GL_FALSE;
  84
  85    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  86
  87    /* r0 -- reserved as usual
  88     */
  89    c->r0 = brw_vec8_grf(reg, 0);
  90    reg++;
  91
  92    /* User clip planes from curbe:
  93     */
  94    if (c->key.nr_userclip) {
  95       for (i = 0; i < c->key.nr_userclip; i++) {
  96          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  97       }
  98
  99       /* Deal with curbe alignment:
 100        */
 101       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 102    }
 103
 104    /* Vertex program parameters from curbe:
 105     */
 106    if (c->vp->use_const_buffer) {
 107       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 108       int constant = 0;
 109
 110       /* We've got more constants than we can load with the push
 111        * mechanism.  This is often correlated with reladdr loads where
 112        * we should probably be using a pull mechanism anyway to avoid
 113        * excessive reading.  However, the pull mechanism is slow in
 114        * general.  So, we try to allocate as many non-reladdr-loaded
 115        * constants through the push buffer as we can before giving up.
 116        */
 117       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 118       for (i = 0;
 119            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 120            i++) {
 121          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 122          int arg;
 123
 124          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 125             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 126                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 127                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 128                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 129                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 130                 inst->SrcReg[arg].RelAddr)
 131                continue;
 132
 133             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 134                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 135             }
 136          }
 137       }
 138
 139       for (i = 0; i < constant; i++) {
 140          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 141                                                               (i%2) * 4),
 142                                                  0, 4, 1);
 143       }
 144       reg += (constant + 1) / 2;
 145       c->prog_data.curb_read_length = reg - 1;
 146       /* XXX 0 causes a bug elsewhere... */
 147       c->prog_data.nr_params = MAX2(constant * 4, 4);
 148    }
 149    else {
 150       /* use a section of the GRF for constants */
 151       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 152       for (i = 0; i < nr_params; i++) {
 153          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 154       }
 155       reg += (nr_params + 1) / 2;
 156       c->prog_data.curb_read_length = reg - 1;
 157
 158       c->prog_data.nr_params = nr_params * 4;
 159    }
 160
 161    /* Allocate input regs:
 162     */
 163    c->nr_inputs = 0;
 164    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 165       if (c->prog_data.inputs_read & (1 << i)) {
 166          c->nr_inputs++;
 167          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 168          reg++;
 169       }
 170    }
 171    /* If there are no inputs, we'll still be reading one attribute's worth
 172     * because it's required -- see urb_read_length setting.
 173     */
 174    if (c->nr_inputs == 0)
 175       reg++;
 176
 177    /* Allocate outputs.  The non-position outputs go straight into message regs.
 178     */
 179    c->nr_outputs = 0;
 180    c->first_output = reg;
 181    c->first_overflow_output = 0;
 182
 183    if (intel->gen >= 6)
 184       mrf = 6;
 185    else if (intel->is_ironlake)
 186       mrf = 8;
 187    else
 188       mrf = 4;
 189
 190    for (i = 0; i < VERT_RESULT_MAX; i++) {
 191       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 192          c->nr_outputs++;
 193          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 194          if (i == VERT_RESULT_HPOS) {
 195             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 196             reg++;
 197          }
 198          else if (i == VERT_RESULT_PSIZ) {
 199             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 200             reg++;
 201             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 202          }
 203          else {
 204             if (mrf < 16) {
 205                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 206                mrf++;
 207             }
 208             else {
 209                /* too many vertex results to fit in MRF, use GRF for overflow */
 210                if (!c->first_overflow_output)
 211                   c->first_overflow_output = i;
 212                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 213                reg++;
 214             }
 215          }
 216       }
 217    }
 218
 219    /* Allocate program temporaries:
 220     */
 221    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 222       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 223       reg++;
 224    }
 225
 226    /* Address reg(s).  Don't try to use the internal address reg until
 227     * deref time.
 228     */
 229    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 230       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 231                                              reg,
 232                                              0,
 233                                              BRW_REGISTER_TYPE_D,
 234                                              BRW_VERTICAL_STRIDE_8,
 235                                              BRW_WIDTH_8,
 236                                              BRW_HORIZONTAL_STRIDE_1,
 237                                              BRW_SWIZZLE_XXXX,
 238                                              WRITEMASK_X);
 239       reg++;
 240    }
 241
 242    if (c->vp->use_const_buffer) {
 243       for (i = 0; i < 3; i++) {
 244          c->current_const[i].index = -1;
 245          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 246          reg++;
 247       }
 248    }
 249
 250    for (i = 0; i < 128; i++) {
 251       if (c->output_regs[i].used_in_src) {
 252          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 253          reg++;
 254       }
 255    }
 256
 257    if (c->needs_stack) {
 258       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 259       reg += 2;
 260    }
 261
 262    /* Some opcodes need an internal temporary:
 263     */
 264    c->first_tmp = reg;
 265    c->last_tmp = reg;           /* for allocation purposes */
 266
 267    /* Each input reg holds data from two vertices.  The
 268     * urb_read_length is the number of registers read from *each*
 269     * vertex urb, so is half the amount:
 270     */
 271    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 272    /* Setting this field to 0 leads to undefined behavior according to the
 273     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 274     * sitting in them, even if it's padding.
 275     */
 276    if (c->prog_data.urb_read_length == 0)
 277       c->prog_data.urb_read_length = 1;
 278
 279    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 280     * them to fit the biggest thing they need to.
 281     */
 282    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 283
 284    if (intel->gen >= 6)
 285       c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8;
 286    else if (intel->is_ironlake)
 287       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 288    else
 289       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 290
 291    c->prog_data.total_grf = reg;
 292
 293    if (INTEL_DEBUG & DEBUG_VS) {
 294       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 295       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 296       printf("%s reg = %d\n", __FUNCTION__, reg);
 297    }
 298 }
 299
 300
 301 /**
 302  * If an instruction uses a temp reg both as a src and the dest, we
 303  * sometimes need to allocate an intermediate temporary.
 304  */
 305 static void unalias1( struct brw_vs_compile *c,
 306                       struct brw_reg dst,
 307                       struct brw_reg arg0,
 308                       void (*func)( struct brw_vs_compile *,
 309                                     struct brw_reg,
 310                                     struct brw_reg ))
 311 {
 312    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 313       struct brw_compile *p = &c->func;
 314       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 315       func(c, tmp, arg0);
 316       brw_MOV(p, dst, tmp);
 317       release_tmp(c, tmp);
 318    }
 319    else {
 320       func(c, dst, arg0);
 321    }
 322 }
 323
 324 /**
 325  * \sa unalias2
 326  * Checkes if 2-operand instruction needs an intermediate temporary.
 327  */
 328 static void unalias2( struct brw_vs_compile *c,
 329                       struct brw_reg dst,
 330                       struct brw_reg arg0,
 331                       struct brw_reg arg1,
 332                       void (*func)( struct brw_vs_compile *,
 333                                     struct brw_reg,
 334                                     struct brw_reg,
 335                                     struct brw_reg ))
 336 {
 337    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 338        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 339       struct brw_compile *p = &c->func;
 340       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 341       func(c, tmp, arg0, arg1);
 342       brw_MOV(p, dst, tmp);
 343       release_tmp(c, tmp);
 344    }
 345    else {
 346       func(c, dst, arg0, arg1);
 347    }
 348 }
 349
 350 /**
 351  * \sa unalias2
 352  * Checkes if 3-operand instruction needs an intermediate temporary.
 353  */
 354 static void unalias3( struct brw_vs_compile *c,
 355                       struct brw_reg dst,
 356                       struct brw_reg arg0,
 357                       struct brw_reg arg1,
 358                       struct brw_reg arg2,
 359                       void (*func)( struct brw_vs_compile *,
 360                                     struct brw_reg,
 361                                     struct brw_reg,
 362                                     struct brw_reg,
 363                                     struct brw_reg ))
 364 {
 365    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 366        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 367        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 368       struct brw_compile *p = &c->func;
 369       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 370       func(c, tmp, arg0, arg1, arg2);
 371       brw_MOV(p, dst, tmp);
 372       release_tmp(c, tmp);
 373    }
 374    else {
 375       func(c, dst, arg0, arg1, arg2);
 376    }
 377 }
 378
 379 static void emit_sop( struct brw_vs_compile *c,
 380                       struct brw_reg dst,
 381                       struct brw_reg arg0,
 382                       struct brw_reg arg1,
 383                       GLuint cond)
 384 {
 385    struct brw_compile *p = &c->func;
 386
 387    brw_CMP(p, brw_null_reg(), cond, arg1, arg0);
 388    brw_SEL(p, dst, brw_null_reg(), brw_imm_f(1.0f));
 389    brw_set_predicate_control_flag_value(p, 0xff);
 390 }
 391
 392 static void emit_seq( struct brw_vs_compile *c,
 393                       struct brw_reg dst,
 394                       struct brw_reg arg0,
 395                       struct brw_reg arg1 )
 396 {
 397    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 398 }
 399
 400 static void emit_sne( struct brw_vs_compile *c,
 401                       struct brw_reg dst,
 402                       struct brw_reg arg0,
 403                       struct brw_reg arg1 )
 404 {
 405    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 406 }
 407 static void emit_slt( struct brw_vs_compile *c,
 408                       struct brw_reg dst,
 409                       struct brw_reg arg0,
 410                       struct brw_reg arg1 )
 411 {
 412    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 413 }
 414
 415 static void emit_sle( struct brw_vs_compile *c,
 416                       struct brw_reg dst,
 417                       struct brw_reg arg0,
 418                       struct brw_reg arg1 )
 419 {
 420    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 421 }
 422
 423 static void emit_sgt( struct brw_vs_compile *c,
 424                       struct brw_reg dst,
 425                       struct brw_reg arg0,
 426                       struct brw_reg arg1 )
 427 {
 428    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 429 }
 430
 431 static void emit_sge( struct brw_vs_compile *c,
 432                       struct brw_reg dst,
 433                       struct brw_reg arg0,
 434                       struct brw_reg arg1 )
 435 {
 436   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 437 }
 438
 439 static void emit_cmp( struct brw_compile *p,
 440                       struct brw_reg dst,
 441                       struct brw_reg arg0,
 442                       struct brw_reg arg1,
 443                       struct brw_reg arg2 )
 444 {
 445    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 446    brw_SEL(p, dst, arg1, arg2);
 447    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 448 }
 449
 450 static void emit_max( struct brw_compile *p,
 451                       struct brw_reg dst,
 452                       struct brw_reg arg0,
 453                       struct brw_reg arg1 )
 454 {
 455    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 456    brw_SEL(p, dst, arg1, arg0);
 457    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 458 }
 459
 460 static void emit_min( struct brw_compile *p,
 461                       struct brw_reg dst,
 462                       struct brw_reg arg0,
 463                       struct brw_reg arg1 )
 464 {
 465    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 466    brw_SEL(p, dst, arg0, arg1);
 467    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 468 }
 469
 470
 471 static void emit_math1( struct brw_vs_compile *c,
 472                         GLuint function,
 473                         struct brw_reg dst,
 474                         struct brw_reg arg0,
 475                         GLuint precision)
 476 {
 477    /* There are various odd behaviours with SEND on the simulator.  In
 478     * addition there are documented issues with the fact that the GEN4
 479     * processor doesn't do dependency control properly on SEND
 480     * results.  So, on balance, this kludge to get around failures
 481     * with writemasked math results looks like it might be necessary
 482     * whether that turns out to be a simulator bug or not:
 483     */
 484    struct brw_compile *p = &c->func;
 485    struct intel_context *intel = &p->brw->intel;
 486    struct brw_reg tmp = dst;
 487    GLboolean need_tmp = (intel->gen < 6 &&
 488                          (dst.dw1.bits.writemask != 0xf ||
 489                           dst.file != BRW_GENERAL_REGISTER_FILE));
 490
 491    if (need_tmp)
 492       tmp = get_tmp(c);
 493
 494    brw_math(p,
 495             tmp,
 496             function,
 497             BRW_MATH_SATURATE_NONE,
 498             2,
 499             arg0,
 500             BRW_MATH_DATA_SCALAR,
 501             precision);
 502
 503    if (need_tmp) {
 504       brw_MOV(p, dst, tmp);
 505       release_tmp(c, tmp);
 506    }
 507 }
 508
 509
 510 static void emit_math2( struct brw_vs_compile *c,
 511                         GLuint function,
 512                         struct brw_reg dst,
 513                         struct brw_reg arg0,
 514                         struct brw_reg arg1,
 515                         GLuint precision)
 516 {
 517    struct brw_compile *p = &c->func;
 518    struct intel_context *intel = &p->brw->intel;
 519    struct brw_reg tmp = dst;
 520    GLboolean need_tmp = (intel->gen < 6 &&
 521                          (dst.dw1.bits.writemask != 0xf ||
 522                           dst.file != BRW_GENERAL_REGISTER_FILE));
 523
 524    if (need_tmp)
 525       tmp = get_tmp(c);
 526
 527    brw_MOV(p, brw_message_reg(3), arg1);
 528
 529    brw_math(p,
 530             tmp,
 531             function,
 532             BRW_MATH_SATURATE_NONE,
 533             2,
 534             arg0,
 535             BRW_MATH_DATA_SCALAR,
 536             precision);
 537
 538    if (need_tmp) {
 539       brw_MOV(p, dst, tmp);
 540       release_tmp(c, tmp);
 541    }
 542 }
 543
 544
 545 static void emit_exp_noalias( struct brw_vs_compile *c,
 546                               struct brw_reg dst,
 547                               struct brw_reg arg0 )
 548 {
 549    struct brw_compile *p = &c->func;
 550
 551
 552    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 553       struct brw_reg tmp = get_tmp(c);
 554       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 555
 556       /* tmp_d = floor(arg0.x) */
 557       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 558
 559       /* result[0] = 2.0 ^ tmp */
 560
 561       /* Adjust exponent for floating point:
 562        * exp += 127
 563        */
 564       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 565
 566       /* Install exponent and sign.
 567        * Excess drops off the edge:
 568        */
 569       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 570               tmp_d, brw_imm_d(23));
 571
 572       release_tmp(c, tmp);
 573    }
 574
 575    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 576       /* result[1] = arg0.x - floor(arg0.x) */
 577       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 578    }
 579
 580    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 581       /* As with the LOG instruction, we might be better off just
 582        * doing a taylor expansion here, seeing as we have to do all
 583        * the prep work.
 584        *
 585        * If mathbox partial precision is too low, consider also:
 586        * result[3] = result[0] * EXP(result[1])
 587        */
 588       emit_math1(c,
 589                  BRW_MATH_FUNCTION_EXP,
 590                  brw_writemask(dst, WRITEMASK_Z),
 591                  brw_swizzle1(arg0, 0),
 592                  BRW_MATH_PRECISION_FULL);
 593    }
 594
 595    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 596       /* result[3] = 1.0; */
 597       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 598    }
 599 }
 600
 601
 602 static void emit_log_noalias( struct brw_vs_compile *c,
 603                               struct brw_reg dst,
 604                               struct brw_reg arg0 )
 605 {
 606    struct brw_compile *p = &c->func;
 607    struct brw_reg tmp = dst;
 608    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 609    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 610    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 611                          dst.file != BRW_GENERAL_REGISTER_FILE);
 612
 613    if (need_tmp) {
 614       tmp = get_tmp(c);
 615       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 616    }
 617
 618    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 619     * according to spec:
 620     *
 621     * These almost look likey they could be joined up, but not really
 622     * practical:
 623     *
 624     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 625     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 626     */
 627    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 628       brw_AND(p,
 629               brw_writemask(tmp_ud, WRITEMASK_X),
 630               brw_swizzle1(arg0_ud, 0),
 631               brw_imm_ud((1U<<31)-1));
 632
 633       brw_SHR(p,
 634               brw_writemask(tmp_ud, WRITEMASK_X),
 635               tmp_ud,
 636               brw_imm_ud(23));
 637
 638       brw_ADD(p,
 639               brw_writemask(tmp, WRITEMASK_X),
 640               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 641               brw_imm_d(-127));
 642    }
 643
 644    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 645       brw_AND(p,
 646               brw_writemask(tmp_ud, WRITEMASK_Y),
 647               brw_swizzle1(arg0_ud, 0),
 648               brw_imm_ud((1<<23)-1));
 649
 650       brw_OR(p,
 651              brw_writemask(tmp_ud, WRITEMASK_Y),
 652              tmp_ud,
 653              brw_imm_ud(127<<23));
 654    }
 655
 656    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 657       /* result[2] = result[0] + LOG2(result[1]); */
 658
 659       /* Why bother?  The above is just a hint how to do this with a
 660        * taylor series.  Maybe we *should* use a taylor series as by
 661        * the time all the above has been done it's almost certainly
 662        * quicker than calling the mathbox, even with low precision.
 663        *
 664        * Options are:
 665        *    - result[0] + mathbox.LOG2(result[1])
 666        *    - mathbox.LOG2(arg0.x)
 667        *    - result[0] + inline_taylor_approx(result[1])
 668        */
 669       emit_math1(c,
 670                  BRW_MATH_FUNCTION_LOG,
 671                  brw_writemask(tmp, WRITEMASK_Z),
 672                  brw_swizzle1(tmp, 1),
 673                  BRW_MATH_PRECISION_FULL);
 674
 675       brw_ADD(p,
 676               brw_writemask(tmp, WRITEMASK_Z),
 677               brw_swizzle1(tmp, 2),
 678               brw_swizzle1(tmp, 0));
 679    }
 680
 681    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 682       /* result[3] = 1.0; */
 683       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 684    }
 685
 686    if (need_tmp) {
 687       brw_MOV(p, dst, tmp);
 688       release_tmp(c, tmp);
 689    }
 690 }
 691
 692
 693 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 694  */
 695 static void emit_dst_noalias( struct brw_vs_compile *c,
 696                               struct brw_reg dst,
 697                               struct brw_reg arg0,
 698                               struct brw_reg arg1)
 699 {
 700    struct brw_compile *p = &c->func;
 701
 702    /* There must be a better way to do this:
 703     */
 704    if (dst.dw1.bits.writemask & WRITEMASK_X)
 705       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 706    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 707       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 708    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 709       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 710    if (dst.dw1.bits.writemask & WRITEMASK_W)
 711       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 712 }
 713
 714
 715 static void emit_xpd( struct brw_compile *p,
 716                       struct brw_reg dst,
 717                       struct brw_reg t,
 718                       struct brw_reg u)
 719 {
 720    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 721    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 722 }
 723
 724
 725 static void emit_lit_noalias( struct brw_vs_compile *c,
 726                               struct brw_reg dst,
 727                               struct brw_reg arg0 )
 728 {
 729    struct brw_compile *p = &c->func;
 730    struct brw_instruction *if_insn;
 731    struct brw_reg tmp = dst;
 732    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 733
 734    if (need_tmp)
 735       tmp = get_tmp(c);
 736
 737    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 738    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 739
 740    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 741     * to get all channels active inside the IF.  In the clipping code
 742     * we run with NoMask, so it's not an option and we can use
 743     * BRW_EXECUTE_1 for all comparisions.
 744     */
 745    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 746    if_insn = brw_IF(p, BRW_EXECUTE_8);
 747    {
 748       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 749
 750       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 751       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 752       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 753
 754       emit_math2(c,
 755                  BRW_MATH_FUNCTION_POW,
 756                  brw_writemask(dst, WRITEMASK_Z),
 757                  brw_swizzle1(tmp, 2),
 758                  brw_swizzle1(arg0, 3),
 759                  BRW_MATH_PRECISION_PARTIAL);
 760    }
 761
 762    brw_ENDIF(p, if_insn);
 763
 764    release_tmp(c, tmp);
 765 }
 766
 767 static void emit_lrp_noalias(struct brw_vs_compile *c,
 768                              struct brw_reg dst,
 769                              struct brw_reg arg0,
 770                              struct brw_reg arg1,
 771                              struct brw_reg arg2)
 772 {
 773    struct brw_compile *p = &c->func;
 774
 775    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 776    brw_MUL(p, brw_null_reg(), dst, arg2);
 777    brw_MAC(p, dst, arg0, arg1);
 778 }
 779
 780 /** 3 or 4-component vector normalization */
 781 static void emit_nrm( struct brw_vs_compile *c,
 782                       struct brw_reg dst,
 783                       struct brw_reg arg0,
 784                       int num_comps)
 785 {
 786    struct brw_compile *p = &c->func;
 787    struct brw_reg tmp = get_tmp(c);
 788
 789    /* tmp = dot(arg0, arg0) */
 790    if (num_comps == 3)
 791       brw_DP3(p, tmp, arg0, arg0);
 792    else
 793       brw_DP4(p, tmp, arg0, arg0);
 794
 795    /* tmp = 1 / sqrt(tmp) */
 796    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 797
 798    /* dst = arg0 * tmp */
 799    brw_MUL(p, dst, arg0, tmp);
 800
 801    release_tmp(c, tmp);
 802 }
 803
 804
 805 static struct brw_reg
 806 get_constant(struct brw_vs_compile *c,
 807              const struct prog_instruction *inst,
 808              GLuint argIndex)
 809 {
 810    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 811    struct brw_compile *p = &c->func;
 812    struct brw_reg const_reg = c->current_const[argIndex].reg;
 813
 814    assert(argIndex < 3);
 815
 816    if (c->current_const[argIndex].index != src->Index) {
 817       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 818
 819       /* Keep track of the last constant loaded in this slot, for reuse. */
 820       c->current_const[argIndex].index = src->Index;
 821
 822 #if 0
 823       printf("  fetch const[%d] for arg %d into reg %d\n",
 824              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 825 #endif
 826       /* need to fetch the constant now */
 827       brw_dp_READ_4_vs(p,
 828                        const_reg,                     /* writeback dest */
 829                        0,                             /* oword */
 830                        0,                             /* relative indexing? */
 831                        addrReg,                       /* address register */
 832                        16 * src->Index,               /* byte offset */
 833                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 834                        );
 835    }
 836
 837    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 838    const_reg = stride(const_reg, 0, 4, 0);
 839    const_reg.subnr = 0;
 840
 841    return const_reg;
 842 }
 843
 844 static struct brw_reg
 845 get_reladdr_constant(struct brw_vs_compile *c,
 846                      const struct prog_instruction *inst,
 847                      GLuint argIndex)
 848 {
 849    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 850    struct brw_compile *p = &c->func;
 851    struct brw_reg const_reg = c->current_const[argIndex].reg;
 852    struct brw_reg const2_reg;
 853    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 854
 855    assert(argIndex < 3);
 856
 857    /* Can't reuse a reladdr constant load. */
 858    c->current_const[argIndex].index = -1;
 859
 860  #if 0
 861    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 862           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 863 #endif
 864
 865    /* fetch the first vec4 */
 866    brw_dp_READ_4_vs(p,
 867                     const_reg,                     /* writeback dest */
 868                     0,                             /* oword */
 869                     1,                             /* relative indexing? */
 870                     addrReg,                       /* address register */
 871                     16 * src->Index,               /* byte offset */
 872                     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 873                     );
 874    /* second vec4 */
 875    const2_reg = get_tmp(c);
 876
 877    /* use upper half of address reg for second read */
 878    addrReg = stride(addrReg, 0, 4, 0);
 879    addrReg.subnr = 16;
 880
 881    brw_dp_READ_4_vs(p,
 882                     const2_reg,              /* writeback dest */
 883                     1,                       /* oword */
 884                     1,                       /* relative indexing? */
 885                     addrReg,                 /* address register */
 886                     16 * src->Index,         /* byte offset */
 887                     SURF_INDEX_VERT_CONST_BUFFER
 888                     );
 889
 890    /* merge the two Owords into the constant register */
 891    /* const_reg[7..4] = const2_reg[7..4] */
 892    brw_MOV(p,
 893            suboffset(stride(const_reg, 0, 4, 1), 4),
 894            suboffset(stride(const2_reg, 0, 4, 1), 4));
 895    release_tmp(c, const2_reg);
 896
 897    return const_reg;
 898 }
 899
 900
 901
 902 /* TODO: relative addressing!
 903  */
 904 static struct brw_reg get_reg( struct brw_vs_compile *c,
 905                                gl_register_file file,
 906                                GLuint index )
 907 {
 908    switch (file) {
 909    case PROGRAM_TEMPORARY:
 910    case PROGRAM_INPUT:
 911    case PROGRAM_OUTPUT:
 912       assert(c->regs[file][index].nr != 0);
 913       return c->regs[file][index];
 914    case PROGRAM_STATE_VAR:
 915    case PROGRAM_CONSTANT:
 916    case PROGRAM_UNIFORM:
 917       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 918       return c->regs[PROGRAM_STATE_VAR][index];
 919    case PROGRAM_ADDRESS:
 920       assert(index == 0);
 921       return c->regs[file][index];
 922
 923    case PROGRAM_UNDEFINED:                      /* undef values */
 924       return brw_null_reg();
 925
 926    case PROGRAM_LOCAL_PARAM:
 927    case PROGRAM_ENV_PARAM:
 928    case PROGRAM_WRITE_ONLY:
 929    default:
 930       assert(0);
 931       return brw_null_reg();
 932    }
 933 }
 934
 935
 936 /**
 937  * Indirect addressing:  get reg[[arg] + offset].
 938  */
 939 static struct brw_reg deref( struct brw_vs_compile *c,
 940                              struct brw_reg arg,
 941                              GLint offset)
 942 {
 943    struct brw_compile *p = &c->func;
 944    struct brw_reg tmp = vec4(get_tmp(c));
 945    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 946    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 947    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 948    struct brw_reg indirect = brw_vec4_indirect(0,0);
 949
 950    {
 951       brw_push_insn_state(p);
 952       brw_set_access_mode(p, BRW_ALIGN_1);
 953
 954       /* This is pretty clunky - load the address register twice and
 955        * fetch each 4-dword value in turn.  There must be a way to do
 956        * this in a single pass, but I couldn't get it to work.
 957        */
 958       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 959       brw_MOV(p, tmp, indirect);
 960
 961       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 962       brw_MOV(p, suboffset(tmp, 4), indirect);
 963
 964       brw_pop_insn_state(p);
 965    }
 966
 967    /* NOTE: tmp not released */
 968    return vec8(tmp);
 969 }
 970
 971
 972 /**
 973  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 974  * TODO: relative addressing!
 975  */
 976 static struct brw_reg
 977 get_src_reg( struct brw_vs_compile *c,
 978              const struct prog_instruction *inst,
 979              GLuint argIndex )
 980 {
 981    const GLuint file = inst->SrcReg[argIndex].File;
 982    const GLint index = inst->SrcReg[argIndex].Index;
 983    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 984
 985    switch (file) {
 986    case PROGRAM_TEMPORARY:
 987    case PROGRAM_INPUT:
 988    case PROGRAM_OUTPUT:
 989       if (relAddr) {
 990          return deref(c, c->regs[file][0], index);
 991       }
 992       else {
 993          assert(c->regs[file][index].nr != 0);
 994          return c->regs[file][index];
 995       }
 996
 997    case PROGRAM_STATE_VAR:
 998    case PROGRAM_CONSTANT:
 999    case PROGRAM_UNIFORM:
1000    case PROGRAM_ENV_PARAM:
1001    case PROGRAM_LOCAL_PARAM:
1002       if (c->vp->use_const_buffer) {
1003          if (!relAddr && c->constant_map[index] != -1) {
1004             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1005             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1006          } else if (relAddr)
1007             return get_reladdr_constant(c, inst, argIndex);
1008          else
1009             return get_constant(c, inst, argIndex);
1010       }
1011       else if (relAddr) {
1012          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
1013       }
1014       else {
1015          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1016          return c->regs[PROGRAM_STATE_VAR][index];
1017       }
1018    case PROGRAM_ADDRESS:
1019       assert(index == 0);
1020       return c->regs[file][index];
1021
1022    case PROGRAM_UNDEFINED:
1023       /* this is a normal case since we loop over all three src args */
1024       return brw_null_reg();
1025
1026    case PROGRAM_WRITE_ONLY:
1027    default:
1028       assert(0);
1029       return brw_null_reg();
1030    }
1031 }
1032
1033
1034 static void emit_arl( struct brw_vs_compile *c,
1035                       struct brw_reg dst,
1036                       struct brw_reg arg0 )
1037 {
1038    struct brw_compile *p = &c->func;
1039    struct brw_reg tmp = dst;
1040    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1041
1042    if (need_tmp)
1043       tmp = get_tmp(c);
1044
1045    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
1046    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
1047
1048    if (need_tmp)
1049       release_tmp(c, tmp);
1050 }
1051
1052
1053 /**
1054  * Return the brw reg for the given instruction's src argument.
1055  * Will return mangled results for SWZ op.  The emit_swz() function
1056  * ignores this result and recalculates taking extended swizzles into
1057  * account.
1058  */
1059 static struct brw_reg get_arg( struct brw_vs_compile *c,
1060                                const struct prog_instruction *inst,
1061                                GLuint argIndex )
1062 {
1063    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1064    struct brw_reg reg;
1065
1066    if (src->File == PROGRAM_UNDEFINED)
1067       return brw_null_reg();
1068
1069    reg = get_src_reg(c, inst, argIndex);
1070
1071    /* Convert 3-bit swizzle to 2-bit.
1072     */
1073    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1074                                        GET_SWZ(src->Swizzle, 1),
1075                                        GET_SWZ(src->Swizzle, 2),
1076                                        GET_SWZ(src->Swizzle, 3));
1077
1078    /* Note this is ok for non-swizzle instructions:
1079     */
1080    reg.negate = src->Negate ? 1 : 0;
1081
1082    return reg;
1083 }
1084
1085
1086 /**
1087  * Get brw register for the given program dest register.
1088  */
1089 static struct brw_reg get_dst( struct brw_vs_compile *c,
1090                                struct prog_dst_register dst )
1091 {
1092    struct brw_reg reg;
1093
1094    switch (dst.File) {
1095    case PROGRAM_TEMPORARY:
1096    case PROGRAM_OUTPUT:
1097       assert(c->regs[dst.File][dst.Index].nr != 0);
1098       reg = c->regs[dst.File][dst.Index];
1099       break;
1100    case PROGRAM_ADDRESS:
1101       assert(dst.Index == 0);
1102       reg = c->regs[dst.File][dst.Index];
1103       break;
1104    case PROGRAM_UNDEFINED:
1105       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1106       reg = brw_null_reg();
1107       break;
1108    default:
1109       assert(0);
1110       reg = brw_null_reg();
1111    }
1112
1113    reg.dw1.bits.writemask = dst.WriteMask;
1114
1115    return reg;
1116 }
1117
1118
1119 static void emit_swz( struct brw_vs_compile *c,
1120                       struct brw_reg dst,
1121                       const struct prog_instruction *inst)
1122 {
1123    const GLuint argIndex = 0;
1124    const struct prog_src_register src = inst->SrcReg[argIndex];
1125    struct brw_compile *p = &c->func;
1126    GLuint zeros_mask = 0;
1127    GLuint ones_mask = 0;
1128    GLuint src_mask = 0;
1129    GLubyte src_swz[4];
1130    GLboolean need_tmp = (src.Negate &&
1131                          dst.file != BRW_GENERAL_REGISTER_FILE);
1132    struct brw_reg tmp = dst;
1133    GLuint i;
1134
1135    if (need_tmp)
1136       tmp = get_tmp(c);
1137
1138    for (i = 0; i < 4; i++) {
1139       if (dst.dw1.bits.writemask & (1<<i)) {
1140          GLubyte s = GET_SWZ(src.Swizzle, i);
1141          switch (s) {
1142          case SWIZZLE_X:
1143          case SWIZZLE_Y:
1144          case SWIZZLE_Z:
1145          case SWIZZLE_W:
1146             src_mask |= 1<<i;
1147             src_swz[i] = s;
1148             break;
1149          case SWIZZLE_ZERO:
1150             zeros_mask |= 1<<i;
1151             break;
1152          case SWIZZLE_ONE:
1153             ones_mask |= 1<<i;
1154             break;
1155          }
1156       }
1157    }
1158
1159    /* Do src first, in case dst aliases src:
1160     */
1161    if (src_mask) {
1162       struct brw_reg arg0;
1163
1164       arg0 = get_src_reg(c, inst, argIndex);
1165
1166       arg0 = brw_swizzle(arg0,
1167                          src_swz[0], src_swz[1],
1168                          src_swz[2], src_swz[3]);
1169
1170       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1171    }
1172
1173    if (zeros_mask)
1174       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1175
1176    if (ones_mask)
1177       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1178
1179    if (src.Negate)
1180       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1181
1182    if (need_tmp) {
1183       brw_MOV(p, dst, tmp);
1184       release_tmp(c, tmp);
1185    }
1186 }
1187
1188
1189 /**
1190  * Post-vertex-program processing.  Send the results to the URB.
1191  */
1192 static void emit_vertex_write( struct brw_vs_compile *c)
1193 {
1194    struct brw_compile *p = &c->func;
1195    struct brw_context *brw = p->brw;
1196    struct intel_context *intel = &brw->intel;
1197    struct brw_reg m0 = brw_message_reg(0);
1198    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1199    struct brw_reg ndc;
1200    int eot;
1201    GLuint len_vertex_header = 2;
1202
1203    if (c->key.copy_edgeflag) {
1204       brw_MOV(p,
1205               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1206               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1207    }
1208
1209    if (intel->gen < 6) {
1210       /* Build ndc coords */
1211       ndc = get_tmp(c);
1212       /* ndc = 1.0 / pos.w */
1213       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1214       /* ndc.xyz = pos * ndc */
1215       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1216    }
1217
1218    /* Update the header for point size, user clipping flags, and -ve rhw
1219     * workaround.
1220     */
1221    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1222        c->key.nr_userclip || brw->has_negative_rhw_bug)
1223    {
1224       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1225       GLuint i;
1226
1227       brw_MOV(p, header1, brw_imm_ud(0));
1228
1229       brw_set_access_mode(p, BRW_ALIGN_16);
1230
1231       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1232          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1233          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1234          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1235       }
1236
1237       for (i = 0; i < c->key.nr_userclip; i++) {
1238          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1239          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1240          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1241          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1242       }
1243
1244       /* i965 clipping workaround:
1245        * 1) Test for -ve rhw
1246        * 2) If set,
1247        *      set ndc = (0,0,0,0)
1248        *      set ucp[6] = 1
1249        *
1250        * Later, clipping will detect ucp[6] and ensure the primitive is
1251        * clipped against all fixed planes.
1252        */
1253       if (brw->has_negative_rhw_bug) {
1254          brw_CMP(p,
1255                  vec8(brw_null_reg()),
1256                  BRW_CONDITIONAL_L,
1257                  brw_swizzle1(ndc, 3),
1258                  brw_imm_f(0));
1259
1260          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1261          brw_MOV(p, ndc, brw_imm_f(0));
1262          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1263       }
1264
1265       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1266       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1267       brw_set_access_mode(p, BRW_ALIGN_16);
1268
1269       release_tmp(c, header1);
1270    }
1271    else {
1272       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1273    }
1274
1275    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1276     * of zeros followed by two sets of NDC coordinates:
1277     */
1278    brw_set_access_mode(p, BRW_ALIGN_1);
1279
1280    if (intel->gen >= 6) {
1281       /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
1282        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1283        * dword 4-7 (m2) is the 4D space position
1284        * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
1285        * m5 is the first vertex data we fill, which is the vertex position.
1286        */
1287       brw_MOV(p, offset(m0, 2), pos);
1288       brw_MOV(p, offset(m0, 5), pos);
1289       len_vertex_header = 4;
1290    } else if (intel->is_ironlake) {
1291       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1292        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1293        * dword 4-7 (m2) is the ndc position (set above)
1294        * dword 8-11 (m3) of the vertex header is the 4D space position
1295        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1296        * m6 is a pad so that the vertex element data is aligned
1297        * m7 is the first vertex data we fill, which is the vertex position.
1298        */
1299       brw_MOV(p, offset(m0, 2), ndc);
1300       brw_MOV(p, offset(m0, 3), pos);
1301       brw_MOV(p, offset(m0, 7), pos);
1302       len_vertex_header = 6;
1303    } else {
1304       /* There are 8 dwords in VUE header pre-Ironlake:
1305        * dword 0-3 (m1) is indices, point width, clip flags.
1306        * dword 4-7 (m2) is ndc position (set above)
1307        *
1308        * dword 8-11 (m3) is the first vertex data, which we always have be the
1309        * vertex position.
1310        */
1311       brw_MOV(p, offset(m0, 2), ndc);
1312       brw_MOV(p, offset(m0, 3), pos);
1313       len_vertex_header = 2;
1314    }
1315
1316    eot = (c->first_overflow_output == 0);
1317
1318    brw_urb_WRITE(p,
1319                  brw_null_reg(), /* dest */
1320                  0,             /* starting mrf reg nr */
1321                  c->r0,         /* src */
1322                  0,             /* allocate */
1323                  1,             /* used */
1324                  MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1325                  0,             /* response len */
1326                  eot,           /* eot */
1327                  eot,           /* writes complete */
1328                  0,             /* urb destination offset */
1329                  BRW_URB_SWIZZLE_INTERLEAVE);
1330
1331    if (c->first_overflow_output > 0) {
1332       /* Not all of the vertex outputs/results fit into the MRF.
1333        * Move the overflowed attributes from the GRF to the MRF and
1334        * issue another brw_urb_WRITE().
1335        */
1336       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1337        * at mrf[4] atm...
1338        */
1339       GLuint i, mrf = 0;
1340       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1341          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1342             /* move from GRF to MRF */
1343             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1344             mrf++;
1345          }
1346       }
1347
1348       brw_urb_WRITE(p,
1349                     brw_null_reg(), /* dest */
1350                     4,              /* starting mrf reg nr */
1351                     c->r0,          /* src */
1352                     0,              /* allocate */
1353                     1,              /* used */
1354                     mrf+1,          /* msg len */
1355                     0,              /* response len */
1356                     1,              /* eot */
1357                     1,              /* writes complete */
1358                     BRW_MAX_MRF-1,  /* urb destination offset */
1359                     BRW_URB_SWIZZLE_INTERLEAVE);
1360    }
1361 }
1362
1363
1364 /**
1365  * Called after code generation to resolve subroutine calls and the
1366  * END instruction.
1367  * \param end_inst  points to brw code for END instruction
1368  * \param last_inst  points to last instruction emitted before vertex write
1369  */
1370 static void
1371 post_vs_emit( struct brw_vs_compile *c,
1372               struct brw_instruction *end_inst,
1373               struct brw_instruction *last_inst )
1374 {
1375    GLint offset;
1376
1377    brw_resolve_cals(&c->func);
1378
1379    /* patch up the END code to jump past subroutines, etc */
1380    offset = last_inst - end_inst;
1381    if (offset > 1) {
1382       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1383    } else {
1384       end_inst->header.opcode = BRW_OPCODE_NOP;
1385    }
1386 }
1387
1388 static GLboolean
1389 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1390 {
1391    struct brw_compile *p = &c->func;
1392    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1393
1394    if (p->nr_insn == 0)
1395       return GL_FALSE;
1396
1397    if (val.address_mode != BRW_ADDRESS_DIRECT)
1398       return GL_FALSE;
1399
1400    switch (prev_insn->header.opcode) {
1401    case BRW_OPCODE_MOV:
1402    case BRW_OPCODE_MAC:
1403    case BRW_OPCODE_MUL:
1404       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1405           prev_insn->header.execution_size == val.width &&
1406           prev_insn->bits1.da1.dest_reg_file == val.file &&
1407           prev_insn->bits1.da1.dest_reg_type == val.type &&
1408           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1409           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1410           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1411           prev_insn->bits1.da16.dest_writemask == 0xf)
1412          return GL_TRUE;
1413       else
1414          return GL_FALSE;
1415    default:
1416       return GL_FALSE;
1417    }
1418 }
1419
1420 static uint32_t
1421 get_predicate(const struct prog_instruction *inst)
1422 {
1423    if (inst->DstReg.CondMask == COND_TR)
1424       return BRW_PREDICATE_NONE;
1425
1426    /* All of GLSL only produces predicates for COND_NE and one channel per
1427     * vector.  Fail badly if someone starts doing something else, as it might
1428     * mean infinite looping or something.
1429     *
1430     * We'd like to support all the condition codes, but our hardware doesn't
1431     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1432     * those, the instruction may update the condition codes or not, then any
1433     * later instruction may use one of those condition codes.  For gen4, the
1434     * instruction may update the flags register based on one of the condition
1435     * codes output by the instruction, and then further instructions may
1436     * predicate on that.  We can probably support this, but it won't
1437     * necessarily be easy.
1438     */
1439    assert(inst->DstReg.CondMask == COND_NE);
1440
1441    switch (inst->DstReg.CondSwizzle) {
1442    case SWIZZLE_XXXX:
1443       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1444    case SWIZZLE_YYYY:
1445       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1446    case SWIZZLE_ZZZZ:
1447       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1448    case SWIZZLE_WWWW:
1449       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1450    default:
1451       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1452                     inst->DstReg.CondMask);
1453       return BRW_PREDICATE_NORMAL;
1454    }
1455 }
1456
1457 /* Emit the vertex program instructions here.
1458  */
1459 void brw_vs_emit(struct brw_vs_compile *c )
1460 {
1461 #define MAX_IF_DEPTH 32
1462 #define MAX_LOOP_DEPTH 32
1463    struct brw_compile *p = &c->func;
1464    struct brw_context *brw = p->brw;
1465    struct intel_context *intel = &brw->intel;
1466    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1467    GLuint insn, if_depth = 0, loop_depth = 0;
1468    GLuint end_offset = 0;
1469    struct brw_instruction *end_inst, *last_inst;
1470    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1471    const struct brw_indirect stack_index = brw_indirect(0, 0);
1472    GLuint index;
1473    GLuint file;
1474
1475    if (INTEL_DEBUG & DEBUG_VS) {
1476       printf("vs-mesa:\n");
1477       _mesa_print_program(&c->vp->program.Base);
1478       printf("\n");
1479    }
1480
1481    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1482    brw_set_access_mode(p, BRW_ALIGN_16);
1483
1484    for (insn = 0; insn < nr_insns; insn++) {
1485        GLuint i;
1486        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1487
1488        /* Message registers can't be read, so copy the output into GRF
1489         * register if they are used in source registers
1490         */
1491        for (i = 0; i < 3; i++) {
1492            struct prog_src_register *src = &inst->SrcReg[i];
1493            GLuint index = src->Index;
1494            GLuint file = src->File;
1495            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1496                c->output_regs[index].used_in_src = GL_TRUE;
1497        }
1498
1499        switch (inst->Opcode) {
1500        case OPCODE_CAL:
1501        case OPCODE_RET:
1502           c->needs_stack = GL_TRUE;
1503           break;
1504        default:
1505           break;
1506        }
1507    }
1508
1509    /* Static register allocation
1510     */
1511    brw_vs_alloc_regs(c);
1512
1513    if (c->needs_stack)
1514       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1515
1516    for (insn = 0; insn < nr_insns; insn++) {
1517
1518       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1519       struct brw_reg args[3], dst;
1520       GLuint i;
1521
1522 #if 0
1523       printf("%d: ", insn);
1524       _mesa_print_instruction(inst);
1525 #endif
1526
1527       /* Get argument regs.  SWZ is special and does this itself.
1528        */
1529       if (inst->Opcode != OPCODE_SWZ)
1530           for (i = 0; i < 3; i++) {
1531               const struct prog_src_register *src = &inst->SrcReg[i];
1532               index = src->Index;
1533               file = src->File;
1534               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1535                   args[i] = c->output_regs[index].reg;
1536               else
1537                   args[i] = get_arg(c, inst, i);
1538           }
1539
1540       /* Get dest regs.  Note that it is possible for a reg to be both
1541        * dst and arg, given the static allocation of registers.  So
1542        * care needs to be taken emitting multi-operation instructions.
1543        */
1544       index = inst->DstReg.Index;
1545       file = inst->DstReg.File;
1546       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1547           dst = c->output_regs[index].reg;
1548       else
1549           dst = get_dst(c, inst->DstReg);
1550
1551       if (inst->SaturateMode != SATURATE_OFF) {
1552          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1553                        inst->SaturateMode);
1554       }
1555
1556       switch (inst->Opcode) {
1557       case OPCODE_ABS:
1558          brw_MOV(p, dst, brw_abs(args[0]));
1559          break;
1560       case OPCODE_ADD:
1561          brw_ADD(p, dst, args[0], args[1]);
1562          break;
1563       case OPCODE_COS:
1564          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1565          break;
1566       case OPCODE_DP3:
1567          brw_DP3(p, dst, args[0], args[1]);
1568          break;
1569       case OPCODE_DP4:
1570          brw_DP4(p, dst, args[0], args[1]);
1571          break;
1572       case OPCODE_DPH:
1573          brw_DPH(p, dst, args[0], args[1]);
1574          break;
1575       case OPCODE_NRM3:
1576          emit_nrm(c, dst, args[0], 3);
1577          break;
1578       case OPCODE_NRM4:
1579          emit_nrm(c, dst, args[0], 4);
1580          break;
1581       case OPCODE_DST:
1582          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1583          break;
1584       case OPCODE_EXP:
1585          unalias1(c, dst, args[0], emit_exp_noalias);
1586          break;
1587       case OPCODE_EX2:
1588          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1589          break;
1590       case OPCODE_ARL:
1591          emit_arl(c, dst, args[0]);
1592          break;
1593       case OPCODE_FLR:
1594          brw_RNDD(p, dst, args[0]);
1595          break;
1596       case OPCODE_FRC:
1597          brw_FRC(p, dst, args[0]);
1598          break;
1599       case OPCODE_LOG:
1600          unalias1(c, dst, args[0], emit_log_noalias);
1601          break;
1602       case OPCODE_LG2:
1603          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1604          break;
1605       case OPCODE_LIT:
1606          unalias1(c, dst, args[0], emit_lit_noalias);
1607          break;
1608       case OPCODE_LRP:
1609          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1610          break;
1611       case OPCODE_MAD:
1612          if (!accumulator_contains(c, args[2]))
1613             brw_MOV(p, brw_acc_reg(), args[2]);
1614          brw_MAC(p, dst, args[0], args[1]);
1615          break;
1616       case OPCODE_CMP:
1617          emit_cmp(p, dst, args[0], args[1], args[2]);
1618          break;
1619       case OPCODE_MAX:
1620          emit_max(p, dst, args[0], args[1]);
1621          break;
1622       case OPCODE_MIN:
1623          emit_min(p, dst, args[0], args[1]);
1624          break;
1625       case OPCODE_MOV:
1626          brw_MOV(p, dst, args[0]);
1627          break;
1628       case OPCODE_MUL:
1629          brw_MUL(p, dst, args[0], args[1]);
1630          break;
1631       case OPCODE_POW:
1632          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1633          break;
1634       case OPCODE_RCP:
1635          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1636          break;
1637       case OPCODE_RSQ:
1638          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1639          break;
1640
1641       case OPCODE_SEQ:
1642          unalias2(c, dst, args[0], args[1], emit_seq);
1643          break;
1644       case OPCODE_SIN:
1645          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1646          break;
1647       case OPCODE_SNE:
1648          unalias2(c, dst, args[0], args[1], emit_sne);
1649          break;
1650       case OPCODE_SGE:
1651          unalias2(c, dst, args[0], args[1], emit_sge);
1652          break;
1653       case OPCODE_SGT:
1654          unalias2(c, dst, args[0], args[1], emit_sgt);
1655          break;
1656       case OPCODE_SLT:
1657          unalias2(c, dst, args[0], args[1], emit_slt);
1658          break;
1659       case OPCODE_SLE:
1660          unalias2(c, dst, args[0], args[1], emit_sle);
1661          break;
1662       case OPCODE_SUB:
1663          brw_ADD(p, dst, args[0], negate(args[1]));
1664          break;
1665       case OPCODE_SWZ:
1666          /* The args[0] value can't be used here as it won't have
1667           * correctly encoded the full swizzle:
1668           */
1669          emit_swz(c, dst, inst);
1670          break;
1671       case OPCODE_TRUNC:
1672          /* round toward zero */
1673          brw_RNDZ(p, dst, args[0]);
1674          break;
1675       case OPCODE_XPD:
1676          emit_xpd(p, dst, args[0], args[1]);
1677          break;
1678       case OPCODE_IF:
1679          assert(if_depth < MAX_IF_DEPTH);
1680          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1681          /* Note that brw_IF smashes the predicate_control field. */
1682          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1683          if_depth++;
1684          break;
1685       case OPCODE_ELSE:
1686          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1687          break;
1688       case OPCODE_ENDIF:
1689          assert(if_depth > 0);
1690          brw_ENDIF(p, if_inst[--if_depth]);
1691          break;
1692       case OPCODE_BGNLOOP:
1693          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1694          break;
1695       case OPCODE_BRK:
1696          brw_set_predicate_control(p, get_predicate(inst));
1697          brw_BREAK(p);
1698          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1699          break;
1700       case OPCODE_CONT:
1701          brw_set_predicate_control(p, get_predicate(inst));
1702          brw_CONT(p);
1703          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1704          break;
1705       case OPCODE_ENDLOOP:
1706          {
1707             struct brw_instruction *inst0, *inst1;
1708             GLuint br = 1;
1709
1710             loop_depth--;
1711
1712             if (intel->is_ironlake)
1713                br = 2;
1714
1715             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1716             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1717             while (inst0 > loop_inst[loop_depth]) {
1718                inst0--;
1719                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1720                    inst0->bits3.if_else.jump_count == 0) {
1721                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1722                   inst0->bits3.if_else.pop_count = 0;
1723                }
1724                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1725                         inst0->bits3.if_else.jump_count == 0) {
1726                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1727                   inst0->bits3.if_else.pop_count = 0;
1728                }
1729             }
1730          }
1731          break;
1732       case OPCODE_BRA:
1733          brw_set_predicate_control(p, get_predicate(inst));
1734          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1735          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1736          break;
1737       case OPCODE_CAL:
1738          brw_set_access_mode(p, BRW_ALIGN_1);
1739          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1740          brw_set_access_mode(p, BRW_ALIGN_16);
1741          brw_ADD(p, get_addr_reg(stack_index),
1742                          get_addr_reg(stack_index), brw_imm_d(4));
1743          brw_save_call(p, inst->Comment, p->nr_insn);
1744          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1745          break;
1746       case OPCODE_RET:
1747          brw_ADD(p, get_addr_reg(stack_index),
1748                          get_addr_reg(stack_index), brw_imm_d(-4));
1749          brw_set_access_mode(p, BRW_ALIGN_1);
1750          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1751          brw_set_access_mode(p, BRW_ALIGN_16);
1752          break;
1753       case OPCODE_END:
1754          end_offset = p->nr_insn;
1755          /* this instruction will get patched later to jump past subroutine
1756           * code, etc.
1757           */
1758          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1759          break;
1760       case OPCODE_PRINT:
1761          /* no-op */
1762          break;
1763       case OPCODE_BGNSUB:
1764          brw_save_label(p, inst->Comment, p->nr_insn);
1765          break;
1766       case OPCODE_ENDSUB:
1767          /* no-op */
1768          break;
1769       default:
1770          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1771                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1772                                     _mesa_opcode_string(inst->Opcode) :
1773                                     "unknown");
1774       }
1775
1776       /* Set the predication update on the last instruction of the native
1777        * instruction sequence.
1778        *
1779        * This would be problematic if it was set on a math instruction,
1780        * but that shouldn't be the case with the current GLSL compiler.
1781        */
1782       if (inst->CondUpdate) {
1783          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1784
1785          assert(hw_insn->header.destreg__conditionalmod == 0);
1786          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1787       }
1788
1789       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1790           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1791           && c->output_regs[inst->DstReg.Index].used_in_src) {
1792          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1793       }
1794
1795       /* Result color clamping.
1796        *
1797        * When destination register is an output register and
1798        * it's primary/secondary front/back color, we have to clamp
1799        * the result to [0,1]. This is done by enabling the
1800        * saturation bit for the last instruction.
1801        *
1802        * We don't use brw_set_saturate() as it modifies
1803        * p->current->header.saturate, which affects all the subsequent
1804        * instructions. Instead, we directly modify the header
1805        * of the last (already stored) instruction.
1806        */
1807       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1808          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1809              || (inst->DstReg.Index == VERT_RESULT_COL1)
1810              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1811              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1812             p->store[p->nr_insn-1].header.saturate = 1;
1813          }
1814       }
1815
1816       release_tmps(c);
1817    }
1818
1819    end_inst = &p->store[end_offset];
1820    last_inst = &p->store[p->nr_insn];
1821
1822    /* The END instruction will be patched to jump to this code */
1823    emit_vertex_write(c);
1824
1825    post_vs_emit(c, end_inst, last_inst);
1826
1827    brw_optimize(p);
1828
1829    if (INTEL_DEBUG & DEBUG_VS) {
1830       int i;
1831
1832       printf("vs-native:\n");
1833       for (i = 0; i < p->nr_insn; i++)
1834          brw_disasm(stderr, &p->store[i]);
1835       printf("\n");
1836    }
1837 }