src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71    int attributes_in_vue;
  72
  73 #if 0
  74    if (c->vp->program.Base.Parameters->NumParameters >= 6)
  75       c->vp->use_const_buffer = 1;
  76    else
  77 #endif
  78       c->vp->use_const_buffer = GL_FALSE;
  79    /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
  80
  81    /* r0 -- reserved as usual
  82     */
  83    c->r0 = brw_vec8_grf(reg, 0);
  84    reg++;
  85
  86    /* User clip planes from curbe:
  87     */
  88    if (c->key.nr_userclip) {
  89       for (i = 0; i < c->key.nr_userclip; i++) {
  90          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  91       }
  92
  93       /* Deal with curbe alignment:
  94        */
  95       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
  96    }
  97
  98    /* Vertex program parameters from curbe:
  99     */
 100    if (c->vp->use_const_buffer) {
 101       /* get constants from a real constant buffer */
 102       c->prog_data.curb_read_length = 0;
 103       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 104    }
 105    else {
 106       /* use a section of the GRF for constants */
 107       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 108       for (i = 0; i < nr_params; i++) {
 109          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 110       }
 111       reg += (nr_params + 1) / 2;
 112       c->prog_data.curb_read_length = reg - 1;
 113
 114       c->prog_data.nr_params = nr_params * 4;
 115    }
 116
 117    /* Allocate input regs:
 118     */
 119    c->nr_inputs = 0;
 120    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 121       if (c->prog_data.inputs_read & (1 << i)) {
 122          c->nr_inputs++;
 123          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 124          reg++;
 125       }
 126    }
 127    /* If there are no inputs, we'll still be reading one attribute's worth
 128     * because it's required -- see urb_read_length setting.
 129     */
 130    if (c->nr_inputs == 0)
 131       reg++;
 132
 133    /* Allocate outputs: TODO: could organize the non-position outputs
 134     * to go straight into message regs.
 135     */
 136    c->nr_outputs = 0;
 137    c->first_output = reg;
 138    mrf = 4;
 139    for (i = 0; i < VERT_RESULT_MAX; i++) {
 140       if (c->prog_data.outputs_written & (1 << i)) {
 141          c->nr_outputs++;
 142          if (i == VERT_RESULT_HPOS) {
 143             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 144             reg++;
 145          }
 146          else if (i == VERT_RESULT_PSIZ) {
 147             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 148             reg++;
 149             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 150          }
 151          else {
 152             c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 153             mrf++;
 154          }
 155       }
 156    }
 157
 158    /* Allocate program temporaries:
 159     */
 160    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 161       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 162       reg++;
 163    }
 164
 165    /* Address reg(s).  Don't try to use the internal address reg until
 166     * deref time.
 167     */
 168    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 169       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 170                                              reg,
 171                                              0,
 172                                              BRW_REGISTER_TYPE_D,
 173                                              BRW_VERTICAL_STRIDE_8,
 174                                              BRW_WIDTH_8,
 175                                              BRW_HORIZONTAL_STRIDE_1,
 176                                              BRW_SWIZZLE_XXXX,
 177                                              WRITEMASK_X);
 178       reg++;
 179    }
 180
 181    if (c->vp->use_const_buffer) {
 182       for (i = 0; i < 3; i++) {
 183          c->current_const[i].index = -1;
 184          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 185          reg++;
 186       }
 187    }
 188
 189    for (i = 0; i < 128; i++) {
 190       if (c->output_regs[i].used_in_src) {
 191          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 192          reg++;
 193       }
 194    }
 195
 196    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 197    reg += 2;
 198
 199    /* Some opcodes need an internal temporary:
 200     */
 201    c->first_tmp = reg;
 202    c->last_tmp = reg;           /* for allocation purposes */
 203
 204    /* Each input reg holds data from two vertices.  The
 205     * urb_read_length is the number of registers read from *each*
 206     * vertex urb, so is half the amount:
 207     */
 208    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 209    /* Setting this field to 0 leads to undefined behavior according to the
 210     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 211     * sitting in them, even if it's padding.
 212     */
 213    if (c->prog_data.urb_read_length == 0)
 214       c->prog_data.urb_read_length = 1;
 215
 216    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 217     * them to fit the biggest thing they need to.
 218     */
 219    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 220
 221    c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 222
 223    c->prog_data.total_grf = reg;
 224
 225    if (INTEL_DEBUG & DEBUG_VS) {
 226       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 227       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 228       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 229    }
 230 }
 231
 232
 233 /**
 234  * If an instruction uses a temp reg both as a src and the dest, we
 235  * sometimes need to allocate an intermediate temporary.
 236  */
 237 static void unalias1( struct brw_vs_compile *c,
 238                       struct brw_reg dst,
 239                       struct brw_reg arg0,
 240                       void (*func)( struct brw_vs_compile *,
 241                                     struct brw_reg,
 242                                     struct brw_reg ))
 243 {
 244    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 245       struct brw_compile *p = &c->func;
 246       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 247       func(c, tmp, arg0);
 248       brw_MOV(p, dst, tmp);
 249       release_tmp(c, tmp);
 250    }
 251    else {
 252       func(c, dst, arg0);
 253    }
 254 }
 255
 256 /**
 257  * \sa unalias2
 258  * Checkes if 2-operand instruction needs an intermediate temporary.
 259  */
 260 static void unalias2( struct brw_vs_compile *c,
 261                       struct brw_reg dst,
 262                       struct brw_reg arg0,
 263                       struct brw_reg arg1,
 264                       void (*func)( struct brw_vs_compile *,
 265                                     struct brw_reg,
 266                                     struct brw_reg,
 267                                     struct brw_reg ))
 268 {
 269    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 270        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 271       struct brw_compile *p = &c->func;
 272       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 273       func(c, tmp, arg0, arg1);
 274       brw_MOV(p, dst, tmp);
 275       release_tmp(c, tmp);
 276    }
 277    else {
 278       func(c, dst, arg0, arg1);
 279    }
 280 }
 281
 282 /**
 283  * \sa unalias2
 284  * Checkes if 3-operand instruction needs an intermediate temporary.
 285  */
 286 static void unalias3( struct brw_vs_compile *c,
 287                       struct brw_reg dst,
 288                       struct brw_reg arg0,
 289                       struct brw_reg arg1,
 290                       struct brw_reg arg2,
 291                       void (*func)( struct brw_vs_compile *,
 292                                     struct brw_reg,
 293                                     struct brw_reg,
 294                                     struct brw_reg,
 295                                     struct brw_reg ))
 296 {
 297    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 298        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 299        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 300       struct brw_compile *p = &c->func;
 301       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 302       func(c, tmp, arg0, arg1, arg2);
 303       brw_MOV(p, dst, tmp);
 304       release_tmp(c, tmp);
 305    }
 306    else {
 307       func(c, dst, arg0, arg1, arg2);
 308    }
 309 }
 310
 311 static void emit_sop( struct brw_compile *p,
 312                       struct brw_reg dst,
 313                       struct brw_reg arg0,
 314                       struct brw_reg arg1,
 315                       GLuint cond)
 316 {
 317    brw_MOV(p, dst, brw_imm_f(0.0f));
 318    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 319    brw_MOV(p, dst, brw_imm_f(1.0f));
 320    brw_set_predicate_control_flag_value(p, 0xff);
 321 }
 322
 323 static void emit_seq( struct brw_compile *p,
 324                       struct brw_reg dst,
 325                       struct brw_reg arg0,
 326                       struct brw_reg arg1 )
 327 {
 328    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 329 }
 330
 331 static void emit_sne( struct brw_compile *p,
 332                       struct brw_reg dst,
 333                       struct brw_reg arg0,
 334                       struct brw_reg arg1 )
 335 {
 336    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 337 }
 338 static void emit_slt( struct brw_compile *p,
 339                       struct brw_reg dst,
 340                       struct brw_reg arg0,
 341                       struct brw_reg arg1 )
 342 {
 343    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 344 }
 345
 346 static void emit_sle( struct brw_compile *p,
 347                       struct brw_reg dst,
 348                       struct brw_reg arg0,
 349                       struct brw_reg arg1 )
 350 {
 351    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 352 }
 353
 354 static void emit_sgt( struct brw_compile *p,
 355                       struct brw_reg dst,
 356                       struct brw_reg arg0,
 357                       struct brw_reg arg1 )
 358 {
 359    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 360 }
 361
 362 static void emit_sge( struct brw_compile *p,
 363                       struct brw_reg dst,
 364                       struct brw_reg arg0,
 365                       struct brw_reg arg1 )
 366 {
 367   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 368 }
 369
 370 static void emit_max( struct brw_compile *p,
 371                       struct brw_reg dst,
 372                       struct brw_reg arg0,
 373                       struct brw_reg arg1 )
 374 {
 375    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 376    brw_SEL(p, dst, arg1, arg0);
 377    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 378 }
 379
 380 static void emit_min( struct brw_compile *p,
 381                       struct brw_reg dst,
 382                       struct brw_reg arg0,
 383                       struct brw_reg arg1 )
 384 {
 385    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 386    brw_SEL(p, dst, arg0, arg1);
 387    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 388 }
 389
 390
 391 static void emit_math1( struct brw_vs_compile *c,
 392                         GLuint function,
 393                         struct brw_reg dst,
 394                         struct brw_reg arg0,
 395                         GLuint precision)
 396 {
 397    /* There are various odd behaviours with SEND on the simulator.  In
 398     * addition there are documented issues with the fact that the GEN4
 399     * processor doesn't do dependency control properly on SEND
 400     * results.  So, on balance, this kludge to get around failures
 401     * with writemasked math results looks like it might be necessary
 402     * whether that turns out to be a simulator bug or not:
 403     */
 404    struct brw_compile *p = &c->func;
 405    struct brw_reg tmp = dst;
 406    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 407                          dst.file != BRW_GENERAL_REGISTER_FILE);
 408
 409    if (need_tmp)
 410       tmp = get_tmp(c);
 411
 412    brw_math(p,
 413             tmp,
 414             function,
 415             BRW_MATH_SATURATE_NONE,
 416             2,
 417             arg0,
 418             BRW_MATH_DATA_SCALAR,
 419             precision);
 420
 421    if (need_tmp) {
 422       brw_MOV(p, dst, tmp);
 423       release_tmp(c, tmp);
 424    }
 425 }
 426
 427
 428 static void emit_math2( struct brw_vs_compile *c,
 429                         GLuint function,
 430                         struct brw_reg dst,
 431                         struct brw_reg arg0,
 432                         struct brw_reg arg1,
 433                         GLuint precision)
 434 {
 435    struct brw_compile *p = &c->func;
 436    struct brw_reg tmp = dst;
 437    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 438                          dst.file != BRW_GENERAL_REGISTER_FILE);
 439
 440    if (need_tmp)
 441       tmp = get_tmp(c);
 442
 443    brw_MOV(p, brw_message_reg(3), arg1);
 444
 445    brw_math(p,
 446             tmp,
 447             function,
 448             BRW_MATH_SATURATE_NONE,
 449             2,
 450             arg0,
 451             BRW_MATH_DATA_SCALAR,
 452             precision);
 453
 454    if (need_tmp) {
 455       brw_MOV(p, dst, tmp);
 456       release_tmp(c, tmp);
 457    }
 458 }
 459
 460
 461 static void emit_exp_noalias( struct brw_vs_compile *c,
 462                               struct brw_reg dst,
 463                               struct brw_reg arg0 )
 464 {
 465    struct brw_compile *p = &c->func;
 466
 467
 468    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 469       struct brw_reg tmp = get_tmp(c);
 470       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 471
 472       /* tmp_d = floor(arg0.x) */
 473       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 474
 475       /* result[0] = 2.0 ^ tmp */
 476
 477       /* Adjust exponent for floating point:
 478        * exp += 127
 479        */
 480       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 481
 482       /* Install exponent and sign.
 483        * Excess drops off the edge:
 484        */
 485       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 486               tmp_d, brw_imm_d(23));
 487
 488       release_tmp(c, tmp);
 489    }
 490
 491    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 492       /* result[1] = arg0.x - floor(arg0.x) */
 493       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 494    }
 495
 496    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 497       /* As with the LOG instruction, we might be better off just
 498        * doing a taylor expansion here, seeing as we have to do all
 499        * the prep work.
 500        *
 501        * If mathbox partial precision is too low, consider also:
 502        * result[3] = result[0] * EXP(result[1])
 503        */
 504       emit_math1(c,
 505                  BRW_MATH_FUNCTION_EXP,
 506                  brw_writemask(dst, WRITEMASK_Z),
 507                  brw_swizzle1(arg0, 0),
 508                  BRW_MATH_PRECISION_FULL);
 509    }
 510
 511    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 512       /* result[3] = 1.0; */
 513       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 514    }
 515 }
 516
 517
 518 static void emit_log_noalias( struct brw_vs_compile *c,
 519                               struct brw_reg dst,
 520                               struct brw_reg arg0 )
 521 {
 522    struct brw_compile *p = &c->func;
 523    struct brw_reg tmp = dst;
 524    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 525    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 526    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 527                          dst.file != BRW_GENERAL_REGISTER_FILE);
 528
 529    if (need_tmp) {
 530       tmp = get_tmp(c);
 531       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 532    }
 533
 534    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 535     * according to spec:
 536     *
 537     * These almost look likey they could be joined up, but not really
 538     * practical:
 539     *
 540     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 541     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 542     */
 543    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 544       brw_AND(p,
 545               brw_writemask(tmp_ud, WRITEMASK_X),
 546               brw_swizzle1(arg0_ud, 0),
 547               brw_imm_ud((1U<<31)-1));
 548
 549       brw_SHR(p,
 550               brw_writemask(tmp_ud, WRITEMASK_X),
 551               tmp_ud,
 552               brw_imm_ud(23));
 553
 554       brw_ADD(p,
 555               brw_writemask(tmp, WRITEMASK_X),
 556               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 557               brw_imm_d(-127));
 558    }
 559
 560    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 561       brw_AND(p,
 562               brw_writemask(tmp_ud, WRITEMASK_Y),
 563               brw_swizzle1(arg0_ud, 0),
 564               brw_imm_ud((1<<23)-1));
 565
 566       brw_OR(p,
 567              brw_writemask(tmp_ud, WRITEMASK_Y),
 568              tmp_ud,
 569              brw_imm_ud(127<<23));
 570    }
 571
 572    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 573       /* result[2] = result[0] + LOG2(result[1]); */
 574
 575       /* Why bother?  The above is just a hint how to do this with a
 576        * taylor series.  Maybe we *should* use a taylor series as by
 577        * the time all the above has been done it's almost certainly
 578        * quicker than calling the mathbox, even with low precision.
 579        *
 580        * Options are:
 581        *    - result[0] + mathbox.LOG2(result[1])
 582        *    - mathbox.LOG2(arg0.x)
 583        *    - result[0] + inline_taylor_approx(result[1])
 584        */
 585       emit_math1(c,
 586                  BRW_MATH_FUNCTION_LOG,
 587                  brw_writemask(tmp, WRITEMASK_Z),
 588                  brw_swizzle1(tmp, 1),
 589                  BRW_MATH_PRECISION_FULL);
 590
 591       brw_ADD(p,
 592               brw_writemask(tmp, WRITEMASK_Z),
 593               brw_swizzle1(tmp, 2),
 594               brw_swizzle1(tmp, 0));
 595    }
 596
 597    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 598       /* result[3] = 1.0; */
 599       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 600    }
 601
 602    if (need_tmp) {
 603       brw_MOV(p, dst, tmp);
 604       release_tmp(c, tmp);
 605    }
 606 }
 607
 608
 609 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 610  */
 611 static void emit_dst_noalias( struct brw_vs_compile *c,
 612                               struct brw_reg dst,
 613                               struct brw_reg arg0,
 614                               struct brw_reg arg1)
 615 {
 616    struct brw_compile *p = &c->func;
 617
 618    /* There must be a better way to do this:
 619     */
 620    if (dst.dw1.bits.writemask & WRITEMASK_X)
 621       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 622    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 623       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 624    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 625       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 626    if (dst.dw1.bits.writemask & WRITEMASK_W)
 627       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 628 }
 629
 630
 631 static void emit_xpd( struct brw_compile *p,
 632                       struct brw_reg dst,
 633                       struct brw_reg t,
 634                       struct brw_reg u)
 635 {
 636    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 637    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 638 }
 639
 640
 641 static void emit_lit_noalias( struct brw_vs_compile *c,
 642                               struct brw_reg dst,
 643                               struct brw_reg arg0 )
 644 {
 645    struct brw_compile *p = &c->func;
 646    struct brw_instruction *if_insn;
 647    struct brw_reg tmp = dst;
 648    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 649
 650    if (need_tmp)
 651       tmp = get_tmp(c);
 652
 653    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 654    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 655
 656    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 657     * to get all channels active inside the IF.  In the clipping code
 658     * we run with NoMask, so it's not an option and we can use
 659     * BRW_EXECUTE_1 for all comparisions.
 660     */
 661    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 662    if_insn = brw_IF(p, BRW_EXECUTE_8);
 663    {
 664       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 665
 666       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 667       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 668       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 669
 670       emit_math2(c,
 671                  BRW_MATH_FUNCTION_POW,
 672                  brw_writemask(dst, WRITEMASK_Z),
 673                  brw_swizzle1(tmp, 2),
 674                  brw_swizzle1(arg0, 3),
 675                  BRW_MATH_PRECISION_PARTIAL);
 676    }
 677
 678    brw_ENDIF(p, if_insn);
 679
 680    release_tmp(c, tmp);
 681 }
 682
 683 static void emit_lrp_noalias(struct brw_vs_compile *c,
 684                              struct brw_reg dst,
 685                              struct brw_reg arg0,
 686                              struct brw_reg arg1,
 687                              struct brw_reg arg2)
 688 {
 689    struct brw_compile *p = &c->func;
 690
 691    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 692    brw_MUL(p, brw_null_reg(), dst, arg2);
 693    brw_MAC(p, dst, arg0, arg1);
 694 }
 695
 696 /** 3 or 4-component vector normalization */
 697 static void emit_nrm( struct brw_vs_compile *c,
 698                       struct brw_reg dst,
 699                       struct brw_reg arg0,
 700                       int num_comps)
 701 {
 702    struct brw_compile *p = &c->func;
 703    struct brw_reg tmp = get_tmp(c);
 704
 705    /* tmp = dot(arg0, arg0) */
 706    if (num_comps == 3)
 707       brw_DP3(p, tmp, arg0, arg0);
 708    else
 709       brw_DP4(p, tmp, arg0, arg0);
 710
 711    /* tmp = 1 / sqrt(tmp) */
 712    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 713
 714    /* dst = arg0 * tmp */
 715    brw_MUL(p, dst, arg0, tmp);
 716
 717    release_tmp(c, tmp);
 718 }
 719
 720
 721 static struct brw_reg
 722 get_constant(struct brw_vs_compile *c,
 723              const struct prog_instruction *inst,
 724              GLuint argIndex)
 725 {
 726    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 727    struct brw_compile *p = &c->func;
 728    struct brw_reg const_reg;
 729    struct brw_reg const2_reg;
 730
 731    assert(argIndex < 3);
 732
 733    if (c->current_const[argIndex].index != src->Index || src->RelAddr) {
 734       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 735
 736       c->current_const[argIndex].index = src->Index;
 737
 738 #if 0
 739       printf("  fetch const[%d] for arg %d into reg %d\n",
 740              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 741 #endif
 742       /* need to fetch the constant now */
 743       brw_dp_READ_4_vs(p,
 744                        c->current_const[argIndex].reg,/* writeback dest */
 745                        0,                             /* oword */
 746                        src->RelAddr,                  /* relative indexing? */
 747                        addrReg,                       /* address register */
 748                        16 * src->Index,               /* byte offset */
 749                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 750                        );
 751
 752       if (src->RelAddr) {
 753          /* second read */
 754          const2_reg = get_tmp(c);
 755
 756          /* use upper half of address reg for second read */
 757          addrReg = stride(addrReg, 0, 4, 0);
 758          addrReg.subnr = 16;
 759
 760          brw_dp_READ_4_vs(p,
 761                           const2_reg,              /* writeback dest */
 762                           1,                       /* oword */
 763                           src->RelAddr,            /* relative indexing? */
 764                           addrReg,                 /* address register */
 765                           16 * src->Index,         /* byte offset */
 766                           SURF_INDEX_VERT_CONST_BUFFER
 767                           );
 768       }
 769    }
 770
 771    const_reg = c->current_const[argIndex].reg;
 772
 773    if (src->RelAddr) {
 774       /* merge the two Owords into the constant register */
 775       /* const_reg[7..4] = const2_reg[7..4] */
 776       brw_MOV(p,
 777               suboffset(stride(const_reg, 0, 4, 1), 4),
 778               suboffset(stride(const2_reg, 0, 4, 1), 4));
 779       release_tmp(c, const2_reg);
 780    }
 781    else {
 782       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 783       const_reg = stride(const_reg, 0, 4, 0);
 784       const_reg.subnr = 0;
 785    }
 786
 787    return const_reg;
 788 }
 789
 790
 791
 792 /* TODO: relative addressing!
 793  */
 794 static struct brw_reg get_reg( struct brw_vs_compile *c,
 795                                gl_register_file file,
 796                                GLuint index )
 797 {
 798    switch (file) {
 799    case PROGRAM_TEMPORARY:
 800    case PROGRAM_INPUT:
 801    case PROGRAM_OUTPUT:
 802       assert(c->regs[file][index].nr != 0);
 803       return c->regs[file][index];
 804    case PROGRAM_STATE_VAR:
 805    case PROGRAM_CONSTANT:
 806    case PROGRAM_UNIFORM:
 807       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 808       return c->regs[PROGRAM_STATE_VAR][index];
 809    case PROGRAM_ADDRESS:
 810       assert(index == 0);
 811       return c->regs[file][index];
 812
 813    case PROGRAM_UNDEFINED:                      /* undef values */
 814       return brw_null_reg();
 815
 816    case PROGRAM_LOCAL_PARAM:
 817    case PROGRAM_ENV_PARAM:
 818    case PROGRAM_WRITE_ONLY:
 819    default:
 820       assert(0);
 821       return brw_null_reg();
 822    }
 823 }
 824
 825
 826 /**
 827  * Indirect addressing:  get reg[[arg] + offset].
 828  */
 829 static struct brw_reg deref( struct brw_vs_compile *c,
 830                              struct brw_reg arg,
 831                              GLint offset)
 832 {
 833    struct brw_compile *p = &c->func;
 834    struct brw_reg tmp = vec4(get_tmp(c));
 835    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 836    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 837    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 838    struct brw_reg indirect = brw_vec4_indirect(0,0);
 839
 840    {
 841       brw_push_insn_state(p);
 842       brw_set_access_mode(p, BRW_ALIGN_1);
 843
 844       /* This is pretty clunky - load the address register twice and
 845        * fetch each 4-dword value in turn.  There must be a way to do
 846        * this in a single pass, but I couldn't get it to work.
 847        */
 848       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 849       brw_MOV(p, tmp, indirect);
 850
 851       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 852       brw_MOV(p, suboffset(tmp, 4), indirect);
 853
 854       brw_pop_insn_state(p);
 855    }
 856
 857    /* NOTE: tmp not released */
 858    return vec8(tmp);
 859 }
 860
 861
 862 /**
 863  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 864  * TODO: relative addressing!
 865  */
 866 static struct brw_reg
 867 get_src_reg( struct brw_vs_compile *c,
 868              const struct prog_instruction *inst,
 869              GLuint argIndex )
 870 {
 871    const GLuint file = inst->SrcReg[argIndex].File;
 872    const GLint index = inst->SrcReg[argIndex].Index;
 873    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 874
 875    switch (file) {
 876    case PROGRAM_TEMPORARY:
 877    case PROGRAM_INPUT:
 878    case PROGRAM_OUTPUT:
 879       if (relAddr) {
 880          return deref(c, c->regs[file][0], index);
 881       }
 882       else {
 883          assert(c->regs[file][index].nr != 0);
 884          return c->regs[file][index];
 885       }
 886
 887    case PROGRAM_STATE_VAR:
 888    case PROGRAM_CONSTANT:
 889    case PROGRAM_UNIFORM:
 890       if (c->vp->use_const_buffer) {
 891          return get_constant(c, inst, argIndex);
 892       }
 893       else if (relAddr) {
 894          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 895       }
 896       else {
 897          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 898          return c->regs[PROGRAM_STATE_VAR][index];
 899       }
 900    case PROGRAM_ADDRESS:
 901       assert(index == 0);
 902       return c->regs[file][index];
 903
 904    case PROGRAM_UNDEFINED:
 905       /* this is a normal case since we loop over all three src args */
 906       return brw_null_reg();
 907
 908    case PROGRAM_LOCAL_PARAM:
 909    case PROGRAM_ENV_PARAM:
 910    case PROGRAM_WRITE_ONLY:
 911    default:
 912       assert(0);
 913       return brw_null_reg();
 914    }
 915 }
 916
 917
 918 static void emit_arl( struct brw_vs_compile *c,
 919                       struct brw_reg dst,
 920                       struct brw_reg arg0 )
 921 {
 922    struct brw_compile *p = &c->func;
 923    struct brw_reg tmp = dst;
 924    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 925
 926    if (need_tmp)
 927       tmp = get_tmp(c);
 928
 929    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 930    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 931
 932    if (need_tmp)
 933       release_tmp(c, tmp);
 934 }
 935
 936
 937 /**
 938  * Return the brw reg for the given instruction's src argument.
 939  * Will return mangled results for SWZ op.  The emit_swz() function
 940  * ignores this result and recalculates taking extended swizzles into
 941  * account.
 942  */
 943 static struct brw_reg get_arg( struct brw_vs_compile *c,
 944                                const struct prog_instruction *inst,
 945                                GLuint argIndex )
 946 {
 947    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 948    struct brw_reg reg;
 949
 950    if (src->File == PROGRAM_UNDEFINED)
 951       return brw_null_reg();
 952
 953    reg = get_src_reg(c, inst, argIndex);
 954
 955    /* Convert 3-bit swizzle to 2-bit.
 956     */
 957    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 958                                        GET_SWZ(src->Swizzle, 1),
 959                                        GET_SWZ(src->Swizzle, 2),
 960                                        GET_SWZ(src->Swizzle, 3));
 961
 962    /* Note this is ok for non-swizzle instructions:
 963     */
 964    reg.negate = src->Negate ? 1 : 0;
 965
 966    return reg;
 967 }
 968
 969
 970 /**
 971  * Get brw register for the given program dest register.
 972  */
 973 static struct brw_reg get_dst( struct brw_vs_compile *c,
 974                                struct prog_dst_register dst )
 975 {
 976    struct brw_reg reg;
 977
 978    switch (dst.File) {
 979    case PROGRAM_TEMPORARY:
 980    case PROGRAM_OUTPUT:
 981       assert(c->regs[dst.File][dst.Index].nr != 0);
 982       reg = c->regs[dst.File][dst.Index];
 983       break;
 984    case PROGRAM_ADDRESS:
 985       assert(dst.Index == 0);
 986       reg = c->regs[dst.File][dst.Index];
 987       break;
 988    case PROGRAM_UNDEFINED:
 989       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
 990       reg = brw_null_reg();
 991       break;
 992    default:
 993       assert(0);
 994       reg = brw_null_reg();
 995    }
 996
 997    reg.dw1.bits.writemask = dst.WriteMask;
 998
 999    return reg;
1000 }
1001
1002
1003 static void emit_swz( struct brw_vs_compile *c,
1004                       struct brw_reg dst,
1005                       const struct prog_instruction *inst)
1006 {
1007    const GLuint argIndex = 0;
1008    const struct prog_src_register src = inst->SrcReg[argIndex];
1009    struct brw_compile *p = &c->func;
1010    GLuint zeros_mask = 0;
1011    GLuint ones_mask = 0;
1012    GLuint src_mask = 0;
1013    GLubyte src_swz[4];
1014    GLboolean need_tmp = (src.Negate &&
1015                          dst.file != BRW_GENERAL_REGISTER_FILE);
1016    struct brw_reg tmp = dst;
1017    GLuint i;
1018
1019    if (need_tmp)
1020       tmp = get_tmp(c);
1021
1022    for (i = 0; i < 4; i++) {
1023       if (dst.dw1.bits.writemask & (1<<i)) {
1024          GLubyte s = GET_SWZ(src.Swizzle, i);
1025          switch (s) {
1026          case SWIZZLE_X:
1027          case SWIZZLE_Y:
1028          case SWIZZLE_Z:
1029          case SWIZZLE_W:
1030             src_mask |= 1<<i;
1031             src_swz[i] = s;
1032             break;
1033          case SWIZZLE_ZERO:
1034             zeros_mask |= 1<<i;
1035             break;
1036          case SWIZZLE_ONE:
1037             ones_mask |= 1<<i;
1038             break;
1039          }
1040       }
1041    }
1042
1043    /* Do src first, in case dst aliases src:
1044     */
1045    if (src_mask) {
1046       struct brw_reg arg0;
1047
1048       arg0 = get_src_reg(c, inst, argIndex);
1049
1050       arg0 = brw_swizzle(arg0,
1051                          src_swz[0], src_swz[1],
1052                          src_swz[2], src_swz[3]);
1053
1054       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1055    }
1056
1057    if (zeros_mask)
1058       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1059
1060    if (ones_mask)
1061       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1062
1063    if (src.Negate)
1064       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1065
1066    if (need_tmp) {
1067       brw_MOV(p, dst, tmp);
1068       release_tmp(c, tmp);
1069    }
1070 }
1071
1072
1073 /**
1074  * Post-vertex-program processing.  Send the results to the URB.
1075  */
1076 static void emit_vertex_write( struct brw_vs_compile *c)
1077 {
1078    struct brw_compile *p = &c->func;
1079    struct brw_reg m0 = brw_message_reg(0);
1080    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1081    struct brw_reg ndc;
1082
1083    if (c->key.copy_edgeflag) {
1084       brw_MOV(p,
1085               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1086               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1087    }
1088
1089    /* Build ndc coords */
1090    ndc = get_tmp(c);
1091    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1092    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1093
1094    /* Update the header for point size, user clipping flags, and -ve rhw
1095     * workaround.
1096     */
1097    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1098        c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1099    {
1100       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1101       GLuint i;
1102
1103       brw_MOV(p, header1, brw_imm_ud(0));
1104
1105       brw_set_access_mode(p, BRW_ALIGN_16);
1106
1107       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1108          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1109          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1110          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1111       }
1112
1113       for (i = 0; i < c->key.nr_userclip; i++) {
1114          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1115          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1116          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1117          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1118       }
1119
1120       /* i965 clipping workaround:
1121        * 1) Test for -ve rhw
1122        * 2) If set,
1123        *      set ndc = (0,0,0,0)
1124        *      set ucp[6] = 1
1125        *
1126        * Later, clipping will detect ucp[6] and ensure the primitive is
1127        * clipped against all fixed planes.
1128        */
1129       if (!BRW_IS_G4X(p->brw)) {
1130          brw_CMP(p,
1131                  vec8(brw_null_reg()),
1132                  BRW_CONDITIONAL_L,
1133                  brw_swizzle1(ndc, 3),
1134                  brw_imm_f(0));
1135
1136          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1137          brw_MOV(p, ndc, brw_imm_f(0));
1138          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1139       }
1140
1141       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1142       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1143       brw_set_access_mode(p, BRW_ALIGN_16);
1144
1145       release_tmp(c, header1);
1146    }
1147    else {
1148       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1149    }
1150
1151    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1152     * of zeros followed by two sets of NDC coordinates:
1153     */
1154    brw_set_access_mode(p, BRW_ALIGN_1);
1155    brw_MOV(p, offset(m0, 2), ndc);
1156    brw_MOV(p, offset(m0, 3), pos);
1157
1158    brw_urb_WRITE(p,
1159                  brw_null_reg(), /* dest */
1160                  0,             /* starting mrf reg nr */
1161                  c->r0,         /* src */
1162                  0,             /* allocate */
1163                  1,             /* used */
1164                  c->nr_outputs + 3, /* msg len */
1165                  0,             /* response len */
1166                  1,             /* eot */
1167                  1,             /* writes complete */
1168                  0,             /* urb destination offset */
1169                  BRW_URB_SWIZZLE_INTERLEAVE);
1170 }
1171
1172
1173 /**
1174  * Called after code generation to resolve subroutine calls and the
1175  * END instruction.
1176  * \param end_inst  points to brw code for END instruction
1177  * \param last_inst  points to last instruction emitted before vertex write
1178  */
1179 static void
1180 post_vs_emit( struct brw_vs_compile *c,
1181               struct brw_instruction *end_inst,
1182               struct brw_instruction *last_inst )
1183 {
1184    GLint offset;
1185
1186    brw_resolve_cals(&c->func);
1187
1188    /* patch up the END code to jump past subroutines, etc */
1189    offset = last_inst - end_inst;
1190    brw_set_src1(end_inst, brw_imm_d(offset * 16));
1191 }
1192
1193
1194 /* Emit the vertex program instructions here.
1195  */
1196 void brw_vs_emit(struct brw_vs_compile *c )
1197 {
1198 #define MAX_IFSN 32
1199    struct brw_compile *p = &c->func;
1200    GLuint nr_insns = c->vp->program.Base.NumInstructions;
1201    GLuint insn, if_insn = 0;
1202    GLuint end_offset = 0;
1203    struct brw_instruction *end_inst, *last_inst;
1204    struct brw_instruction *if_inst[MAX_IFSN];
1205    struct brw_indirect stack_index = brw_indirect(0, 0);
1206
1207    GLuint index;
1208    GLuint file;
1209
1210    if (INTEL_DEBUG & DEBUG_VS) {
1211       _mesa_printf("vs-emit:\n");
1212       _mesa_print_program(&c->vp->program.Base);
1213       _mesa_printf("\n");
1214    }
1215
1216    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1217    brw_set_access_mode(p, BRW_ALIGN_16);
1218
1219    /* Message registers can't be read, so copy the output into GRF register
1220       if they are used in source registers */
1221    for (insn = 0; insn < nr_insns; insn++) {
1222        GLuint i;
1223        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1224        for (i = 0; i < 3; i++) {
1225            struct prog_src_register *src = &inst->SrcReg[i];
1226            GLuint index = src->Index;
1227            GLuint file = src->File;
1228            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1229                c->output_regs[index].used_in_src = GL_TRUE;
1230        }
1231    }
1232
1233    /* Static register allocation
1234     */
1235    brw_vs_alloc_regs(c);
1236    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1237
1238    for (insn = 0; insn < nr_insns; insn++) {
1239
1240       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1241       struct brw_reg args[3], dst;
1242       GLuint i;
1243
1244 #if 0
1245       printf("%d: ", insn);
1246       _mesa_print_instruction(inst);
1247 #endif
1248
1249       /* Get argument regs.  SWZ is special and does this itself.
1250        */
1251       if (inst->Opcode != OPCODE_SWZ)
1252           for (i = 0; i < 3; i++) {
1253               struct prog_src_register *src = &inst->SrcReg[i];
1254               index = src->Index;
1255               file = src->File;
1256               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1257                   args[i] = c->output_regs[index].reg;
1258               else
1259                   args[i] = get_arg(c, inst, i);
1260           }
1261
1262       /* Get dest regs.  Note that it is possible for a reg to be both
1263        * dst and arg, given the static allocation of registers.  So
1264        * care needs to be taken emitting multi-operation instructions.
1265        */
1266       index = inst->DstReg.Index;
1267       file = inst->DstReg.File;
1268       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1269           dst = c->output_regs[index].reg;
1270       else
1271           dst = get_dst(c, inst->DstReg);
1272
1273       if (inst->SaturateMode != SATURATE_OFF) {
1274          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1275                        inst->SaturateMode);
1276       }
1277
1278       switch (inst->Opcode) {
1279       case OPCODE_ABS:
1280          brw_MOV(p, dst, brw_abs(args[0]));
1281          break;
1282       case OPCODE_ADD:
1283          brw_ADD(p, dst, args[0], args[1]);
1284          break;
1285       case OPCODE_COS:
1286          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1287          break;
1288       case OPCODE_DP3:
1289          brw_DP3(p, dst, args[0], args[1]);
1290          break;
1291       case OPCODE_DP4:
1292          brw_DP4(p, dst, args[0], args[1]);
1293          break;
1294       case OPCODE_DPH:
1295          brw_DPH(p, dst, args[0], args[1]);
1296          break;
1297       case OPCODE_NRM3:
1298          emit_nrm(c, dst, args[0], 3);
1299          break;
1300       case OPCODE_NRM4:
1301          emit_nrm(c, dst, args[0], 4);
1302          break;
1303       case OPCODE_DST:
1304          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1305          break;
1306       case OPCODE_EXP:
1307          unalias1(c, dst, args[0], emit_exp_noalias);
1308          break;
1309       case OPCODE_EX2:
1310          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1311          break;
1312       case OPCODE_ARL:
1313          emit_arl(c, dst, args[0]);
1314          break;
1315       case OPCODE_FLR:
1316          brw_RNDD(p, dst, args[0]);
1317          break;
1318       case OPCODE_FRC:
1319          brw_FRC(p, dst, args[0]);
1320          break;
1321       case OPCODE_LOG:
1322          unalias1(c, dst, args[0], emit_log_noalias);
1323          break;
1324       case OPCODE_LG2:
1325          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1326          break;
1327       case OPCODE_LIT:
1328          unalias1(c, dst, args[0], emit_lit_noalias);
1329          break;
1330       case OPCODE_LRP:
1331          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1332          break;
1333       case OPCODE_MAD:
1334          brw_MOV(p, brw_acc_reg(), args[2]);
1335          brw_MAC(p, dst, args[0], args[1]);
1336          break;
1337       case OPCODE_MAX:
1338          emit_max(p, dst, args[0], args[1]);
1339          break;
1340       case OPCODE_MIN:
1341          emit_min(p, dst, args[0], args[1]);
1342          break;
1343       case OPCODE_MOV:
1344          brw_MOV(p, dst, args[0]);
1345          break;
1346       case OPCODE_MUL:
1347          brw_MUL(p, dst, args[0], args[1]);
1348          break;
1349       case OPCODE_POW:
1350          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1351          break;
1352       case OPCODE_RCP:
1353          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1354          break;
1355       case OPCODE_RSQ:
1356          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1357          break;
1358
1359       case OPCODE_SEQ:
1360          emit_seq(p, dst, args[0], args[1]);
1361          break;
1362       case OPCODE_SIN:
1363          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1364          break;
1365       case OPCODE_SNE:
1366          emit_sne(p, dst, args[0], args[1]);
1367          break;
1368       case OPCODE_SGE:
1369          emit_sge(p, dst, args[0], args[1]);
1370          break;
1371       case OPCODE_SGT:
1372          emit_sgt(p, dst, args[0], args[1]);
1373          break;
1374       case OPCODE_SLT:
1375          emit_slt(p, dst, args[0], args[1]);
1376          break;
1377       case OPCODE_SLE:
1378          emit_sle(p, dst, args[0], args[1]);
1379          break;
1380       case OPCODE_SUB:
1381          brw_ADD(p, dst, args[0], negate(args[1]));
1382          break;
1383       case OPCODE_SWZ:
1384          /* The args[0] value can't be used here as it won't have
1385           * correctly encoded the full swizzle:
1386           */
1387          emit_swz(c, dst, inst);
1388          break;
1389       case OPCODE_TRUNC:
1390          /* round toward zero */
1391          brw_RNDZ(p, dst, args[0]);
1392          break;
1393       case OPCODE_XPD:
1394          emit_xpd(p, dst, args[0], args[1]);
1395          break;
1396       case OPCODE_IF:
1397          assert(if_insn < MAX_IFSN);
1398          if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1399          break;
1400       case OPCODE_ELSE:
1401          if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1402          break;
1403       case OPCODE_ENDIF:
1404          assert(if_insn > 0);
1405          brw_ENDIF(p, if_inst[--if_insn]);
1406          break;
1407       case OPCODE_BRA:
1408          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1409          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1410          brw_set_predicate_control_flag_value(p, 0xff);
1411          break;
1412       case OPCODE_CAL:
1413          brw_set_access_mode(p, BRW_ALIGN_1);
1414          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1415          brw_set_access_mode(p, BRW_ALIGN_16);
1416          brw_ADD(p, get_addr_reg(stack_index),
1417                          get_addr_reg(stack_index), brw_imm_d(4));
1418          brw_save_call(p, inst->Comment, p->nr_insn);
1419          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1420          break;
1421       case OPCODE_RET:
1422          brw_ADD(p, get_addr_reg(stack_index),
1423                          get_addr_reg(stack_index), brw_imm_d(-4));
1424          brw_set_access_mode(p, BRW_ALIGN_1);
1425          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1426          brw_set_access_mode(p, BRW_ALIGN_16);
1427          break;
1428       case OPCODE_END:
1429          end_offset = p->nr_insn;
1430          /* this instruction will get patched later to jump past subroutine
1431           * code, etc.
1432           */
1433          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1434          break;
1435       case OPCODE_PRINT:
1436          /* no-op */
1437          break;
1438       case OPCODE_BGNSUB:
1439          brw_save_label(p, inst->Comment, p->nr_insn);
1440          break;
1441       case OPCODE_ENDSUB:
1442          /* no-op */
1443          break;
1444       default:
1445          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1446                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1447                                     _mesa_opcode_string(inst->Opcode) :
1448                                     "unknown");
1449       }
1450
1451       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1452           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1453           && c->output_regs[inst->DstReg.Index].used_in_src) {
1454          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1455       }
1456
1457       /* Result color clamping.
1458        *
1459        * When destination register is an output register and
1460        * it's primary/secondary front/back color, we have to clamp
1461        * the result to [0,1]. This is done by enabling the
1462        * saturation bit for the last instruction.
1463        *
1464        * We don't use brw_set_saturate() as it modifies
1465        * p->current->header.saturate, which affects all the subsequent
1466        * instructions. Instead, we directly modify the header
1467        * of the last (already stored) instruction.
1468        */
1469       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1470          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1471              || (inst->DstReg.Index == VERT_RESULT_COL1)
1472              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1473              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1474             p->store[p->nr_insn-1].header.saturate = 1;
1475          }
1476       }
1477
1478       release_tmps(c);
1479    }
1480
1481    end_inst = &p->store[end_offset];
1482    last_inst = &p->store[p->nr_insn];
1483
1484    /* The END instruction will be patched to jump to this code */
1485    emit_vertex_write(c);
1486
1487    post_vs_emit(c, end_inst, last_inst);
1488 }