src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71    GLuint nr_params;
  72
  73 #if 0
  74    if (c->vp->program.Base.Parameters->NumParameters >= 6)
  75       c->use_const_buffer = 1;
  76    else
  77 #endif
  78       c->use_const_buffer = GL_FALSE;
  79    /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
  80
  81    /* r0 -- reserved as usual
  82     */
  83    c->r0 = brw_vec8_grf(reg, 0);
  84    reg++;
  85
  86    /* User clip planes from curbe:
  87     */
  88    if (c->key.nr_userclip) {
  89       for (i = 0; i < c->key.nr_userclip; i++) {
  90          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  91       }
  92
  93       /* Deal with curbe alignment:
  94        */
  95       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
  96    }
  97
  98    /* Vertex program parameters from curbe:
  99     */
 100    if (c->use_const_buffer) {
 101       /* get constants from a real constant buffer */
 102       c->prog_data.curb_read_length = 0;
 103    }
 104    else {
 105       /* use a section of the GRF for constants */
 106       nr_params = c->vp->program.Base.Parameters->NumParameters;
 107       for (i = 0; i < nr_params; i++) {
 108          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 109       }
 110       reg += (nr_params + 1) / 2;
 111       c->prog_data.curb_read_length = reg - 1;
 112    }
 113
 114    /* Allocate input regs:
 115     */
 116    c->nr_inputs = 0;
 117    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 118       if (c->prog_data.inputs_read & (1 << i)) {
 119          c->nr_inputs++;
 120          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 121          reg++;
 122       }
 123    }
 124
 125    /* Allocate outputs: TODO: could organize the non-position outputs
 126     * to go straight into message regs.
 127     */
 128    c->nr_outputs = 0;
 129    c->first_output = reg;
 130    mrf = 4;
 131    for (i = 0; i < VERT_RESULT_MAX; i++) {
 132       if (c->prog_data.outputs_written & (1 << i)) {
 133          c->nr_outputs++;
 134          if (i == VERT_RESULT_HPOS) {
 135             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 136             reg++;
 137          }
 138          else if (i == VERT_RESULT_PSIZ) {
 139             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 140             reg++;
 141             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 142          }
 143          else {
 144             c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 145             mrf++;
 146          }
 147       }
 148    }
 149
 150    /* Allocate program temporaries:
 151     */
 152    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 153       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 154       reg++;
 155    }
 156
 157    /* Address reg(s).  Don't try to use the internal address reg until
 158     * deref time.
 159     */
 160    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 161       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 162                                              reg,
 163                                              0,
 164                                              BRW_REGISTER_TYPE_D,
 165                                              BRW_VERTICAL_STRIDE_8,
 166                                              BRW_WIDTH_8,
 167                                              BRW_HORIZONTAL_STRIDE_1,
 168                                              BRW_SWIZZLE_XXXX,
 169                                              WRITEMASK_X);
 170       reg++;
 171    }
 172
 173    for (i = 0; i < 128; i++) {
 174       if (c->output_regs[i].used_in_src) {
 175          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 176          reg++;
 177       }
 178    }
 179
 180    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 181    reg += 2;
 182
 183    /* Some opcodes need an internal temporary:
 184     */
 185    c->first_tmp = reg;
 186    c->last_tmp = reg;           /* for allocation purposes */
 187
 188    /* Each input reg holds data from two vertices.  The
 189     * urb_read_length is the number of registers read from *each*
 190     * vertex urb, so is half the amount:
 191     */
 192    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 193
 194    c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
 195    c->prog_data.total_grf = reg;
 196
 197    if (c->use_const_buffer) {
 198        for (i = 0; i < 3; i++) {
 199           c->current_const[i].index = -1;
 200           c->current_const[i].reg = get_tmp(c);
 201        }
 202    }
 203
 204    if (INTEL_DEBUG & DEBUG_VS) {
 205       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 206       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 207       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 208    }
 209 }
 210
 211
 212 /**
 213  * If an instruction uses a temp reg both as a src and the dest, we
 214  * sometimes need to allocate an intermediate temporary.
 215  */
 216 static void unalias1( struct brw_vs_compile *c,
 217                       struct brw_reg dst,
 218                       struct brw_reg arg0,
 219                       void (*func)( struct brw_vs_compile *,
 220                                     struct brw_reg,
 221                                     struct brw_reg ))
 222 {
 223    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 224       struct brw_compile *p = &c->func;
 225       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 226       func(c, tmp, arg0);
 227       brw_MOV(p, dst, tmp);
 228       release_tmp(c, tmp);
 229    }
 230    else {
 231       func(c, dst, arg0);
 232    }
 233 }
 234
 235 /**
 236  * \sa unalias2
 237  * Checkes if 2-operand instruction needs an intermediate temporary.
 238  */
 239 static void unalias2( struct brw_vs_compile *c,
 240                       struct brw_reg dst,
 241                       struct brw_reg arg0,
 242                       struct brw_reg arg1,
 243                       void (*func)( struct brw_vs_compile *,
 244                                     struct brw_reg,
 245                                     struct brw_reg,
 246                                     struct brw_reg ))
 247 {
 248    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 249        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 250       struct brw_compile *p = &c->func;
 251       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 252       func(c, tmp, arg0, arg1);
 253       brw_MOV(p, dst, tmp);
 254       release_tmp(c, tmp);
 255    }
 256    else {
 257       func(c, dst, arg0, arg1);
 258    }
 259 }
 260
 261 /**
 262  * \sa unalias2
 263  * Checkes if 3-operand instruction needs an intermediate temporary.
 264  */
 265 static void unalias3( struct brw_vs_compile *c,
 266                       struct brw_reg dst,
 267                       struct brw_reg arg0,
 268                       struct brw_reg arg1,
 269                       struct brw_reg arg2,
 270                       void (*func)( struct brw_vs_compile *,
 271                                     struct brw_reg,
 272                                     struct brw_reg,
 273                                     struct brw_reg,
 274                                     struct brw_reg ))
 275 {
 276    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 277        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 278        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 279       struct brw_compile *p = &c->func;
 280       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 281       func(c, tmp, arg0, arg1, arg2);
 282       brw_MOV(p, dst, tmp);
 283       release_tmp(c, tmp);
 284    }
 285    else {
 286       func(c, dst, arg0, arg1, arg2);
 287    }
 288 }
 289
 290 static void emit_sop( struct brw_compile *p,
 291                       struct brw_reg dst,
 292                       struct brw_reg arg0,
 293                       struct brw_reg arg1,
 294                       GLuint cond)
 295 {
 296    brw_MOV(p, dst, brw_imm_f(0.0f));
 297    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 298    brw_MOV(p, dst, brw_imm_f(1.0f));
 299    brw_set_predicate_control_flag_value(p, 0xff);
 300 }
 301
 302 static void emit_seq( struct brw_compile *p,
 303                       struct brw_reg dst,
 304                       struct brw_reg arg0,
 305                       struct brw_reg arg1 )
 306 {
 307    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 308 }
 309
 310 static void emit_sne( struct brw_compile *p,
 311                       struct brw_reg dst,
 312                       struct brw_reg arg0,
 313                       struct brw_reg arg1 )
 314 {
 315    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 316 }
 317 static void emit_slt( struct brw_compile *p,
 318                       struct brw_reg dst,
 319                       struct brw_reg arg0,
 320                       struct brw_reg arg1 )
 321 {
 322    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 323 }
 324
 325 static void emit_sle( struct brw_compile *p,
 326                       struct brw_reg dst,
 327                       struct brw_reg arg0,
 328                       struct brw_reg arg1 )
 329 {
 330    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 331 }
 332
 333 static void emit_sgt( struct brw_compile *p,
 334                       struct brw_reg dst,
 335                       struct brw_reg arg0,
 336                       struct brw_reg arg1 )
 337 {
 338    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 339 }
 340
 341 static void emit_sge( struct brw_compile *p,
 342                       struct brw_reg dst,
 343                       struct brw_reg arg0,
 344                       struct brw_reg arg1 )
 345 {
 346   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 347 }
 348
 349 static void emit_max( struct brw_compile *p,
 350                       struct brw_reg dst,
 351                       struct brw_reg arg0,
 352                       struct brw_reg arg1 )
 353 {
 354    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 355    brw_SEL(p, dst, arg1, arg0);
 356    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 357 }
 358
 359 static void emit_min( struct brw_compile *p,
 360                       struct brw_reg dst,
 361                       struct brw_reg arg0,
 362                       struct brw_reg arg1 )
 363 {
 364    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 365    brw_SEL(p, dst, arg0, arg1);
 366    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 367 }
 368
 369
 370 static void emit_math1( struct brw_vs_compile *c,
 371                         GLuint function,
 372                         struct brw_reg dst,
 373                         struct brw_reg arg0,
 374                         GLuint precision)
 375 {
 376    /* There are various odd behaviours with SEND on the simulator.  In
 377     * addition there are documented issues with the fact that the GEN4
 378     * processor doesn't do dependency control properly on SEND
 379     * results.  So, on balance, this kludge to get around failures
 380     * with writemasked math results looks like it might be necessary
 381     * whether that turns out to be a simulator bug or not:
 382     */
 383    struct brw_compile *p = &c->func;
 384    struct brw_reg tmp = dst;
 385    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 386                          dst.file != BRW_GENERAL_REGISTER_FILE);
 387
 388    if (need_tmp)
 389       tmp = get_tmp(c);
 390
 391    brw_math(p,
 392             tmp,
 393             function,
 394             BRW_MATH_SATURATE_NONE,
 395             2,
 396             arg0,
 397             BRW_MATH_DATA_SCALAR,
 398             precision);
 399
 400    if (need_tmp) {
 401       brw_MOV(p, dst, tmp);
 402       release_tmp(c, tmp);
 403    }
 404 }
 405
 406
 407 static void emit_math2( struct brw_vs_compile *c,
 408                         GLuint function,
 409                         struct brw_reg dst,
 410                         struct brw_reg arg0,
 411                         struct brw_reg arg1,
 412                         GLuint precision)
 413 {
 414    struct brw_compile *p = &c->func;
 415    struct brw_reg tmp = dst;
 416    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 417                          dst.file != BRW_GENERAL_REGISTER_FILE);
 418
 419    if (need_tmp)
 420       tmp = get_tmp(c);
 421
 422    brw_MOV(p, brw_message_reg(3), arg1);
 423
 424    brw_math(p,
 425             tmp,
 426             function,
 427             BRW_MATH_SATURATE_NONE,
 428             2,
 429             arg0,
 430             BRW_MATH_DATA_SCALAR,
 431             precision);
 432
 433    if (need_tmp) {
 434       brw_MOV(p, dst, tmp);
 435       release_tmp(c, tmp);
 436    }
 437 }
 438
 439
 440 static void emit_exp_noalias( struct brw_vs_compile *c,
 441                               struct brw_reg dst,
 442                               struct brw_reg arg0 )
 443 {
 444    struct brw_compile *p = &c->func;
 445
 446
 447    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 448       struct brw_reg tmp = get_tmp(c);
 449       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 450
 451       /* tmp_d = floor(arg0.x) */
 452       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 453
 454       /* result[0] = 2.0 ^ tmp */
 455
 456       /* Adjust exponent for floating point:
 457        * exp += 127
 458        */
 459       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 460
 461       /* Install exponent and sign.
 462        * Excess drops off the edge:
 463        */
 464       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 465               tmp_d, brw_imm_d(23));
 466
 467       release_tmp(c, tmp);
 468    }
 469
 470    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 471       /* result[1] = arg0.x - floor(arg0.x) */
 472       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 473    }
 474
 475    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 476       /* As with the LOG instruction, we might be better off just
 477        * doing a taylor expansion here, seeing as we have to do all
 478        * the prep work.
 479        *
 480        * If mathbox partial precision is too low, consider also:
 481        * result[3] = result[0] * EXP(result[1])
 482        */
 483       emit_math1(c,
 484                  BRW_MATH_FUNCTION_EXP,
 485                  brw_writemask(dst, WRITEMASK_Z),
 486                  brw_swizzle1(arg0, 0),
 487                  BRW_MATH_PRECISION_FULL);
 488    }
 489
 490    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 491       /* result[3] = 1.0; */
 492       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 493    }
 494 }
 495
 496
 497 static void emit_log_noalias( struct brw_vs_compile *c,
 498                               struct brw_reg dst,
 499                               struct brw_reg arg0 )
 500 {
 501    struct brw_compile *p = &c->func;
 502    struct brw_reg tmp = dst;
 503    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 504    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 505    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 506                          dst.file != BRW_GENERAL_REGISTER_FILE);
 507
 508    if (need_tmp) {
 509       tmp = get_tmp(c);
 510       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 511    }
 512
 513    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 514     * according to spec:
 515     *
 516     * These almost look likey they could be joined up, but not really
 517     * practical:
 518     *
 519     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 520     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 521     */
 522    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 523       brw_AND(p,
 524               brw_writemask(tmp_ud, WRITEMASK_X),
 525               brw_swizzle1(arg0_ud, 0),
 526               brw_imm_ud((1U<<31)-1));
 527
 528       brw_SHR(p,
 529               brw_writemask(tmp_ud, WRITEMASK_X),
 530               tmp_ud,
 531               brw_imm_ud(23));
 532
 533       brw_ADD(p,
 534               brw_writemask(tmp, WRITEMASK_X),
 535               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 536               brw_imm_d(-127));
 537    }
 538
 539    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 540       brw_AND(p,
 541               brw_writemask(tmp_ud, WRITEMASK_Y),
 542               brw_swizzle1(arg0_ud, 0),
 543               brw_imm_ud((1<<23)-1));
 544
 545       brw_OR(p,
 546              brw_writemask(tmp_ud, WRITEMASK_Y),
 547              tmp_ud,
 548              brw_imm_ud(127<<23));
 549    }
 550
 551    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 552       /* result[2] = result[0] + LOG2(result[1]); */
 553
 554       /* Why bother?  The above is just a hint how to do this with a
 555        * taylor series.  Maybe we *should* use a taylor series as by
 556        * the time all the above has been done it's almost certainly
 557        * quicker than calling the mathbox, even with low precision.
 558        *
 559        * Options are:
 560        *    - result[0] + mathbox.LOG2(result[1])
 561        *    - mathbox.LOG2(arg0.x)
 562        *    - result[0] + inline_taylor_approx(result[1])
 563        */
 564       emit_math1(c,
 565                  BRW_MATH_FUNCTION_LOG,
 566                  brw_writemask(tmp, WRITEMASK_Z),
 567                  brw_swizzle1(tmp, 1),
 568                  BRW_MATH_PRECISION_FULL);
 569
 570       brw_ADD(p,
 571               brw_writemask(tmp, WRITEMASK_Z),
 572               brw_swizzle1(tmp, 2),
 573               brw_swizzle1(tmp, 0));
 574    }
 575
 576    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 577       /* result[3] = 1.0; */
 578       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 579    }
 580
 581    if (need_tmp) {
 582       brw_MOV(p, dst, tmp);
 583       release_tmp(c, tmp);
 584    }
 585 }
 586
 587
 588 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 589  */
 590 static void emit_dst_noalias( struct brw_vs_compile *c,
 591                               struct brw_reg dst,
 592                               struct brw_reg arg0,
 593                               struct brw_reg arg1)
 594 {
 595    struct brw_compile *p = &c->func;
 596
 597    /* There must be a better way to do this:
 598     */
 599    if (dst.dw1.bits.writemask & WRITEMASK_X)
 600       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 601    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 602       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 603    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 604       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 605    if (dst.dw1.bits.writemask & WRITEMASK_W)
 606       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 607 }
 608
 609
 610 static void emit_xpd( struct brw_compile *p,
 611                       struct brw_reg dst,
 612                       struct brw_reg t,
 613                       struct brw_reg u)
 614 {
 615    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 616    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 617 }
 618
 619
 620 static void emit_lit_noalias( struct brw_vs_compile *c,
 621                               struct brw_reg dst,
 622                               struct brw_reg arg0 )
 623 {
 624    struct brw_compile *p = &c->func;
 625    struct brw_instruction *if_insn;
 626    struct brw_reg tmp = dst;
 627    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 628
 629    if (need_tmp)
 630       tmp = get_tmp(c);
 631
 632    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 633    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 634
 635    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 636     * to get all channels active inside the IF.  In the clipping code
 637     * we run with NoMask, so it's not an option and we can use
 638     * BRW_EXECUTE_1 for all comparisions.
 639     */
 640    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 641    if_insn = brw_IF(p, BRW_EXECUTE_8);
 642    {
 643       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 644
 645       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 646       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 647       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 648
 649       emit_math2(c,
 650                  BRW_MATH_FUNCTION_POW,
 651                  brw_writemask(dst, WRITEMASK_Z),
 652                  brw_swizzle1(tmp, 2),
 653                  brw_swizzle1(arg0, 3),
 654                  BRW_MATH_PRECISION_PARTIAL);
 655    }
 656
 657    brw_ENDIF(p, if_insn);
 658 }
 659
 660 static void emit_lrp_noalias(struct brw_vs_compile *c,
 661                              struct brw_reg dst,
 662                              struct brw_reg arg0,
 663                              struct brw_reg arg1,
 664                              struct brw_reg arg2)
 665 {
 666    struct brw_compile *p = &c->func;
 667
 668    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 669    brw_MUL(p, brw_null_reg(), dst, arg2);
 670    brw_MAC(p, dst, arg0, arg1);
 671 }
 672
 673 /** 3 or 4-component vector normalization */
 674 static void emit_nrm( struct brw_vs_compile *c,
 675                       struct brw_reg dst,
 676                       struct brw_reg arg0,
 677                       int num_comps)
 678 {
 679    struct brw_compile *p = &c->func;
 680    struct brw_reg tmp = get_tmp(c);
 681
 682    /* tmp = dot(arg0, arg0) */
 683    if (num_comps == 3)
 684       brw_DP3(p, tmp, arg0, arg0);
 685    else
 686       brw_DP4(p, tmp, arg0, arg0);
 687
 688    /* tmp = 1 / sqrt(tmp) */
 689    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 690
 691    /* dst = arg0 * tmp */
 692    brw_MUL(p, dst, arg0, tmp);
 693
 694    release_tmp(c, tmp);
 695 }
 696
 697
 698 static struct brw_reg
 699 get_constant(struct brw_vs_compile *c,
 700              const struct prog_instruction *inst,
 701              GLuint argIndex)
 702 {
 703    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 704    struct brw_compile *p = &c->func;
 705    struct brw_reg const_reg;
 706
 707    if (c->current_const[argIndex].index != src->Index) {
 708       struct brw_reg src_reg = get_tmp(c);
 709       struct brw_reg t = get_tmp(c);
 710
 711       c->current_const[argIndex].index = src->Index;
 712
 713       brw_MOV(p, t, brw_vec8_grf(0, 0));/*SAVE*/
 714
 715 #if 0
 716       printf("  fetch const[%d] for arg %d into reg %d\n",
 717              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 718 #endif
 719
 720       /* need to fetch the constant now */
 721       brw_dp_READ_4_vs(p,
 722                        c->current_const[argIndex].reg, /* writeback dest */
 723                        src_reg,                        /* src reg */
 724                        1,                              /* msg_reg */
 725                        src->RelAddr,                   /* relative indexing? */
 726                        16 * src->Index,                /* byte offset */
 727                        SURF_INDEX_VERT_CONST_BUFFER    /* binding table index */
 728                        );
 729
 730       brw_MOV(p, brw_vec8_grf(0, 0), t);/*RESTORE*/
 731       release_tmp(c, src_reg);
 732       release_tmp(c, t);
 733    }
 734
 735    /* replicate lower four floats into upper four floats (to get XYZWXYZW) */
 736    const_reg = c->current_const[argIndex].reg;
 737    const_reg = stride(const_reg, 0, 4, 0);
 738    const_reg.subnr = 0;
 739
 740    return const_reg;
 741 }
 742
 743
 744
 745 /* TODO: relative addressing!
 746  */
 747 static struct brw_reg get_reg( struct brw_vs_compile *c,
 748                                gl_register_file file,
 749                                GLuint index )
 750 {
 751    switch (file) {
 752    case PROGRAM_TEMPORARY:
 753    case PROGRAM_INPUT:
 754    case PROGRAM_OUTPUT:
 755       assert(c->regs[file][index].nr != 0);
 756       return c->regs[file][index];
 757    case PROGRAM_STATE_VAR:
 758    case PROGRAM_CONSTANT:
 759    case PROGRAM_UNIFORM:
 760       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 761       return c->regs[PROGRAM_STATE_VAR][index];
 762    case PROGRAM_ADDRESS:
 763       assert(index == 0);
 764       return c->regs[file][index];
 765
 766    case PROGRAM_UNDEFINED:                      /* undef values */
 767       return brw_null_reg();
 768
 769    case PROGRAM_LOCAL_PARAM:
 770    case PROGRAM_ENV_PARAM:
 771    case PROGRAM_WRITE_ONLY:
 772    default:
 773       assert(0);
 774       return brw_null_reg();
 775    }
 776 }
 777
 778
 779 /**
 780  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 781  * TODO: relative addressing!
 782  */
 783 static struct brw_reg
 784 get_src_reg( struct brw_vs_compile *c,
 785              const struct prog_instruction *inst,
 786              GLuint argIndex )
 787 {
 788    const GLuint file = inst->SrcReg[argIndex].File;
 789    const GLint index = inst->SrcReg[argIndex].Index;
 790
 791    switch (file) {
 792    case PROGRAM_TEMPORARY:
 793    case PROGRAM_INPUT:
 794    case PROGRAM_OUTPUT:
 795       assert(c->regs[file][index].nr != 0);
 796       return c->regs[file][index];
 797    case PROGRAM_STATE_VAR:
 798    case PROGRAM_CONSTANT:
 799    case PROGRAM_UNIFORM:
 800       if (c->use_const_buffer) {
 801          return get_constant(c, inst, argIndex);
 802       }
 803       else {
 804          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 805          return c->regs[PROGRAM_STATE_VAR][index];
 806       }
 807    case PROGRAM_ADDRESS:
 808       assert(index == 0);
 809       return c->regs[file][index];
 810
 811    case PROGRAM_UNDEFINED:
 812       /* this is a normal case since we loop over all three src args */
 813       return brw_null_reg();
 814
 815    case PROGRAM_LOCAL_PARAM:
 816    case PROGRAM_ENV_PARAM:
 817    case PROGRAM_WRITE_ONLY:
 818    default:
 819       assert(0);
 820       return brw_null_reg();
 821    }
 822 }
 823
 824
 825 /**
 826  * Indirect addressing:  get reg[[arg] + offset].
 827  */
 828 static struct brw_reg deref( struct brw_vs_compile *c,
 829                              struct brw_reg arg,
 830                              GLint offset)
 831 {
 832    struct brw_compile *p = &c->func;
 833    struct brw_reg tmp = vec4(get_tmp(c));
 834    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 835    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 836    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 837    struct brw_reg indirect = brw_vec4_indirect(0,0);
 838
 839    {
 840       brw_push_insn_state(p);
 841       brw_set_access_mode(p, BRW_ALIGN_1);
 842
 843       /* This is pretty clunky - load the address register twice and
 844        * fetch each 4-dword value in turn.  There must be a way to do
 845        * this in a single pass, but I couldn't get it to work.
 846        */
 847       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 848       brw_MOV(p, tmp, indirect);
 849
 850       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 851       brw_MOV(p, suboffset(tmp, 4), indirect);
 852
 853       brw_pop_insn_state(p);
 854    }
 855
 856    return vec8(tmp);
 857 }
 858
 859
 860 static void emit_arl( struct brw_vs_compile *c,
 861                       struct brw_reg dst,
 862                       struct brw_reg arg0 )
 863 {
 864    struct brw_compile *p = &c->func;
 865    struct brw_reg tmp = dst;
 866    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 867
 868    if (need_tmp)
 869       tmp = get_tmp(c);
 870
 871    brw_RNDD(p, tmp, arg0);
 872    brw_MUL(p, dst, tmp, brw_imm_d(16));
 873
 874    if (need_tmp)
 875       release_tmp(c, tmp);
 876 }
 877
 878
 879 /**
 880  * Return the brw reg for the given instruction's src argument.
 881  * Will return mangled results for SWZ op.  The emit_swz() function
 882  * ignores this result and recalculates taking extended swizzles into
 883  * account.
 884  */
 885 static struct brw_reg get_arg( struct brw_vs_compile *c,
 886                                const struct prog_instruction *inst,
 887                                GLuint argIndex )
 888 {
 889    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 890    struct brw_reg reg;
 891
 892    if (src->File == PROGRAM_UNDEFINED)
 893       return brw_null_reg();
 894
 895    if (src->RelAddr) {
 896       /* XXX fix */
 897       reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
 898    }
 899    else {
 900       reg = get_src_reg(c, inst, argIndex);
 901    }
 902
 903    /* Convert 3-bit swizzle to 2-bit.
 904     */
 905    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 906                                        GET_SWZ(src->Swizzle, 1),
 907                                        GET_SWZ(src->Swizzle, 2),
 908                                        GET_SWZ(src->Swizzle, 3));
 909
 910    /* Note this is ok for non-swizzle instructions:
 911     */
 912    reg.negate = src->NegateBase ? 1 : 0;
 913
 914    return reg;
 915 }
 916
 917
 918 /**
 919  * Get brw register for the given program dest register.
 920  */
 921 static struct brw_reg get_dst( struct brw_vs_compile *c,
 922                                struct prog_dst_register dst )
 923 {
 924    struct brw_reg reg;
 925
 926    switch (dst.File) {
 927    case PROGRAM_TEMPORARY:
 928    case PROGRAM_OUTPUT:
 929       assert(c->regs[dst.File][dst.Index].nr != 0);
 930       reg = c->regs[dst.File][dst.Index];
 931       break;
 932    case PROGRAM_UNDEFINED:
 933       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
 934       reg = brw_null_reg();
 935       break;
 936    default:
 937       assert(0);
 938       reg = brw_null_reg();
 939    }
 940
 941    reg.dw1.bits.writemask = dst.WriteMask;
 942
 943    return reg;
 944 }
 945
 946
 947 static void emit_swz( struct brw_vs_compile *c,
 948                       struct brw_reg dst,
 949                       const struct prog_instruction *inst)
 950 {
 951    const GLuint argIndex = 0;
 952    const struct prog_src_register src = inst->SrcReg[argIndex];
 953    struct brw_compile *p = &c->func;
 954    GLuint zeros_mask = 0;
 955    GLuint ones_mask = 0;
 956    GLuint src_mask = 0;
 957    GLubyte src_swz[4];
 958    GLboolean need_tmp = (src.NegateBase &&
 959                          dst.file != BRW_GENERAL_REGISTER_FILE);
 960    struct brw_reg tmp = dst;
 961    GLuint i;
 962
 963    if (need_tmp)
 964       tmp = get_tmp(c);
 965
 966    for (i = 0; i < 4; i++) {
 967       if (dst.dw1.bits.writemask & (1<<i)) {
 968          GLubyte s = GET_SWZ(src.Swizzle, i);
 969          switch (s) {
 970          case SWIZZLE_X:
 971          case SWIZZLE_Y:
 972          case SWIZZLE_Z:
 973          case SWIZZLE_W:
 974             src_mask |= 1<<i;
 975             src_swz[i] = s;
 976             break;
 977          case SWIZZLE_ZERO:
 978             zeros_mask |= 1<<i;
 979             break;
 980          case SWIZZLE_ONE:
 981             ones_mask |= 1<<i;
 982             break;
 983          }
 984       }
 985    }
 986
 987    /* Do src first, in case dst aliases src:
 988     */
 989    if (src_mask) {
 990       struct brw_reg arg0;
 991
 992       if (src.RelAddr)
 993          arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 994       else
 995          arg0 = get_src_reg(c, inst, argIndex);
 996
 997       arg0 = brw_swizzle(arg0,
 998                          src_swz[0], src_swz[1],
 999                          src_swz[2], src_swz[3]);
1000
1001       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1002    }
1003
1004    if (zeros_mask)
1005       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1006
1007    if (ones_mask)
1008       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1009
1010    if (src.NegateBase)
1011       brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
1012
1013    if (need_tmp) {
1014       brw_MOV(p, dst, tmp);
1015       release_tmp(c, tmp);
1016    }
1017 }
1018
1019
1020 /**
1021  * Post-vertex-program processing.  Send the results to the URB.
1022  */
1023 static void emit_vertex_write( struct brw_vs_compile *c)
1024 {
1025    struct brw_compile *p = &c->func;
1026    struct brw_reg m0 = brw_message_reg(0);
1027    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1028    struct brw_reg ndc;
1029
1030    if (c->key.copy_edgeflag) {
1031       brw_MOV(p,
1032               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1033               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1034    }
1035
1036    /* Build ndc coords */
1037    ndc = get_tmp(c);
1038    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1039    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1040
1041    /* Update the header for point size, user clipping flags, and -ve rhw
1042     * workaround.
1043     */
1044    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1045        c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1046    {
1047       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1048       GLuint i;
1049
1050       brw_MOV(p, header1, brw_imm_ud(0));
1051
1052       brw_set_access_mode(p, BRW_ALIGN_16);
1053
1054       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1055          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1056          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1057          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1058       }
1059
1060       for (i = 0; i < c->key.nr_userclip; i++) {
1061          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1062          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1063          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1064          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1065       }
1066
1067       /* i965 clipping workaround:
1068        * 1) Test for -ve rhw
1069        * 2) If set,
1070        *      set ndc = (0,0,0,0)
1071        *      set ucp[6] = 1
1072        *
1073        * Later, clipping will detect ucp[6] and ensure the primitive is
1074        * clipped against all fixed planes.
1075        */
1076       if (!BRW_IS_G4X(p->brw)) {
1077          brw_CMP(p,
1078                  vec8(brw_null_reg()),
1079                  BRW_CONDITIONAL_L,
1080                  brw_swizzle1(ndc, 3),
1081                  brw_imm_f(0));
1082
1083          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1084          brw_MOV(p, ndc, brw_imm_f(0));
1085          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1086       }
1087
1088       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1089       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1090       brw_set_access_mode(p, BRW_ALIGN_16);
1091
1092       release_tmp(c, header1);
1093    }
1094    else {
1095       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1096    }
1097
1098    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1099     * of zeros followed by two sets of NDC coordinates:
1100     */
1101    brw_set_access_mode(p, BRW_ALIGN_1);
1102    brw_MOV(p, offset(m0, 2), ndc);
1103    brw_MOV(p, offset(m0, 3), pos);
1104
1105    brw_urb_WRITE(p,
1106                  brw_null_reg(), /* dest */
1107                  0,             /* starting mrf reg nr */
1108                  c->r0,         /* src */
1109                  0,             /* allocate */
1110                  1,             /* used */
1111                  c->nr_outputs + 3, /* msg len */
1112                  0,             /* response len */
1113                  1,             /* eot */
1114                  1,             /* writes complete */
1115                  0,             /* urb destination offset */
1116                  BRW_URB_SWIZZLE_INTERLEAVE);
1117 }
1118
1119
1120 /**
1121  * Called after code generation to resolve subroutine calls and the
1122  * END instruction.
1123  * \param end_inst  points to brw code for END instruction
1124  * \param last_inst  points to last instruction emitted before vertex write
1125  */
1126 static void
1127 post_vs_emit( struct brw_vs_compile *c,
1128               struct brw_instruction *end_inst,
1129               struct brw_instruction *last_inst )
1130 {
1131    GLint offset;
1132
1133    brw_resolve_cals(&c->func);
1134
1135    /* patch up the END code to jump past subroutines, etc */
1136    offset = last_inst - end_inst;
1137    brw_set_src1(end_inst, brw_imm_d(offset * 16));
1138 }
1139
1140
1141 /* Emit the vertex program instructions here.
1142  */
1143 void brw_vs_emit(struct brw_vs_compile *c )
1144 {
1145 #define MAX_IFSN 32
1146    struct brw_compile *p = &c->func;
1147    GLuint nr_insns = c->vp->program.Base.NumInstructions;
1148    GLuint insn, if_insn = 0;
1149    GLuint end_offset = 0;
1150    struct brw_instruction *end_inst, *last_inst;
1151    struct brw_instruction *if_inst[MAX_IFSN];
1152    struct brw_indirect stack_index = brw_indirect(0, 0);
1153
1154    GLuint index;
1155    GLuint file;
1156
1157    if (INTEL_DEBUG & DEBUG_VS) {
1158       _mesa_printf("vs-emit:\n");
1159       _mesa_print_program(&c->vp->program.Base);
1160       _mesa_printf("\n");
1161    }
1162
1163    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1164    brw_set_access_mode(p, BRW_ALIGN_16);
1165
1166    /* Message registers can't be read, so copy the output into GRF register
1167       if they are used in source registers */
1168    for (insn = 0; insn < nr_insns; insn++) {
1169        GLuint i;
1170        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1171        for (i = 0; i < 3; i++) {
1172            struct prog_src_register *src = &inst->SrcReg[i];
1173            GLuint index = src->Index;
1174            GLuint file = src->File;
1175            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1176                c->output_regs[index].used_in_src = GL_TRUE;
1177        }
1178    }
1179
1180    /* Static register allocation
1181     */
1182    brw_vs_alloc_regs(c);
1183    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1184
1185    for (insn = 0; insn < nr_insns; insn++) {
1186
1187       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1188       struct brw_reg args[3], dst;
1189       GLuint i;
1190
1191       /* Get argument regs.  SWZ is special and does this itself.
1192        */
1193       if (inst->Opcode != OPCODE_SWZ)
1194           for (i = 0; i < 3; i++) {
1195               struct prog_src_register *src = &inst->SrcReg[i];
1196               index = src->Index;
1197               file = src->File;
1198               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1199                   args[i] = c->output_regs[index].reg;
1200               else
1201                   args[i] = get_arg(c, inst, i);
1202           }
1203
1204       /* Get dest regs.  Note that it is possible for a reg to be both
1205        * dst and arg, given the static allocation of registers.  So
1206        * care needs to be taken emitting multi-operation instructions.
1207        */
1208       index = inst->DstReg.Index;
1209       file = inst->DstReg.File;
1210       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1211           dst = c->output_regs[index].reg;
1212       else
1213           dst = get_dst(c, inst->DstReg);
1214
1215       if (inst->SaturateMode != SATURATE_OFF) {
1216          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1217                        inst->SaturateMode);
1218       }
1219
1220       switch (inst->Opcode) {
1221       case OPCODE_ABS:
1222          brw_MOV(p, dst, brw_abs(args[0]));
1223          break;
1224       case OPCODE_ADD:
1225          brw_ADD(p, dst, args[0], args[1]);
1226          break;
1227       case OPCODE_COS:
1228          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1229          break;
1230       case OPCODE_DP3:
1231          brw_DP3(p, dst, args[0], args[1]);
1232          break;
1233       case OPCODE_DP4:
1234          brw_DP4(p, dst, args[0], args[1]);
1235          break;
1236       case OPCODE_DPH:
1237          brw_DPH(p, dst, args[0], args[1]);
1238          break;
1239       case OPCODE_NRM3:
1240          emit_nrm(c, dst, args[0], 3);
1241          break;
1242       case OPCODE_NRM4:
1243          emit_nrm(c, dst, args[0], 4);
1244          break;
1245       case OPCODE_DST:
1246          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1247          break;
1248       case OPCODE_EXP:
1249          unalias1(c, dst, args[0], emit_exp_noalias);
1250          break;
1251       case OPCODE_EX2:
1252          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1253          break;
1254       case OPCODE_ARL:
1255          emit_arl(c, dst, args[0]);
1256          break;
1257       case OPCODE_FLR:
1258          brw_RNDD(p, dst, args[0]);
1259          break;
1260       case OPCODE_FRC:
1261          brw_FRC(p, dst, args[0]);
1262          break;
1263       case OPCODE_LOG:
1264          unalias1(c, dst, args[0], emit_log_noalias);
1265          break;
1266       case OPCODE_LG2:
1267          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1268          break;
1269       case OPCODE_LIT:
1270          unalias1(c, dst, args[0], emit_lit_noalias);
1271          break;
1272       case OPCODE_LRP:
1273          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1274          break;
1275       case OPCODE_MAD:
1276          brw_MOV(p, brw_acc_reg(), args[2]);
1277          brw_MAC(p, dst, args[0], args[1]);
1278          break;
1279       case OPCODE_MAX:
1280          emit_max(p, dst, args[0], args[1]);
1281          break;
1282       case OPCODE_MIN:
1283          emit_min(p, dst, args[0], args[1]);
1284          break;
1285       case OPCODE_MOV:
1286          brw_MOV(p, dst, args[0]);
1287          break;
1288       case OPCODE_MUL:
1289          brw_MUL(p, dst, args[0], args[1]);
1290          break;
1291       case OPCODE_POW:
1292          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1293          break;
1294       case OPCODE_RCP:
1295          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1296          break;
1297       case OPCODE_RSQ:
1298          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1299          break;
1300
1301       case OPCODE_SEQ:
1302          emit_seq(p, dst, args[0], args[1]);
1303          break;
1304       case OPCODE_SIN:
1305          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1306          break;
1307       case OPCODE_SNE:
1308          emit_sne(p, dst, args[0], args[1]);
1309          break;
1310       case OPCODE_SGE:
1311          emit_sge(p, dst, args[0], args[1]);
1312          break;
1313       case OPCODE_SGT:
1314          emit_sgt(p, dst, args[0], args[1]);
1315          break;
1316       case OPCODE_SLT:
1317          emit_slt(p, dst, args[0], args[1]);
1318          break;
1319       case OPCODE_SLE:
1320          emit_sle(p, dst, args[0], args[1]);
1321          break;
1322       case OPCODE_SUB:
1323          brw_ADD(p, dst, args[0], negate(args[1]));
1324          break;
1325       case OPCODE_SWZ:
1326          /* The args[0] value can't be used here as it won't have
1327           * correctly encoded the full swizzle:
1328           */
1329          emit_swz(c, dst, inst);
1330          break;
1331       case OPCODE_TRUNC:
1332          /* round toward zero */
1333          brw_RNDZ(p, dst, args[0]);
1334          break;
1335       case OPCODE_XPD:
1336          emit_xpd(p, dst, args[0], args[1]);
1337          break;
1338       case OPCODE_IF:
1339          assert(if_insn < MAX_IFSN);
1340          if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1341          break;
1342       case OPCODE_ELSE:
1343          if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1344          break;
1345       case OPCODE_ENDIF:
1346          assert(if_insn > 0);
1347          brw_ENDIF(p, if_inst[--if_insn]);
1348          break;
1349       case OPCODE_BRA:
1350          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1351          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1352          brw_set_predicate_control_flag_value(p, 0xff);
1353          break;
1354       case OPCODE_CAL:
1355          brw_set_access_mode(p, BRW_ALIGN_1);
1356          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1357          brw_set_access_mode(p, BRW_ALIGN_16);
1358          brw_ADD(p, get_addr_reg(stack_index),
1359                          get_addr_reg(stack_index), brw_imm_d(4));
1360          brw_save_call(p, inst->Comment, p->nr_insn);
1361          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1362          break;
1363       case OPCODE_RET:
1364          brw_ADD(p, get_addr_reg(stack_index),
1365                          get_addr_reg(stack_index), brw_imm_d(-4));
1366          brw_set_access_mode(p, BRW_ALIGN_1);
1367          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1368          brw_set_access_mode(p, BRW_ALIGN_16);
1369          break;
1370       case OPCODE_END:
1371          end_offset = p->nr_insn;
1372          /* this instruction will get patched later to jump past subroutine
1373           * code, etc.
1374           */
1375          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1376          break;
1377       case OPCODE_PRINT:
1378          /* no-op */
1379          break;
1380       case OPCODE_BGNSUB:
1381          brw_save_label(p, inst->Comment, p->nr_insn);
1382          break;
1383       case OPCODE_ENDSUB:
1384          /* no-op */
1385          break;
1386       default:
1387          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1388                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1389                                     _mesa_opcode_string(inst->Opcode) :
1390                                     "unknown");
1391       }
1392
1393       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1394           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1395           && c->output_regs[inst->DstReg.Index].used_in_src) {
1396          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1397       }
1398
1399       /* Result color clamping.
1400        *
1401        * When destination register is an output register and
1402        * it's primary/secondary front/back color, we have to clamp
1403        * the result to [0,1]. This is done by enabling the
1404        * saturation bit for the last instruction.
1405        *
1406        * We don't use brw_set_saturate() as it modifies
1407        * p->current->header.saturate, which affects all the subsequent
1408        * instructions. Instead, we directly modify the header
1409        * of the last (already stored) instruction.
1410        */
1411       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1412          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1413              || (inst->DstReg.Index == VERT_RESULT_COL1)
1414              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1415              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1416             p->store[p->nr_insn-1].header.saturate = 1;
1417          }
1418       }
1419
1420       release_tmps(c);
1421    }
1422
1423    end_inst = &p->store[end_offset];
1424    last_inst = &p->store[p->nr_insn];
1425
1426    /* The END instruction will be patched to jump to this code */
1427    emit_vertex_write(c);
1428
1429    post_vs_emit(c, end_inst, last_inst);
1430 }