src/gallium/drivers/i965simple/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "brw_context.h"
  33 #include "brw_vs.h"
  34
  35 #include "pipe/p_shader_tokens.h"
  36 #include "tgsi/util/tgsi_parse.h"
  37
  38 struct brw_prog_info {
  39    unsigned num_temps;
  40    unsigned num_addrs;
  41    unsigned num_consts;
  42
  43    unsigned writes_psize;
  44
  45    unsigned pos_idx;
  46    unsigned result_edge_idx;
  47    unsigned edge_flag_idx;
  48    unsigned psize_idx;
  49 };
  50
  51 /* Do things as simply as possible.  Allocate and populate all regs
  52  * ahead of time.
  53  */
  54 static void brw_vs_alloc_regs( struct brw_vs_compile *c,
  55                                struct brw_prog_info *info )
  56 {
  57    unsigned i, reg = 0, mrf;
  58    unsigned nr_params;
  59
  60    /* r0 -- reserved as usual
  61     */
  62    c->r0 = brw_vec8_grf(reg, 0); reg++;
  63
  64    /* User clip planes from curbe:
  65     */
  66    if (c->key.nr_userclip) {
  67       for (i = 0; i < c->key.nr_userclip; i++) {
  68          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  69       }
  70
  71       /* Deal with curbe alignment:
  72        */
  73       reg += ((6+c->key.nr_userclip+3)/4)*2;
  74    }
  75
  76    /* Vertex program parameters from curbe:
  77     */
  78    nr_params = c->prog_data.max_const;
  79    for (i = 0; i < nr_params; i++) {
  80       c->regs[TGSI_FILE_CONSTANT][i] = stride(brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
  81    }
  82    reg += (nr_params+1)/2;
  83    c->prog_data.curb_read_length = reg - 1;
  84
  85
  86
  87    /* Allocate input regs:
  88     */
  89    c->nr_inputs = c->vp->info.num_inputs;
  90    for (i = 0; i < c->nr_inputs; i++) {
  91          c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
  92          reg++;
  93    }
  94
  95
  96    /* Allocate outputs: TODO: could organize the non-position outputs
  97     * to go straight into message regs.
  98     */
  99    c->nr_outputs = 0;
 100    c->first_output = reg;
 101    mrf = 4;
 102    for (i = 0; i < c->vp->info.num_outputs; i++) {
 103       c->nr_outputs++;
 104 #if 0
 105       if (i == VERT_RESULT_HPOS) {
 106          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 107          reg++;
 108       }
 109       else if (i == VERT_RESULT_PSIZ) {
 110          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 111          reg++;
 112          mrf++;         /* just a placeholder?  XXX fix later stages & remove this */
 113       }
 114       else {
 115          c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
 116          mrf++;
 117       }
 118 #else
 119       /*treat pos differently for now */
 120       if (i == info->pos_idx) {
 121          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 122          reg++;
 123       } else {
 124          c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
 125          mrf++;
 126       }
 127 #endif
 128    }
 129
 130    /* Allocate program temporaries:
 131     */
 132    for (i = 0; i < info->num_temps; i++) {
 133       c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 134       reg++;
 135    }
 136
 137    /* Address reg(s).  Don't try to use the internal address reg until
 138     * deref time.
 139     */
 140    for (i = 0; i < info->num_addrs; i++) {
 141       c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 142                                                reg,
 143                                                0,
 144                                                BRW_REGISTER_TYPE_D,
 145                                                BRW_VERTICAL_STRIDE_8,
 146                                                BRW_WIDTH_8,
 147                                                BRW_HORIZONTAL_STRIDE_1,
 148                                                BRW_SWIZZLE_XXXX,
 149                                                TGSI_WRITEMASK_X);
 150       reg++;
 151    }
 152
 153    for (i = 0; i < 128; i++) {
 154       if (c->output_regs[i].used_in_src) {
 155          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 156          reg++;
 157       }
 158    }
 159
 160    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 161    reg += 2;
 162
 163
 164    /* Some opcodes need an internal temporary:
 165     */
 166    c->first_tmp = reg;
 167    c->last_tmp = reg;           /* for allocation purposes */
 168
 169    /* Each input reg holds data from two vertices.  The
 170     * urb_read_length is the number of registers read from *each*
 171     * vertex urb, so is half the amount:
 172     */
 173    c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
 174
 175    c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
 176    c->prog_data.total_grf = reg;
 177 }
 178
 179
 180 static struct brw_reg get_tmp( struct brw_vs_compile *c )
 181 {
 182    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
 183
 184    if (++c->last_tmp > c->prog_data.total_grf)
 185       c->prog_data.total_grf = c->last_tmp;
 186
 187    return tmp;
 188 }
 189
 190 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
 191 {
 192    if (tmp.nr == c->last_tmp-1)
 193       c->last_tmp--;
 194 }
 195
 196 static void release_tmps( struct brw_vs_compile *c )
 197 {
 198    c->last_tmp = c->first_tmp;
 199 }
 200
 201
 202 static void unalias1( struct brw_vs_compile *c,
 203                       struct brw_reg dst,
 204                       struct brw_reg arg0,
 205                       void (*func)( struct brw_vs_compile *,
 206                                     struct brw_reg,
 207                                     struct brw_reg ))
 208 {
 209    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 210       struct brw_compile *p = &c->func;
 211       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 212       func(c, tmp, arg0);
 213       brw_MOV(p, dst, tmp);
 214    }
 215    else {
 216       func(c, dst, arg0);
 217    }
 218 }
 219
 220 static void unalias2( struct brw_vs_compile *c,
 221                       struct brw_reg dst,
 222                       struct brw_reg arg0,
 223                       struct brw_reg arg1,
 224                       void (*func)( struct brw_vs_compile *,
 225                                     struct brw_reg,
 226                                     struct brw_reg,
 227                                     struct brw_reg ))
 228 {
 229    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 230        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 231       struct brw_compile *p = &c->func;
 232       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 233       func(c, tmp, arg0, arg1);
 234       brw_MOV(p, dst, tmp);
 235    }
 236    else {
 237       func(c, dst, arg0, arg1);
 238    }
 239 }
 240
 241 static void emit_sop( struct brw_compile *p,
 242                       struct brw_reg dst,
 243                       struct brw_reg arg0,
 244                       struct brw_reg arg1,
 245                       unsigned cond)
 246 {
 247    brw_push_insn_state(p);
 248    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 249    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 250    brw_MOV(p, dst, brw_imm_f(1.0f));
 251    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 252    brw_MOV(p, dst, brw_imm_f(0.0f));
 253    brw_pop_insn_state(p);
 254 }
 255
 256 static void emit_seq( struct brw_compile *p,
 257                       struct brw_reg dst,
 258                       struct brw_reg arg0,
 259                       struct brw_reg arg1 )
 260 {
 261    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 262 }
 263
 264 static void emit_sne( struct brw_compile *p,
 265                       struct brw_reg dst,
 266                       struct brw_reg arg0,
 267                       struct brw_reg arg1 )
 268 {
 269    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 270 }
 271 static void emit_slt( struct brw_compile *p,
 272                       struct brw_reg dst,
 273                       struct brw_reg arg0,
 274                       struct brw_reg arg1 )
 275 {
 276    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 277 }
 278
 279 static void emit_sle( struct brw_compile *p,
 280                       struct brw_reg dst,
 281                       struct brw_reg arg0,
 282                       struct brw_reg arg1 )
 283 {
 284    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 285 }
 286
 287 static void emit_sgt( struct brw_compile *p,
 288                       struct brw_reg dst,
 289                       struct brw_reg arg0,
 290                       struct brw_reg arg1 )
 291 {
 292    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 293 }
 294
 295 static void emit_sge( struct brw_compile *p,
 296                       struct brw_reg dst,
 297                       struct brw_reg arg0,
 298                       struct brw_reg arg1 )
 299 {
 300   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 301 }
 302
 303 static void emit_max( struct brw_compile *p,
 304                       struct brw_reg dst,
 305                       struct brw_reg arg0,
 306                       struct brw_reg arg1 )
 307 {
 308    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 309    brw_SEL(p, dst, arg1, arg0);
 310    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 311 }
 312
 313 static void emit_min( struct brw_compile *p,
 314                       struct brw_reg dst,
 315                       struct brw_reg arg0,
 316                       struct brw_reg arg1 )
 317 {
 318    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 319    brw_SEL(p, dst, arg0, arg1);
 320    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 321 }
 322
 323
 324 static void emit_math1( struct brw_vs_compile *c,
 325                         unsigned function,
 326                         struct brw_reg dst,
 327                         struct brw_reg arg0,
 328                         unsigned precision)
 329 {
 330    /* There are various odd behaviours with SEND on the simulator.  In
 331     * addition there are documented issues with the fact that the GEN4
 332     * processor doesn't do dependency control properly on SEND
 333     * results.  So, on balance, this kludge to get around failures
 334     * with writemasked math results looks like it might be necessary
 335     * whether that turns out to be a simulator bug or not:
 336     */
 337    struct brw_compile *p = &c->func;
 338    struct brw_reg tmp = dst;
 339    boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 340                          dst.file != BRW_GENERAL_REGISTER_FILE);
 341
 342    if (need_tmp)
 343       tmp = get_tmp(c);
 344
 345    brw_math(p,
 346             tmp,
 347             function,
 348             BRW_MATH_SATURATE_NONE,
 349             2,
 350             arg0,
 351             BRW_MATH_DATA_SCALAR,
 352             precision);
 353
 354    if (need_tmp) {
 355       brw_MOV(p, dst, tmp);
 356       release_tmp(c, tmp);
 357    }
 358 }
 359
 360 static void emit_math2( struct brw_vs_compile *c,
 361                         unsigned function,
 362                         struct brw_reg dst,
 363                         struct brw_reg arg0,
 364                         struct brw_reg arg1,
 365                         unsigned precision)
 366 {
 367    struct brw_compile *p = &c->func;
 368    struct brw_reg tmp = dst;
 369    boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 370                          dst.file != BRW_GENERAL_REGISTER_FILE);
 371
 372    if (need_tmp)
 373       tmp = get_tmp(c);
 374
 375    brw_MOV(p, brw_message_reg(3), arg1);
 376
 377    brw_math(p,
 378             tmp,
 379             function,
 380             BRW_MATH_SATURATE_NONE,
 381             2,
 382             arg0,
 383             BRW_MATH_DATA_SCALAR,
 384             precision);
 385
 386    if (need_tmp) {
 387       brw_MOV(p, dst, tmp);
 388       release_tmp(c, tmp);
 389    }
 390 }
 391
 392
 393
 394 static void emit_exp_noalias( struct brw_vs_compile *c,
 395                               struct brw_reg dst,
 396                               struct brw_reg arg0 )
 397 {
 398    struct brw_compile *p = &c->func;
 399
 400
 401    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X) {
 402       struct brw_reg tmp = get_tmp(c);
 403       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 404
 405       /* tmp_d = floor(arg0.x) */
 406       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 407
 408       /* result[0] = 2.0 ^ tmp */
 409
 410       /* Adjust exponent for floating point:
 411        * exp += 127
 412        */
 413       brw_ADD(p, brw_writemask(tmp_d, TGSI_WRITEMASK_X), tmp_d, brw_imm_d(127));
 414
 415       /* Install exponent and sign.
 416        * Excess drops off the edge:
 417        */
 418       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), TGSI_WRITEMASK_X),
 419               tmp_d, brw_imm_d(23));
 420
 421       release_tmp(c, tmp);
 422    }
 423
 424    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y) {
 425       /* result[1] = arg0.x - floor(arg0.x) */
 426       brw_FRC(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0, 0));
 427    }
 428
 429    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
 430       /* As with the LOG instruction, we might be better off just
 431        * doing a taylor expansion here, seeing as we have to do all
 432        * the prep work.
 433        *
 434        * If mathbox partial precision is too low, consider also:
 435        * result[3] = result[0] * EXP(result[1])
 436        */
 437       emit_math1(c,
 438                  BRW_MATH_FUNCTION_EXP,
 439                  brw_writemask(dst, TGSI_WRITEMASK_Z),
 440                  brw_swizzle1(arg0, 0),
 441                  BRW_MATH_PRECISION_PARTIAL);
 442    }
 443
 444    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
 445       /* result[3] = 1.0; */
 446       brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), brw_imm_f(1));
 447    }
 448 }
 449
 450
 451 static void emit_log_noalias( struct brw_vs_compile *c,
 452                               struct brw_reg dst,
 453                               struct brw_reg arg0 )
 454 {
 455    struct brw_compile *p = &c->func;
 456    struct brw_reg tmp = dst;
 457    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 458    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 459    boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 460                          dst.file != BRW_GENERAL_REGISTER_FILE);
 461
 462    if (need_tmp) {
 463       tmp = get_tmp(c);
 464       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 465    }
 466
 467    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 468     * according to spec:
 469     *
 470     * These almost look likey they could be joined up, but not really
 471     * practical:
 472     *
 473     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 474     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 475     */
 476    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_XZ) {
 477       brw_AND(p,
 478               brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
 479               brw_swizzle1(arg0_ud, 0),
 480               brw_imm_ud((1U<<31)-1));
 481
 482       brw_SHR(p,
 483               brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
 484               tmp_ud,
 485               brw_imm_ud(23));
 486
 487       brw_ADD(p,
 488               brw_writemask(tmp, TGSI_WRITEMASK_X),
 489               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 490               brw_imm_d(-127));
 491    }
 492
 493    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_YZ) {
 494       brw_AND(p,
 495               brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
 496               brw_swizzle1(arg0_ud, 0),
 497               brw_imm_ud((1<<23)-1));
 498
 499       brw_OR(p,
 500              brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
 501              tmp_ud,
 502              brw_imm_ud(127<<23));
 503    }
 504
 505    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
 506       /* result[2] = result[0] + LOG2(result[1]); */
 507
 508       /* Why bother?  The above is just a hint how to do this with a
 509        * taylor series.  Maybe we *should* use a taylor series as by
 510        * the time all the above has been done it's almost certainly
 511        * quicker than calling the mathbox, even with low precision.
 512        *
 513        * Options are:
 514        *    - result[0] + mathbox.LOG2(result[1])
 515        *    - mathbox.LOG2(arg0.x)
 516        *    - result[0] + inline_taylor_approx(result[1])
 517        */
 518       emit_math1(c,
 519                  BRW_MATH_FUNCTION_LOG,
 520                  brw_writemask(tmp, TGSI_WRITEMASK_Z),
 521                  brw_swizzle1(tmp, 1),
 522                  BRW_MATH_PRECISION_FULL);
 523
 524       brw_ADD(p,
 525               brw_writemask(tmp, TGSI_WRITEMASK_Z),
 526               brw_swizzle1(tmp, 2),
 527               brw_swizzle1(tmp, 0));
 528    }
 529
 530    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
 531       /* result[3] = 1.0; */
 532       brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_W), brw_imm_f(1));
 533    }
 534
 535    if (need_tmp) {
 536       brw_MOV(p, dst, tmp);
 537       release_tmp(c, tmp);
 538    }
 539 }
 540
 541
 542
 543
 544 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 545  */
 546 static void emit_dst_noalias( struct brw_vs_compile *c,
 547                               struct brw_reg dst,
 548                               struct brw_reg arg0,
 549                               struct brw_reg arg1)
 550 {
 551    struct brw_compile *p = &c->func;
 552
 553    /* There must be a better way to do this:
 554     */
 555    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X)
 556       brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_X), brw_imm_f(1.0));
 557    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y)
 558       brw_MUL(p, brw_writemask(dst, TGSI_WRITEMASK_Y), arg0, arg1);
 559    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z)
 560       brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Z), arg0);
 561    if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W)
 562       brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), arg1);
 563 }
 564
 565 static void emit_xpd( struct brw_compile *p,
 566                       struct brw_reg dst,
 567                       struct brw_reg t,
 568                       struct brw_reg u)
 569 {
 570    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 571    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 572 }
 573
 574
 575
 576 static void emit_lit_noalias( struct brw_vs_compile *c,
 577                               struct brw_reg dst,
 578                               struct brw_reg arg0 )
 579 {
 580    struct brw_compile *p = &c->func;
 581    struct brw_instruction *if_insn;
 582    struct brw_reg tmp = dst;
 583    boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 584
 585    if (need_tmp)
 586       tmp = get_tmp(c);
 587
 588    brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_YZ), brw_imm_f(0));
 589    brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_XW), brw_imm_f(1));
 590
 591    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 592     * to get all channels active inside the IF.  In the clipping code
 593     * we run with NoMask, so it's not an option and we can use
 594     * BRW_EXECUTE_1 for all comparisions.
 595     */
 596    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 597    if_insn = brw_IF(p, BRW_EXECUTE_8);
 598    {
 599       brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0,0));
 600
 601       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 602       brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_Z),  brw_swizzle1(arg0,1));
 603       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 604
 605       emit_math2(c,
 606                  BRW_MATH_FUNCTION_POW,
 607                  brw_writemask(dst, TGSI_WRITEMASK_Z),
 608                  brw_swizzle1(tmp, 2),
 609                  brw_swizzle1(arg0, 3),
 610                  BRW_MATH_PRECISION_PARTIAL);
 611    }
 612
 613    brw_ENDIF(p, if_insn);
 614 }
 615
 616
 617
 618
 619
 620 /* TODO: relative addressing!
 621  */
 622 static struct brw_reg get_reg( struct brw_vs_compile *c,
 623                                unsigned file,
 624                                unsigned index )
 625 {
 626    switch (file) {
 627    case TGSI_FILE_TEMPORARY:
 628    case TGSI_FILE_INPUT:
 629    case TGSI_FILE_OUTPUT:
 630       assert(c->regs[file][index].nr != 0);
 631       return c->regs[file][index];
 632    case TGSI_FILE_CONSTANT:
 633       assert(c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm].nr != 0);
 634       return c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm];
 635    case TGSI_FILE_IMMEDIATE:
 636       assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
 637       return c->regs[TGSI_FILE_CONSTANT][index];
 638    case TGSI_FILE_ADDRESS:
 639       assert(index == 0);
 640       return c->regs[file][index];
 641
 642    case TGSI_FILE_NULL:                 /* undef values */
 643       return brw_null_reg();
 644
 645    default:
 646       assert(0);
 647       return brw_null_reg();
 648    }
 649 }
 650
 651
 652
 653 static struct brw_reg deref( struct brw_vs_compile *c,
 654                              struct brw_reg arg,
 655                              int offset)
 656 {
 657    struct brw_compile *p = &c->func;
 658    struct brw_reg tmp = vec4(get_tmp(c));
 659    struct brw_reg vp_address = retype(vec1(get_reg(c, TGSI_FILE_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
 660    unsigned byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 661    struct brw_reg indirect = brw_vec4_indirect(0,0);
 662
 663    {
 664       brw_push_insn_state(p);
 665       brw_set_access_mode(p, BRW_ALIGN_1);
 666
 667       /* This is pretty clunky - load the address register twice and
 668        * fetch each 4-dword value in turn.  There must be a way to do
 669        * this in a single pass, but I couldn't get it to work.
 670        */
 671       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 672       brw_MOV(p, tmp, indirect);
 673
 674       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 675       brw_MOV(p, suboffset(tmp, 4), indirect);
 676
 677       brw_pop_insn_state(p);
 678    }
 679
 680    return vec8(tmp);
 681 }
 682
 683
 684 static void emit_arl( struct brw_vs_compile *c,
 685                       struct brw_reg dst,
 686                       struct brw_reg arg0 )
 687 {
 688    struct brw_compile *p = &c->func;
 689    struct brw_reg tmp = dst;
 690    boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 691
 692    if (need_tmp)
 693       tmp = get_tmp(c);
 694
 695    brw_RNDD(p, tmp, arg0);
 696    brw_MUL(p, dst, tmp, brw_imm_d(16));
 697
 698    if (need_tmp)
 699       release_tmp(c, tmp);
 700 }
 701
 702
 703 /* Will return mangled results for SWZ op.  The emit_swz() function
 704  * ignores this result and recalculates taking extended swizzles into
 705  * account.
 706  */
 707 static struct brw_reg get_arg( struct brw_vs_compile *c,
 708                                struct tgsi_src_register *src )
 709 {
 710    struct brw_reg reg;
 711
 712    if (src->File == TGSI_FILE_NULL)
 713       return brw_null_reg();
 714
 715 #if 0
 716    if (src->RelAddr)
 717       reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
 718    else
 719 #endif
 720       reg = get_reg(c, src->File, src->Index);
 721
 722    /* Convert 3-bit swizzle to 2-bit.
 723     */
 724    reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SwizzleX,
 725                                        src->SwizzleY,
 726                                        src->SwizzleZ,
 727                                        src->SwizzleW);
 728
 729    /* Note this is ok for non-swizzle instructions:
 730     */
 731    reg.negate = src->Negate ? 1 : 0;
 732
 733    return reg;
 734 }
 735
 736
 737 static struct brw_reg get_dst( struct brw_vs_compile *c,
 738                                const struct tgsi_dst_register *dst )
 739 {
 740    struct brw_reg reg = get_reg(c, dst->File, dst->Index);
 741
 742    reg.dw1.bits.writemask = dst->WriteMask;
 743
 744    return reg;
 745 }
 746
 747
 748
 749
 750 static void emit_swz( struct brw_vs_compile *c,
 751                       struct brw_reg dst,
 752                       struct tgsi_src_register src )
 753 {
 754    struct brw_compile *p = &c->func;
 755    unsigned zeros_mask = 0;
 756    unsigned ones_mask = 0;
 757    unsigned src_mask = 0;
 758    ubyte src_swz[4];
 759    boolean need_tmp = (src.Negate &&
 760                          dst.file != BRW_GENERAL_REGISTER_FILE);
 761    struct brw_reg tmp = dst;
 762    unsigned i;
 763
 764    if (need_tmp)
 765       tmp = get_tmp(c);
 766
 767    for (i = 0; i < 4; i++) {
 768       if (dst.dw1.bits.writemask & (1<<i)) {
 769          ubyte s = 0;
 770          switch(i) {
 771          case 0:
 772             s = src.SwizzleX;
 773             break;
 774             s = src.SwizzleY;
 775          case 1:
 776             break;
 777             s = src.SwizzleZ;
 778          case 2:
 779             break;
 780             s = src.SwizzleW;
 781          case 3:
 782             break;
 783          }
 784          switch (s) {
 785          case TGSI_SWIZZLE_X:
 786          case TGSI_SWIZZLE_Y:
 787          case TGSI_SWIZZLE_Z:
 788          case TGSI_SWIZZLE_W:
 789             src_mask |= 1<<i;
 790             src_swz[i] = s;
 791             break;
 792          case TGSI_EXTSWIZZLE_ZERO:
 793             zeros_mask |= 1<<i;
 794             break;
 795          case TGSI_EXTSWIZZLE_ONE:
 796             ones_mask |= 1<<i;
 797             break;
 798          }
 799       }
 800    }
 801
 802    /* Do src first, in case dst aliases src:
 803     */
 804    if (src_mask) {
 805       struct brw_reg arg0;
 806
 807 #if 0
 808       if (src.RelAddr)
 809          arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 810       else
 811 #endif
 812          arg0 = get_reg(c, src.File, src.Index);
 813
 814       arg0 = brw_swizzle(arg0,
 815                          src_swz[0], src_swz[1],
 816                          src_swz[2], src_swz[3]);
 817
 818       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
 819    }
 820
 821    if (zeros_mask)
 822       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
 823
 824    if (ones_mask)
 825       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 826
 827    if (src.Negate)
 828       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
 829
 830    if (need_tmp) {
 831       brw_MOV(p, dst, tmp);
 832       release_tmp(c, tmp);
 833    }
 834 }
 835
 836
 837
 838 /* Post-vertex-program processing.  Send the results to the URB.
 839  */
 840 static void emit_vertex_write( struct brw_vs_compile *c, struct brw_prog_info *info)
 841 {
 842    struct brw_compile *p = &c->func;
 843    struct brw_reg m0 = brw_message_reg(0);
 844    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][info->pos_idx];
 845    struct brw_reg ndc;
 846
 847    if (c->key.copy_edgeflag) {
 848       brw_MOV(p,
 849               get_reg(c, TGSI_FILE_OUTPUT, info->result_edge_idx),
 850               get_reg(c, TGSI_FILE_INPUT, info->edge_flag_idx));
 851    }
 852
 853
 854    /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
 855     */
 856    if (!c->key.know_w_is_one) {
 857       ndc = get_tmp(c);
 858       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
 859       brw_MUL(p, brw_writemask(ndc, TGSI_WRITEMASK_XYZ), pos, ndc);
 860    }
 861    else {
 862       ndc = pos;
 863    }
 864
 865    /* This includes the workaround for -ve rhw, so is no longer an
 866     * optional step:
 867     */
 868    if (info->writes_psize ||
 869        c->key.nr_userclip ||
 870        !c->key.know_w_is_one)
 871    {
 872       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
 873       unsigned i;
 874
 875       brw_MOV(p, header1, brw_imm_ud(0));
 876
 877       brw_set_access_mode(p, BRW_ALIGN_16);
 878
 879       if (info->writes_psize) {
 880          struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][info->psize_idx];
 881          brw_MUL(p, brw_writemask(header1, TGSI_WRITEMASK_W),
 882                  brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 883          brw_AND(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1,
 884                  brw_imm_ud(0x7ff<<8));
 885       }
 886
 887
 888       for (i = 0; i < c->key.nr_userclip; i++) {
 889          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 890          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
 891          brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<i));
 892          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 893       }
 894
 895
 896       /* i965 clipping workaround:
 897        * 1) Test for -ve rhw
 898        * 2) If set,
 899        *      set ndc = (0,0,0,0)
 900        *      set ucp[6] = 1
 901        *
 902        * Later, clipping will detect ucp[6] and ensure the primitive is
 903        * clipped against all fixed planes.
 904        */
 905       if (!c->key.know_w_is_one) {
 906          brw_CMP(p,
 907                  vec8(brw_null_reg()),
 908                  BRW_CONDITIONAL_L,
 909                  brw_swizzle1(ndc, 3),
 910                  brw_imm_f(0));
 911
 912          brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<6));
 913          brw_MOV(p, ndc, brw_imm_f(0));
 914          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 915       }
 916
 917       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
 918       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
 919       brw_set_access_mode(p, BRW_ALIGN_16);
 920
 921       release_tmp(c, header1);
 922    }
 923    else {
 924       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
 925    }
 926
 927
 928    /* Emit the (interleaved) headers for the two vertices - an 8-reg
 929     * of zeros followed by two sets of NDC coordinates:
 930     */
 931    brw_set_access_mode(p, BRW_ALIGN_1);
 932    brw_MOV(p, offset(m0, 2), ndc);
 933    brw_MOV(p, offset(m0, 3), pos);
 934
 935
 936    brw_urb_WRITE(p,
 937                  brw_null_reg(), /* dest */
 938                  0,             /* starting mrf reg nr */
 939                  c->r0,         /* src */
 940                  0,             /* allocate */
 941                  1,             /* used */
 942                  c->nr_outputs + 3, /* msg len */
 943                  0,             /* response len */
 944                  1,             /* eot */
 945                  1,             /* writes complete */
 946                  0,             /* urb destination offset */
 947                  BRW_URB_SWIZZLE_INTERLEAVE);
 948
 949 }
 950
 951 static void
 952 post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
 953 {
 954    struct tgsi_parse_context parse;
 955    const struct tgsi_token *tokens = c->vp->program.tokens;
 956    tgsi_parse_init(&parse, tokens);
 957    while (!tgsi_parse_end_of_tokens(&parse)) {
 958       tgsi_parse_token(&parse);
 959       if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
 960 #if 0
 961          struct brw_instruction *brw_inst1, *brw_inst2;
 962          const struct tgsi_full_instruction *inst1, *inst2;
 963          int offset;
 964          inst1 = &parse.FullToken.FullInstruction;
 965          brw_inst1 = inst1->Data;
 966          switch (inst1->Opcode) {
 967          case TGSI_OPCODE_CAL:
 968          case TGSI_OPCODE_BRA:
 969             target_insn = inst1->BranchTarget;
 970             inst2 = &c->vp->program.Base.Instructions[target_insn];
 971             brw_inst2 = inst2->Data;
 972             offset = brw_inst2 - brw_inst1;
 973             brw_set_src1(brw_inst1, brw_imm_d(offset*16));
 974             break;
 975          case TGSI_OPCODE_END:
 976             offset = end_inst - brw_inst1;
 977             brw_set_src1(brw_inst1, brw_imm_d(offset*16));
 978             break;
 979          default:
 980             break;
 981          }
 982 #endif
 983       }
 984    }
 985    tgsi_parse_free(&parse);
 986 }
 987
 988 static void process_declaration(const struct tgsi_full_declaration *decl,
 989                                 struct brw_prog_info *info)
 990 {
 991    int first = decl->u.DeclarationRange.First;
 992    int last = decl->u.DeclarationRange.Last;
 993
 994    assert (decl->Declaration.Declare != TGSI_DECLARE_MASK);
 995
 996    switch(decl->Declaration.File) {
 997    case TGSI_FILE_CONSTANT:
 998       info->num_consts += last - first + 1;
 999       break;
1000    case TGSI_FILE_INPUT: {
1001    }
1002       break;
1003    case TGSI_FILE_OUTPUT: {
1004       assert(last == first);    /* for now */
1005       if (decl->Declaration.Semantic) {
1006          switch (decl->Semantic.SemanticName) {
1007          case TGSI_SEMANTIC_POSITION: {
1008             info->pos_idx = first;
1009          }
1010             break;
1011          case TGSI_SEMANTIC_COLOR:
1012             break;
1013          case TGSI_SEMANTIC_BCOLOR:
1014             break;
1015          case TGSI_SEMANTIC_FOG:
1016             break;
1017          case TGSI_SEMANTIC_PSIZE: {
1018             info->writes_psize = TRUE;
1019             info->psize_idx = first;
1020          }
1021             break;
1022          case TGSI_SEMANTIC_GENERIC:
1023             break;
1024          }
1025       }
1026    }
1027       break;
1028    case TGSI_FILE_TEMPORARY: {
1029       info->num_temps += (last - first) + 1;
1030    }
1031       break;
1032    case TGSI_FILE_SAMPLER: {
1033    }
1034       break;
1035    case TGSI_FILE_ADDRESS: {
1036       info->num_addrs += (last - first) + 1;
1037    }
1038       break;
1039    case TGSI_FILE_IMMEDIATE: {
1040    }
1041       break;
1042    case TGSI_FILE_NULL: {
1043    }
1044       break;
1045    }
1046 }
1047
1048 static void process_instruction(struct brw_vs_compile *c,
1049                                 struct tgsi_full_instruction *inst,
1050                                 struct brw_prog_info *info)
1051 {
1052    struct brw_reg args[3], dst;
1053    struct brw_compile *p = &c->func;
1054    /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
1055    unsigned i;
1056    unsigned index;
1057    unsigned file;
1058    /*FIXME: might not be the only one*/
1059    const struct tgsi_dst_register *dst_reg = &inst->FullDstRegisters[0].DstRegister;
1060    /*
1061    struct brw_instruction *if_inst[MAX_IFSN];
1062    unsigned insn, if_insn = 0;
1063    */
1064
1065    for (i = 0; i < 3; i++) {
1066       struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
1067       index = src->SrcRegister.Index;
1068       file = src->SrcRegister.File;
1069       if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
1070          args[i] = c->output_regs[index].reg;
1071       else
1072          args[i] = get_arg(c, &src->SrcRegister);
1073    }
1074
1075    /* Get dest regs.  Note that it is possible for a reg to be both
1076     * dst and arg, given the static allocation of registers.  So
1077     * care needs to be taken emitting multi-operation instructions.
1078     */
1079    index = dst_reg->Index;
1080    file = dst_reg->File;
1081    if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
1082       dst = c->output_regs[index].reg;
1083    else
1084       dst = get_dst(c, dst_reg);
1085
1086    switch (inst->Instruction.Opcode) {
1087    case TGSI_OPCODE_ABS:
1088       brw_MOV(p, dst, brw_abs(args[0]));
1089       break;
1090    case TGSI_OPCODE_ADD:
1091       brw_ADD(p, dst, args[0], args[1]);
1092       break;
1093    case TGSI_OPCODE_DP3:
1094       brw_DP3(p, dst, args[0], args[1]);
1095       break;
1096    case TGSI_OPCODE_DP4:
1097       brw_DP4(p, dst, args[0], args[1]);
1098       break;
1099    case TGSI_OPCODE_DPH:
1100       brw_DPH(p, dst, args[0], args[1]);
1101       break;
1102    case TGSI_OPCODE_DST:
1103       unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1104       break;
1105    case TGSI_OPCODE_EXP:
1106       unalias1(c, dst, args[0], emit_exp_noalias);
1107       break;
1108    case TGSI_OPCODE_EX2:
1109       emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1110       break;
1111    case TGSI_OPCODE_ARL:
1112       emit_arl(c, dst, args[0]);
1113       break;
1114    case TGSI_OPCODE_FLR:
1115       brw_RNDD(p, dst, args[0]);
1116       break;
1117    case TGSI_OPCODE_FRC:
1118       brw_FRC(p, dst, args[0]);
1119       break;
1120    case TGSI_OPCODE_LOG:
1121       unalias1(c, dst, args[0], emit_log_noalias);
1122       break;
1123    case TGSI_OPCODE_LG2:
1124       emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1125       break;
1126    case TGSI_OPCODE_LIT:
1127       unalias1(c, dst, args[0], emit_lit_noalias);
1128       break;
1129    case TGSI_OPCODE_MAD:
1130       brw_MOV(p, brw_acc_reg(), args[2]);
1131       brw_MAC(p, dst, args[0], args[1]);
1132       break;
1133    case TGSI_OPCODE_MAX:
1134       emit_max(p, dst, args[0], args[1]);
1135       break;
1136    case TGSI_OPCODE_MIN:
1137       emit_min(p, dst, args[0], args[1]);
1138       break;
1139    case TGSI_OPCODE_MOV:
1140 #if 0
1141    case TGSI_OPCODE_SWZ:
1142       /* The args[0] value can't be used here as it won't have
1143        * correctly encoded the full swizzle:
1144        */
1145       emit_swz(c, dst, inst->SrcReg[0] );
1146 #endif
1147       brw_MOV(p, dst, args[0]);
1148       break;
1149    case TGSI_OPCODE_MUL:
1150       brw_MUL(p, dst, args[0], args[1]);
1151       break;
1152    case TGSI_OPCODE_POW:
1153       emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1154       break;
1155    case TGSI_OPCODE_RCP:
1156       emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1157       break;
1158    case TGSI_OPCODE_RSQ:
1159       emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1160       break;
1161
1162    case TGSI_OPCODE_SEQ:
1163       emit_seq(p, dst, args[0], args[1]);
1164       break;
1165    case TGSI_OPCODE_SNE:
1166       emit_sne(p, dst, args[0], args[1]);
1167       break;
1168    case TGSI_OPCODE_SGE:
1169       emit_sge(p, dst, args[0], args[1]);
1170       break;
1171    case TGSI_OPCODE_SGT:
1172       emit_sgt(p, dst, args[0], args[1]);
1173       break;
1174    case TGSI_OPCODE_SLT:
1175       emit_slt(p, dst, args[0], args[1]);
1176       break;
1177    case TGSI_OPCODE_SLE:
1178       emit_sle(p, dst, args[0], args[1]);
1179       break;
1180    case TGSI_OPCODE_SUB:
1181       brw_ADD(p, dst, args[0], negate(args[1]));
1182       break;
1183    case TGSI_OPCODE_XPD:
1184       emit_xpd(p, dst, args[0], args[1]);
1185       break;
1186 #if 0
1187    case TGSI_OPCODE_IF:
1188       assert(if_insn < MAX_IFSN);
1189       if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1190       break;
1191    case TGSI_OPCODE_ELSE:
1192       if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1193       break;
1194    case TGSI_OPCODE_ENDIF:
1195       assert(if_insn > 0);
1196       brw_ENDIF(p, if_inst[--if_insn]);
1197       break;
1198    case TGSI_OPCODE_BRA:
1199       brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1200       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1201       brw_set_predicate_control_flag_value(p, 0xff);
1202       break;
1203    case TGSI_OPCODE_CAL:
1204       brw_set_access_mode(p, BRW_ALIGN_1);
1205       brw_ADD(p, deref_1uw(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1206       brw_set_access_mode(p, BRW_ALIGN_16);
1207       brw_ADD(p, get_addr_reg(stack_index),
1208               get_addr_reg(stack_index), brw_imm_d(4));
1209       inst->Data = &p->store[p->nr_insn];
1210       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1211       break;
1212 #endif
1213    case TGSI_OPCODE_RET:
1214 #if 0
1215       brw_ADD(p, get_addr_reg(stack_index),
1216               get_addr_reg(stack_index), brw_imm_d(-4));
1217       brw_set_access_mode(p, BRW_ALIGN_1);
1218       brw_MOV(p, brw_ip_reg(), deref_1uw(stack_index, 0));
1219       brw_set_access_mode(p, BRW_ALIGN_16);
1220 #else
1221       /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
1222 #endif
1223       break;
1224    case TGSI_OPCODE_END:
1225       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1226       break;
1227    case TGSI_OPCODE_BGNSUB:
1228    case TGSI_OPCODE_ENDSUB:
1229       break;
1230    default:
1231       debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
1232       break;
1233    }
1234
1235    if (dst_reg->File == TGSI_FILE_OUTPUT
1236        && dst_reg->Index != info->pos_idx
1237        && c->output_regs[dst_reg->Index].used_in_src)
1238       brw_MOV(p, get_dst(c, dst_reg), dst);
1239
1240    release_tmps(c);
1241 }
1242
1243 /* Emit the fragment program instructions here.
1244  */
1245 void brw_vs_emit(struct brw_vs_compile *c)
1246 {
1247 #define MAX_IFSN 32
1248    struct brw_compile *p = &c->func;
1249    struct brw_instruction *end_inst;
1250    struct tgsi_parse_context parse;
1251    struct brw_indirect stack_index = brw_indirect(0, 0);
1252    const struct tgsi_token *tokens = c->vp->program.tokens;
1253    struct brw_prog_info prog_info;
1254    unsigned allocated_registers = 0;
1255    memset(&prog_info, 0, sizeof(struct brw_prog_info));
1256
1257    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1258    brw_set_access_mode(p, BRW_ALIGN_16);
1259
1260    tgsi_parse_init(&parse, tokens);
1261    /* Message registers can't be read, so copy the output into GRF register
1262       if they are used in source registers */
1263    while (!tgsi_parse_end_of_tokens(&parse)) {
1264       tgsi_parse_token(&parse);
1265       unsigned i;
1266       switch (parse.FullToken.Token.Type) {
1267       case TGSI_TOKEN_TYPE_INSTRUCTION: {
1268          const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1269          for (i = 0; i < 3; ++i) {
1270             const struct tgsi_src_register *src = &inst->FullSrcRegisters[i].SrcRegister;
1271             unsigned index = src->Index;
1272             unsigned file = src->File;
1273             if (file == TGSI_FILE_OUTPUT)
1274                c->output_regs[index].used_in_src = TRUE;
1275          }
1276       }
1277          break;
1278       default:
1279          /* nothing */
1280          break;
1281       }
1282    }
1283    tgsi_parse_free(&parse);
1284
1285    tgsi_parse_init(&parse, tokens);
1286
1287    while (!tgsi_parse_end_of_tokens(&parse)) {
1288       tgsi_parse_token(&parse);
1289
1290       switch (parse.FullToken.Token.Type) {
1291       case TGSI_TOKEN_TYPE_DECLARATION: {
1292          struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1293          process_declaration(decl, &prog_info);
1294       }
1295          break;
1296       case TGSI_TOKEN_TYPE_IMMEDIATE: {
1297          struct tgsi_full_immediate *imm = &parse.FullToken.FullImmediate;
1298          /*assert(imm->Immediate.Size == 4);*/
1299          c->prog_data.imm_buf[c->prog_data.num_imm][0] = imm->u.ImmediateFloat32[0].Float;
1300          c->prog_data.imm_buf[c->prog_data.num_imm][1] = imm->u.ImmediateFloat32[1].Float;
1301          c->prog_data.imm_buf[c->prog_data.num_imm][2] = imm->u.ImmediateFloat32[2].Float;
1302          c->prog_data.imm_buf[c->prog_data.num_imm][3] = imm->u.ImmediateFloat32[3].Float;
1303          c->prog_data.num_imm++;
1304       }
1305          break;
1306       case TGSI_TOKEN_TYPE_INSTRUCTION: {
1307          struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1308          if (!allocated_registers) {
1309             /* first instruction (declerations finished).
1310              * now that we know what vars are being used allocate
1311              * registers for them.*/
1312             c->prog_data.num_consts = prog_info.num_consts;
1313             c->prog_data.max_const = prog_info.num_consts + c->prog_data.num_imm;
1314             brw_vs_alloc_regs(c, &prog_info);
1315
1316             brw_set_access_mode(p, BRW_ALIGN_1);
1317             brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1318             brw_set_access_mode(p, BRW_ALIGN_16);
1319             allocated_registers = 1;
1320          }
1321          process_instruction(c, inst, &prog_info);
1322       }
1323          break;
1324       }
1325    }
1326
1327    end_inst = &p->store[p->nr_insn];
1328    emit_vertex_write(c, &prog_info);
1329    post_vs_emit(c, end_inst);
1330    tgsi_parse_free(&parse);
1331
1332 }