src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71
  72    /* Determine whether to use a real constant buffer or use a block
  73     * of GRF registers for constants.  The later is faster but only
  74     * works if everything fits in the GRF.
  75     * XXX this heuristic/check may need some fine tuning...
  76     */
  77    if (c->vp->program.Base.Parameters->NumParameters +
  78        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  79       c->vp->use_const_buffer = GL_TRUE;
  80    else
  81       c->vp->use_const_buffer = GL_FALSE;
  82
  83    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  84
  85    /* r0 -- reserved as usual
  86     */
  87    c->r0 = brw_vec8_grf(reg, 0);
  88    reg++;
  89
  90    /* User clip planes from curbe:
  91     */
  92    if (c->key.nr_userclip) {
  93       for (i = 0; i < c->key.nr_userclip; i++) {
  94          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  95       }
  96
  97       /* Deal with curbe alignment:
  98        */
  99       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 100    }
 101
 102    /* Vertex program parameters from curbe:
 103     */
 104    if (c->vp->use_const_buffer) {
 105       /* get constants from a real constant buffer */
 106       c->prog_data.curb_read_length = 0;
 107       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 108    }
 109    else {
 110       /* use a section of the GRF for constants */
 111       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 112       for (i = 0; i < nr_params; i++) {
 113          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 114       }
 115       reg += (nr_params + 1) / 2;
 116       c->prog_data.curb_read_length = reg - 1;
 117
 118       c->prog_data.nr_params = nr_params * 4;
 119    }
 120
 121    /* Allocate input regs:
 122     */
 123    c->nr_inputs = 0;
 124    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 125       if (c->prog_data.inputs_read & (1 << i)) {
 126          c->nr_inputs++;
 127          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 128          reg++;
 129       }
 130    }
 131
 132    /* Allocate outputs.  The non-position outputs go straight into message regs.
 133     */
 134    c->nr_outputs = 0;
 135    c->first_output = reg;
 136    mrf = 4;
 137    for (i = 0; i < VERT_RESULT_MAX; i++) {
 138       if (c->prog_data.outputs_written & (1 << i)) {
 139          c->nr_outputs++;
 140          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 141          if (i == VERT_RESULT_HPOS) {
 142             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 143             reg++;
 144          }
 145          else if (i == VERT_RESULT_PSIZ) {
 146             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 147             reg++;
 148             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 149          }
 150          else {
 151             c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 152             mrf++;
 153          }
 154       }
 155    }
 156
 157    /* Allocate program temporaries:
 158     */
 159    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 160       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 161       reg++;
 162    }
 163
 164    /* Address reg(s).  Don't try to use the internal address reg until
 165     * deref time.
 166     */
 167    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 168       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 169                                              reg,
 170                                              0,
 171                                              BRW_REGISTER_TYPE_D,
 172                                              BRW_VERTICAL_STRIDE_8,
 173                                              BRW_WIDTH_8,
 174                                              BRW_HORIZONTAL_STRIDE_1,
 175                                              BRW_SWIZZLE_XXXX,
 176                                              WRITEMASK_X);
 177       reg++;
 178    }
 179
 180    if (c->vp->use_const_buffer) {
 181       for (i = 0; i < 3; i++) {
 182          c->current_const[i].index = -1;
 183          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 184          reg++;
 185       }
 186    }
 187
 188    for (i = 0; i < 128; i++) {
 189       if (c->output_regs[i].used_in_src) {
 190          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 191          reg++;
 192       }
 193    }
 194
 195    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 196    reg += 2;
 197
 198    /* Some opcodes need an internal temporary:
 199     */
 200    c->first_tmp = reg;
 201    c->last_tmp = reg;           /* for allocation purposes */
 202
 203    /* Each input reg holds data from two vertices.  The
 204     * urb_read_length is the number of registers read from *each*
 205     * vertex urb, so is half the amount:
 206     */
 207    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 208
 209    c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
 210    c->prog_data.total_grf = reg;
 211
 212    if (INTEL_DEBUG & DEBUG_VS) {
 213       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 214       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 215       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 216    }
 217 }
 218
 219
 220 /**
 221  * If an instruction uses a temp reg both as a src and the dest, we
 222  * sometimes need to allocate an intermediate temporary.
 223  */
 224 static void unalias1( struct brw_vs_compile *c,
 225                       struct brw_reg dst,
 226                       struct brw_reg arg0,
 227                       void (*func)( struct brw_vs_compile *,
 228                                     struct brw_reg,
 229                                     struct brw_reg ))
 230 {
 231    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 232       struct brw_compile *p = &c->func;
 233       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 234       func(c, tmp, arg0);
 235       brw_MOV(p, dst, tmp);
 236       release_tmp(c, tmp);
 237    }
 238    else {
 239       func(c, dst, arg0);
 240    }
 241 }
 242
 243 /**
 244  * \sa unalias2
 245  * Checkes if 2-operand instruction needs an intermediate temporary.
 246  */
 247 static void unalias2( struct brw_vs_compile *c,
 248                       struct brw_reg dst,
 249                       struct brw_reg arg0,
 250                       struct brw_reg arg1,
 251                       void (*func)( struct brw_vs_compile *,
 252                                     struct brw_reg,
 253                                     struct brw_reg,
 254                                     struct brw_reg ))
 255 {
 256    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 257        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 258       struct brw_compile *p = &c->func;
 259       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 260       func(c, tmp, arg0, arg1);
 261       brw_MOV(p, dst, tmp);
 262       release_tmp(c, tmp);
 263    }
 264    else {
 265       func(c, dst, arg0, arg1);
 266    }
 267 }
 268
 269 /**
 270  * \sa unalias2
 271  * Checkes if 3-operand instruction needs an intermediate temporary.
 272  */
 273 static void unalias3( struct brw_vs_compile *c,
 274                       struct brw_reg dst,
 275                       struct brw_reg arg0,
 276                       struct brw_reg arg1,
 277                       struct brw_reg arg2,
 278                       void (*func)( struct brw_vs_compile *,
 279                                     struct brw_reg,
 280                                     struct brw_reg,
 281                                     struct brw_reg,
 282                                     struct brw_reg ))
 283 {
 284    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 285        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 286        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 287       struct brw_compile *p = &c->func;
 288       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 289       func(c, tmp, arg0, arg1, arg2);
 290       brw_MOV(p, dst, tmp);
 291       release_tmp(c, tmp);
 292    }
 293    else {
 294       func(c, dst, arg0, arg1, arg2);
 295    }
 296 }
 297
 298 static void emit_sop( struct brw_compile *p,
 299                       struct brw_reg dst,
 300                       struct brw_reg arg0,
 301                       struct brw_reg arg1,
 302                       GLuint cond)
 303 {
 304    brw_MOV(p, dst, brw_imm_f(0.0f));
 305    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 306    brw_MOV(p, dst, brw_imm_f(1.0f));
 307    brw_set_predicate_control_flag_value(p, 0xff);
 308 }
 309
 310 static void emit_seq( struct brw_compile *p,
 311                       struct brw_reg dst,
 312                       struct brw_reg arg0,
 313                       struct brw_reg arg1 )
 314 {
 315    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 316 }
 317
 318 static void emit_sne( struct brw_compile *p,
 319                       struct brw_reg dst,
 320                       struct brw_reg arg0,
 321                       struct brw_reg arg1 )
 322 {
 323    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 324 }
 325 static void emit_slt( struct brw_compile *p,
 326                       struct brw_reg dst,
 327                       struct brw_reg arg0,
 328                       struct brw_reg arg1 )
 329 {
 330    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 331 }
 332
 333 static void emit_sle( struct brw_compile *p,
 334                       struct brw_reg dst,
 335                       struct brw_reg arg0,
 336                       struct brw_reg arg1 )
 337 {
 338    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 339 }
 340
 341 static void emit_sgt( struct brw_compile *p,
 342                       struct brw_reg dst,
 343                       struct brw_reg arg0,
 344                       struct brw_reg arg1 )
 345 {
 346    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 347 }
 348
 349 static void emit_sge( struct brw_compile *p,
 350                       struct brw_reg dst,
 351                       struct brw_reg arg0,
 352                       struct brw_reg arg1 )
 353 {
 354   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 355 }
 356
 357 static void emit_max( struct brw_compile *p,
 358                       struct brw_reg dst,
 359                       struct brw_reg arg0,
 360                       struct brw_reg arg1 )
 361 {
 362    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 363    brw_SEL(p, dst, arg1, arg0);
 364    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 365 }
 366
 367 static void emit_min( struct brw_compile *p,
 368                       struct brw_reg dst,
 369                       struct brw_reg arg0,
 370                       struct brw_reg arg1 )
 371 {
 372    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 373    brw_SEL(p, dst, arg0, arg1);
 374    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 375 }
 376
 377
 378 static void emit_math1( struct brw_vs_compile *c,
 379                         GLuint function,
 380                         struct brw_reg dst,
 381                         struct brw_reg arg0,
 382                         GLuint precision)
 383 {
 384    /* There are various odd behaviours with SEND on the simulator.  In
 385     * addition there are documented issues with the fact that the GEN4
 386     * processor doesn't do dependency control properly on SEND
 387     * results.  So, on balance, this kludge to get around failures
 388     * with writemasked math results looks like it might be necessary
 389     * whether that turns out to be a simulator bug or not:
 390     */
 391    struct brw_compile *p = &c->func;
 392    struct brw_reg tmp = dst;
 393    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 394                          dst.file != BRW_GENERAL_REGISTER_FILE);
 395
 396    if (need_tmp)
 397       tmp = get_tmp(c);
 398
 399    brw_math(p,
 400             tmp,
 401             function,
 402             BRW_MATH_SATURATE_NONE,
 403             2,
 404             arg0,
 405             BRW_MATH_DATA_SCALAR,
 406             precision);
 407
 408    if (need_tmp) {
 409       brw_MOV(p, dst, tmp);
 410       release_tmp(c, tmp);
 411    }
 412 }
 413
 414
 415 static void emit_math2( struct brw_vs_compile *c,
 416                         GLuint function,
 417                         struct brw_reg dst,
 418                         struct brw_reg arg0,
 419                         struct brw_reg arg1,
 420                         GLuint precision)
 421 {
 422    struct brw_compile *p = &c->func;
 423    struct brw_reg tmp = dst;
 424    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 425                          dst.file != BRW_GENERAL_REGISTER_FILE);
 426
 427    if (need_tmp)
 428       tmp = get_tmp(c);
 429
 430    brw_MOV(p, brw_message_reg(3), arg1);
 431
 432    brw_math(p,
 433             tmp,
 434             function,
 435             BRW_MATH_SATURATE_NONE,
 436             2,
 437             arg0,
 438             BRW_MATH_DATA_SCALAR,
 439             precision);
 440
 441    if (need_tmp) {
 442       brw_MOV(p, dst, tmp);
 443       release_tmp(c, tmp);
 444    }
 445 }
 446
 447
 448 static void emit_exp_noalias( struct brw_vs_compile *c,
 449                               struct brw_reg dst,
 450                               struct brw_reg arg0 )
 451 {
 452    struct brw_compile *p = &c->func;
 453
 454
 455    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 456       struct brw_reg tmp = get_tmp(c);
 457       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 458
 459       /* tmp_d = floor(arg0.x) */
 460       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 461
 462       /* result[0] = 2.0 ^ tmp */
 463
 464       /* Adjust exponent for floating point:
 465        * exp += 127
 466        */
 467       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 468
 469       /* Install exponent and sign.
 470        * Excess drops off the edge:
 471        */
 472       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 473               tmp_d, brw_imm_d(23));
 474
 475       release_tmp(c, tmp);
 476    }
 477
 478    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 479       /* result[1] = arg0.x - floor(arg0.x) */
 480       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 481    }
 482
 483    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 484       /* As with the LOG instruction, we might be better off just
 485        * doing a taylor expansion here, seeing as we have to do all
 486        * the prep work.
 487        *
 488        * If mathbox partial precision is too low, consider also:
 489        * result[3] = result[0] * EXP(result[1])
 490        */
 491       emit_math1(c,
 492                  BRW_MATH_FUNCTION_EXP,
 493                  brw_writemask(dst, WRITEMASK_Z),
 494                  brw_swizzle1(arg0, 0),
 495                  BRW_MATH_PRECISION_FULL);
 496    }
 497
 498    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 499       /* result[3] = 1.0; */
 500       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 501    }
 502 }
 503
 504
 505 static void emit_log_noalias( struct brw_vs_compile *c,
 506                               struct brw_reg dst,
 507                               struct brw_reg arg0 )
 508 {
 509    struct brw_compile *p = &c->func;
 510    struct brw_reg tmp = dst;
 511    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 512    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 513    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 514                          dst.file != BRW_GENERAL_REGISTER_FILE);
 515
 516    if (need_tmp) {
 517       tmp = get_tmp(c);
 518       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 519    }
 520
 521    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 522     * according to spec:
 523     *
 524     * These almost look likey they could be joined up, but not really
 525     * practical:
 526     *
 527     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 528     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 529     */
 530    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 531       brw_AND(p,
 532               brw_writemask(tmp_ud, WRITEMASK_X),
 533               brw_swizzle1(arg0_ud, 0),
 534               brw_imm_ud((1U<<31)-1));
 535
 536       brw_SHR(p,
 537               brw_writemask(tmp_ud, WRITEMASK_X),
 538               tmp_ud,
 539               brw_imm_ud(23));
 540
 541       brw_ADD(p,
 542               brw_writemask(tmp, WRITEMASK_X),
 543               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 544               brw_imm_d(-127));
 545    }
 546
 547    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 548       brw_AND(p,
 549               brw_writemask(tmp_ud, WRITEMASK_Y),
 550               brw_swizzle1(arg0_ud, 0),
 551               brw_imm_ud((1<<23)-1));
 552
 553       brw_OR(p,
 554              brw_writemask(tmp_ud, WRITEMASK_Y),
 555              tmp_ud,
 556              brw_imm_ud(127<<23));
 557    }
 558
 559    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 560       /* result[2] = result[0] + LOG2(result[1]); */
 561
 562       /* Why bother?  The above is just a hint how to do this with a
 563        * taylor series.  Maybe we *should* use a taylor series as by
 564        * the time all the above has been done it's almost certainly
 565        * quicker than calling the mathbox, even with low precision.
 566        *
 567        * Options are:
 568        *    - result[0] + mathbox.LOG2(result[1])
 569        *    - mathbox.LOG2(arg0.x)
 570        *    - result[0] + inline_taylor_approx(result[1])
 571        */
 572       emit_math1(c,
 573                  BRW_MATH_FUNCTION_LOG,
 574                  brw_writemask(tmp, WRITEMASK_Z),
 575                  brw_swizzle1(tmp, 1),
 576                  BRW_MATH_PRECISION_FULL);
 577
 578       brw_ADD(p,
 579               brw_writemask(tmp, WRITEMASK_Z),
 580               brw_swizzle1(tmp, 2),
 581               brw_swizzle1(tmp, 0));
 582    }
 583
 584    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 585       /* result[3] = 1.0; */
 586       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 587    }
 588
 589    if (need_tmp) {
 590       brw_MOV(p, dst, tmp);
 591       release_tmp(c, tmp);
 592    }
 593 }
 594
 595
 596 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 597  */
 598 static void emit_dst_noalias( struct brw_vs_compile *c,
 599                               struct brw_reg dst,
 600                               struct brw_reg arg0,
 601                               struct brw_reg arg1)
 602 {
 603    struct brw_compile *p = &c->func;
 604
 605    /* There must be a better way to do this:
 606     */
 607    if (dst.dw1.bits.writemask & WRITEMASK_X)
 608       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 609    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 610       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 611    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 612       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 613    if (dst.dw1.bits.writemask & WRITEMASK_W)
 614       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 615 }
 616
 617
 618 static void emit_xpd( struct brw_compile *p,
 619                       struct brw_reg dst,
 620                       struct brw_reg t,
 621                       struct brw_reg u)
 622 {
 623    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 624    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 625 }
 626
 627
 628 static void emit_lit_noalias( struct brw_vs_compile *c,
 629                               struct brw_reg dst,
 630                               struct brw_reg arg0 )
 631 {
 632    struct brw_compile *p = &c->func;
 633    struct brw_instruction *if_insn;
 634    struct brw_reg tmp = dst;
 635    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 636
 637    if (need_tmp)
 638       tmp = get_tmp(c);
 639
 640    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 641    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 642
 643    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 644     * to get all channels active inside the IF.  In the clipping code
 645     * we run with NoMask, so it's not an option and we can use
 646     * BRW_EXECUTE_1 for all comparisions.
 647     */
 648    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 649    if_insn = brw_IF(p, BRW_EXECUTE_8);
 650    {
 651       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 652
 653       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 654       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 655       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 656
 657       emit_math2(c,
 658                  BRW_MATH_FUNCTION_POW,
 659                  brw_writemask(dst, WRITEMASK_Z),
 660                  brw_swizzle1(tmp, 2),
 661                  brw_swizzle1(arg0, 3),
 662                  BRW_MATH_PRECISION_PARTIAL);
 663    }
 664
 665    brw_ENDIF(p, if_insn);
 666
 667    release_tmp(c, tmp);
 668 }
 669
 670 static void emit_lrp_noalias(struct brw_vs_compile *c,
 671                              struct brw_reg dst,
 672                              struct brw_reg arg0,
 673                              struct brw_reg arg1,
 674                              struct brw_reg arg2)
 675 {
 676    struct brw_compile *p = &c->func;
 677
 678    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 679    brw_MUL(p, brw_null_reg(), dst, arg2);
 680    brw_MAC(p, dst, arg0, arg1);
 681 }
 682
 683 /** 3 or 4-component vector normalization */
 684 static void emit_nrm( struct brw_vs_compile *c,
 685                       struct brw_reg dst,
 686                       struct brw_reg arg0,
 687                       int num_comps)
 688 {
 689    struct brw_compile *p = &c->func;
 690    struct brw_reg tmp = get_tmp(c);
 691
 692    /* tmp = dot(arg0, arg0) */
 693    if (num_comps == 3)
 694       brw_DP3(p, tmp, arg0, arg0);
 695    else
 696       brw_DP4(p, tmp, arg0, arg0);
 697
 698    /* tmp = 1 / sqrt(tmp) */
 699    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 700
 701    /* dst = arg0 * tmp */
 702    brw_MUL(p, dst, arg0, tmp);
 703
 704    release_tmp(c, tmp);
 705 }
 706
 707
 708 static struct brw_reg
 709 get_constant(struct brw_vs_compile *c,
 710              const struct prog_instruction *inst,
 711              GLuint argIndex)
 712 {
 713    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 714    struct brw_compile *p = &c->func;
 715    struct brw_reg const_reg;
 716    struct brw_reg const2_reg;
 717    const GLboolean relAddr = src->RelAddr;
 718
 719    assert(argIndex < 3);
 720
 721    if (c->current_const[argIndex].index != src->Index || relAddr) {
 722       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 723
 724       c->current_const[argIndex].index = src->Index;
 725
 726 #if 0
 727       printf("  fetch const[%d] for arg %d into reg %d\n",
 728              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 729 #endif
 730       /* need to fetch the constant now */
 731       brw_dp_READ_4_vs(p,
 732                        c->current_const[argIndex].reg,/* writeback dest */
 733                        0,                             /* oword */
 734                        relAddr,                       /* relative indexing? */
 735                        addrReg,                       /* address register */
 736                        16 * src->Index,               /* byte offset */
 737                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 738                        );
 739
 740       if (relAddr) {
 741          /* second read */
 742          const2_reg = get_tmp(c);
 743
 744          /* use upper half of address reg for second read */
 745          addrReg = stride(addrReg, 0, 4, 0);
 746          addrReg.subnr = 16;
 747
 748          brw_dp_READ_4_vs(p,
 749                           const2_reg,              /* writeback dest */
 750                           1,                       /* oword */
 751                           relAddr,                 /* relative indexing? */
 752                           addrReg,                 /* address register */
 753                           16 * src->Index,         /* byte offset */
 754                           SURF_INDEX_VERT_CONST_BUFFER
 755                           );
 756       }
 757    }
 758
 759    const_reg = c->current_const[argIndex].reg;
 760
 761    if (relAddr) {
 762       /* merge the two Owords into the constant register */
 763       /* const_reg[7..4] = const2_reg[7..4] */
 764       brw_MOV(p,
 765               suboffset(stride(const_reg, 0, 4, 1), 4),
 766               suboffset(stride(const2_reg, 0, 4, 1), 4));
 767       release_tmp(c, const2_reg);
 768    }
 769    else {
 770       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 771       const_reg = stride(const_reg, 0, 4, 0);
 772       const_reg.subnr = 0;
 773    }
 774
 775    return const_reg;
 776 }
 777
 778
 779
 780 /* TODO: relative addressing!
 781  */
 782 static struct brw_reg get_reg( struct brw_vs_compile *c,
 783                                gl_register_file file,
 784                                GLuint index )
 785 {
 786    switch (file) {
 787    case PROGRAM_TEMPORARY:
 788    case PROGRAM_INPUT:
 789    case PROGRAM_OUTPUT:
 790       assert(c->regs[file][index].nr != 0);
 791       return c->regs[file][index];
 792    case PROGRAM_STATE_VAR:
 793    case PROGRAM_CONSTANT:
 794    case PROGRAM_UNIFORM:
 795       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 796       return c->regs[PROGRAM_STATE_VAR][index];
 797    case PROGRAM_ADDRESS:
 798       assert(index == 0);
 799       return c->regs[file][index];
 800
 801    case PROGRAM_UNDEFINED:                      /* undef values */
 802       return brw_null_reg();
 803
 804    case PROGRAM_LOCAL_PARAM:
 805    case PROGRAM_ENV_PARAM:
 806    case PROGRAM_WRITE_ONLY:
 807    default:
 808       assert(0);
 809       return brw_null_reg();
 810    }
 811 }
 812
 813
 814 /**
 815  * Indirect addressing:  get reg[[arg] + offset].
 816  */
 817 static struct brw_reg deref( struct brw_vs_compile *c,
 818                              struct brw_reg arg,
 819                              GLint offset)
 820 {
 821    struct brw_compile *p = &c->func;
 822    struct brw_reg tmp = vec4(get_tmp(c));
 823    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 824    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 825    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 826    struct brw_reg indirect = brw_vec4_indirect(0,0);
 827
 828    {
 829       brw_push_insn_state(p);
 830       brw_set_access_mode(p, BRW_ALIGN_1);
 831
 832       /* This is pretty clunky - load the address register twice and
 833        * fetch each 4-dword value in turn.  There must be a way to do
 834        * this in a single pass, but I couldn't get it to work.
 835        */
 836       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 837       brw_MOV(p, tmp, indirect);
 838
 839       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 840       brw_MOV(p, suboffset(tmp, 4), indirect);
 841
 842       brw_pop_insn_state(p);
 843    }
 844
 845    /* NOTE: tmp not released */
 846    return vec8(tmp);
 847 }
 848
 849
 850 /**
 851  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 852  * TODO: relative addressing!
 853  */
 854 static struct brw_reg
 855 get_src_reg( struct brw_vs_compile *c,
 856              const struct prog_instruction *inst,
 857              GLuint argIndex )
 858 {
 859    const GLuint file = inst->SrcReg[argIndex].File;
 860    const GLint index = inst->SrcReg[argIndex].Index;
 861    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 862
 863    switch (file) {
 864    case PROGRAM_TEMPORARY:
 865    case PROGRAM_INPUT:
 866    case PROGRAM_OUTPUT:
 867       if (relAddr) {
 868          return deref(c, c->regs[file][0], index);
 869       }
 870       else {
 871          assert(c->regs[file][index].nr != 0);
 872          return c->regs[file][index];
 873       }
 874
 875    case PROGRAM_STATE_VAR:
 876    case PROGRAM_CONSTANT:
 877    case PROGRAM_UNIFORM:
 878       if (c->vp->use_const_buffer) {
 879          return get_constant(c, inst, argIndex);
 880       }
 881       else if (relAddr) {
 882          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 883       }
 884       else {
 885          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 886          return c->regs[PROGRAM_STATE_VAR][index];
 887       }
 888    case PROGRAM_ADDRESS:
 889       assert(index == 0);
 890       return c->regs[file][index];
 891
 892    case PROGRAM_UNDEFINED:
 893       /* this is a normal case since we loop over all three src args */
 894       return brw_null_reg();
 895
 896    case PROGRAM_LOCAL_PARAM:
 897    case PROGRAM_ENV_PARAM:
 898    case PROGRAM_WRITE_ONLY:
 899    default:
 900       assert(0);
 901       return brw_null_reg();
 902    }
 903 }
 904
 905
 906 static void emit_arl( struct brw_vs_compile *c,
 907                       struct brw_reg dst,
 908                       struct brw_reg arg0 )
 909 {
 910    struct brw_compile *p = &c->func;
 911    struct brw_reg tmp = dst;
 912    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 913
 914    if (need_tmp)
 915       tmp = get_tmp(c);
 916
 917    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 918    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 919
 920    if (need_tmp)
 921       release_tmp(c, tmp);
 922 }
 923
 924
 925 /**
 926  * Return the brw reg for the given instruction's src argument.
 927  * Will return mangled results for SWZ op.  The emit_swz() function
 928  * ignores this result and recalculates taking extended swizzles into
 929  * account.
 930  */
 931 static struct brw_reg get_arg( struct brw_vs_compile *c,
 932                                const struct prog_instruction *inst,
 933                                GLuint argIndex )
 934 {
 935    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 936    struct brw_reg reg;
 937
 938    if (src->File == PROGRAM_UNDEFINED)
 939       return brw_null_reg();
 940
 941    reg = get_src_reg(c, inst, argIndex);
 942
 943    /* Convert 3-bit swizzle to 2-bit.
 944     */
 945    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 946                                        GET_SWZ(src->Swizzle, 1),
 947                                        GET_SWZ(src->Swizzle, 2),
 948                                        GET_SWZ(src->Swizzle, 3));
 949
 950    /* Note this is ok for non-swizzle instructions:
 951     */
 952    reg.negate = src->Negate ? 1 : 0;
 953
 954    return reg;
 955 }
 956
 957
 958 /**
 959  * Get brw register for the given program dest register.
 960  */
 961 static struct brw_reg get_dst( struct brw_vs_compile *c,
 962                                struct prog_dst_register dst )
 963 {
 964    struct brw_reg reg;
 965
 966    switch (dst.File) {
 967    case PROGRAM_TEMPORARY:
 968    case PROGRAM_OUTPUT:
 969       assert(c->regs[dst.File][dst.Index].nr != 0);
 970       reg = c->regs[dst.File][dst.Index];
 971       break;
 972    case PROGRAM_ADDRESS:
 973       assert(dst.Index == 0);
 974       reg = c->regs[dst.File][dst.Index];
 975       break;
 976    case PROGRAM_UNDEFINED:
 977       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
 978       reg = brw_null_reg();
 979       break;
 980    default:
 981       assert(0);
 982       reg = brw_null_reg();
 983    }
 984
 985    reg.dw1.bits.writemask = dst.WriteMask;
 986
 987    return reg;
 988 }
 989
 990
 991 static void emit_swz( struct brw_vs_compile *c,
 992                       struct brw_reg dst,
 993                       const struct prog_instruction *inst)
 994 {
 995    const GLuint argIndex = 0;
 996    const struct prog_src_register src = inst->SrcReg[argIndex];
 997    struct brw_compile *p = &c->func;
 998    GLuint zeros_mask = 0;
 999    GLuint ones_mask = 0;
1000    GLuint src_mask = 0;
1001    GLubyte src_swz[4];
1002    GLboolean need_tmp = (src.Negate &&
1003                          dst.file != BRW_GENERAL_REGISTER_FILE);
1004    struct brw_reg tmp = dst;
1005    GLuint i;
1006
1007    if (need_tmp)
1008       tmp = get_tmp(c);
1009
1010    for (i = 0; i < 4; i++) {
1011       if (dst.dw1.bits.writemask & (1<<i)) {
1012          GLubyte s = GET_SWZ(src.Swizzle, i);
1013          switch (s) {
1014          case SWIZZLE_X:
1015          case SWIZZLE_Y:
1016          case SWIZZLE_Z:
1017          case SWIZZLE_W:
1018             src_mask |= 1<<i;
1019             src_swz[i] = s;
1020             break;
1021          case SWIZZLE_ZERO:
1022             zeros_mask |= 1<<i;
1023             break;
1024          case SWIZZLE_ONE:
1025             ones_mask |= 1<<i;
1026             break;
1027          }
1028       }
1029    }
1030
1031    /* Do src first, in case dst aliases src:
1032     */
1033    if (src_mask) {
1034       struct brw_reg arg0;
1035
1036       arg0 = get_src_reg(c, inst, argIndex);
1037
1038       arg0 = brw_swizzle(arg0,
1039                          src_swz[0], src_swz[1],
1040                          src_swz[2], src_swz[3]);
1041
1042       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1043    }
1044
1045    if (zeros_mask)
1046       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1047
1048    if (ones_mask)
1049       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1050
1051    if (src.Negate)
1052       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1053
1054    if (need_tmp) {
1055       brw_MOV(p, dst, tmp);
1056       release_tmp(c, tmp);
1057    }
1058 }
1059
1060
1061 /**
1062  * Post-vertex-program processing.  Send the results to the URB.
1063  */
1064 static void emit_vertex_write( struct brw_vs_compile *c)
1065 {
1066    struct brw_compile *p = &c->func;
1067    struct brw_reg m0 = brw_message_reg(0);
1068    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1069    struct brw_reg ndc;
1070
1071    if (c->key.copy_edgeflag) {
1072       brw_MOV(p,
1073               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1074               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1075    }
1076
1077    /* Build ndc coords */
1078    ndc = get_tmp(c);
1079    /* ndc = 1.0 / pos.w */
1080    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1081    /* ndc.xyz = pos * ndc */
1082    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1083
1084    /* Update the header for point size, user clipping flags, and -ve rhw
1085     * workaround.
1086     */
1087    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1088        c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1089    {
1090       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1091       GLuint i;
1092
1093       brw_MOV(p, header1, brw_imm_ud(0));
1094
1095       brw_set_access_mode(p, BRW_ALIGN_16);
1096
1097       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1098          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1099          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1100          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1101       }
1102
1103       for (i = 0; i < c->key.nr_userclip; i++) {
1104          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1105          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1106          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1107          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1108       }
1109
1110       /* i965 clipping workaround:
1111        * 1) Test for -ve rhw
1112        * 2) If set,
1113        *      set ndc = (0,0,0,0)
1114        *      set ucp[6] = 1
1115        *
1116        * Later, clipping will detect ucp[6] and ensure the primitive is
1117        * clipped against all fixed planes.
1118        */
1119       if (!BRW_IS_G4X(p->brw)) {
1120          brw_CMP(p,
1121                  vec8(brw_null_reg()),
1122                  BRW_CONDITIONAL_L,
1123                  brw_swizzle1(ndc, 3),
1124                  brw_imm_f(0));
1125
1126          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1127          brw_MOV(p, ndc, brw_imm_f(0));
1128          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1129       }
1130
1131       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1132       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1133       brw_set_access_mode(p, BRW_ALIGN_16);
1134
1135       release_tmp(c, header1);
1136    }
1137    else {
1138       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1139    }
1140
1141    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1142     * of zeros followed by two sets of NDC coordinates:
1143     */
1144    brw_set_access_mode(p, BRW_ALIGN_1);
1145    brw_MOV(p, offset(m0, 2), ndc);
1146    brw_MOV(p, offset(m0, 3), pos);
1147
1148    brw_urb_WRITE(p,
1149                  brw_null_reg(), /* dest */
1150                  0,             /* starting mrf reg nr */
1151                  c->r0,         /* src */
1152                  0,             /* allocate */
1153                  1,             /* used */
1154                  c->nr_outputs + 3, /* msg len */
1155                  0,             /* response len */
1156                  1,             /* eot */
1157                  1,             /* writes complete */
1158                  0,             /* urb destination offset */
1159                  BRW_URB_SWIZZLE_INTERLEAVE);
1160 }
1161
1162
1163 /**
1164  * Called after code generation to resolve subroutine calls and the
1165  * END instruction.
1166  * \param end_inst  points to brw code for END instruction
1167  * \param last_inst  points to last instruction emitted before vertex write
1168  */
1169 static void
1170 post_vs_emit( struct brw_vs_compile *c,
1171               struct brw_instruction *end_inst,
1172               struct brw_instruction *last_inst )
1173 {
1174    GLint offset;
1175
1176    brw_resolve_cals(&c->func);
1177
1178    /* patch up the END code to jump past subroutines, etc */
1179    offset = last_inst - end_inst;
1180    brw_set_src1(end_inst, brw_imm_d(offset * 16));
1181 }
1182
1183
1184 /* Emit the vertex program instructions here.
1185  */
1186 void brw_vs_emit(struct brw_vs_compile *c )
1187 {
1188 #define MAX_IF_DEPTH 32
1189 #define MAX_LOOP_DEPTH 32
1190    struct brw_compile *p = &c->func;
1191    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1192    GLuint insn, if_depth = 0, loop_depth = 0;
1193    GLuint end_offset = 0;
1194    struct brw_instruction *end_inst, *last_inst;
1195    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1196    const struct brw_indirect stack_index = brw_indirect(0, 0);
1197    GLuint index;
1198    GLuint file;
1199
1200    if (INTEL_DEBUG & DEBUG_VS) {
1201       _mesa_printf("vs-emit:\n");
1202       _mesa_print_program(&c->vp->program.Base);
1203       _mesa_printf("\n");
1204    }
1205
1206    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1207    brw_set_access_mode(p, BRW_ALIGN_16);
1208
1209    /* Message registers can't be read, so copy the output into GRF register
1210       if they are used in source registers */
1211    for (insn = 0; insn < nr_insns; insn++) {
1212        GLuint i;
1213        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1214        for (i = 0; i < 3; i++) {
1215            struct prog_src_register *src = &inst->SrcReg[i];
1216            GLuint index = src->Index;
1217            GLuint file = src->File;
1218            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1219                c->output_regs[index].used_in_src = GL_TRUE;
1220        }
1221    }
1222
1223    /* Static register allocation
1224     */
1225    brw_vs_alloc_regs(c);
1226    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1227
1228    for (insn = 0; insn < nr_insns; insn++) {
1229
1230       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1231       struct brw_reg args[3], dst;
1232       GLuint i;
1233
1234 #if 0
1235       printf("%d: ", insn);
1236       _mesa_print_instruction(inst);
1237 #endif
1238
1239       /* Get argument regs.  SWZ is special and does this itself.
1240        */
1241       if (inst->Opcode != OPCODE_SWZ)
1242           for (i = 0; i < 3; i++) {
1243               const struct prog_src_register *src = &inst->SrcReg[i];
1244               index = src->Index;
1245               file = src->File;
1246               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1247                   args[i] = c->output_regs[index].reg;
1248               else
1249                   args[i] = get_arg(c, inst, i);
1250           }
1251
1252       /* Get dest regs.  Note that it is possible for a reg to be both
1253        * dst and arg, given the static allocation of registers.  So
1254        * care needs to be taken emitting multi-operation instructions.
1255        */
1256       index = inst->DstReg.Index;
1257       file = inst->DstReg.File;
1258       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1259           dst = c->output_regs[index].reg;
1260       else
1261           dst = get_dst(c, inst->DstReg);
1262
1263       if (inst->SaturateMode != SATURATE_OFF) {
1264          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1265                        inst->SaturateMode);
1266       }
1267
1268       switch (inst->Opcode) {
1269       case OPCODE_ABS:
1270          brw_MOV(p, dst, brw_abs(args[0]));
1271          break;
1272       case OPCODE_ADD:
1273          brw_ADD(p, dst, args[0], args[1]);
1274          break;
1275       case OPCODE_COS:
1276          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1277          break;
1278       case OPCODE_DP3:
1279          brw_DP3(p, dst, args[0], args[1]);
1280          break;
1281       case OPCODE_DP4:
1282          brw_DP4(p, dst, args[0], args[1]);
1283          break;
1284       case OPCODE_DPH:
1285          brw_DPH(p, dst, args[0], args[1]);
1286          break;
1287       case OPCODE_NRM3:
1288          emit_nrm(c, dst, args[0], 3);
1289          break;
1290       case OPCODE_NRM4:
1291          emit_nrm(c, dst, args[0], 4);
1292          break;
1293       case OPCODE_DST:
1294          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1295          break;
1296       case OPCODE_EXP:
1297          unalias1(c, dst, args[0], emit_exp_noalias);
1298          break;
1299       case OPCODE_EX2:
1300          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1301          break;
1302       case OPCODE_ARL:
1303          emit_arl(c, dst, args[0]);
1304          break;
1305       case OPCODE_FLR:
1306          brw_RNDD(p, dst, args[0]);
1307          break;
1308       case OPCODE_FRC:
1309          brw_FRC(p, dst, args[0]);
1310          break;
1311       case OPCODE_LOG:
1312          unalias1(c, dst, args[0], emit_log_noalias);
1313          break;
1314       case OPCODE_LG2:
1315          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1316          break;
1317       case OPCODE_LIT:
1318          unalias1(c, dst, args[0], emit_lit_noalias);
1319          break;
1320       case OPCODE_LRP:
1321          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1322          break;
1323       case OPCODE_MAD:
1324          brw_MOV(p, brw_acc_reg(), args[2]);
1325          brw_MAC(p, dst, args[0], args[1]);
1326          break;
1327       case OPCODE_MAX:
1328          emit_max(p, dst, args[0], args[1]);
1329          break;
1330       case OPCODE_MIN:
1331          emit_min(p, dst, args[0], args[1]);
1332          break;
1333       case OPCODE_MOV:
1334          brw_MOV(p, dst, args[0]);
1335          break;
1336       case OPCODE_MUL:
1337          brw_MUL(p, dst, args[0], args[1]);
1338          break;
1339       case OPCODE_POW:
1340          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1341          break;
1342       case OPCODE_RCP:
1343          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1344          break;
1345       case OPCODE_RSQ:
1346          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1347          break;
1348
1349       case OPCODE_SEQ:
1350          emit_seq(p, dst, args[0], args[1]);
1351          break;
1352       case OPCODE_SIN:
1353          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1354          break;
1355       case OPCODE_SNE:
1356          emit_sne(p, dst, args[0], args[1]);
1357          break;
1358       case OPCODE_SGE:
1359          emit_sge(p, dst, args[0], args[1]);
1360          break;
1361       case OPCODE_SGT:
1362          emit_sgt(p, dst, args[0], args[1]);
1363          break;
1364       case OPCODE_SLT:
1365          emit_slt(p, dst, args[0], args[1]);
1366          break;
1367       case OPCODE_SLE:
1368          emit_sle(p, dst, args[0], args[1]);
1369          break;
1370       case OPCODE_SUB:
1371          brw_ADD(p, dst, args[0], negate(args[1]));
1372          break;
1373       case OPCODE_SWZ:
1374          /* The args[0] value can't be used here as it won't have
1375           * correctly encoded the full swizzle:
1376           */
1377          emit_swz(c, dst, inst);
1378          break;
1379       case OPCODE_TRUNC:
1380          /* round toward zero */
1381          brw_RNDZ(p, dst, args[0]);
1382          break;
1383       case OPCODE_XPD:
1384          emit_xpd(p, dst, args[0], args[1]);
1385          break;
1386       case OPCODE_IF:
1387          assert(if_depth < MAX_IF_DEPTH);
1388          if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1389          break;
1390       case OPCODE_ELSE:
1391          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1392          break;
1393       case OPCODE_ENDIF:
1394          assert(if_depth > 0);
1395          brw_ENDIF(p, if_inst[--if_depth]);
1396          break;
1397 #if 0
1398       case OPCODE_BGNLOOP:
1399          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1400          break;
1401       case OPCODE_BRK:
1402          brw_BREAK(p);
1403          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1404          break;
1405       case OPCODE_CONT:
1406          brw_CONT(p);
1407          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1408          break;
1409       case OPCODE_ENDLOOP:
1410          {
1411             struct brw_instruction *inst0, *inst1;
1412             loop_depth--;
1413             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1414             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1415             while (inst0 > loop_inst[loop_depth]) {
1416                inst0--;
1417                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1418                   inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
1419                   inst0->bits3.if_else.pop_count = 0;
1420                }
1421                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1422                   inst0->bits3.if_else.jump_count = inst1 - inst0;
1423                   inst0->bits3.if_else.pop_count = 0;
1424                }
1425             }
1426          }
1427          break;
1428 #else
1429          (void) loop_inst;
1430          (void) loop_depth;
1431 #endif
1432       case OPCODE_BRA:
1433          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1434          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1435          brw_set_predicate_control_flag_value(p, 0xff);
1436          break;
1437       case OPCODE_CAL:
1438          brw_set_access_mode(p, BRW_ALIGN_1);
1439          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1440          brw_set_access_mode(p, BRW_ALIGN_16);
1441          brw_ADD(p, get_addr_reg(stack_index),
1442                          get_addr_reg(stack_index), brw_imm_d(4));
1443          brw_save_call(p, inst->Comment, p->nr_insn);
1444          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1445          break;
1446       case OPCODE_RET:
1447          brw_ADD(p, get_addr_reg(stack_index),
1448                          get_addr_reg(stack_index), brw_imm_d(-4));
1449          brw_set_access_mode(p, BRW_ALIGN_1);
1450          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1451          brw_set_access_mode(p, BRW_ALIGN_16);
1452          break;
1453       case OPCODE_END:
1454          end_offset = p->nr_insn;
1455          /* this instruction will get patched later to jump past subroutine
1456           * code, etc.
1457           */
1458          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1459          break;
1460       case OPCODE_PRINT:
1461          /* no-op */
1462          break;
1463       case OPCODE_BGNSUB:
1464          brw_save_label(p, inst->Comment, p->nr_insn);
1465          break;
1466       case OPCODE_ENDSUB:
1467          /* no-op */
1468          break;
1469       default:
1470          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1471                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1472                                     _mesa_opcode_string(inst->Opcode) :
1473                                     "unknown");
1474       }
1475
1476       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1477           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1478           && c->output_regs[inst->DstReg.Index].used_in_src) {
1479          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1480       }
1481
1482       /* Result color clamping.
1483        *
1484        * When destination register is an output register and
1485        * it's primary/secondary front/back color, we have to clamp
1486        * the result to [0,1]. This is done by enabling the
1487        * saturation bit for the last instruction.
1488        *
1489        * We don't use brw_set_saturate() as it modifies
1490        * p->current->header.saturate, which affects all the subsequent
1491        * instructions. Instead, we directly modify the header
1492        * of the last (already stored) instruction.
1493        */
1494       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1495          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1496              || (inst->DstReg.Index == VERT_RESULT_COL1)
1497              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1498              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1499             p->store[p->nr_insn-1].header.saturate = 1;
1500          }
1501       }
1502
1503       release_tmps(c);
1504    }
1505
1506    end_inst = &p->store[end_offset];
1507    last_inst = &p->store[p->nr_insn];
1508
1509    /* The END instruction will be patched to jump to this code */
1510    emit_vertex_write(c);
1511
1512    post_vs_emit(c, end_inst, last_inst);
1513 }