src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71    int attributes_in_vue;
  72
  73    /* Determine whether to use a real constant buffer or use a block
  74     * of GRF registers for constants.  The later is faster but only
  75     * works if everything fits in the GRF.
  76     * XXX this heuristic/check may need some fine tuning...
  77     */
  78    if (c->vp->program.Base.Parameters->NumParameters +
  79        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  80       c->vp->use_const_buffer = GL_TRUE;
  81    else
  82       c->vp->use_const_buffer = GL_FALSE;
  83
  84    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  85
  86    /* r0 -- reserved as usual
  87     */
  88    c->r0 = brw_vec8_grf(reg, 0);
  89    reg++;
  90
  91    /* User clip planes from curbe:
  92     */
  93    if (c->key.nr_userclip) {
  94       for (i = 0; i < c->key.nr_userclip; i++) {
  95          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  96       }
  97
  98       /* Deal with curbe alignment:
  99        */
 100       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 101    }
 102
 103    /* Vertex program parameters from curbe:
 104     */
 105    if (c->vp->use_const_buffer) {
 106       /* get constants from a real constant buffer */
 107       c->prog_data.curb_read_length = 0;
 108       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 109    }
 110    else {
 111       /* use a section of the GRF for constants */
 112       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 113       for (i = 0; i < nr_params; i++) {
 114          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 115       }
 116       reg += (nr_params + 1) / 2;
 117       c->prog_data.curb_read_length = reg - 1;
 118
 119       c->prog_data.nr_params = nr_params * 4;
 120    }
 121
 122    /* Allocate input regs:
 123     */
 124    c->nr_inputs = 0;
 125    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 126       if (c->prog_data.inputs_read & (1 << i)) {
 127          c->nr_inputs++;
 128          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 129          reg++;
 130       }
 131    }
 132    /* If there are no inputs, we'll still be reading one attribute's worth
 133     * because it's required -- see urb_read_length setting.
 134     */
 135    if (c->nr_inputs == 0)
 136       reg++;
 137
 138    /* Allocate outputs.  The non-position outputs go straight into message regs.
 139     */
 140    c->nr_outputs = 0;
 141    c->first_output = reg;
 142    c->first_overflow_output = 0;
 143
 144    if (BRW_IS_IGDNG(c->func.brw))
 145        mrf = 8;
 146    else
 147        mrf = 4;
 148
 149    for (i = 0; i < VERT_RESULT_MAX; i++) {
 150       if (c->prog_data.outputs_written & (1 << i)) {
 151          c->nr_outputs++;
 152          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 153          if (i == VERT_RESULT_HPOS) {
 154             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 155             reg++;
 156          }
 157          else if (i == VERT_RESULT_PSIZ) {
 158             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 159             reg++;
 160             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 161          }
 162          else {
 163             if (mrf < 16) {
 164                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 165                mrf++;
 166             }
 167             else {
 168                /* too many vertex results to fit in MRF, use GRF for overflow */
 169                if (!c->first_overflow_output)
 170                   c->first_overflow_output = i;
 171                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 172                reg++;
 173             }
 174          }
 175       }
 176    }
 177
 178    /* Allocate program temporaries:
 179     */
 180    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 181       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 182       reg++;
 183    }
 184
 185    /* Address reg(s).  Don't try to use the internal address reg until
 186     * deref time.
 187     */
 188    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 189       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 190                                              reg,
 191                                              0,
 192                                              BRW_REGISTER_TYPE_D,
 193                                              BRW_VERTICAL_STRIDE_8,
 194                                              BRW_WIDTH_8,
 195                                              BRW_HORIZONTAL_STRIDE_1,
 196                                              BRW_SWIZZLE_XXXX,
 197                                              WRITEMASK_X);
 198       reg++;
 199    }
 200
 201    if (c->vp->use_const_buffer) {
 202       for (i = 0; i < 3; i++) {
 203          c->current_const[i].index = -1;
 204          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 205          reg++;
 206       }
 207    }
 208
 209    for (i = 0; i < 128; i++) {
 210       if (c->output_regs[i].used_in_src) {
 211          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 212          reg++;
 213       }
 214    }
 215
 216    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 217    reg += 2;
 218
 219    /* Some opcodes need an internal temporary:
 220     */
 221    c->first_tmp = reg;
 222    c->last_tmp = reg;           /* for allocation purposes */
 223
 224    /* Each input reg holds data from two vertices.  The
 225     * urb_read_length is the number of registers read from *each*
 226     * vertex urb, so is half the amount:
 227     */
 228    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 229    /* Setting this field to 0 leads to undefined behavior according to the
 230     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 231     * sitting in them, even if it's padding.
 232     */
 233    if (c->prog_data.urb_read_length == 0)
 234       c->prog_data.urb_read_length = 1;
 235
 236    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 237     * them to fit the biggest thing they need to.
 238     */
 239    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 240
 241    if (BRW_IS_IGDNG(c->func.brw))
 242        c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 243    else
 244        c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 245
 246    c->prog_data.total_grf = reg;
 247
 248    if (INTEL_DEBUG & DEBUG_VS) {
 249       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 250       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 251       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 252    }
 253 }
 254
 255
 256 /**
 257  * If an instruction uses a temp reg both as a src and the dest, we
 258  * sometimes need to allocate an intermediate temporary.
 259  */
 260 static void unalias1( struct brw_vs_compile *c,
 261                       struct brw_reg dst,
 262                       struct brw_reg arg0,
 263                       void (*func)( struct brw_vs_compile *,
 264                                     struct brw_reg,
 265                                     struct brw_reg ))
 266 {
 267    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 268       struct brw_compile *p = &c->func;
 269       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 270       func(c, tmp, arg0);
 271       brw_MOV(p, dst, tmp);
 272       release_tmp(c, tmp);
 273    }
 274    else {
 275       func(c, dst, arg0);
 276    }
 277 }
 278
 279 /**
 280  * \sa unalias2
 281  * Checkes if 2-operand instruction needs an intermediate temporary.
 282  */
 283 static void unalias2( struct brw_vs_compile *c,
 284                       struct brw_reg dst,
 285                       struct brw_reg arg0,
 286                       struct brw_reg arg1,
 287                       void (*func)( struct brw_vs_compile *,
 288                                     struct brw_reg,
 289                                     struct brw_reg,
 290                                     struct brw_reg ))
 291 {
 292    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 293        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 294       struct brw_compile *p = &c->func;
 295       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 296       func(c, tmp, arg0, arg1);
 297       brw_MOV(p, dst, tmp);
 298       release_tmp(c, tmp);
 299    }
 300    else {
 301       func(c, dst, arg0, arg1);
 302    }
 303 }
 304
 305 /**
 306  * \sa unalias2
 307  * Checkes if 3-operand instruction needs an intermediate temporary.
 308  */
 309 static void unalias3( struct brw_vs_compile *c,
 310                       struct brw_reg dst,
 311                       struct brw_reg arg0,
 312                       struct brw_reg arg1,
 313                       struct brw_reg arg2,
 314                       void (*func)( struct brw_vs_compile *,
 315                                     struct brw_reg,
 316                                     struct brw_reg,
 317                                     struct brw_reg,
 318                                     struct brw_reg ))
 319 {
 320    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 321        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 322        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 323       struct brw_compile *p = &c->func;
 324       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 325       func(c, tmp, arg0, arg1, arg2);
 326       brw_MOV(p, dst, tmp);
 327       release_tmp(c, tmp);
 328    }
 329    else {
 330       func(c, dst, arg0, arg1, arg2);
 331    }
 332 }
 333
 334 static void emit_sop( struct brw_compile *p,
 335                       struct brw_reg dst,
 336                       struct brw_reg arg0,
 337                       struct brw_reg arg1,
 338                       GLuint cond)
 339 {
 340    brw_MOV(p, dst, brw_imm_f(0.0f));
 341    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 342    brw_MOV(p, dst, brw_imm_f(1.0f));
 343    brw_set_predicate_control_flag_value(p, 0xff);
 344 }
 345
 346 static void emit_seq( struct brw_compile *p,
 347                       struct brw_reg dst,
 348                       struct brw_reg arg0,
 349                       struct brw_reg arg1 )
 350 {
 351    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 352 }
 353
 354 static void emit_sne( struct brw_compile *p,
 355                       struct brw_reg dst,
 356                       struct brw_reg arg0,
 357                       struct brw_reg arg1 )
 358 {
 359    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 360 }
 361 static void emit_slt( struct brw_compile *p,
 362                       struct brw_reg dst,
 363                       struct brw_reg arg0,
 364                       struct brw_reg arg1 )
 365 {
 366    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 367 }
 368
 369 static void emit_sle( struct brw_compile *p,
 370                       struct brw_reg dst,
 371                       struct brw_reg arg0,
 372                       struct brw_reg arg1 )
 373 {
 374    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 375 }
 376
 377 static void emit_sgt( struct brw_compile *p,
 378                       struct brw_reg dst,
 379                       struct brw_reg arg0,
 380                       struct brw_reg arg1 )
 381 {
 382    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 383 }
 384
 385 static void emit_sge( struct brw_compile *p,
 386                       struct brw_reg dst,
 387                       struct brw_reg arg0,
 388                       struct brw_reg arg1 )
 389 {
 390   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 391 }
 392
 393 static void emit_max( struct brw_compile *p,
 394                       struct brw_reg dst,
 395                       struct brw_reg arg0,
 396                       struct brw_reg arg1 )
 397 {
 398    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 399    brw_SEL(p, dst, arg1, arg0);
 400    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 401 }
 402
 403 static void emit_min( struct brw_compile *p,
 404                       struct brw_reg dst,
 405                       struct brw_reg arg0,
 406                       struct brw_reg arg1 )
 407 {
 408    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 409    brw_SEL(p, dst, arg0, arg1);
 410    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 411 }
 412
 413
 414 static void emit_math1( struct brw_vs_compile *c,
 415                         GLuint function,
 416                         struct brw_reg dst,
 417                         struct brw_reg arg0,
 418                         GLuint precision)
 419 {
 420    /* There are various odd behaviours with SEND on the simulator.  In
 421     * addition there are documented issues with the fact that the GEN4
 422     * processor doesn't do dependency control properly on SEND
 423     * results.  So, on balance, this kludge to get around failures
 424     * with writemasked math results looks like it might be necessary
 425     * whether that turns out to be a simulator bug or not:
 426     */
 427    struct brw_compile *p = &c->func;
 428    struct brw_reg tmp = dst;
 429    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 430                          dst.file != BRW_GENERAL_REGISTER_FILE);
 431
 432    if (need_tmp)
 433       tmp = get_tmp(c);
 434
 435    brw_math(p,
 436             tmp,
 437             function,
 438             BRW_MATH_SATURATE_NONE,
 439             2,
 440             arg0,
 441             BRW_MATH_DATA_SCALAR,
 442             precision);
 443
 444    if (need_tmp) {
 445       brw_MOV(p, dst, tmp);
 446       release_tmp(c, tmp);
 447    }
 448 }
 449
 450
 451 static void emit_math2( struct brw_vs_compile *c,
 452                         GLuint function,
 453                         struct brw_reg dst,
 454                         struct brw_reg arg0,
 455                         struct brw_reg arg1,
 456                         GLuint precision)
 457 {
 458    struct brw_compile *p = &c->func;
 459    struct brw_reg tmp = dst;
 460    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 461                          dst.file != BRW_GENERAL_REGISTER_FILE);
 462
 463    if (need_tmp)
 464       tmp = get_tmp(c);
 465
 466    brw_MOV(p, brw_message_reg(3), arg1);
 467
 468    brw_math(p,
 469             tmp,
 470             function,
 471             BRW_MATH_SATURATE_NONE,
 472             2,
 473             arg0,
 474             BRW_MATH_DATA_SCALAR,
 475             precision);
 476
 477    if (need_tmp) {
 478       brw_MOV(p, dst, tmp);
 479       release_tmp(c, tmp);
 480    }
 481 }
 482
 483
 484 static void emit_exp_noalias( struct brw_vs_compile *c,
 485                               struct brw_reg dst,
 486                               struct brw_reg arg0 )
 487 {
 488    struct brw_compile *p = &c->func;
 489
 490
 491    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 492       struct brw_reg tmp = get_tmp(c);
 493       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 494
 495       /* tmp_d = floor(arg0.x) */
 496       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 497
 498       /* result[0] = 2.0 ^ tmp */
 499
 500       /* Adjust exponent for floating point:
 501        * exp += 127
 502        */
 503       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 504
 505       /* Install exponent and sign.
 506        * Excess drops off the edge:
 507        */
 508       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 509               tmp_d, brw_imm_d(23));
 510
 511       release_tmp(c, tmp);
 512    }
 513
 514    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 515       /* result[1] = arg0.x - floor(arg0.x) */
 516       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 517    }
 518
 519    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 520       /* As with the LOG instruction, we might be better off just
 521        * doing a taylor expansion here, seeing as we have to do all
 522        * the prep work.
 523        *
 524        * If mathbox partial precision is too low, consider also:
 525        * result[3] = result[0] * EXP(result[1])
 526        */
 527       emit_math1(c,
 528                  BRW_MATH_FUNCTION_EXP,
 529                  brw_writemask(dst, WRITEMASK_Z),
 530                  brw_swizzle1(arg0, 0),
 531                  BRW_MATH_PRECISION_FULL);
 532    }
 533
 534    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 535       /* result[3] = 1.0; */
 536       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 537    }
 538 }
 539
 540
 541 static void emit_log_noalias( struct brw_vs_compile *c,
 542                               struct brw_reg dst,
 543                               struct brw_reg arg0 )
 544 {
 545    struct brw_compile *p = &c->func;
 546    struct brw_reg tmp = dst;
 547    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 548    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 549    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 550                          dst.file != BRW_GENERAL_REGISTER_FILE);
 551
 552    if (need_tmp) {
 553       tmp = get_tmp(c);
 554       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 555    }
 556
 557    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 558     * according to spec:
 559     *
 560     * These almost look likey they could be joined up, but not really
 561     * practical:
 562     *
 563     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 564     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 565     */
 566    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 567       brw_AND(p,
 568               brw_writemask(tmp_ud, WRITEMASK_X),
 569               brw_swizzle1(arg0_ud, 0),
 570               brw_imm_ud((1U<<31)-1));
 571
 572       brw_SHR(p,
 573               brw_writemask(tmp_ud, WRITEMASK_X),
 574               tmp_ud,
 575               brw_imm_ud(23));
 576
 577       brw_ADD(p,
 578               brw_writemask(tmp, WRITEMASK_X),
 579               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 580               brw_imm_d(-127));
 581    }
 582
 583    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 584       brw_AND(p,
 585               brw_writemask(tmp_ud, WRITEMASK_Y),
 586               brw_swizzle1(arg0_ud, 0),
 587               brw_imm_ud((1<<23)-1));
 588
 589       brw_OR(p,
 590              brw_writemask(tmp_ud, WRITEMASK_Y),
 591              tmp_ud,
 592              brw_imm_ud(127<<23));
 593    }
 594
 595    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 596       /* result[2] = result[0] + LOG2(result[1]); */
 597
 598       /* Why bother?  The above is just a hint how to do this with a
 599        * taylor series.  Maybe we *should* use a taylor series as by
 600        * the time all the above has been done it's almost certainly
 601        * quicker than calling the mathbox, even with low precision.
 602        *
 603        * Options are:
 604        *    - result[0] + mathbox.LOG2(result[1])
 605        *    - mathbox.LOG2(arg0.x)
 606        *    - result[0] + inline_taylor_approx(result[1])
 607        */
 608       emit_math1(c,
 609                  BRW_MATH_FUNCTION_LOG,
 610                  brw_writemask(tmp, WRITEMASK_Z),
 611                  brw_swizzle1(tmp, 1),
 612                  BRW_MATH_PRECISION_FULL);
 613
 614       brw_ADD(p,
 615               brw_writemask(tmp, WRITEMASK_Z),
 616               brw_swizzle1(tmp, 2),
 617               brw_swizzle1(tmp, 0));
 618    }
 619
 620    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 621       /* result[3] = 1.0; */
 622       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 623    }
 624
 625    if (need_tmp) {
 626       brw_MOV(p, dst, tmp);
 627       release_tmp(c, tmp);
 628    }
 629 }
 630
 631
 632 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 633  */
 634 static void emit_dst_noalias( struct brw_vs_compile *c,
 635                               struct brw_reg dst,
 636                               struct brw_reg arg0,
 637                               struct brw_reg arg1)
 638 {
 639    struct brw_compile *p = &c->func;
 640
 641    /* There must be a better way to do this:
 642     */
 643    if (dst.dw1.bits.writemask & WRITEMASK_X)
 644       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 645    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 646       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 647    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 648       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 649    if (dst.dw1.bits.writemask & WRITEMASK_W)
 650       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 651 }
 652
 653
 654 static void emit_xpd( struct brw_compile *p,
 655                       struct brw_reg dst,
 656                       struct brw_reg t,
 657                       struct brw_reg u)
 658 {
 659    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 660    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 661 }
 662
 663
 664 static void emit_lit_noalias( struct brw_vs_compile *c,
 665                               struct brw_reg dst,
 666                               struct brw_reg arg0 )
 667 {
 668    struct brw_compile *p = &c->func;
 669    struct brw_instruction *if_insn;
 670    struct brw_reg tmp = dst;
 671    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 672
 673    if (need_tmp)
 674       tmp = get_tmp(c);
 675
 676    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 677    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 678
 679    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 680     * to get all channels active inside the IF.  In the clipping code
 681     * we run with NoMask, so it's not an option and we can use
 682     * BRW_EXECUTE_1 for all comparisions.
 683     */
 684    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 685    if_insn = brw_IF(p, BRW_EXECUTE_8);
 686    {
 687       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 688
 689       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 690       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 691       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 692
 693       emit_math2(c,
 694                  BRW_MATH_FUNCTION_POW,
 695                  brw_writemask(dst, WRITEMASK_Z),
 696                  brw_swizzle1(tmp, 2),
 697                  brw_swizzle1(arg0, 3),
 698                  BRW_MATH_PRECISION_PARTIAL);
 699    }
 700
 701    brw_ENDIF(p, if_insn);
 702
 703    release_tmp(c, tmp);
 704 }
 705
 706 static void emit_lrp_noalias(struct brw_vs_compile *c,
 707                              struct brw_reg dst,
 708                              struct brw_reg arg0,
 709                              struct brw_reg arg1,
 710                              struct brw_reg arg2)
 711 {
 712    struct brw_compile *p = &c->func;
 713
 714    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 715    brw_MUL(p, brw_null_reg(), dst, arg2);
 716    brw_MAC(p, dst, arg0, arg1);
 717 }
 718
 719 /** 3 or 4-component vector normalization */
 720 static void emit_nrm( struct brw_vs_compile *c,
 721                       struct brw_reg dst,
 722                       struct brw_reg arg0,
 723                       int num_comps)
 724 {
 725    struct brw_compile *p = &c->func;
 726    struct brw_reg tmp = get_tmp(c);
 727
 728    /* tmp = dot(arg0, arg0) */
 729    if (num_comps == 3)
 730       brw_DP3(p, tmp, arg0, arg0);
 731    else
 732       brw_DP4(p, tmp, arg0, arg0);
 733
 734    /* tmp = 1 / sqrt(tmp) */
 735    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 736
 737    /* dst = arg0 * tmp */
 738    brw_MUL(p, dst, arg0, tmp);
 739
 740    release_tmp(c, tmp);
 741 }
 742
 743
 744 static struct brw_reg
 745 get_constant(struct brw_vs_compile *c,
 746              const struct prog_instruction *inst,
 747              GLuint argIndex)
 748 {
 749    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 750    struct brw_compile *p = &c->func;
 751    struct brw_reg const_reg;
 752    struct brw_reg const2_reg;
 753    const GLboolean relAddr = src->RelAddr;
 754
 755    assert(argIndex < 3);
 756
 757    if (c->current_const[argIndex].index != src->Index || relAddr) {
 758       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 759
 760       c->current_const[argIndex].index = src->Index;
 761
 762 #if 0
 763       printf("  fetch const[%d] for arg %d into reg %d\n",
 764              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 765 #endif
 766       /* need to fetch the constant now */
 767       brw_dp_READ_4_vs(p,
 768                        c->current_const[argIndex].reg,/* writeback dest */
 769                        0,                             /* oword */
 770                        relAddr,                       /* relative indexing? */
 771                        addrReg,                       /* address register */
 772                        16 * src->Index,               /* byte offset */
 773                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 774                        );
 775
 776       if (relAddr) {
 777          /* second read */
 778          const2_reg = get_tmp(c);
 779
 780          /* use upper half of address reg for second read */
 781          addrReg = stride(addrReg, 0, 4, 0);
 782          addrReg.subnr = 16;
 783
 784          brw_dp_READ_4_vs(p,
 785                           const2_reg,              /* writeback dest */
 786                           1,                       /* oword */
 787                           relAddr,                 /* relative indexing? */
 788                           addrReg,                 /* address register */
 789                           16 * src->Index,         /* byte offset */
 790                           SURF_INDEX_VERT_CONST_BUFFER
 791                           );
 792       }
 793    }
 794
 795    const_reg = c->current_const[argIndex].reg;
 796
 797    if (relAddr) {
 798       /* merge the two Owords into the constant register */
 799       /* const_reg[7..4] = const2_reg[7..4] */
 800       brw_MOV(p,
 801               suboffset(stride(const_reg, 0, 4, 1), 4),
 802               suboffset(stride(const2_reg, 0, 4, 1), 4));
 803       release_tmp(c, const2_reg);
 804    }
 805    else {
 806       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 807       const_reg = stride(const_reg, 0, 4, 0);
 808       const_reg.subnr = 0;
 809    }
 810
 811    return const_reg;
 812 }
 813
 814
 815
 816 /* TODO: relative addressing!
 817  */
 818 static struct brw_reg get_reg( struct brw_vs_compile *c,
 819                                gl_register_file file,
 820                                GLuint index )
 821 {
 822    switch (file) {
 823    case PROGRAM_TEMPORARY:
 824    case PROGRAM_INPUT:
 825    case PROGRAM_OUTPUT:
 826       assert(c->regs[file][index].nr != 0);
 827       return c->regs[file][index];
 828    case PROGRAM_STATE_VAR:
 829    case PROGRAM_CONSTANT:
 830    case PROGRAM_UNIFORM:
 831       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 832       return c->regs[PROGRAM_STATE_VAR][index];
 833    case PROGRAM_ADDRESS:
 834       assert(index == 0);
 835       return c->regs[file][index];
 836
 837    case PROGRAM_UNDEFINED:                      /* undef values */
 838       return brw_null_reg();
 839
 840    case PROGRAM_LOCAL_PARAM:
 841    case PROGRAM_ENV_PARAM:
 842    case PROGRAM_WRITE_ONLY:
 843    default:
 844       assert(0);
 845       return brw_null_reg();
 846    }
 847 }
 848
 849
 850 /**
 851  * Indirect addressing:  get reg[[arg] + offset].
 852  */
 853 static struct brw_reg deref( struct brw_vs_compile *c,
 854                              struct brw_reg arg,
 855                              GLint offset)
 856 {
 857    struct brw_compile *p = &c->func;
 858    struct brw_reg tmp = vec4(get_tmp(c));
 859    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 860    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 861    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 862    struct brw_reg indirect = brw_vec4_indirect(0,0);
 863
 864    {
 865       brw_push_insn_state(p);
 866       brw_set_access_mode(p, BRW_ALIGN_1);
 867
 868       /* This is pretty clunky - load the address register twice and
 869        * fetch each 4-dword value in turn.  There must be a way to do
 870        * this in a single pass, but I couldn't get it to work.
 871        */
 872       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 873       brw_MOV(p, tmp, indirect);
 874
 875       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 876       brw_MOV(p, suboffset(tmp, 4), indirect);
 877
 878       brw_pop_insn_state(p);
 879    }
 880
 881    /* NOTE: tmp not released */
 882    return vec8(tmp);
 883 }
 884
 885
 886 /**
 887  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 888  * TODO: relative addressing!
 889  */
 890 static struct brw_reg
 891 get_src_reg( struct brw_vs_compile *c,
 892              const struct prog_instruction *inst,
 893              GLuint argIndex )
 894 {
 895    const GLuint file = inst->SrcReg[argIndex].File;
 896    const GLint index = inst->SrcReg[argIndex].Index;
 897    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 898
 899    switch (file) {
 900    case PROGRAM_TEMPORARY:
 901    case PROGRAM_INPUT:
 902    case PROGRAM_OUTPUT:
 903       if (relAddr) {
 904          return deref(c, c->regs[file][0], index);
 905       }
 906       else {
 907          assert(c->regs[file][index].nr != 0);
 908          return c->regs[file][index];
 909       }
 910
 911    case PROGRAM_STATE_VAR:
 912    case PROGRAM_CONSTANT:
 913    case PROGRAM_UNIFORM:
 914    case PROGRAM_ENV_PARAM:
 915       if (c->vp->use_const_buffer) {
 916          return get_constant(c, inst, argIndex);
 917       }
 918       else if (relAddr) {
 919          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 920       }
 921       else {
 922          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 923          return c->regs[PROGRAM_STATE_VAR][index];
 924       }
 925    case PROGRAM_ADDRESS:
 926       assert(index == 0);
 927       return c->regs[file][index];
 928
 929    case PROGRAM_UNDEFINED:
 930       /* this is a normal case since we loop over all three src args */
 931       return brw_null_reg();
 932
 933    case PROGRAM_LOCAL_PARAM:
 934    case PROGRAM_WRITE_ONLY:
 935    default:
 936       assert(0);
 937       return brw_null_reg();
 938    }
 939 }
 940
 941
 942 static void emit_arl( struct brw_vs_compile *c,
 943                       struct brw_reg dst,
 944                       struct brw_reg arg0 )
 945 {
 946    struct brw_compile *p = &c->func;
 947    struct brw_reg tmp = dst;
 948    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 949
 950    if (need_tmp)
 951       tmp = get_tmp(c);
 952
 953    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 954    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 955
 956    if (need_tmp)
 957       release_tmp(c, tmp);
 958 }
 959
 960
 961 /**
 962  * Return the brw reg for the given instruction's src argument.
 963  * Will return mangled results for SWZ op.  The emit_swz() function
 964  * ignores this result and recalculates taking extended swizzles into
 965  * account.
 966  */
 967 static struct brw_reg get_arg( struct brw_vs_compile *c,
 968                                const struct prog_instruction *inst,
 969                                GLuint argIndex )
 970 {
 971    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 972    struct brw_reg reg;
 973
 974    if (src->File == PROGRAM_UNDEFINED)
 975       return brw_null_reg();
 976
 977    reg = get_src_reg(c, inst, argIndex);
 978
 979    /* Convert 3-bit swizzle to 2-bit.
 980     */
 981    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 982                                        GET_SWZ(src->Swizzle, 1),
 983                                        GET_SWZ(src->Swizzle, 2),
 984                                        GET_SWZ(src->Swizzle, 3));
 985
 986    /* Note this is ok for non-swizzle instructions:
 987     */
 988    reg.negate = src->Negate ? 1 : 0;
 989
 990    return reg;
 991 }
 992
 993
 994 /**
 995  * Get brw register for the given program dest register.
 996  */
 997 static struct brw_reg get_dst( struct brw_vs_compile *c,
 998                                struct prog_dst_register dst )
 999 {
1000    struct brw_reg reg;
1001
1002    switch (dst.File) {
1003    case PROGRAM_TEMPORARY:
1004    case PROGRAM_OUTPUT:
1005       assert(c->regs[dst.File][dst.Index].nr != 0);
1006       reg = c->regs[dst.File][dst.Index];
1007       break;
1008    case PROGRAM_ADDRESS:
1009       assert(dst.Index == 0);
1010       reg = c->regs[dst.File][dst.Index];
1011       break;
1012    case PROGRAM_UNDEFINED:
1013       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1014       reg = brw_null_reg();
1015       break;
1016    default:
1017       assert(0);
1018       reg = brw_null_reg();
1019    }
1020
1021    reg.dw1.bits.writemask = dst.WriteMask;
1022
1023    return reg;
1024 }
1025
1026
1027 static void emit_swz( struct brw_vs_compile *c,
1028                       struct brw_reg dst,
1029                       const struct prog_instruction *inst)
1030 {
1031    const GLuint argIndex = 0;
1032    const struct prog_src_register src = inst->SrcReg[argIndex];
1033    struct brw_compile *p = &c->func;
1034    GLuint zeros_mask = 0;
1035    GLuint ones_mask = 0;
1036    GLuint src_mask = 0;
1037    GLubyte src_swz[4];
1038    GLboolean need_tmp = (src.Negate &&
1039                          dst.file != BRW_GENERAL_REGISTER_FILE);
1040    struct brw_reg tmp = dst;
1041    GLuint i;
1042
1043    if (need_tmp)
1044       tmp = get_tmp(c);
1045
1046    for (i = 0; i < 4; i++) {
1047       if (dst.dw1.bits.writemask & (1<<i)) {
1048          GLubyte s = GET_SWZ(src.Swizzle, i);
1049          switch (s) {
1050          case SWIZZLE_X:
1051          case SWIZZLE_Y:
1052          case SWIZZLE_Z:
1053          case SWIZZLE_W:
1054             src_mask |= 1<<i;
1055             src_swz[i] = s;
1056             break;
1057          case SWIZZLE_ZERO:
1058             zeros_mask |= 1<<i;
1059             break;
1060          case SWIZZLE_ONE:
1061             ones_mask |= 1<<i;
1062             break;
1063          }
1064       }
1065    }
1066
1067    /* Do src first, in case dst aliases src:
1068     */
1069    if (src_mask) {
1070       struct brw_reg arg0;
1071
1072       arg0 = get_src_reg(c, inst, argIndex);
1073
1074       arg0 = brw_swizzle(arg0,
1075                          src_swz[0], src_swz[1],
1076                          src_swz[2], src_swz[3]);
1077
1078       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1079    }
1080
1081    if (zeros_mask)
1082       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1083
1084    if (ones_mask)
1085       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1086
1087    if (src.Negate)
1088       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1089
1090    if (need_tmp) {
1091       brw_MOV(p, dst, tmp);
1092       release_tmp(c, tmp);
1093    }
1094 }
1095
1096
1097 /**
1098  * Post-vertex-program processing.  Send the results to the URB.
1099  */
1100 static void emit_vertex_write( struct brw_vs_compile *c)
1101 {
1102    struct brw_compile *p = &c->func;
1103    struct brw_reg m0 = brw_message_reg(0);
1104    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1105    struct brw_reg ndc;
1106    int eot;
1107    GLuint len_vertext_header = 2;
1108
1109    if (c->key.copy_edgeflag) {
1110       brw_MOV(p,
1111               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1112               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1113    }
1114
1115    /* Build ndc coords */
1116    ndc = get_tmp(c);
1117    /* ndc = 1.0 / pos.w */
1118    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1119    /* ndc.xyz = pos * ndc */
1120    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1121
1122    /* Update the header for point size, user clipping flags, and -ve rhw
1123     * workaround.
1124     */
1125    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1126        c->key.nr_userclip || BRW_IS_965(p->brw))
1127    {
1128       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1129       GLuint i;
1130
1131       brw_MOV(p, header1, brw_imm_ud(0));
1132
1133       brw_set_access_mode(p, BRW_ALIGN_16);
1134
1135       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1136          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1137          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1138          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1139       }
1140
1141       for (i = 0; i < c->key.nr_userclip; i++) {
1142          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1143          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1144          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1145          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1146       }
1147
1148       /* i965 clipping workaround:
1149        * 1) Test for -ve rhw
1150        * 2) If set,
1151        *      set ndc = (0,0,0,0)
1152        *      set ucp[6] = 1
1153        *
1154        * Later, clipping will detect ucp[6] and ensure the primitive is
1155        * clipped against all fixed planes.
1156        */
1157       if (BRW_IS_965(p->brw)) {
1158          brw_CMP(p,
1159                  vec8(brw_null_reg()),
1160                  BRW_CONDITIONAL_L,
1161                  brw_swizzle1(ndc, 3),
1162                  brw_imm_f(0));
1163
1164          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1165          brw_MOV(p, ndc, brw_imm_f(0));
1166          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1167       }
1168
1169       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1170       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1171       brw_set_access_mode(p, BRW_ALIGN_16);
1172
1173       release_tmp(c, header1);
1174    }
1175    else {
1176       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1177    }
1178
1179    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1180     * of zeros followed by two sets of NDC coordinates:
1181     */
1182    brw_set_access_mode(p, BRW_ALIGN_1);
1183    brw_MOV(p, offset(m0, 2), ndc);
1184
1185    if (BRW_IS_IGDNG(p->brw)) {
1186        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1187        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1188        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1189         * Seems it is useless for us.
1190         * m6 is used for aligning, so that the remainder of vertex element is
1191         * reg-aligned.
1192         */
1193        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1194        len_vertext_header = 6;
1195    } else {
1196        brw_MOV(p, offset(m0, 3), pos);
1197        len_vertext_header = 2;
1198    }
1199
1200    eot = (c->first_overflow_output == 0);
1201
1202    brw_urb_WRITE(p,
1203                  brw_null_reg(), /* dest */
1204                  0,             /* starting mrf reg nr */
1205                  c->r0,         /* src */
1206                  0,             /* allocate */
1207                  1,             /* used */
1208                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1209                  0,             /* response len */
1210                  eot,           /* eot */
1211                  eot,           /* writes complete */
1212                  0,             /* urb destination offset */
1213                  BRW_URB_SWIZZLE_INTERLEAVE);
1214
1215    if (c->first_overflow_output > 0) {
1216       /* Not all of the vertex outputs/results fit into the MRF.
1217        * Move the overflowed attributes from the GRF to the MRF and
1218        * issue another brw_urb_WRITE().
1219        */
1220       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1221        * at mrf[4] atm...
1222        */
1223       GLuint i, mrf = 0;
1224       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1225          if (c->prog_data.outputs_written & (1 << i)) {
1226             /* move from GRF to MRF */
1227             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1228             mrf++;
1229          }
1230       }
1231
1232       brw_urb_WRITE(p,
1233                     brw_null_reg(), /* dest */
1234                     4,              /* starting mrf reg nr */
1235                     c->r0,          /* src */
1236                     0,              /* allocate */
1237                     1,              /* used */
1238                     mrf+1,          /* msg len */
1239                     0,              /* response len */
1240                     1,              /* eot */
1241                     1,              /* writes complete */
1242                     BRW_MAX_MRF-1,  /* urb destination offset */
1243                     BRW_URB_SWIZZLE_INTERLEAVE);
1244    }
1245 }
1246
1247
1248 /**
1249  * Called after code generation to resolve subroutine calls and the
1250  * END instruction.
1251  * \param end_inst  points to brw code for END instruction
1252  * \param last_inst  points to last instruction emitted before vertex write
1253  */
1254 static void
1255 post_vs_emit( struct brw_vs_compile *c,
1256               struct brw_instruction *end_inst,
1257               struct brw_instruction *last_inst )
1258 {
1259    GLint offset;
1260
1261    brw_resolve_cals(&c->func);
1262
1263    /* patch up the END code to jump past subroutines, etc */
1264    offset = last_inst - end_inst;
1265    if (offset > 1) {
1266       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1267    } else {
1268       end_inst->header.opcode = BRW_OPCODE_NOP;
1269    }
1270 }
1271
1272 static uint32_t
1273 get_predicate(const struct prog_instruction *inst)
1274 {
1275    if (inst->DstReg.CondMask == COND_TR)
1276       return BRW_PREDICATE_NONE;
1277
1278    /* All of GLSL only produces predicates for COND_NE and one channel per
1279     * vector.  Fail badly if someone starts doing something else, as it might
1280     * mean infinite looping or something.
1281     *
1282     * We'd like to support all the condition codes, but our hardware doesn't
1283     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1284     * those, the instruction may update the condition codes or not, then any
1285     * later instruction may use one of those condition codes.  For gen4, the
1286     * instruction may update the flags register based on one of the condition
1287     * codes output by the instruction, and then further instructions may
1288     * predicate on that.  We can probably support this, but it won't
1289     * necessarily be easy.
1290     */
1291    assert(inst->DstReg.CondMask == COND_NE);
1292
1293    switch (inst->DstReg.CondSwizzle) {
1294    case SWIZZLE_XXXX:
1295       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1296    case SWIZZLE_YYYY:
1297       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1298    case SWIZZLE_ZZZZ:
1299       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1300    case SWIZZLE_WWWW:
1301       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1302    default:
1303       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1304                     inst->DstReg.CondMask);
1305       return BRW_PREDICATE_NORMAL;
1306    }
1307 }
1308
1309 /* Emit the vertex program instructions here.
1310  */
1311 void brw_vs_emit(struct brw_vs_compile *c )
1312 {
1313 #define MAX_IF_DEPTH 32
1314 #define MAX_LOOP_DEPTH 32
1315    struct brw_compile *p = &c->func;
1316    struct brw_context *brw = p->brw;
1317    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1318    GLuint insn, if_depth = 0, loop_depth = 0;
1319    GLuint end_offset = 0;
1320    struct brw_instruction *end_inst, *last_inst;
1321    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1322    const struct brw_indirect stack_index = brw_indirect(0, 0);
1323    GLuint index;
1324    GLuint file;
1325
1326    if (INTEL_DEBUG & DEBUG_VS) {
1327       _mesa_printf("vs-mesa:\n");
1328       _mesa_print_program(&c->vp->program.Base);
1329       _mesa_printf("\n");
1330    }
1331
1332    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1333    brw_set_access_mode(p, BRW_ALIGN_16);
1334
1335    /* Message registers can't be read, so copy the output into GRF register
1336       if they are used in source registers */
1337    for (insn = 0; insn < nr_insns; insn++) {
1338        GLuint i;
1339        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1340        for (i = 0; i < 3; i++) {
1341            struct prog_src_register *src = &inst->SrcReg[i];
1342            GLuint index = src->Index;
1343            GLuint file = src->File;
1344            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1345                c->output_regs[index].used_in_src = GL_TRUE;
1346        }
1347    }
1348
1349    /* Static register allocation
1350     */
1351    brw_vs_alloc_regs(c);
1352    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1353
1354    for (insn = 0; insn < nr_insns; insn++) {
1355
1356       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1357       struct brw_reg args[3], dst;
1358       GLuint i;
1359
1360 #if 0
1361       printf("%d: ", insn);
1362       _mesa_print_instruction(inst);
1363 #endif
1364
1365       /* Get argument regs.  SWZ is special and does this itself.
1366        */
1367       if (inst->Opcode != OPCODE_SWZ)
1368           for (i = 0; i < 3; i++) {
1369               const struct prog_src_register *src = &inst->SrcReg[i];
1370               index = src->Index;
1371               file = src->File;
1372               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1373                   args[i] = c->output_regs[index].reg;
1374               else
1375                   args[i] = get_arg(c, inst, i);
1376           }
1377
1378       /* Get dest regs.  Note that it is possible for a reg to be both
1379        * dst and arg, given the static allocation of registers.  So
1380        * care needs to be taken emitting multi-operation instructions.
1381        */
1382       index = inst->DstReg.Index;
1383       file = inst->DstReg.File;
1384       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1385           dst = c->output_regs[index].reg;
1386       else
1387           dst = get_dst(c, inst->DstReg);
1388
1389       if (inst->SaturateMode != SATURATE_OFF) {
1390          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1391                        inst->SaturateMode);
1392       }
1393
1394       switch (inst->Opcode) {
1395       case OPCODE_ABS:
1396          brw_MOV(p, dst, brw_abs(args[0]));
1397          break;
1398       case OPCODE_ADD:
1399          brw_ADD(p, dst, args[0], args[1]);
1400          break;
1401       case OPCODE_COS:
1402          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1403          break;
1404       case OPCODE_DP3:
1405          brw_DP3(p, dst, args[0], args[1]);
1406          break;
1407       case OPCODE_DP4:
1408          brw_DP4(p, dst, args[0], args[1]);
1409          break;
1410       case OPCODE_DPH:
1411          brw_DPH(p, dst, args[0], args[1]);
1412          break;
1413       case OPCODE_NRM3:
1414          emit_nrm(c, dst, args[0], 3);
1415          break;
1416       case OPCODE_NRM4:
1417          emit_nrm(c, dst, args[0], 4);
1418          break;
1419       case OPCODE_DST:
1420          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1421          break;
1422       case OPCODE_EXP:
1423          unalias1(c, dst, args[0], emit_exp_noalias);
1424          break;
1425       case OPCODE_EX2:
1426          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1427          break;
1428       case OPCODE_ARL:
1429          emit_arl(c, dst, args[0]);
1430          break;
1431       case OPCODE_FLR:
1432          brw_RNDD(p, dst, args[0]);
1433          break;
1434       case OPCODE_FRC:
1435          brw_FRC(p, dst, args[0]);
1436          break;
1437       case OPCODE_LOG:
1438          unalias1(c, dst, args[0], emit_log_noalias);
1439          break;
1440       case OPCODE_LG2:
1441          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1442          break;
1443       case OPCODE_LIT:
1444          unalias1(c, dst, args[0], emit_lit_noalias);
1445          break;
1446       case OPCODE_LRP:
1447          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1448          break;
1449       case OPCODE_MAD:
1450          brw_MOV(p, brw_acc_reg(), args[2]);
1451          brw_MAC(p, dst, args[0], args[1]);
1452          break;
1453       case OPCODE_MAX:
1454          emit_max(p, dst, args[0], args[1]);
1455          break;
1456       case OPCODE_MIN:
1457          emit_min(p, dst, args[0], args[1]);
1458          break;
1459       case OPCODE_MOV:
1460          brw_MOV(p, dst, args[0]);
1461          break;
1462       case OPCODE_MUL:
1463          brw_MUL(p, dst, args[0], args[1]);
1464          break;
1465       case OPCODE_POW:
1466          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1467          break;
1468       case OPCODE_RCP:
1469          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1470          break;
1471       case OPCODE_RSQ:
1472          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1473          break;
1474
1475       case OPCODE_SEQ:
1476          emit_seq(p, dst, args[0], args[1]);
1477          break;
1478       case OPCODE_SIN:
1479          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1480          break;
1481       case OPCODE_SNE:
1482          emit_sne(p, dst, args[0], args[1]);
1483          break;
1484       case OPCODE_SGE:
1485          emit_sge(p, dst, args[0], args[1]);
1486          break;
1487       case OPCODE_SGT:
1488          emit_sgt(p, dst, args[0], args[1]);
1489          break;
1490       case OPCODE_SLT:
1491          emit_slt(p, dst, args[0], args[1]);
1492          break;
1493       case OPCODE_SLE:
1494          emit_sle(p, dst, args[0], args[1]);
1495          break;
1496       case OPCODE_SUB:
1497          brw_ADD(p, dst, args[0], negate(args[1]));
1498          break;
1499       case OPCODE_SWZ:
1500          /* The args[0] value can't be used here as it won't have
1501           * correctly encoded the full swizzle:
1502           */
1503          emit_swz(c, dst, inst);
1504          break;
1505       case OPCODE_TRUNC:
1506          /* round toward zero */
1507          brw_RNDZ(p, dst, args[0]);
1508          break;
1509       case OPCODE_XPD:
1510          emit_xpd(p, dst, args[0], args[1]);
1511          break;
1512       case OPCODE_IF:
1513          assert(if_depth < MAX_IF_DEPTH);
1514          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1515          /* Note that brw_IF smashes the predicate_control field. */
1516          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1517          if_depth++;
1518          break;
1519       case OPCODE_ELSE:
1520          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1521          break;
1522       case OPCODE_ENDIF:
1523          assert(if_depth > 0);
1524          brw_ENDIF(p, if_inst[--if_depth]);
1525          break;
1526       case OPCODE_BGNLOOP:
1527          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1528          break;
1529       case OPCODE_BRK:
1530          brw_set_predicate_control(p, get_predicate(inst));
1531          brw_BREAK(p);
1532          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1533          break;
1534       case OPCODE_CONT:
1535          brw_set_predicate_control(p, get_predicate(inst));
1536          brw_CONT(p);
1537          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1538          break;
1539       case OPCODE_ENDLOOP:
1540          {
1541             struct brw_instruction *inst0, *inst1;
1542             GLuint br = 1;
1543
1544             loop_depth--;
1545
1546             if (BRW_IS_IGDNG(brw))
1547                br = 2;
1548
1549             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1550             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1551             while (inst0 > loop_inst[loop_depth]) {
1552                inst0--;
1553                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1554                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1555                   inst0->bits3.if_else.pop_count = 0;
1556                }
1557                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1558                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1559                   inst0->bits3.if_else.pop_count = 0;
1560                }
1561             }
1562          }
1563          break;
1564       case OPCODE_BRA:
1565          brw_set_predicate_control(p, get_predicate(inst));
1566          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1567          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1568          break;
1569       case OPCODE_CAL:
1570          brw_set_access_mode(p, BRW_ALIGN_1);
1571          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1572          brw_set_access_mode(p, BRW_ALIGN_16);
1573          brw_ADD(p, get_addr_reg(stack_index),
1574                          get_addr_reg(stack_index), brw_imm_d(4));
1575          brw_save_call(p, inst->Comment, p->nr_insn);
1576          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1577          break;
1578       case OPCODE_RET:
1579          brw_ADD(p, get_addr_reg(stack_index),
1580                          get_addr_reg(stack_index), brw_imm_d(-4));
1581          brw_set_access_mode(p, BRW_ALIGN_1);
1582          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1583          brw_set_access_mode(p, BRW_ALIGN_16);
1584          break;
1585       case OPCODE_END:
1586          end_offset = p->nr_insn;
1587          /* this instruction will get patched later to jump past subroutine
1588           * code, etc.
1589           */
1590          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1591          break;
1592       case OPCODE_PRINT:
1593          /* no-op */
1594          break;
1595       case OPCODE_BGNSUB:
1596          brw_save_label(p, inst->Comment, p->nr_insn);
1597          break;
1598       case OPCODE_ENDSUB:
1599          /* no-op */
1600          break;
1601       default:
1602          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1603                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1604                                     _mesa_opcode_string(inst->Opcode) :
1605                                     "unknown");
1606       }
1607
1608       /* Set the predication update on the last instruction of the native
1609        * instruction sequence.
1610        *
1611        * This would be problematic if it was set on a math instruction,
1612        * but that shouldn't be the case with the current GLSL compiler.
1613        */
1614       if (inst->CondUpdate) {
1615          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1616
1617          assert(hw_insn->header.destreg__conditionalmod == 0);
1618          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1619       }
1620
1621       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1622           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1623           && c->output_regs[inst->DstReg.Index].used_in_src) {
1624          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1625       }
1626
1627       /* Result color clamping.
1628        *
1629        * When destination register is an output register and
1630        * it's primary/secondary front/back color, we have to clamp
1631        * the result to [0,1]. This is done by enabling the
1632        * saturation bit for the last instruction.
1633        *
1634        * We don't use brw_set_saturate() as it modifies
1635        * p->current->header.saturate, which affects all the subsequent
1636        * instructions. Instead, we directly modify the header
1637        * of the last (already stored) instruction.
1638        */
1639       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1640          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1641              || (inst->DstReg.Index == VERT_RESULT_COL1)
1642              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1643              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1644             p->store[p->nr_insn-1].header.saturate = 1;
1645          }
1646       }
1647
1648       release_tmps(c);
1649    }
1650
1651    end_inst = &p->store[end_offset];
1652    last_inst = &p->store[p->nr_insn];
1653
1654    /* The END instruction will be patched to jump to this code */
1655    emit_vertex_write(c);
1656
1657    post_vs_emit(c, end_inst, last_inst);
1658
1659    if (INTEL_DEBUG & DEBUG_VS) {
1660       int i;
1661
1662       _mesa_printf("vs-native:\n");
1663       for (i = 0; i < p->nr_insn; i++)
1664          brw_disasm(stderr, &p->store[i]);
1665       _mesa_printf("\n");
1666    }
1667 }