src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71    int attributes_in_vue;
  72
  73    /* Determine whether to use a real constant buffer or use a block
  74     * of GRF registers for constants.  The later is faster but only
  75     * works if everything fits in the GRF.
  76     * XXX this heuristic/check may need some fine tuning...
  77     */
  78    if (c->vp->program.Base.Parameters->NumParameters +
  79        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  80       c->vp->use_const_buffer = GL_TRUE;
  81    else
  82       c->vp->use_const_buffer = GL_FALSE;
  83
  84    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  85
  86    /* r0 -- reserved as usual
  87     */
  88    c->r0 = brw_vec8_grf(reg, 0);
  89    reg++;
  90
  91    /* User clip planes from curbe:
  92     */
  93    if (c->key.nr_userclip) {
  94       for (i = 0; i < c->key.nr_userclip; i++) {
  95          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  96       }
  97
  98       /* Deal with curbe alignment:
  99        */
 100       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 101    }
 102
 103    /* Vertex program parameters from curbe:
 104     */
 105    if (c->vp->use_const_buffer) {
 106       /* get constants from a real constant buffer */
 107       c->prog_data.curb_read_length = 0;
 108       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 109    }
 110    else {
 111       /* use a section of the GRF for constants */
 112       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 113       for (i = 0; i < nr_params; i++) {
 114          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 115       }
 116       reg += (nr_params + 1) / 2;
 117       c->prog_data.curb_read_length = reg - 1;
 118
 119       c->prog_data.nr_params = nr_params * 4;
 120    }
 121
 122    /* Allocate input regs:
 123     */
 124    c->nr_inputs = 0;
 125    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 126       if (c->prog_data.inputs_read & (1 << i)) {
 127          c->nr_inputs++;
 128          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 129          reg++;
 130       }
 131    }
 132    /* If there are no inputs, we'll still be reading one attribute's worth
 133     * because it's required -- see urb_read_length setting.
 134     */
 135    if (c->nr_inputs == 0)
 136       reg++;
 137
 138    /* Allocate outputs.  The non-position outputs go straight into message regs.
 139     */
 140    c->nr_outputs = 0;
 141    c->first_output = reg;
 142    c->first_overflow_output = 0;
 143
 144    if (BRW_IS_IGDNG(c->func.brw))
 145        mrf = 8;
 146    else
 147        mrf = 4;
 148
 149    for (i = 0; i < VERT_RESULT_MAX; i++) {
 150       if (c->prog_data.outputs_written & (1 << i)) {
 151          c->nr_outputs++;
 152          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 153          if (i == VERT_RESULT_HPOS) {
 154             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 155             reg++;
 156          }
 157          else if (i == VERT_RESULT_PSIZ) {
 158             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 159             reg++;
 160             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 161          }
 162          else {
 163             if (mrf < 16) {
 164                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 165                mrf++;
 166             }
 167             else {
 168                /* too many vertex results to fit in MRF, use GRF for overflow */
 169                if (!c->first_overflow_output)
 170                   c->first_overflow_output = i;
 171                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 172                reg++;
 173             }
 174          }
 175       }
 176    }
 177
 178    /* Allocate program temporaries:
 179     */
 180    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 181       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 182       reg++;
 183    }
 184
 185    /* Address reg(s).  Don't try to use the internal address reg until
 186     * deref time.
 187     */
 188    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 189       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 190                                              reg,
 191                                              0,
 192                                              BRW_REGISTER_TYPE_D,
 193                                              BRW_VERTICAL_STRIDE_8,
 194                                              BRW_WIDTH_8,
 195                                              BRW_HORIZONTAL_STRIDE_1,
 196                                              BRW_SWIZZLE_XXXX,
 197                                              WRITEMASK_X);
 198       reg++;
 199    }
 200
 201    if (c->vp->use_const_buffer) {
 202       for (i = 0; i < 3; i++) {
 203          c->current_const[i].index = -1;
 204          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 205          reg++;
 206       }
 207    }
 208
 209    for (i = 0; i < 128; i++) {
 210       if (c->output_regs[i].used_in_src) {
 211          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 212          reg++;
 213       }
 214    }
 215
 216    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 217    reg += 2;
 218
 219    /* Some opcodes need an internal temporary:
 220     */
 221    c->first_tmp = reg;
 222    c->last_tmp = reg;           /* for allocation purposes */
 223
 224    /* Each input reg holds data from two vertices.  The
 225     * urb_read_length is the number of registers read from *each*
 226     * vertex urb, so is half the amount:
 227     */
 228    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 229    /* Setting this field to 0 leads to undefined behavior according to the
 230     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 231     * sitting in them, even if it's padding.
 232     */
 233    if (c->prog_data.urb_read_length == 0)
 234       c->prog_data.urb_read_length = 1;
 235
 236    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 237     * them to fit the biggest thing they need to.
 238     */
 239    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 240
 241    if (BRW_IS_IGDNG(c->func.brw))
 242        c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 243    else
 244        c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 245
 246    c->prog_data.total_grf = reg;
 247
 248    if (INTEL_DEBUG & DEBUG_VS) {
 249       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 250       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 251       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 252    }
 253 }
 254
 255
 256 /**
 257  * If an instruction uses a temp reg both as a src and the dest, we
 258  * sometimes need to allocate an intermediate temporary.
 259  */
 260 static void unalias1( struct brw_vs_compile *c,
 261                       struct brw_reg dst,
 262                       struct brw_reg arg0,
 263                       void (*func)( struct brw_vs_compile *,
 264                                     struct brw_reg,
 265                                     struct brw_reg ))
 266 {
 267    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 268       struct brw_compile *p = &c->func;
 269       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 270       func(c, tmp, arg0);
 271       brw_MOV(p, dst, tmp);
 272       release_tmp(c, tmp);
 273    }
 274    else {
 275       func(c, dst, arg0);
 276    }
 277 }
 278
 279 /**
 280  * \sa unalias2
 281  * Checkes if 2-operand instruction needs an intermediate temporary.
 282  */
 283 static void unalias2( struct brw_vs_compile *c,
 284                       struct brw_reg dst,
 285                       struct brw_reg arg0,
 286                       struct brw_reg arg1,
 287                       void (*func)( struct brw_vs_compile *,
 288                                     struct brw_reg,
 289                                     struct brw_reg,
 290                                     struct brw_reg ))
 291 {
 292    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 293        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 294       struct brw_compile *p = &c->func;
 295       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 296       func(c, tmp, arg0, arg1);
 297       brw_MOV(p, dst, tmp);
 298       release_tmp(c, tmp);
 299    }
 300    else {
 301       func(c, dst, arg0, arg1);
 302    }
 303 }
 304
 305 /**
 306  * \sa unalias2
 307  * Checkes if 3-operand instruction needs an intermediate temporary.
 308  */
 309 static void unalias3( struct brw_vs_compile *c,
 310                       struct brw_reg dst,
 311                       struct brw_reg arg0,
 312                       struct brw_reg arg1,
 313                       struct brw_reg arg2,
 314                       void (*func)( struct brw_vs_compile *,
 315                                     struct brw_reg,
 316                                     struct brw_reg,
 317                                     struct brw_reg,
 318                                     struct brw_reg ))
 319 {
 320    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 321        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 322        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 323       struct brw_compile *p = &c->func;
 324       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 325       func(c, tmp, arg0, arg1, arg2);
 326       brw_MOV(p, dst, tmp);
 327       release_tmp(c, tmp);
 328    }
 329    else {
 330       func(c, dst, arg0, arg1, arg2);
 331    }
 332 }
 333
 334 static void emit_sop( struct brw_vs_compile *c,
 335                       struct brw_reg dst,
 336                       struct brw_reg arg0,
 337                       struct brw_reg arg1,
 338                       GLuint cond)
 339 {
 340    struct brw_compile *p = &c->func;
 341
 342    brw_MOV(p, dst, brw_imm_f(0.0f));
 343    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 344    brw_MOV(p, dst, brw_imm_f(1.0f));
 345    brw_set_predicate_control_flag_value(p, 0xff);
 346 }
 347
 348 static void emit_seq( struct brw_vs_compile *c,
 349                       struct brw_reg dst,
 350                       struct brw_reg arg0,
 351                       struct brw_reg arg1 )
 352 {
 353    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 354 }
 355
 356 static void emit_sne( struct brw_vs_compile *c,
 357                       struct brw_reg dst,
 358                       struct brw_reg arg0,
 359                       struct brw_reg arg1 )
 360 {
 361    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 362 }
 363 static void emit_slt( struct brw_vs_compile *c,
 364                       struct brw_reg dst,
 365                       struct brw_reg arg0,
 366                       struct brw_reg arg1 )
 367 {
 368    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 369 }
 370
 371 static void emit_sle( struct brw_vs_compile *c,
 372                       struct brw_reg dst,
 373                       struct brw_reg arg0,
 374                       struct brw_reg arg1 )
 375 {
 376    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 377 }
 378
 379 static void emit_sgt( struct brw_vs_compile *c,
 380                       struct brw_reg dst,
 381                       struct brw_reg arg0,
 382                       struct brw_reg arg1 )
 383 {
 384    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 385 }
 386
 387 static void emit_sge( struct brw_vs_compile *c,
 388                       struct brw_reg dst,
 389                       struct brw_reg arg0,
 390                       struct brw_reg arg1 )
 391 {
 392   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 393 }
 394
 395 static void emit_max( struct brw_compile *p,
 396                       struct brw_reg dst,
 397                       struct brw_reg arg0,
 398                       struct brw_reg arg1 )
 399 {
 400    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 401    brw_SEL(p, dst, arg1, arg0);
 402    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 403 }
 404
 405 static void emit_min( struct brw_compile *p,
 406                       struct brw_reg dst,
 407                       struct brw_reg arg0,
 408                       struct brw_reg arg1 )
 409 {
 410    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 411    brw_SEL(p, dst, arg0, arg1);
 412    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 413 }
 414
 415
 416 static void emit_math1( struct brw_vs_compile *c,
 417                         GLuint function,
 418                         struct brw_reg dst,
 419                         struct brw_reg arg0,
 420                         GLuint precision)
 421 {
 422    /* There are various odd behaviours with SEND on the simulator.  In
 423     * addition there are documented issues with the fact that the GEN4
 424     * processor doesn't do dependency control properly on SEND
 425     * results.  So, on balance, this kludge to get around failures
 426     * with writemasked math results looks like it might be necessary
 427     * whether that turns out to be a simulator bug or not:
 428     */
 429    struct brw_compile *p = &c->func;
 430    struct brw_reg tmp = dst;
 431    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 432                          dst.file != BRW_GENERAL_REGISTER_FILE);
 433
 434    if (need_tmp)
 435       tmp = get_tmp(c);
 436
 437    brw_math(p,
 438             tmp,
 439             function,
 440             BRW_MATH_SATURATE_NONE,
 441             2,
 442             arg0,
 443             BRW_MATH_DATA_SCALAR,
 444             precision);
 445
 446    if (need_tmp) {
 447       brw_MOV(p, dst, tmp);
 448       release_tmp(c, tmp);
 449    }
 450 }
 451
 452
 453 static void emit_math2( struct brw_vs_compile *c,
 454                         GLuint function,
 455                         struct brw_reg dst,
 456                         struct brw_reg arg0,
 457                         struct brw_reg arg1,
 458                         GLuint precision)
 459 {
 460    struct brw_compile *p = &c->func;
 461    struct brw_reg tmp = dst;
 462    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 463                          dst.file != BRW_GENERAL_REGISTER_FILE);
 464
 465    if (need_tmp)
 466       tmp = get_tmp(c);
 467
 468    brw_MOV(p, brw_message_reg(3), arg1);
 469
 470    brw_math(p,
 471             tmp,
 472             function,
 473             BRW_MATH_SATURATE_NONE,
 474             2,
 475             arg0,
 476             BRW_MATH_DATA_SCALAR,
 477             precision);
 478
 479    if (need_tmp) {
 480       brw_MOV(p, dst, tmp);
 481       release_tmp(c, tmp);
 482    }
 483 }
 484
 485
 486 static void emit_exp_noalias( struct brw_vs_compile *c,
 487                               struct brw_reg dst,
 488                               struct brw_reg arg0 )
 489 {
 490    struct brw_compile *p = &c->func;
 491
 492
 493    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 494       struct brw_reg tmp = get_tmp(c);
 495       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 496
 497       /* tmp_d = floor(arg0.x) */
 498       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 499
 500       /* result[0] = 2.0 ^ tmp */
 501
 502       /* Adjust exponent for floating point:
 503        * exp += 127
 504        */
 505       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 506
 507       /* Install exponent and sign.
 508        * Excess drops off the edge:
 509        */
 510       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 511               tmp_d, brw_imm_d(23));
 512
 513       release_tmp(c, tmp);
 514    }
 515
 516    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 517       /* result[1] = arg0.x - floor(arg0.x) */
 518       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 519    }
 520
 521    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 522       /* As with the LOG instruction, we might be better off just
 523        * doing a taylor expansion here, seeing as we have to do all
 524        * the prep work.
 525        *
 526        * If mathbox partial precision is too low, consider also:
 527        * result[3] = result[0] * EXP(result[1])
 528        */
 529       emit_math1(c,
 530                  BRW_MATH_FUNCTION_EXP,
 531                  brw_writemask(dst, WRITEMASK_Z),
 532                  brw_swizzle1(arg0, 0),
 533                  BRW_MATH_PRECISION_FULL);
 534    }
 535
 536    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 537       /* result[3] = 1.0; */
 538       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 539    }
 540 }
 541
 542
 543 static void emit_log_noalias( struct brw_vs_compile *c,
 544                               struct brw_reg dst,
 545                               struct brw_reg arg0 )
 546 {
 547    struct brw_compile *p = &c->func;
 548    struct brw_reg tmp = dst;
 549    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 550    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 551    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 552                          dst.file != BRW_GENERAL_REGISTER_FILE);
 553
 554    if (need_tmp) {
 555       tmp = get_tmp(c);
 556       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 557    }
 558
 559    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 560     * according to spec:
 561     *
 562     * These almost look likey they could be joined up, but not really
 563     * practical:
 564     *
 565     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 566     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 567     */
 568    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 569       brw_AND(p,
 570               brw_writemask(tmp_ud, WRITEMASK_X),
 571               brw_swizzle1(arg0_ud, 0),
 572               brw_imm_ud((1U<<31)-1));
 573
 574       brw_SHR(p,
 575               brw_writemask(tmp_ud, WRITEMASK_X),
 576               tmp_ud,
 577               brw_imm_ud(23));
 578
 579       brw_ADD(p,
 580               brw_writemask(tmp, WRITEMASK_X),
 581               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 582               brw_imm_d(-127));
 583    }
 584
 585    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 586       brw_AND(p,
 587               brw_writemask(tmp_ud, WRITEMASK_Y),
 588               brw_swizzle1(arg0_ud, 0),
 589               brw_imm_ud((1<<23)-1));
 590
 591       brw_OR(p,
 592              brw_writemask(tmp_ud, WRITEMASK_Y),
 593              tmp_ud,
 594              brw_imm_ud(127<<23));
 595    }
 596
 597    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 598       /* result[2] = result[0] + LOG2(result[1]); */
 599
 600       /* Why bother?  The above is just a hint how to do this with a
 601        * taylor series.  Maybe we *should* use a taylor series as by
 602        * the time all the above has been done it's almost certainly
 603        * quicker than calling the mathbox, even with low precision.
 604        *
 605        * Options are:
 606        *    - result[0] + mathbox.LOG2(result[1])
 607        *    - mathbox.LOG2(arg0.x)
 608        *    - result[0] + inline_taylor_approx(result[1])
 609        */
 610       emit_math1(c,
 611                  BRW_MATH_FUNCTION_LOG,
 612                  brw_writemask(tmp, WRITEMASK_Z),
 613                  brw_swizzle1(tmp, 1),
 614                  BRW_MATH_PRECISION_FULL);
 615
 616       brw_ADD(p,
 617               brw_writemask(tmp, WRITEMASK_Z),
 618               brw_swizzle1(tmp, 2),
 619               brw_swizzle1(tmp, 0));
 620    }
 621
 622    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 623       /* result[3] = 1.0; */
 624       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 625    }
 626
 627    if (need_tmp) {
 628       brw_MOV(p, dst, tmp);
 629       release_tmp(c, tmp);
 630    }
 631 }
 632
 633
 634 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 635  */
 636 static void emit_dst_noalias( struct brw_vs_compile *c,
 637                               struct brw_reg dst,
 638                               struct brw_reg arg0,
 639                               struct brw_reg arg1)
 640 {
 641    struct brw_compile *p = &c->func;
 642
 643    /* There must be a better way to do this:
 644     */
 645    if (dst.dw1.bits.writemask & WRITEMASK_X)
 646       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 647    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 648       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 649    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 650       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 651    if (dst.dw1.bits.writemask & WRITEMASK_W)
 652       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 653 }
 654
 655
 656 static void emit_xpd( struct brw_compile *p,
 657                       struct brw_reg dst,
 658                       struct brw_reg t,
 659                       struct brw_reg u)
 660 {
 661    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 662    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 663 }
 664
 665
 666 static void emit_lit_noalias( struct brw_vs_compile *c,
 667                               struct brw_reg dst,
 668                               struct brw_reg arg0 )
 669 {
 670    struct brw_compile *p = &c->func;
 671    struct brw_instruction *if_insn;
 672    struct brw_reg tmp = dst;
 673    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 674
 675    if (need_tmp)
 676       tmp = get_tmp(c);
 677
 678    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 679    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 680
 681    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 682     * to get all channels active inside the IF.  In the clipping code
 683     * we run with NoMask, so it's not an option and we can use
 684     * BRW_EXECUTE_1 for all comparisions.
 685     */
 686    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 687    if_insn = brw_IF(p, BRW_EXECUTE_8);
 688    {
 689       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 690
 691       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 692       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 693       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 694
 695       emit_math2(c,
 696                  BRW_MATH_FUNCTION_POW,
 697                  brw_writemask(dst, WRITEMASK_Z),
 698                  brw_swizzle1(tmp, 2),
 699                  brw_swizzle1(arg0, 3),
 700                  BRW_MATH_PRECISION_PARTIAL);
 701    }
 702
 703    brw_ENDIF(p, if_insn);
 704
 705    release_tmp(c, tmp);
 706 }
 707
 708 static void emit_lrp_noalias(struct brw_vs_compile *c,
 709                              struct brw_reg dst,
 710                              struct brw_reg arg0,
 711                              struct brw_reg arg1,
 712                              struct brw_reg arg2)
 713 {
 714    struct brw_compile *p = &c->func;
 715
 716    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 717    brw_MUL(p, brw_null_reg(), dst, arg2);
 718    brw_MAC(p, dst, arg0, arg1);
 719 }
 720
 721 /** 3 or 4-component vector normalization */
 722 static void emit_nrm( struct brw_vs_compile *c,
 723                       struct brw_reg dst,
 724                       struct brw_reg arg0,
 725                       int num_comps)
 726 {
 727    struct brw_compile *p = &c->func;
 728    struct brw_reg tmp = get_tmp(c);
 729
 730    /* tmp = dot(arg0, arg0) */
 731    if (num_comps == 3)
 732       brw_DP3(p, tmp, arg0, arg0);
 733    else
 734       brw_DP4(p, tmp, arg0, arg0);
 735
 736    /* tmp = 1 / sqrt(tmp) */
 737    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 738
 739    /* dst = arg0 * tmp */
 740    brw_MUL(p, dst, arg0, tmp);
 741
 742    release_tmp(c, tmp);
 743 }
 744
 745
 746 static struct brw_reg
 747 get_constant(struct brw_vs_compile *c,
 748              const struct prog_instruction *inst,
 749              GLuint argIndex)
 750 {
 751    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 752    struct brw_compile *p = &c->func;
 753    struct brw_reg const_reg;
 754    struct brw_reg const2_reg;
 755    const GLboolean relAddr = src->RelAddr;
 756
 757    assert(argIndex < 3);
 758
 759    if (c->current_const[argIndex].index != src->Index || relAddr) {
 760       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 761
 762       c->current_const[argIndex].index = src->Index;
 763
 764 #if 0
 765       printf("  fetch const[%d] for arg %d into reg %d\n",
 766              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 767 #endif
 768       /* need to fetch the constant now */
 769       brw_dp_READ_4_vs(p,
 770                        c->current_const[argIndex].reg,/* writeback dest */
 771                        0,                             /* oword */
 772                        relAddr,                       /* relative indexing? */
 773                        addrReg,                       /* address register */
 774                        16 * src->Index,               /* byte offset */
 775                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 776                        );
 777
 778       if (relAddr) {
 779          /* second read */
 780          const2_reg = get_tmp(c);
 781
 782          /* use upper half of address reg for second read */
 783          addrReg = stride(addrReg, 0, 4, 0);
 784          addrReg.subnr = 16;
 785
 786          brw_dp_READ_4_vs(p,
 787                           const2_reg,              /* writeback dest */
 788                           1,                       /* oword */
 789                           relAddr,                 /* relative indexing? */
 790                           addrReg,                 /* address register */
 791                           16 * src->Index,         /* byte offset */
 792                           SURF_INDEX_VERT_CONST_BUFFER
 793                           );
 794       }
 795    }
 796
 797    const_reg = c->current_const[argIndex].reg;
 798
 799    if (relAddr) {
 800       /* merge the two Owords into the constant register */
 801       /* const_reg[7..4] = const2_reg[7..4] */
 802       brw_MOV(p,
 803               suboffset(stride(const_reg, 0, 4, 1), 4),
 804               suboffset(stride(const2_reg, 0, 4, 1), 4));
 805       release_tmp(c, const2_reg);
 806    }
 807    else {
 808       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 809       const_reg = stride(const_reg, 0, 4, 0);
 810       const_reg.subnr = 0;
 811    }
 812
 813    return const_reg;
 814 }
 815
 816
 817
 818 /* TODO: relative addressing!
 819  */
 820 static struct brw_reg get_reg( struct brw_vs_compile *c,
 821                                gl_register_file file,
 822                                GLuint index )
 823 {
 824    switch (file) {
 825    case PROGRAM_TEMPORARY:
 826    case PROGRAM_INPUT:
 827    case PROGRAM_OUTPUT:
 828       assert(c->regs[file][index].nr != 0);
 829       return c->regs[file][index];
 830    case PROGRAM_STATE_VAR:
 831    case PROGRAM_CONSTANT:
 832    case PROGRAM_UNIFORM:
 833       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 834       return c->regs[PROGRAM_STATE_VAR][index];
 835    case PROGRAM_ADDRESS:
 836       assert(index == 0);
 837       return c->regs[file][index];
 838
 839    case PROGRAM_UNDEFINED:                      /* undef values */
 840       return brw_null_reg();
 841
 842    case PROGRAM_LOCAL_PARAM:
 843    case PROGRAM_ENV_PARAM:
 844    case PROGRAM_WRITE_ONLY:
 845    default:
 846       assert(0);
 847       return brw_null_reg();
 848    }
 849 }
 850
 851
 852 /**
 853  * Indirect addressing:  get reg[[arg] + offset].
 854  */
 855 static struct brw_reg deref( struct brw_vs_compile *c,
 856                              struct brw_reg arg,
 857                              GLint offset)
 858 {
 859    struct brw_compile *p = &c->func;
 860    struct brw_reg tmp = vec4(get_tmp(c));
 861    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 862    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 863    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 864    struct brw_reg indirect = brw_vec4_indirect(0,0);
 865
 866    {
 867       brw_push_insn_state(p);
 868       brw_set_access_mode(p, BRW_ALIGN_1);
 869
 870       /* This is pretty clunky - load the address register twice and
 871        * fetch each 4-dword value in turn.  There must be a way to do
 872        * this in a single pass, but I couldn't get it to work.
 873        */
 874       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 875       brw_MOV(p, tmp, indirect);
 876
 877       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 878       brw_MOV(p, suboffset(tmp, 4), indirect);
 879
 880       brw_pop_insn_state(p);
 881    }
 882
 883    /* NOTE: tmp not released */
 884    return vec8(tmp);
 885 }
 886
 887
 888 /**
 889  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 890  * TODO: relative addressing!
 891  */
 892 static struct brw_reg
 893 get_src_reg( struct brw_vs_compile *c,
 894              const struct prog_instruction *inst,
 895              GLuint argIndex )
 896 {
 897    const GLuint file = inst->SrcReg[argIndex].File;
 898    const GLint index = inst->SrcReg[argIndex].Index;
 899    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 900
 901    switch (file) {
 902    case PROGRAM_TEMPORARY:
 903    case PROGRAM_INPUT:
 904    case PROGRAM_OUTPUT:
 905       if (relAddr) {
 906          return deref(c, c->regs[file][0], index);
 907       }
 908       else {
 909          assert(c->regs[file][index].nr != 0);
 910          return c->regs[file][index];
 911       }
 912
 913    case PROGRAM_STATE_VAR:
 914    case PROGRAM_CONSTANT:
 915    case PROGRAM_UNIFORM:
 916    case PROGRAM_ENV_PARAM:
 917    case PROGRAM_LOCAL_PARAM:
 918       if (c->vp->use_const_buffer) {
 919          return get_constant(c, inst, argIndex);
 920       }
 921       else if (relAddr) {
 922          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 923       }
 924       else {
 925          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 926          return c->regs[PROGRAM_STATE_VAR][index];
 927       }
 928    case PROGRAM_ADDRESS:
 929       assert(index == 0);
 930       return c->regs[file][index];
 931
 932    case PROGRAM_UNDEFINED:
 933       /* this is a normal case since we loop over all three src args */
 934       return brw_null_reg();
 935
 936    case PROGRAM_WRITE_ONLY:
 937    default:
 938       assert(0);
 939       return brw_null_reg();
 940    }
 941 }
 942
 943
 944 static void emit_arl( struct brw_vs_compile *c,
 945                       struct brw_reg dst,
 946                       struct brw_reg arg0 )
 947 {
 948    struct brw_compile *p = &c->func;
 949    struct brw_reg tmp = dst;
 950    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 951
 952    if (need_tmp)
 953       tmp = get_tmp(c);
 954
 955    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 956    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 957
 958    if (need_tmp)
 959       release_tmp(c, tmp);
 960 }
 961
 962
 963 /**
 964  * Return the brw reg for the given instruction's src argument.
 965  * Will return mangled results for SWZ op.  The emit_swz() function
 966  * ignores this result and recalculates taking extended swizzles into
 967  * account.
 968  */
 969 static struct brw_reg get_arg( struct brw_vs_compile *c,
 970                                const struct prog_instruction *inst,
 971                                GLuint argIndex )
 972 {
 973    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 974    struct brw_reg reg;
 975
 976    if (src->File == PROGRAM_UNDEFINED)
 977       return brw_null_reg();
 978
 979    reg = get_src_reg(c, inst, argIndex);
 980
 981    /* Convert 3-bit swizzle to 2-bit.
 982     */
 983    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 984                                        GET_SWZ(src->Swizzle, 1),
 985                                        GET_SWZ(src->Swizzle, 2),
 986                                        GET_SWZ(src->Swizzle, 3));
 987
 988    /* Note this is ok for non-swizzle instructions:
 989     */
 990    reg.negate = src->Negate ? 1 : 0;
 991
 992    return reg;
 993 }
 994
 995
 996 /**
 997  * Get brw register for the given program dest register.
 998  */
 999 static struct brw_reg get_dst( struct brw_vs_compile *c,
1000                                struct prog_dst_register dst )
1001 {
1002    struct brw_reg reg;
1003
1004    switch (dst.File) {
1005    case PROGRAM_TEMPORARY:
1006    case PROGRAM_OUTPUT:
1007       assert(c->regs[dst.File][dst.Index].nr != 0);
1008       reg = c->regs[dst.File][dst.Index];
1009       break;
1010    case PROGRAM_ADDRESS:
1011       assert(dst.Index == 0);
1012       reg = c->regs[dst.File][dst.Index];
1013       break;
1014    case PROGRAM_UNDEFINED:
1015       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1016       reg = brw_null_reg();
1017       break;
1018    default:
1019       assert(0);
1020       reg = brw_null_reg();
1021    }
1022
1023    reg.dw1.bits.writemask = dst.WriteMask;
1024
1025    return reg;
1026 }
1027
1028
1029 static void emit_swz( struct brw_vs_compile *c,
1030                       struct brw_reg dst,
1031                       const struct prog_instruction *inst)
1032 {
1033    const GLuint argIndex = 0;
1034    const struct prog_src_register src = inst->SrcReg[argIndex];
1035    struct brw_compile *p = &c->func;
1036    GLuint zeros_mask = 0;
1037    GLuint ones_mask = 0;
1038    GLuint src_mask = 0;
1039    GLubyte src_swz[4];
1040    GLboolean need_tmp = (src.Negate &&
1041                          dst.file != BRW_GENERAL_REGISTER_FILE);
1042    struct brw_reg tmp = dst;
1043    GLuint i;
1044
1045    if (need_tmp)
1046       tmp = get_tmp(c);
1047
1048    for (i = 0; i < 4; i++) {
1049       if (dst.dw1.bits.writemask & (1<<i)) {
1050          GLubyte s = GET_SWZ(src.Swizzle, i);
1051          switch (s) {
1052          case SWIZZLE_X:
1053          case SWIZZLE_Y:
1054          case SWIZZLE_Z:
1055          case SWIZZLE_W:
1056             src_mask |= 1<<i;
1057             src_swz[i] = s;
1058             break;
1059          case SWIZZLE_ZERO:
1060             zeros_mask |= 1<<i;
1061             break;
1062          case SWIZZLE_ONE:
1063             ones_mask |= 1<<i;
1064             break;
1065          }
1066       }
1067    }
1068
1069    /* Do src first, in case dst aliases src:
1070     */
1071    if (src_mask) {
1072       struct brw_reg arg0;
1073
1074       arg0 = get_src_reg(c, inst, argIndex);
1075
1076       arg0 = brw_swizzle(arg0,
1077                          src_swz[0], src_swz[1],
1078                          src_swz[2], src_swz[3]);
1079
1080       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1081    }
1082
1083    if (zeros_mask)
1084       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1085
1086    if (ones_mask)
1087       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1088
1089    if (src.Negate)
1090       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1091
1092    if (need_tmp) {
1093       brw_MOV(p, dst, tmp);
1094       release_tmp(c, tmp);
1095    }
1096 }
1097
1098
1099 /**
1100  * Post-vertex-program processing.  Send the results to the URB.
1101  */
1102 static void emit_vertex_write( struct brw_vs_compile *c)
1103 {
1104    struct brw_compile *p = &c->func;
1105    struct brw_reg m0 = brw_message_reg(0);
1106    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1107    struct brw_reg ndc;
1108    int eot;
1109    GLuint len_vertext_header = 2;
1110
1111    if (c->key.copy_edgeflag) {
1112       brw_MOV(p,
1113               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1114               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1115    }
1116
1117    /* Build ndc coords */
1118    ndc = get_tmp(c);
1119    /* ndc = 1.0 / pos.w */
1120    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1121    /* ndc.xyz = pos * ndc */
1122    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1123
1124    /* Update the header for point size, user clipping flags, and -ve rhw
1125     * workaround.
1126     */
1127    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1128        c->key.nr_userclip || BRW_IS_965(p->brw))
1129    {
1130       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1131       GLuint i;
1132
1133       brw_MOV(p, header1, brw_imm_ud(0));
1134
1135       brw_set_access_mode(p, BRW_ALIGN_16);
1136
1137       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1138          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1139          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1140          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1141       }
1142
1143       for (i = 0; i < c->key.nr_userclip; i++) {
1144          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1145          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1146          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1147          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1148       }
1149
1150       /* i965 clipping workaround:
1151        * 1) Test for -ve rhw
1152        * 2) If set,
1153        *      set ndc = (0,0,0,0)
1154        *      set ucp[6] = 1
1155        *
1156        * Later, clipping will detect ucp[6] and ensure the primitive is
1157        * clipped against all fixed planes.
1158        */
1159       if (BRW_IS_965(p->brw)) {
1160          brw_CMP(p,
1161                  vec8(brw_null_reg()),
1162                  BRW_CONDITIONAL_L,
1163                  brw_swizzle1(ndc, 3),
1164                  brw_imm_f(0));
1165
1166          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1167          brw_MOV(p, ndc, brw_imm_f(0));
1168          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1169       }
1170
1171       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1172       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1173       brw_set_access_mode(p, BRW_ALIGN_16);
1174
1175       release_tmp(c, header1);
1176    }
1177    else {
1178       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1179    }
1180
1181    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1182     * of zeros followed by two sets of NDC coordinates:
1183     */
1184    brw_set_access_mode(p, BRW_ALIGN_1);
1185    brw_MOV(p, offset(m0, 2), ndc);
1186
1187    if (BRW_IS_IGDNG(p->brw)) {
1188        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1189        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1190        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1191         * Seems it is useless for us.
1192         * m6 is used for aligning, so that the remainder of vertex element is
1193         * reg-aligned.
1194         */
1195        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1196        len_vertext_header = 6;
1197    } else {
1198        brw_MOV(p, offset(m0, 3), pos);
1199        len_vertext_header = 2;
1200    }
1201
1202    eot = (c->first_overflow_output == 0);
1203
1204    brw_urb_WRITE(p,
1205                  brw_null_reg(), /* dest */
1206                  0,             /* starting mrf reg nr */
1207                  c->r0,         /* src */
1208                  0,             /* allocate */
1209                  1,             /* used */
1210                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1211                  0,             /* response len */
1212                  eot,           /* eot */
1213                  eot,           /* writes complete */
1214                  0,             /* urb destination offset */
1215                  BRW_URB_SWIZZLE_INTERLEAVE);
1216
1217    if (c->first_overflow_output > 0) {
1218       /* Not all of the vertex outputs/results fit into the MRF.
1219        * Move the overflowed attributes from the GRF to the MRF and
1220        * issue another brw_urb_WRITE().
1221        */
1222       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1223        * at mrf[4] atm...
1224        */
1225       GLuint i, mrf = 0;
1226       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1227          if (c->prog_data.outputs_written & (1 << i)) {
1228             /* move from GRF to MRF */
1229             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1230             mrf++;
1231          }
1232       }
1233
1234       brw_urb_WRITE(p,
1235                     brw_null_reg(), /* dest */
1236                     4,              /* starting mrf reg nr */
1237                     c->r0,          /* src */
1238                     0,              /* allocate */
1239                     1,              /* used */
1240                     mrf+1,          /* msg len */
1241                     0,              /* response len */
1242                     1,              /* eot */
1243                     1,              /* writes complete */
1244                     BRW_MAX_MRF-1,  /* urb destination offset */
1245                     BRW_URB_SWIZZLE_INTERLEAVE);
1246    }
1247 }
1248
1249
1250 /**
1251  * Called after code generation to resolve subroutine calls and the
1252  * END instruction.
1253  * \param end_inst  points to brw code for END instruction
1254  * \param last_inst  points to last instruction emitted before vertex write
1255  */
1256 static void
1257 post_vs_emit( struct brw_vs_compile *c,
1258               struct brw_instruction *end_inst,
1259               struct brw_instruction *last_inst )
1260 {
1261    GLint offset;
1262
1263    brw_resolve_cals(&c->func);
1264
1265    /* patch up the END code to jump past subroutines, etc */
1266    offset = last_inst - end_inst;
1267    if (offset > 1) {
1268       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1269    } else {
1270       end_inst->header.opcode = BRW_OPCODE_NOP;
1271    }
1272 }
1273
1274 static GLboolean
1275 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1276 {
1277    struct brw_compile *p = &c->func;
1278    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1279
1280    if (p->nr_insn == 0)
1281       return GL_FALSE;
1282
1283    if (val.address_mode != BRW_ADDRESS_DIRECT)
1284       return GL_FALSE;
1285
1286    switch (prev_insn->header.opcode) {
1287    case BRW_OPCODE_MOV:
1288    case BRW_OPCODE_MAC:
1289    case BRW_OPCODE_MUL:
1290       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1291           prev_insn->header.execution_size == val.width &&
1292           prev_insn->bits1.da1.dest_reg_file == val.file &&
1293           prev_insn->bits1.da1.dest_reg_type == val.type &&
1294           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1295           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1296           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1297           prev_insn->bits1.da16.dest_writemask == 0xf)
1298          return GL_TRUE;
1299       else
1300          return GL_FALSE;
1301    default:
1302       return GL_FALSE;
1303    }
1304 }
1305
1306 static uint32_t
1307 get_predicate(const struct prog_instruction *inst)
1308 {
1309    if (inst->DstReg.CondMask == COND_TR)
1310       return BRW_PREDICATE_NONE;
1311
1312    /* All of GLSL only produces predicates for COND_NE and one channel per
1313     * vector.  Fail badly if someone starts doing something else, as it might
1314     * mean infinite looping or something.
1315     *
1316     * We'd like to support all the condition codes, but our hardware doesn't
1317     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1318     * those, the instruction may update the condition codes or not, then any
1319     * later instruction may use one of those condition codes.  For gen4, the
1320     * instruction may update the flags register based on one of the condition
1321     * codes output by the instruction, and then further instructions may
1322     * predicate on that.  We can probably support this, but it won't
1323     * necessarily be easy.
1324     */
1325    assert(inst->DstReg.CondMask == COND_NE);
1326
1327    switch (inst->DstReg.CondSwizzle) {
1328    case SWIZZLE_XXXX:
1329       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1330    case SWIZZLE_YYYY:
1331       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1332    case SWIZZLE_ZZZZ:
1333       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1334    case SWIZZLE_WWWW:
1335       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1336    default:
1337       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1338                     inst->DstReg.CondMask);
1339       return BRW_PREDICATE_NORMAL;
1340    }
1341 }
1342
1343 /* Emit the vertex program instructions here.
1344  */
1345 void brw_vs_emit(struct brw_vs_compile *c )
1346 {
1347 #define MAX_IF_DEPTH 32
1348 #define MAX_LOOP_DEPTH 32
1349    struct brw_compile *p = &c->func;
1350    struct brw_context *brw = p->brw;
1351    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1352    GLuint insn, if_depth = 0, loop_depth = 0;
1353    GLuint end_offset = 0;
1354    struct brw_instruction *end_inst, *last_inst;
1355    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1356    const struct brw_indirect stack_index = brw_indirect(0, 0);
1357    GLuint index;
1358    GLuint file;
1359
1360    if (INTEL_DEBUG & DEBUG_VS) {
1361       _mesa_printf("vs-mesa:\n");
1362       _mesa_print_program(&c->vp->program.Base);
1363       _mesa_printf("\n");
1364    }
1365
1366    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1367    brw_set_access_mode(p, BRW_ALIGN_16);
1368
1369    /* Message registers can't be read, so copy the output into GRF register
1370       if they are used in source registers */
1371    for (insn = 0; insn < nr_insns; insn++) {
1372        GLuint i;
1373        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1374        for (i = 0; i < 3; i++) {
1375            struct prog_src_register *src = &inst->SrcReg[i];
1376            GLuint index = src->Index;
1377            GLuint file = src->File;
1378            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1379                c->output_regs[index].used_in_src = GL_TRUE;
1380        }
1381    }
1382
1383    /* Static register allocation
1384     */
1385    brw_vs_alloc_regs(c);
1386    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1387
1388    for (insn = 0; insn < nr_insns; insn++) {
1389
1390       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1391       struct brw_reg args[3], dst;
1392       GLuint i;
1393
1394 #if 0
1395       printf("%d: ", insn);
1396       _mesa_print_instruction(inst);
1397 #endif
1398
1399       /* Get argument regs.  SWZ is special and does this itself.
1400        */
1401       if (inst->Opcode != OPCODE_SWZ)
1402           for (i = 0; i < 3; i++) {
1403               const struct prog_src_register *src = &inst->SrcReg[i];
1404               index = src->Index;
1405               file = src->File;
1406               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1407                   args[i] = c->output_regs[index].reg;
1408               else
1409                   args[i] = get_arg(c, inst, i);
1410           }
1411
1412       /* Get dest regs.  Note that it is possible for a reg to be both
1413        * dst and arg, given the static allocation of registers.  So
1414        * care needs to be taken emitting multi-operation instructions.
1415        */
1416       index = inst->DstReg.Index;
1417       file = inst->DstReg.File;
1418       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1419           dst = c->output_regs[index].reg;
1420       else
1421           dst = get_dst(c, inst->DstReg);
1422
1423       if (inst->SaturateMode != SATURATE_OFF) {
1424          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1425                        inst->SaturateMode);
1426       }
1427
1428       switch (inst->Opcode) {
1429       case OPCODE_ABS:
1430          brw_MOV(p, dst, brw_abs(args[0]));
1431          break;
1432       case OPCODE_ADD:
1433          brw_ADD(p, dst, args[0], args[1]);
1434          break;
1435       case OPCODE_COS:
1436          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1437          break;
1438       case OPCODE_DP3:
1439          brw_DP3(p, dst, args[0], args[1]);
1440          break;
1441       case OPCODE_DP4:
1442          brw_DP4(p, dst, args[0], args[1]);
1443          break;
1444       case OPCODE_DPH:
1445          brw_DPH(p, dst, args[0], args[1]);
1446          break;
1447       case OPCODE_NRM3:
1448          emit_nrm(c, dst, args[0], 3);
1449          break;
1450       case OPCODE_NRM4:
1451          emit_nrm(c, dst, args[0], 4);
1452          break;
1453       case OPCODE_DST:
1454          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1455          break;
1456       case OPCODE_EXP:
1457          unalias1(c, dst, args[0], emit_exp_noalias);
1458          break;
1459       case OPCODE_EX2:
1460          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1461          break;
1462       case OPCODE_ARL:
1463          emit_arl(c, dst, args[0]);
1464          break;
1465       case OPCODE_FLR:
1466          brw_RNDD(p, dst, args[0]);
1467          break;
1468       case OPCODE_FRC:
1469          brw_FRC(p, dst, args[0]);
1470          break;
1471       case OPCODE_LOG:
1472          unalias1(c, dst, args[0], emit_log_noalias);
1473          break;
1474       case OPCODE_LG2:
1475          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1476          break;
1477       case OPCODE_LIT:
1478          unalias1(c, dst, args[0], emit_lit_noalias);
1479          break;
1480       case OPCODE_LRP:
1481          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1482          break;
1483       case OPCODE_MAD:
1484          if (!accumulator_contains(c, args[2]))
1485             brw_MOV(p, brw_acc_reg(), args[2]);
1486          brw_MAC(p, dst, args[0], args[1]);
1487          break;
1488       case OPCODE_MAX:
1489          emit_max(p, dst, args[0], args[1]);
1490          break;
1491       case OPCODE_MIN:
1492          emit_min(p, dst, args[0], args[1]);
1493          break;
1494       case OPCODE_MOV:
1495          brw_MOV(p, dst, args[0]);
1496          break;
1497       case OPCODE_MUL:
1498          brw_MUL(p, dst, args[0], args[1]);
1499          break;
1500       case OPCODE_POW:
1501          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1502          break;
1503       case OPCODE_RCP:
1504          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1505          break;
1506       case OPCODE_RSQ:
1507          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1508          break;
1509
1510       case OPCODE_SEQ:
1511          unalias2(c, dst, args[0], args[1], emit_seq);
1512          break;
1513       case OPCODE_SIN:
1514          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1515          break;
1516       case OPCODE_SNE:
1517          unalias2(c, dst, args[0], args[1], emit_sne);
1518          break;
1519       case OPCODE_SGE:
1520          unalias2(c, dst, args[0], args[1], emit_sge);
1521          break;
1522       case OPCODE_SGT:
1523          unalias2(c, dst, args[0], args[1], emit_sgt);
1524          break;
1525       case OPCODE_SLT:
1526          unalias2(c, dst, args[0], args[1], emit_slt);
1527          break;
1528       case OPCODE_SLE:
1529          unalias2(c, dst, args[0], args[1], emit_sle);
1530          break;
1531       case OPCODE_SUB:
1532          brw_ADD(p, dst, args[0], negate(args[1]));
1533          break;
1534       case OPCODE_SWZ:
1535          /* The args[0] value can't be used here as it won't have
1536           * correctly encoded the full swizzle:
1537           */
1538          emit_swz(c, dst, inst);
1539          break;
1540       case OPCODE_TRUNC:
1541          /* round toward zero */
1542          brw_RNDZ(p, dst, args[0]);
1543          break;
1544       case OPCODE_XPD:
1545          emit_xpd(p, dst, args[0], args[1]);
1546          break;
1547       case OPCODE_IF:
1548          assert(if_depth < MAX_IF_DEPTH);
1549          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1550          /* Note that brw_IF smashes the predicate_control field. */
1551          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1552          if_depth++;
1553          break;
1554       case OPCODE_ELSE:
1555          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1556          break;
1557       case OPCODE_ENDIF:
1558          assert(if_depth > 0);
1559          brw_ENDIF(p, if_inst[--if_depth]);
1560          break;
1561       case OPCODE_BGNLOOP:
1562          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1563          break;
1564       case OPCODE_BRK:
1565          brw_set_predicate_control(p, get_predicate(inst));
1566          brw_BREAK(p);
1567          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1568          break;
1569       case OPCODE_CONT:
1570          brw_set_predicate_control(p, get_predicate(inst));
1571          brw_CONT(p);
1572          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1573          break;
1574       case OPCODE_ENDLOOP:
1575          {
1576             struct brw_instruction *inst0, *inst1;
1577             GLuint br = 1;
1578
1579             loop_depth--;
1580
1581             if (BRW_IS_IGDNG(brw))
1582                br = 2;
1583
1584             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1585             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1586             while (inst0 > loop_inst[loop_depth]) {
1587                inst0--;
1588                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1589                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1590                   inst0->bits3.if_else.pop_count = 0;
1591                }
1592                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1593                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1594                   inst0->bits3.if_else.pop_count = 0;
1595                }
1596             }
1597          }
1598          break;
1599       case OPCODE_BRA:
1600          brw_set_predicate_control(p, get_predicate(inst));
1601          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1602          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1603          break;
1604       case OPCODE_CAL:
1605          brw_set_access_mode(p, BRW_ALIGN_1);
1606          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1607          brw_set_access_mode(p, BRW_ALIGN_16);
1608          brw_ADD(p, get_addr_reg(stack_index),
1609                          get_addr_reg(stack_index), brw_imm_d(4));
1610          brw_save_call(p, inst->Comment, p->nr_insn);
1611          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1612          break;
1613       case OPCODE_RET:
1614          brw_ADD(p, get_addr_reg(stack_index),
1615                          get_addr_reg(stack_index), brw_imm_d(-4));
1616          brw_set_access_mode(p, BRW_ALIGN_1);
1617          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1618          brw_set_access_mode(p, BRW_ALIGN_16);
1619          break;
1620       case OPCODE_END:
1621          end_offset = p->nr_insn;
1622          /* this instruction will get patched later to jump past subroutine
1623           * code, etc.
1624           */
1625          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1626          break;
1627       case OPCODE_PRINT:
1628          /* no-op */
1629          break;
1630       case OPCODE_BGNSUB:
1631          brw_save_label(p, inst->Comment, p->nr_insn);
1632          break;
1633       case OPCODE_ENDSUB:
1634          /* no-op */
1635          break;
1636       default:
1637          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1638                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1639                                     _mesa_opcode_string(inst->Opcode) :
1640                                     "unknown");
1641       }
1642
1643       /* Set the predication update on the last instruction of the native
1644        * instruction sequence.
1645        *
1646        * This would be problematic if it was set on a math instruction,
1647        * but that shouldn't be the case with the current GLSL compiler.
1648        */
1649       if (inst->CondUpdate) {
1650          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1651
1652          assert(hw_insn->header.destreg__conditionalmod == 0);
1653          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1654       }
1655
1656       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1657           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1658           && c->output_regs[inst->DstReg.Index].used_in_src) {
1659          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1660       }
1661
1662       /* Result color clamping.
1663        *
1664        * When destination register is an output register and
1665        * it's primary/secondary front/back color, we have to clamp
1666        * the result to [0,1]. This is done by enabling the
1667        * saturation bit for the last instruction.
1668        *
1669        * We don't use brw_set_saturate() as it modifies
1670        * p->current->header.saturate, which affects all the subsequent
1671        * instructions. Instead, we directly modify the header
1672        * of the last (already stored) instruction.
1673        */
1674       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1675          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1676              || (inst->DstReg.Index == VERT_RESULT_COL1)
1677              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1678              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1679             p->store[p->nr_insn-1].header.saturate = 1;
1680          }
1681       }
1682
1683       release_tmps(c);
1684    }
1685
1686    end_inst = &p->store[end_offset];
1687    last_inst = &p->store[p->nr_insn];
1688
1689    /* The END instruction will be patched to jump to this code */
1690    emit_vertex_write(c);
1691
1692    post_vs_emit(c, end_inst, last_inst);
1693
1694    if (INTEL_DEBUG & DEBUG_VS) {
1695       int i;
1696
1697       _mesa_printf("vs-native:\n");
1698       for (i = 0; i < p->nr_insn; i++)
1699          brw_disasm(stderr, &p->store[i]);
1700       _mesa_printf("\n");
1701    }
1702 }