src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71
  72    /* Determine whether to use a real constant buffer or use a block
  73     * of GRF registers for constants.  The later is faster but only
  74     * works if everything fits in the GRF.
  75     * XXX this heuristic/check may need some fine tuning...
  76     */
  77    if (c->vp->program.Base.Parameters->NumParameters +
  78        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  79       c->vp->use_const_buffer = GL_TRUE;
  80    else
  81       c->vp->use_const_buffer = GL_FALSE;
  82
  83    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  84
  85    /* r0 -- reserved as usual
  86     */
  87    c->r0 = brw_vec8_grf(reg, 0);
  88    reg++;
  89
  90    /* User clip planes from curbe:
  91     */
  92    if (c->key.nr_userclip) {
  93       for (i = 0; i < c->key.nr_userclip; i++) {
  94          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  95       }
  96
  97       /* Deal with curbe alignment:
  98        */
  99       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 100    }
 101
 102    /* Vertex program parameters from curbe:
 103     */
 104    if (c->vp->use_const_buffer) {
 105       /* get constants from a real constant buffer */
 106       c->prog_data.curb_read_length = 0;
 107       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 108    }
 109    else {
 110       /* use a section of the GRF for constants */
 111       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 112       for (i = 0; i < nr_params; i++) {
 113          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 114       }
 115       reg += (nr_params + 1) / 2;
 116       c->prog_data.curb_read_length = reg - 1;
 117
 118       c->prog_data.nr_params = nr_params * 4;
 119    }
 120
 121    /* Allocate input regs:
 122     */
 123    c->nr_inputs = 0;
 124    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 125       if (c->prog_data.inputs_read & (1 << i)) {
 126          c->nr_inputs++;
 127          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 128          reg++;
 129       }
 130    }
 131
 132    /* Allocate outputs.  The non-position outputs go straight into message regs.
 133     */
 134    c->nr_outputs = 0;
 135    c->first_output = reg;
 136    c->first_overflow_output = 0;
 137    mrf = 4;
 138    for (i = 0; i < VERT_RESULT_MAX; i++) {
 139       if (c->prog_data.outputs_written & (1 << i)) {
 140          c->nr_outputs++;
 141          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 142          if (i == VERT_RESULT_HPOS) {
 143             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 144             reg++;
 145          }
 146          else if (i == VERT_RESULT_PSIZ) {
 147             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 148             reg++;
 149             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 150          }
 151          else {
 152             if (mrf < 16) {
 153                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 154                mrf++;
 155             }
 156             else {
 157                /* too many vertex results to fit in MRF, use GRF for overflow */
 158                if (!c->first_overflow_output)
 159                   c->first_overflow_output = i;
 160                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 161                reg++;
 162             }
 163          }
 164       }
 165    }
 166
 167    /* Allocate program temporaries:
 168     */
 169    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 170       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 171       reg++;
 172    }
 173
 174    /* Address reg(s).  Don't try to use the internal address reg until
 175     * deref time.
 176     */
 177    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 178       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 179                                              reg,
 180                                              0,
 181                                              BRW_REGISTER_TYPE_D,
 182                                              BRW_VERTICAL_STRIDE_8,
 183                                              BRW_WIDTH_8,
 184                                              BRW_HORIZONTAL_STRIDE_1,
 185                                              BRW_SWIZZLE_XXXX,
 186                                              WRITEMASK_X);
 187       reg++;
 188    }
 189
 190    if (c->vp->use_const_buffer) {
 191       for (i = 0; i < 3; i++) {
 192          c->current_const[i].index = -1;
 193          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 194          reg++;
 195       }
 196    }
 197
 198    for (i = 0; i < 128; i++) {
 199       if (c->output_regs[i].used_in_src) {
 200          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 201          reg++;
 202       }
 203    }
 204
 205    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 206    reg += 2;
 207
 208    /* Some opcodes need an internal temporary:
 209     */
 210    c->first_tmp = reg;
 211    c->last_tmp = reg;           /* for allocation purposes */
 212
 213    /* Each input reg holds data from two vertices.  The
 214     * urb_read_length is the number of registers read from *each*
 215     * vertex urb, so is half the amount:
 216     */
 217    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 218
 219    c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
 220    c->prog_data.total_grf = reg;
 221
 222    if (INTEL_DEBUG & DEBUG_VS) {
 223       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 224       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 225       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 226    }
 227 }
 228
 229
 230 /**
 231  * If an instruction uses a temp reg both as a src and the dest, we
 232  * sometimes need to allocate an intermediate temporary.
 233  */
 234 static void unalias1( struct brw_vs_compile *c,
 235                       struct brw_reg dst,
 236                       struct brw_reg arg0,
 237                       void (*func)( struct brw_vs_compile *,
 238                                     struct brw_reg,
 239                                     struct brw_reg ))
 240 {
 241    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 242       struct brw_compile *p = &c->func;
 243       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 244       func(c, tmp, arg0);
 245       brw_MOV(p, dst, tmp);
 246       release_tmp(c, tmp);
 247    }
 248    else {
 249       func(c, dst, arg0);
 250    }
 251 }
 252
 253 /**
 254  * \sa unalias2
 255  * Checkes if 2-operand instruction needs an intermediate temporary.
 256  */
 257 static void unalias2( struct brw_vs_compile *c,
 258                       struct brw_reg dst,
 259                       struct brw_reg arg0,
 260                       struct brw_reg arg1,
 261                       void (*func)( struct brw_vs_compile *,
 262                                     struct brw_reg,
 263                                     struct brw_reg,
 264                                     struct brw_reg ))
 265 {
 266    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 267        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 268       struct brw_compile *p = &c->func;
 269       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 270       func(c, tmp, arg0, arg1);
 271       brw_MOV(p, dst, tmp);
 272       release_tmp(c, tmp);
 273    }
 274    else {
 275       func(c, dst, arg0, arg1);
 276    }
 277 }
 278
 279 /**
 280  * \sa unalias2
 281  * Checkes if 3-operand instruction needs an intermediate temporary.
 282  */
 283 static void unalias3( struct brw_vs_compile *c,
 284                       struct brw_reg dst,
 285                       struct brw_reg arg0,
 286                       struct brw_reg arg1,
 287                       struct brw_reg arg2,
 288                       void (*func)( struct brw_vs_compile *,
 289                                     struct brw_reg,
 290                                     struct brw_reg,
 291                                     struct brw_reg,
 292                                     struct brw_reg ))
 293 {
 294    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 295        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 296        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 297       struct brw_compile *p = &c->func;
 298       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 299       func(c, tmp, arg0, arg1, arg2);
 300       brw_MOV(p, dst, tmp);
 301       release_tmp(c, tmp);
 302    }
 303    else {
 304       func(c, dst, arg0, arg1, arg2);
 305    }
 306 }
 307
 308 static void emit_sop( struct brw_compile *p,
 309                       struct brw_reg dst,
 310                       struct brw_reg arg0,
 311                       struct brw_reg arg1,
 312                       GLuint cond)
 313 {
 314    brw_MOV(p, dst, brw_imm_f(0.0f));
 315    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 316    brw_MOV(p, dst, brw_imm_f(1.0f));
 317    brw_set_predicate_control_flag_value(p, 0xff);
 318 }
 319
 320 static void emit_seq( struct brw_compile *p,
 321                       struct brw_reg dst,
 322                       struct brw_reg arg0,
 323                       struct brw_reg arg1 )
 324 {
 325    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 326 }
 327
 328 static void emit_sne( struct brw_compile *p,
 329                       struct brw_reg dst,
 330                       struct brw_reg arg0,
 331                       struct brw_reg arg1 )
 332 {
 333    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 334 }
 335 static void emit_slt( struct brw_compile *p,
 336                       struct brw_reg dst,
 337                       struct brw_reg arg0,
 338                       struct brw_reg arg1 )
 339 {
 340    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 341 }
 342
 343 static void emit_sle( struct brw_compile *p,
 344                       struct brw_reg dst,
 345                       struct brw_reg arg0,
 346                       struct brw_reg arg1 )
 347 {
 348    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 349 }
 350
 351 static void emit_sgt( struct brw_compile *p,
 352                       struct brw_reg dst,
 353                       struct brw_reg arg0,
 354                       struct brw_reg arg1 )
 355 {
 356    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 357 }
 358
 359 static void emit_sge( struct brw_compile *p,
 360                       struct brw_reg dst,
 361                       struct brw_reg arg0,
 362                       struct brw_reg arg1 )
 363 {
 364   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 365 }
 366
 367 static void emit_max( struct brw_compile *p,
 368                       struct brw_reg dst,
 369                       struct brw_reg arg0,
 370                       struct brw_reg arg1 )
 371 {
 372    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 373    brw_SEL(p, dst, arg1, arg0);
 374    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 375 }
 376
 377 static void emit_min( struct brw_compile *p,
 378                       struct brw_reg dst,
 379                       struct brw_reg arg0,
 380                       struct brw_reg arg1 )
 381 {
 382    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 383    brw_SEL(p, dst, arg0, arg1);
 384    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 385 }
 386
 387
 388 static void emit_math1( struct brw_vs_compile *c,
 389                         GLuint function,
 390                         struct brw_reg dst,
 391                         struct brw_reg arg0,
 392                         GLuint precision)
 393 {
 394    /* There are various odd behaviours with SEND on the simulator.  In
 395     * addition there are documented issues with the fact that the GEN4
 396     * processor doesn't do dependency control properly on SEND
 397     * results.  So, on balance, this kludge to get around failures
 398     * with writemasked math results looks like it might be necessary
 399     * whether that turns out to be a simulator bug or not:
 400     */
 401    struct brw_compile *p = &c->func;
 402    struct brw_reg tmp = dst;
 403    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 404                          dst.file != BRW_GENERAL_REGISTER_FILE);
 405
 406    if (need_tmp)
 407       tmp = get_tmp(c);
 408
 409    brw_math(p,
 410             tmp,
 411             function,
 412             BRW_MATH_SATURATE_NONE,
 413             2,
 414             arg0,
 415             BRW_MATH_DATA_SCALAR,
 416             precision);
 417
 418    if (need_tmp) {
 419       brw_MOV(p, dst, tmp);
 420       release_tmp(c, tmp);
 421    }
 422 }
 423
 424
 425 static void emit_math2( struct brw_vs_compile *c,
 426                         GLuint function,
 427                         struct brw_reg dst,
 428                         struct brw_reg arg0,
 429                         struct brw_reg arg1,
 430                         GLuint precision)
 431 {
 432    struct brw_compile *p = &c->func;
 433    struct brw_reg tmp = dst;
 434    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 435                          dst.file != BRW_GENERAL_REGISTER_FILE);
 436
 437    if (need_tmp)
 438       tmp = get_tmp(c);
 439
 440    brw_MOV(p, brw_message_reg(3), arg1);
 441
 442    brw_math(p,
 443             tmp,
 444             function,
 445             BRW_MATH_SATURATE_NONE,
 446             2,
 447             arg0,
 448             BRW_MATH_DATA_SCALAR,
 449             precision);
 450
 451    if (need_tmp) {
 452       brw_MOV(p, dst, tmp);
 453       release_tmp(c, tmp);
 454    }
 455 }
 456
 457
 458 static void emit_exp_noalias( struct brw_vs_compile *c,
 459                               struct brw_reg dst,
 460                               struct brw_reg arg0 )
 461 {
 462    struct brw_compile *p = &c->func;
 463
 464
 465    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 466       struct brw_reg tmp = get_tmp(c);
 467       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 468
 469       /* tmp_d = floor(arg0.x) */
 470       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 471
 472       /* result[0] = 2.0 ^ tmp */
 473
 474       /* Adjust exponent for floating point:
 475        * exp += 127
 476        */
 477       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 478
 479       /* Install exponent and sign.
 480        * Excess drops off the edge:
 481        */
 482       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 483               tmp_d, brw_imm_d(23));
 484
 485       release_tmp(c, tmp);
 486    }
 487
 488    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 489       /* result[1] = arg0.x - floor(arg0.x) */
 490       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 491    }
 492
 493    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 494       /* As with the LOG instruction, we might be better off just
 495        * doing a taylor expansion here, seeing as we have to do all
 496        * the prep work.
 497        *
 498        * If mathbox partial precision is too low, consider also:
 499        * result[3] = result[0] * EXP(result[1])
 500        */
 501       emit_math1(c,
 502                  BRW_MATH_FUNCTION_EXP,
 503                  brw_writemask(dst, WRITEMASK_Z),
 504                  brw_swizzle1(arg0, 0),
 505                  BRW_MATH_PRECISION_FULL);
 506    }
 507
 508    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 509       /* result[3] = 1.0; */
 510       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 511    }
 512 }
 513
 514
 515 static void emit_log_noalias( struct brw_vs_compile *c,
 516                               struct brw_reg dst,
 517                               struct brw_reg arg0 )
 518 {
 519    struct brw_compile *p = &c->func;
 520    struct brw_reg tmp = dst;
 521    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 522    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 523    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 524                          dst.file != BRW_GENERAL_REGISTER_FILE);
 525
 526    if (need_tmp) {
 527       tmp = get_tmp(c);
 528       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 529    }
 530
 531    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 532     * according to spec:
 533     *
 534     * These almost look likey they could be joined up, but not really
 535     * practical:
 536     *
 537     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 538     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 539     */
 540    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 541       brw_AND(p,
 542               brw_writemask(tmp_ud, WRITEMASK_X),
 543               brw_swizzle1(arg0_ud, 0),
 544               brw_imm_ud((1U<<31)-1));
 545
 546       brw_SHR(p,
 547               brw_writemask(tmp_ud, WRITEMASK_X),
 548               tmp_ud,
 549               brw_imm_ud(23));
 550
 551       brw_ADD(p,
 552               brw_writemask(tmp, WRITEMASK_X),
 553               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 554               brw_imm_d(-127));
 555    }
 556
 557    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 558       brw_AND(p,
 559               brw_writemask(tmp_ud, WRITEMASK_Y),
 560               brw_swizzle1(arg0_ud, 0),
 561               brw_imm_ud((1<<23)-1));
 562
 563       brw_OR(p,
 564              brw_writemask(tmp_ud, WRITEMASK_Y),
 565              tmp_ud,
 566              brw_imm_ud(127<<23));
 567    }
 568
 569    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 570       /* result[2] = result[0] + LOG2(result[1]); */
 571
 572       /* Why bother?  The above is just a hint how to do this with a
 573        * taylor series.  Maybe we *should* use a taylor series as by
 574        * the time all the above has been done it's almost certainly
 575        * quicker than calling the mathbox, even with low precision.
 576        *
 577        * Options are:
 578        *    - result[0] + mathbox.LOG2(result[1])
 579        *    - mathbox.LOG2(arg0.x)
 580        *    - result[0] + inline_taylor_approx(result[1])
 581        */
 582       emit_math1(c,
 583                  BRW_MATH_FUNCTION_LOG,
 584                  brw_writemask(tmp, WRITEMASK_Z),
 585                  brw_swizzle1(tmp, 1),
 586                  BRW_MATH_PRECISION_FULL);
 587
 588       brw_ADD(p,
 589               brw_writemask(tmp, WRITEMASK_Z),
 590               brw_swizzle1(tmp, 2),
 591               brw_swizzle1(tmp, 0));
 592    }
 593
 594    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 595       /* result[3] = 1.0; */
 596       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 597    }
 598
 599    if (need_tmp) {
 600       brw_MOV(p, dst, tmp);
 601       release_tmp(c, tmp);
 602    }
 603 }
 604
 605
 606 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 607  */
 608 static void emit_dst_noalias( struct brw_vs_compile *c,
 609                               struct brw_reg dst,
 610                               struct brw_reg arg0,
 611                               struct brw_reg arg1)
 612 {
 613    struct brw_compile *p = &c->func;
 614
 615    /* There must be a better way to do this:
 616     */
 617    if (dst.dw1.bits.writemask & WRITEMASK_X)
 618       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 619    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 620       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 621    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 622       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 623    if (dst.dw1.bits.writemask & WRITEMASK_W)
 624       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 625 }
 626
 627
 628 static void emit_xpd( struct brw_compile *p,
 629                       struct brw_reg dst,
 630                       struct brw_reg t,
 631                       struct brw_reg u)
 632 {
 633    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 634    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 635 }
 636
 637
 638 static void emit_lit_noalias( struct brw_vs_compile *c,
 639                               struct brw_reg dst,
 640                               struct brw_reg arg0 )
 641 {
 642    struct brw_compile *p = &c->func;
 643    struct brw_instruction *if_insn;
 644    struct brw_reg tmp = dst;
 645    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 646
 647    if (need_tmp)
 648       tmp = get_tmp(c);
 649
 650    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 651    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 652
 653    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 654     * to get all channels active inside the IF.  In the clipping code
 655     * we run with NoMask, so it's not an option and we can use
 656     * BRW_EXECUTE_1 for all comparisions.
 657     */
 658    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 659    if_insn = brw_IF(p, BRW_EXECUTE_8);
 660    {
 661       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 662
 663       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 664       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 665       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 666
 667       emit_math2(c,
 668                  BRW_MATH_FUNCTION_POW,
 669                  brw_writemask(dst, WRITEMASK_Z),
 670                  brw_swizzle1(tmp, 2),
 671                  brw_swizzle1(arg0, 3),
 672                  BRW_MATH_PRECISION_PARTIAL);
 673    }
 674
 675    brw_ENDIF(p, if_insn);
 676
 677    release_tmp(c, tmp);
 678 }
 679
 680 static void emit_lrp_noalias(struct brw_vs_compile *c,
 681                              struct brw_reg dst,
 682                              struct brw_reg arg0,
 683                              struct brw_reg arg1,
 684                              struct brw_reg arg2)
 685 {
 686    struct brw_compile *p = &c->func;
 687
 688    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 689    brw_MUL(p, brw_null_reg(), dst, arg2);
 690    brw_MAC(p, dst, arg0, arg1);
 691 }
 692
 693 /** 3 or 4-component vector normalization */
 694 static void emit_nrm( struct brw_vs_compile *c,
 695                       struct brw_reg dst,
 696                       struct brw_reg arg0,
 697                       int num_comps)
 698 {
 699    struct brw_compile *p = &c->func;
 700    struct brw_reg tmp = get_tmp(c);
 701
 702    /* tmp = dot(arg0, arg0) */
 703    if (num_comps == 3)
 704       brw_DP3(p, tmp, arg0, arg0);
 705    else
 706       brw_DP4(p, tmp, arg0, arg0);
 707
 708    /* tmp = 1 / sqrt(tmp) */
 709    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 710
 711    /* dst = arg0 * tmp */
 712    brw_MUL(p, dst, arg0, tmp);
 713
 714    release_tmp(c, tmp);
 715 }
 716
 717
 718 static struct brw_reg
 719 get_constant(struct brw_vs_compile *c,
 720              const struct prog_instruction *inst,
 721              GLuint argIndex)
 722 {
 723    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 724    struct brw_compile *p = &c->func;
 725    struct brw_reg const_reg;
 726    struct brw_reg const2_reg;
 727    const GLboolean relAddr = src->RelAddr;
 728
 729    assert(argIndex < 3);
 730
 731    if (c->current_const[argIndex].index != src->Index || relAddr) {
 732       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 733
 734       c->current_const[argIndex].index = src->Index;
 735
 736 #if 0
 737       printf("  fetch const[%d] for arg %d into reg %d\n",
 738              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 739 #endif
 740       /* need to fetch the constant now */
 741       brw_dp_READ_4_vs(p,
 742                        c->current_const[argIndex].reg,/* writeback dest */
 743                        0,                             /* oword */
 744                        relAddr,                       /* relative indexing? */
 745                        addrReg,                       /* address register */
 746                        16 * src->Index,               /* byte offset */
 747                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 748                        );
 749
 750       if (relAddr) {
 751          /* second read */
 752          const2_reg = get_tmp(c);
 753
 754          /* use upper half of address reg for second read */
 755          addrReg = stride(addrReg, 0, 4, 0);
 756          addrReg.subnr = 16;
 757
 758          brw_dp_READ_4_vs(p,
 759                           const2_reg,              /* writeback dest */
 760                           1,                       /* oword */
 761                           relAddr,                 /* relative indexing? */
 762                           addrReg,                 /* address register */
 763                           16 * src->Index,         /* byte offset */
 764                           SURF_INDEX_VERT_CONST_BUFFER
 765                           );
 766       }
 767    }
 768
 769    const_reg = c->current_const[argIndex].reg;
 770
 771    if (relAddr) {
 772       /* merge the two Owords into the constant register */
 773       /* const_reg[7..4] = const2_reg[7..4] */
 774       brw_MOV(p,
 775               suboffset(stride(const_reg, 0, 4, 1), 4),
 776               suboffset(stride(const2_reg, 0, 4, 1), 4));
 777       release_tmp(c, const2_reg);
 778    }
 779    else {
 780       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 781       const_reg = stride(const_reg, 0, 4, 0);
 782       const_reg.subnr = 0;
 783    }
 784
 785    return const_reg;
 786 }
 787
 788
 789
 790 /* TODO: relative addressing!
 791  */
 792 static struct brw_reg get_reg( struct brw_vs_compile *c,
 793                                gl_register_file file,
 794                                GLuint index )
 795 {
 796    switch (file) {
 797    case PROGRAM_TEMPORARY:
 798    case PROGRAM_INPUT:
 799    case PROGRAM_OUTPUT:
 800       assert(c->regs[file][index].nr != 0);
 801       return c->regs[file][index];
 802    case PROGRAM_STATE_VAR:
 803    case PROGRAM_CONSTANT:
 804    case PROGRAM_UNIFORM:
 805       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 806       return c->regs[PROGRAM_STATE_VAR][index];
 807    case PROGRAM_ADDRESS:
 808       assert(index == 0);
 809       return c->regs[file][index];
 810
 811    case PROGRAM_UNDEFINED:                      /* undef values */
 812       return brw_null_reg();
 813
 814    case PROGRAM_LOCAL_PARAM:
 815    case PROGRAM_ENV_PARAM:
 816    case PROGRAM_WRITE_ONLY:
 817    default:
 818       assert(0);
 819       return brw_null_reg();
 820    }
 821 }
 822
 823
 824 /**
 825  * Indirect addressing:  get reg[[arg] + offset].
 826  */
 827 static struct brw_reg deref( struct brw_vs_compile *c,
 828                              struct brw_reg arg,
 829                              GLint offset)
 830 {
 831    struct brw_compile *p = &c->func;
 832    struct brw_reg tmp = vec4(get_tmp(c));
 833    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 834    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 835    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 836    struct brw_reg indirect = brw_vec4_indirect(0,0);
 837
 838    {
 839       brw_push_insn_state(p);
 840       brw_set_access_mode(p, BRW_ALIGN_1);
 841
 842       /* This is pretty clunky - load the address register twice and
 843        * fetch each 4-dword value in turn.  There must be a way to do
 844        * this in a single pass, but I couldn't get it to work.
 845        */
 846       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 847       brw_MOV(p, tmp, indirect);
 848
 849       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 850       brw_MOV(p, suboffset(tmp, 4), indirect);
 851
 852       brw_pop_insn_state(p);
 853    }
 854
 855    /* NOTE: tmp not released */
 856    return vec8(tmp);
 857 }
 858
 859
 860 /**
 861  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 862  * TODO: relative addressing!
 863  */
 864 static struct brw_reg
 865 get_src_reg( struct brw_vs_compile *c,
 866              const struct prog_instruction *inst,
 867              GLuint argIndex )
 868 {
 869    const GLuint file = inst->SrcReg[argIndex].File;
 870    const GLint index = inst->SrcReg[argIndex].Index;
 871    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 872
 873    switch (file) {
 874    case PROGRAM_TEMPORARY:
 875    case PROGRAM_INPUT:
 876    case PROGRAM_OUTPUT:
 877       if (relAddr) {
 878          return deref(c, c->regs[file][0], index);
 879       }
 880       else {
 881          assert(c->regs[file][index].nr != 0);
 882          return c->regs[file][index];
 883       }
 884
 885    case PROGRAM_STATE_VAR:
 886    case PROGRAM_CONSTANT:
 887    case PROGRAM_UNIFORM:
 888       if (c->vp->use_const_buffer) {
 889          return get_constant(c, inst, argIndex);
 890       }
 891       else if (relAddr) {
 892          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 893       }
 894       else {
 895          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 896          return c->regs[PROGRAM_STATE_VAR][index];
 897       }
 898    case PROGRAM_ADDRESS:
 899       assert(index == 0);
 900       return c->regs[file][index];
 901
 902    case PROGRAM_UNDEFINED:
 903       /* this is a normal case since we loop over all three src args */
 904       return brw_null_reg();
 905
 906    case PROGRAM_LOCAL_PARAM:
 907    case PROGRAM_ENV_PARAM:
 908    case PROGRAM_WRITE_ONLY:
 909    default:
 910       assert(0);
 911       return brw_null_reg();
 912    }
 913 }
 914
 915
 916 static void emit_arl( struct brw_vs_compile *c,
 917                       struct brw_reg dst,
 918                       struct brw_reg arg0 )
 919 {
 920    struct brw_compile *p = &c->func;
 921    struct brw_reg tmp = dst;
 922    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 923
 924    if (need_tmp)
 925       tmp = get_tmp(c);
 926
 927    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 928    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 929
 930    if (need_tmp)
 931       release_tmp(c, tmp);
 932 }
 933
 934
 935 /**
 936  * Return the brw reg for the given instruction's src argument.
 937  * Will return mangled results for SWZ op.  The emit_swz() function
 938  * ignores this result and recalculates taking extended swizzles into
 939  * account.
 940  */
 941 static struct brw_reg get_arg( struct brw_vs_compile *c,
 942                                const struct prog_instruction *inst,
 943                                GLuint argIndex )
 944 {
 945    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 946    struct brw_reg reg;
 947
 948    if (src->File == PROGRAM_UNDEFINED)
 949       return brw_null_reg();
 950
 951    reg = get_src_reg(c, inst, argIndex);
 952
 953    /* Convert 3-bit swizzle to 2-bit.
 954     */
 955    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 956                                        GET_SWZ(src->Swizzle, 1),
 957                                        GET_SWZ(src->Swizzle, 2),
 958                                        GET_SWZ(src->Swizzle, 3));
 959
 960    /* Note this is ok for non-swizzle instructions:
 961     */
 962    reg.negate = src->Negate ? 1 : 0;
 963
 964    return reg;
 965 }
 966
 967
 968 /**
 969  * Get brw register for the given program dest register.
 970  */
 971 static struct brw_reg get_dst( struct brw_vs_compile *c,
 972                                struct prog_dst_register dst )
 973 {
 974    struct brw_reg reg;
 975
 976    switch (dst.File) {
 977    case PROGRAM_TEMPORARY:
 978    case PROGRAM_OUTPUT:
 979       assert(c->regs[dst.File][dst.Index].nr != 0);
 980       reg = c->regs[dst.File][dst.Index];
 981       break;
 982    case PROGRAM_ADDRESS:
 983       assert(dst.Index == 0);
 984       reg = c->regs[dst.File][dst.Index];
 985       break;
 986    case PROGRAM_UNDEFINED:
 987       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
 988       reg = brw_null_reg();
 989       break;
 990    default:
 991       assert(0);
 992       reg = brw_null_reg();
 993    }
 994
 995    reg.dw1.bits.writemask = dst.WriteMask;
 996
 997    return reg;
 998 }
 999
1000
1001 static void emit_swz( struct brw_vs_compile *c,
1002                       struct brw_reg dst,
1003                       const struct prog_instruction *inst)
1004 {
1005    const GLuint argIndex = 0;
1006    const struct prog_src_register src = inst->SrcReg[argIndex];
1007    struct brw_compile *p = &c->func;
1008    GLuint zeros_mask = 0;
1009    GLuint ones_mask = 0;
1010    GLuint src_mask = 0;
1011    GLubyte src_swz[4];
1012    GLboolean need_tmp = (src.Negate &&
1013                          dst.file != BRW_GENERAL_REGISTER_FILE);
1014    struct brw_reg tmp = dst;
1015    GLuint i;
1016
1017    if (need_tmp)
1018       tmp = get_tmp(c);
1019
1020    for (i = 0; i < 4; i++) {
1021       if (dst.dw1.bits.writemask & (1<<i)) {
1022          GLubyte s = GET_SWZ(src.Swizzle, i);
1023          switch (s) {
1024          case SWIZZLE_X:
1025          case SWIZZLE_Y:
1026          case SWIZZLE_Z:
1027          case SWIZZLE_W:
1028             src_mask |= 1<<i;
1029             src_swz[i] = s;
1030             break;
1031          case SWIZZLE_ZERO:
1032             zeros_mask |= 1<<i;
1033             break;
1034          case SWIZZLE_ONE:
1035             ones_mask |= 1<<i;
1036             break;
1037          }
1038       }
1039    }
1040
1041    /* Do src first, in case dst aliases src:
1042     */
1043    if (src_mask) {
1044       struct brw_reg arg0;
1045
1046       arg0 = get_src_reg(c, inst, argIndex);
1047
1048       arg0 = brw_swizzle(arg0,
1049                          src_swz[0], src_swz[1],
1050                          src_swz[2], src_swz[3]);
1051
1052       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1053    }
1054
1055    if (zeros_mask)
1056       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1057
1058    if (ones_mask)
1059       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1060
1061    if (src.Negate)
1062       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1063
1064    if (need_tmp) {
1065       brw_MOV(p, dst, tmp);
1066       release_tmp(c, tmp);
1067    }
1068 }
1069
1070
1071 /**
1072  * Post-vertex-program processing.  Send the results to the URB.
1073  */
1074 static void emit_vertex_write( struct brw_vs_compile *c)
1075 {
1076    struct brw_compile *p = &c->func;
1077    struct brw_reg m0 = brw_message_reg(0);
1078    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1079    struct brw_reg ndc;
1080    int eot;
1081
1082    if (c->key.copy_edgeflag) {
1083       brw_MOV(p,
1084               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1085               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1086    }
1087
1088    /* Build ndc coords */
1089    ndc = get_tmp(c);
1090    /* ndc = 1.0 / pos.w */
1091    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1092    /* ndc.xyz = pos * ndc */
1093    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1094
1095    /* Update the header for point size, user clipping flags, and -ve rhw
1096     * workaround.
1097     */
1098    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1099        c->key.nr_userclip || !BRW_IS_G4X(p->brw))
1100    {
1101       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1102       GLuint i;
1103
1104       brw_MOV(p, header1, brw_imm_ud(0));
1105
1106       brw_set_access_mode(p, BRW_ALIGN_16);
1107
1108       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1109          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1110          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1111          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1112       }
1113
1114       for (i = 0; i < c->key.nr_userclip; i++) {
1115          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1116          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1117          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1118          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1119       }
1120
1121       /* i965 clipping workaround:
1122        * 1) Test for -ve rhw
1123        * 2) If set,
1124        *      set ndc = (0,0,0,0)
1125        *      set ucp[6] = 1
1126        *
1127        * Later, clipping will detect ucp[6] and ensure the primitive is
1128        * clipped against all fixed planes.
1129        */
1130       if (!BRW_IS_G4X(p->brw)) {
1131          brw_CMP(p,
1132                  vec8(brw_null_reg()),
1133                  BRW_CONDITIONAL_L,
1134                  brw_swizzle1(ndc, 3),
1135                  brw_imm_f(0));
1136
1137          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1138          brw_MOV(p, ndc, brw_imm_f(0));
1139          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1140       }
1141
1142       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1143       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1144       brw_set_access_mode(p, BRW_ALIGN_16);
1145
1146       release_tmp(c, header1);
1147    }
1148    else {
1149       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1150    }
1151
1152    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1153     * of zeros followed by two sets of NDC coordinates:
1154     */
1155    brw_set_access_mode(p, BRW_ALIGN_1);
1156    brw_MOV(p, offset(m0, 2), ndc);
1157    brw_MOV(p, offset(m0, 3), pos);
1158
1159    eot = (c->first_overflow_output == 0);
1160
1161    brw_urb_WRITE(p,
1162                  brw_null_reg(), /* dest */
1163                  0,             /* starting mrf reg nr */
1164                  c->r0,         /* src */
1165                  0,             /* allocate */
1166                  1,             /* used */
1167                  MIN2(c->nr_outputs + 3, (BRW_MAX_MRF-1)), /* msg len */
1168                  0,             /* response len */
1169                  eot,           /* eot */
1170                  1,             /* writes complete */
1171                  0,             /* urb destination offset */
1172                  BRW_URB_SWIZZLE_INTERLEAVE);
1173
1174    if (c->first_overflow_output > 0) {
1175       /* Not all of the vertex outputs/results fit into the MRF.
1176        * Move the overflowed attributes from the GRF to the MRF and
1177        * issue another brw_urb_WRITE().
1178        */
1179       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1180        * at mrf[4] atm...
1181        */
1182       GLuint i, mrf = 0;
1183       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1184          if (c->prog_data.outputs_written & (1 << i)) {
1185             /* move from GRF to MRF */
1186             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1187             mrf++;
1188          }
1189       }
1190
1191       brw_urb_WRITE(p,
1192                     brw_null_reg(), /* dest */
1193                     4,              /* starting mrf reg nr */
1194                     c->r0,          /* src */
1195                     0,              /* allocate */
1196                     1,              /* used */
1197                     mrf+1,          /* msg len */
1198                     0,              /* response len */
1199                     1,              /* eot */
1200                     1,              /* writes complete */
1201                     BRW_MAX_MRF-1,  /* urb destination offset */
1202                     BRW_URB_SWIZZLE_INTERLEAVE);
1203    }
1204 }
1205
1206
1207 /**
1208  * Called after code generation to resolve subroutine calls and the
1209  * END instruction.
1210  * \param end_inst  points to brw code for END instruction
1211  * \param last_inst  points to last instruction emitted before vertex write
1212  */
1213 static void
1214 post_vs_emit( struct brw_vs_compile *c,
1215               struct brw_instruction *end_inst,
1216               struct brw_instruction *last_inst )
1217 {
1218    GLint offset;
1219
1220    brw_resolve_cals(&c->func);
1221
1222    /* patch up the END code to jump past subroutines, etc */
1223    offset = last_inst - end_inst;
1224    brw_set_src1(end_inst, brw_imm_d(offset * 16));
1225 }
1226
1227
1228 /* Emit the vertex program instructions here.
1229  */
1230 void brw_vs_emit(struct brw_vs_compile *c )
1231 {
1232 #define MAX_IF_DEPTH 32
1233 #define MAX_LOOP_DEPTH 32
1234    struct brw_compile *p = &c->func;
1235    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1236    GLuint insn, if_depth = 0, loop_depth = 0;
1237    GLuint end_offset = 0;
1238    struct brw_instruction *end_inst, *last_inst;
1239    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1240    const struct brw_indirect stack_index = brw_indirect(0, 0);
1241    GLuint index;
1242    GLuint file;
1243
1244    if (INTEL_DEBUG & DEBUG_VS) {
1245       _mesa_printf("vs-emit:\n");
1246       _mesa_print_program(&c->vp->program.Base);
1247       _mesa_printf("\n");
1248    }
1249
1250    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1251    brw_set_access_mode(p, BRW_ALIGN_16);
1252
1253    /* Message registers can't be read, so copy the output into GRF register
1254       if they are used in source registers */
1255    for (insn = 0; insn < nr_insns; insn++) {
1256        GLuint i;
1257        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1258        for (i = 0; i < 3; i++) {
1259            struct prog_src_register *src = &inst->SrcReg[i];
1260            GLuint index = src->Index;
1261            GLuint file = src->File;
1262            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1263                c->output_regs[index].used_in_src = GL_TRUE;
1264        }
1265    }
1266
1267    /* Static register allocation
1268     */
1269    brw_vs_alloc_regs(c);
1270    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1271
1272    for (insn = 0; insn < nr_insns; insn++) {
1273
1274       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1275       struct brw_reg args[3], dst;
1276       GLuint i;
1277
1278 #if 0
1279       printf("%d: ", insn);
1280       _mesa_print_instruction(inst);
1281 #endif
1282
1283       /* Get argument regs.  SWZ is special and does this itself.
1284        */
1285       if (inst->Opcode != OPCODE_SWZ)
1286           for (i = 0; i < 3; i++) {
1287               const struct prog_src_register *src = &inst->SrcReg[i];
1288               index = src->Index;
1289               file = src->File;
1290               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1291                   args[i] = c->output_regs[index].reg;
1292               else
1293                   args[i] = get_arg(c, inst, i);
1294           }
1295
1296       /* Get dest regs.  Note that it is possible for a reg to be both
1297        * dst and arg, given the static allocation of registers.  So
1298        * care needs to be taken emitting multi-operation instructions.
1299        */
1300       index = inst->DstReg.Index;
1301       file = inst->DstReg.File;
1302       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1303           dst = c->output_regs[index].reg;
1304       else
1305           dst = get_dst(c, inst->DstReg);
1306
1307       if (inst->SaturateMode != SATURATE_OFF) {
1308          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1309                        inst->SaturateMode);
1310       }
1311
1312       switch (inst->Opcode) {
1313       case OPCODE_ABS:
1314          brw_MOV(p, dst, brw_abs(args[0]));
1315          break;
1316       case OPCODE_ADD:
1317          brw_ADD(p, dst, args[0], args[1]);
1318          break;
1319       case OPCODE_COS:
1320          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1321          break;
1322       case OPCODE_DP3:
1323          brw_DP3(p, dst, args[0], args[1]);
1324          break;
1325       case OPCODE_DP4:
1326          brw_DP4(p, dst, args[0], args[1]);
1327          break;
1328       case OPCODE_DPH:
1329          brw_DPH(p, dst, args[0], args[1]);
1330          break;
1331       case OPCODE_NRM3:
1332          emit_nrm(c, dst, args[0], 3);
1333          break;
1334       case OPCODE_NRM4:
1335          emit_nrm(c, dst, args[0], 4);
1336          break;
1337       case OPCODE_DST:
1338          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1339          break;
1340       case OPCODE_EXP:
1341          unalias1(c, dst, args[0], emit_exp_noalias);
1342          break;
1343       case OPCODE_EX2:
1344          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1345          break;
1346       case OPCODE_ARL:
1347          emit_arl(c, dst, args[0]);
1348          break;
1349       case OPCODE_FLR:
1350          brw_RNDD(p, dst, args[0]);
1351          break;
1352       case OPCODE_FRC:
1353          brw_FRC(p, dst, args[0]);
1354          break;
1355       case OPCODE_LOG:
1356          unalias1(c, dst, args[0], emit_log_noalias);
1357          break;
1358       case OPCODE_LG2:
1359          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1360          break;
1361       case OPCODE_LIT:
1362          unalias1(c, dst, args[0], emit_lit_noalias);
1363          break;
1364       case OPCODE_LRP:
1365          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1366          break;
1367       case OPCODE_MAD:
1368          brw_MOV(p, brw_acc_reg(), args[2]);
1369          brw_MAC(p, dst, args[0], args[1]);
1370          break;
1371       case OPCODE_MAX:
1372          emit_max(p, dst, args[0], args[1]);
1373          break;
1374       case OPCODE_MIN:
1375          emit_min(p, dst, args[0], args[1]);
1376          break;
1377       case OPCODE_MOV:
1378          brw_MOV(p, dst, args[0]);
1379          break;
1380       case OPCODE_MUL:
1381          brw_MUL(p, dst, args[0], args[1]);
1382          break;
1383       case OPCODE_POW:
1384          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1385          break;
1386       case OPCODE_RCP:
1387          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1388          break;
1389       case OPCODE_RSQ:
1390          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1391          break;
1392
1393       case OPCODE_SEQ:
1394          emit_seq(p, dst, args[0], args[1]);
1395          break;
1396       case OPCODE_SIN:
1397          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1398          break;
1399       case OPCODE_SNE:
1400          emit_sne(p, dst, args[0], args[1]);
1401          break;
1402       case OPCODE_SGE:
1403          emit_sge(p, dst, args[0], args[1]);
1404          break;
1405       case OPCODE_SGT:
1406          emit_sgt(p, dst, args[0], args[1]);
1407          break;
1408       case OPCODE_SLT:
1409          emit_slt(p, dst, args[0], args[1]);
1410          break;
1411       case OPCODE_SLE:
1412          emit_sle(p, dst, args[0], args[1]);
1413          break;
1414       case OPCODE_SUB:
1415          brw_ADD(p, dst, args[0], negate(args[1]));
1416          break;
1417       case OPCODE_SWZ:
1418          /* The args[0] value can't be used here as it won't have
1419           * correctly encoded the full swizzle:
1420           */
1421          emit_swz(c, dst, inst);
1422          break;
1423       case OPCODE_TRUNC:
1424          /* round toward zero */
1425          brw_RNDZ(p, dst, args[0]);
1426          break;
1427       case OPCODE_XPD:
1428          emit_xpd(p, dst, args[0], args[1]);
1429          break;
1430       case OPCODE_IF:
1431          assert(if_depth < MAX_IF_DEPTH);
1432          if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1433          break;
1434       case OPCODE_ELSE:
1435          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1436          break;
1437       case OPCODE_ENDIF:
1438          assert(if_depth > 0);
1439          brw_ENDIF(p, if_inst[--if_depth]);
1440          break;
1441 #if 0
1442       case OPCODE_BGNLOOP:
1443          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1444          break;
1445       case OPCODE_BRK:
1446          brw_BREAK(p);
1447          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1448          break;
1449       case OPCODE_CONT:
1450          brw_CONT(p);
1451          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1452          break;
1453       case OPCODE_ENDLOOP:
1454          {
1455             struct brw_instruction *inst0, *inst1;
1456             loop_depth--;
1457             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1458             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1459             while (inst0 > loop_inst[loop_depth]) {
1460                inst0--;
1461                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1462                   inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
1463                   inst0->bits3.if_else.pop_count = 0;
1464                }
1465                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1466                   inst0->bits3.if_else.jump_count = inst1 - inst0;
1467                   inst0->bits3.if_else.pop_count = 0;
1468                }
1469             }
1470          }
1471          break;
1472 #else
1473          (void) loop_inst;
1474          (void) loop_depth;
1475 #endif
1476       case OPCODE_BRA:
1477          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1478          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1479          brw_set_predicate_control_flag_value(p, 0xff);
1480          break;
1481       case OPCODE_CAL:
1482          brw_set_access_mode(p, BRW_ALIGN_1);
1483          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1484          brw_set_access_mode(p, BRW_ALIGN_16);
1485          brw_ADD(p, get_addr_reg(stack_index),
1486                          get_addr_reg(stack_index), brw_imm_d(4));
1487          brw_save_call(p, inst->Comment, p->nr_insn);
1488          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1489          break;
1490       case OPCODE_RET:
1491          brw_ADD(p, get_addr_reg(stack_index),
1492                          get_addr_reg(stack_index), brw_imm_d(-4));
1493          brw_set_access_mode(p, BRW_ALIGN_1);
1494          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1495          brw_set_access_mode(p, BRW_ALIGN_16);
1496          break;
1497       case OPCODE_END:
1498          end_offset = p->nr_insn;
1499          /* this instruction will get patched later to jump past subroutine
1500           * code, etc.
1501           */
1502          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1503          break;
1504       case OPCODE_PRINT:
1505          /* no-op */
1506          break;
1507       case OPCODE_BGNSUB:
1508          brw_save_label(p, inst->Comment, p->nr_insn);
1509          break;
1510       case OPCODE_ENDSUB:
1511          /* no-op */
1512          break;
1513       default:
1514          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1515                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1516                                     _mesa_opcode_string(inst->Opcode) :
1517                                     "unknown");
1518       }
1519
1520       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1521           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1522           && c->output_regs[inst->DstReg.Index].used_in_src) {
1523          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1524       }
1525
1526       /* Result color clamping.
1527        *
1528        * When destination register is an output register and
1529        * it's primary/secondary front/back color, we have to clamp
1530        * the result to [0,1]. This is done by enabling the
1531        * saturation bit for the last instruction.
1532        *
1533        * We don't use brw_set_saturate() as it modifies
1534        * p->current->header.saturate, which affects all the subsequent
1535        * instructions. Instead, we directly modify the header
1536        * of the last (already stored) instruction.
1537        */
1538       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1539          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1540              || (inst->DstReg.Index == VERT_RESULT_COL1)
1541              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1542              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1543             p->store[p->nr_insn-1].header.saturate = 1;
1544          }
1545       }
1546
1547       release_tmps(c);
1548    }
1549
1550    end_inst = &p->store[end_offset];
1551    last_inst = &p->store[p->nr_insn];
1552
1553    /* The END instruction will be patched to jump to this code */
1554    emit_vertex_write(c);
1555
1556    post_vs_emit(c, end_inst, last_inst);
1557 }