src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "program.h"
  35 #include "program_instruction.h"
  36 #include "macros.h"
  37 #include "brw_vs.h"
  38
  39
  40
  41 /* Do things as simply as possible.  Allocate and populate all regs
  42  * ahead of time.
  43  */
  44 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  45 {
  46    GLuint i, reg = 0, mrf;
  47    GLuint nr_params;
  48
  49    /* r0 -- reserved as usual
  50     */
  51    c->r0 = brw_vec8_grf(reg, 0); reg++;
  52
  53    /* User clip planes from curbe:
  54     */
  55    if (c->key.nr_userclip) {
  56       for (i = 0; i < c->key.nr_userclip; i++) {
  57          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  58       }
  59
  60       /* Deal with curbe alignment:
  61        */
  62       reg += ((6+c->key.nr_userclip+3)/4)*2;
  63    }
  64
  65    /* Vertex program parameters from curbe:
  66     */
  67    nr_params = c->vp->program.Base.Parameters->NumParameters;
  68    for (i = 0; i < nr_params; i++) {
  69       c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
  70    }
  71    reg += (nr_params+1)/2;
  72
  73    c->prog_data.curb_read_length = reg - 1;
  74
  75
  76
  77    /* Allocate input regs:
  78     */
  79    c->nr_inputs = 0;
  80    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
  81       if (c->prog_data.inputs_read & (1<<i)) {
  82          c->nr_inputs++;
  83          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
  84          reg++;
  85       }
  86    }
  87
  88
  89    /* Allocate outputs: TODO: could organize the non-position outputs
  90     * to go straight into message regs.
  91     */
  92    c->nr_outputs = 0;
  93    c->first_output = reg;
  94    mrf = 4;
  95    for (i = 0; i < VERT_RESULT_MAX; i++) {
  96       if (c->prog_data.outputs_written & (1<<i)) {
  97          c->nr_outputs++;
  98          if (i == VERT_RESULT_HPOS) {
  99             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 100             reg++;
 101          }
 102          else if (i == VERT_RESULT_PSIZ) {
 103             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 104             reg++;
 105             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 106          }
 107          else {
 108             c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 109             mrf++;
 110          }
 111       }
 112    }
 113
 114    /* Allocate program temporaries:
 115     */
 116    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 117       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 118       reg++;
 119    }
 120
 121    /* Address reg(s).  Don't try to use the internal address reg until
 122     * deref time.
 123     */
 124    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 125       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 126                                              reg,
 127                                              0,
 128                                              BRW_REGISTER_TYPE_D,
 129                                              BRW_VERTICAL_STRIDE_8,
 130                                              BRW_WIDTH_8,
 131                                              BRW_HORIZONTAL_STRIDE_1,
 132                                              BRW_SWIZZLE_XXXX,
 133                                              WRITEMASK_X);
 134       reg++;
 135    }
 136
 137
 138    /* Some opcodes need an internal temporary:
 139     */
 140    c->first_tmp = reg;
 141    c->last_tmp = reg;           /* for allocation purposes */
 142
 143    /* Each input reg holds data from two vertices.  The
 144     * urb_read_length is the number of registers read from *each*
 145     * vertex urb, so is half the amount:
 146     */
 147    c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
 148
 149    c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
 150    c->prog_data.total_grf = reg;
 151 }
 152
 153
 154 static struct brw_reg get_tmp( struct brw_vs_compile *c )
 155 {
 156    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
 157
 158    if (++c->last_tmp > c->prog_data.total_grf)
 159       c->prog_data.total_grf = c->last_tmp;
 160
 161    return tmp;
 162 }
 163
 164 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
 165 {
 166    if (tmp.nr == c->last_tmp-1)
 167       c->last_tmp--;
 168 }
 169
 170 static void release_tmps( struct brw_vs_compile *c )
 171 {
 172    c->last_tmp = c->first_tmp;
 173 }
 174
 175
 176 static void unalias1( struct brw_vs_compile *c,
 177                       struct brw_reg dst,
 178                       struct brw_reg arg0,
 179                       void (*func)( struct brw_vs_compile *,
 180                                     struct brw_reg,
 181                                     struct brw_reg ))
 182 {
 183    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 184       struct brw_compile *p = &c->func;
 185       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 186       func(c, tmp, arg0);
 187       brw_MOV(p, dst, tmp);
 188    }
 189    else {
 190       func(c, dst, arg0);
 191    }
 192 }
 193
 194 static void unalias2( struct brw_vs_compile *c,
 195                       struct brw_reg dst,
 196                       struct brw_reg arg0,
 197                       struct brw_reg arg1,
 198                       void (*func)( struct brw_vs_compile *,
 199                                     struct brw_reg,
 200                                     struct brw_reg,
 201                                     struct brw_reg ))
 202 {
 203    if ((dst.file == arg0.file && dst.nr == arg0.nr) &&
 204        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 205       struct brw_compile *p = &c->func;
 206       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 207       func(c, tmp, arg0, arg1);
 208       brw_MOV(p, dst, tmp);
 209    }
 210    else {
 211       func(c, dst, arg0, arg1);
 212    }
 213 }
 214
 215
 216
 217
 218 static void emit_slt( struct brw_compile *p,
 219                       struct brw_reg dst,
 220                       struct brw_reg arg0,
 221                       struct brw_reg arg1 )
 222 {
 223    /* Could be done with an if/else/endif, but this method uses half
 224     * the instructions.  Note that we are careful to reference the
 225     * arguments before writing the dest.  That means we emit the
 226     * instructions in an odd order and have to play with the flag
 227     * values.
 228     */
 229    brw_push_insn_state(p);
 230    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 231
 232    /* Write all values to 1:
 233     */
 234    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 235    brw_MOV(p, dst, brw_imm_f(1.0));
 236
 237    /* Where the test succeeded, overwite with zero:
 238     */
 239    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 240    brw_MOV(p, dst, brw_imm_f(0.0));
 241    brw_pop_insn_state(p);
 242 }
 243
 244
 245 static void emit_sge( struct brw_compile *p,
 246                       struct brw_reg dst,
 247                       struct brw_reg arg0,
 248                       struct brw_reg arg1 )
 249 {
 250    brw_push_insn_state(p);
 251    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 252
 253    /* Write all values to zero:
 254     */
 255    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 256    brw_MOV(p, dst, brw_imm_f(0));
 257
 258    /* Where the test succeeded, overwite with 1:
 259     */
 260    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 261    brw_MOV(p, dst, brw_imm_f(1.0));
 262    brw_pop_insn_state(p);
 263 }
 264
 265
 266 static void emit_max( struct brw_compile *p,
 267                       struct brw_reg dst,
 268                       struct brw_reg arg0,
 269                       struct brw_reg arg1 )
 270 {
 271    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 272    brw_SEL(p, dst, arg1, arg0);
 273    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 274 }
 275
 276 static void emit_min( struct brw_compile *p,
 277                       struct brw_reg dst,
 278                       struct brw_reg arg0,
 279                       struct brw_reg arg1 )
 280 {
 281    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 282    brw_SEL(p, dst, arg0, arg1);
 283    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 284 }
 285
 286
 287 static void emit_math1( struct brw_vs_compile *c,
 288                         GLuint function,
 289                         struct brw_reg dst,
 290                         struct brw_reg arg0,
 291                         GLuint precision)
 292 {
 293    /* There are various odd behaviours with SEND on the simulator.  In
 294     * addition there are documented issues with the fact that the GEN4
 295     * processor doesn't do dependency control properly on SEND
 296     * results.  So, on balance, this kludge to get around failures
 297     * with writemasked math results looks like it might be necessary
 298     * whether that turns out to be a simulator bug or not:
 299     */
 300    struct brw_compile *p = &c->func;
 301    struct brw_reg tmp = dst;
 302    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 303                          dst.file != BRW_GENERAL_REGISTER_FILE);
 304
 305    if (need_tmp)
 306       tmp = get_tmp(c);
 307
 308    brw_math(p,
 309             tmp,
 310             function,
 311             BRW_MATH_SATURATE_NONE,
 312             2,
 313             arg0,
 314             BRW_MATH_DATA_SCALAR,
 315             precision);
 316
 317    if (need_tmp) {
 318       brw_MOV(p, dst, tmp);
 319       release_tmp(c, tmp);
 320    }
 321 }
 322
 323 static void emit_math2( struct brw_vs_compile *c,
 324                         GLuint function,
 325                         struct brw_reg dst,
 326                         struct brw_reg arg0,
 327                         struct brw_reg arg1,
 328                         GLuint precision)
 329 {
 330    struct brw_compile *p = &c->func;
 331    struct brw_reg tmp = dst;
 332    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 333                          dst.file != BRW_GENERAL_REGISTER_FILE);
 334
 335    if (need_tmp)
 336       tmp = get_tmp(c);
 337
 338    brw_MOV(p, brw_message_reg(3), arg1);
 339
 340    brw_math(p,
 341             tmp,
 342             function,
 343             BRW_MATH_SATURATE_NONE,
 344             2,
 345             arg0,
 346             BRW_MATH_DATA_SCALAR,
 347             precision);
 348
 349    if (need_tmp) {
 350       brw_MOV(p, dst, tmp);
 351       release_tmp(c, tmp);
 352    }
 353 }
 354
 355
 356
 357 static void emit_exp_noalias( struct brw_vs_compile *c,
 358                               struct brw_reg dst,
 359                               struct brw_reg arg0 )
 360 {
 361    struct brw_compile *p = &c->func;
 362
 363
 364    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 365       struct brw_reg tmp = get_tmp(c);
 366       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 367
 368       /* tmp_d = floor(arg0.x) */
 369       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 370
 371       /* result[0] = 2.0 ^ tmp */
 372
 373       /* Adjust exponent for floating point:
 374        * exp += 127
 375        */
 376       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 377
 378       /* Install exponent and sign.
 379        * Excess drops off the edge:
 380        */
 381       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 382               tmp_d, brw_imm_d(23));
 383
 384       release_tmp(c, tmp);
 385    }
 386
 387    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 388       /* result[1] = arg0.x - floor(arg0.x) */
 389       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 390    }
 391
 392    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 393       /* As with the LOG instruction, we might be better off just
 394        * doing a taylor expansion here, seeing as we have to do all
 395        * the prep work.
 396        *
 397        * If mathbox partial precision is too low, consider also:
 398        * result[3] = result[0] * EXP(result[1])
 399        */
 400       emit_math1(c,
 401                  BRW_MATH_FUNCTION_EXP,
 402                  brw_writemask(dst, WRITEMASK_Z),
 403                  brw_swizzle1(arg0, 0),
 404                  BRW_MATH_PRECISION_PARTIAL);
 405    }
 406
 407    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 408       /* result[3] = 1.0; */
 409       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 410    }
 411 }
 412
 413
 414 static void emit_log_noalias( struct brw_vs_compile *c,
 415                               struct brw_reg dst,
 416                               struct brw_reg arg0 )
 417 {
 418    struct brw_compile *p = &c->func;
 419    struct brw_reg tmp = dst;
 420    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 421    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 422    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 423                          dst.file != BRW_GENERAL_REGISTER_FILE);
 424
 425    if (need_tmp) {
 426       tmp = get_tmp(c);
 427       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 428    }
 429
 430    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 431     * according to spec:
 432     *
 433     * These almost look likey they could be joined up, but not really
 434     * practical:
 435     *
 436     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 437     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 438     */
 439    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 440       brw_AND(p,
 441               brw_writemask(tmp_ud, WRITEMASK_X),
 442               brw_swizzle1(arg0_ud, 0),
 443               brw_imm_ud((1U<<31)-1));
 444
 445       brw_SHR(p,
 446               brw_writemask(tmp_ud, WRITEMASK_X),
 447               tmp_ud,
 448               brw_imm_ud(23));
 449
 450       brw_ADD(p,
 451               brw_writemask(tmp, WRITEMASK_X),
 452               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 453               brw_imm_d(-127));
 454    }
 455
 456    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 457       brw_AND(p,
 458               brw_writemask(tmp_ud, WRITEMASK_Y),
 459               brw_swizzle1(arg0_ud, 0),
 460               brw_imm_ud((1<<23)-1));
 461
 462       brw_OR(p,
 463              brw_writemask(tmp_ud, WRITEMASK_Y),
 464              tmp_ud,
 465              brw_imm_ud(127<<23));
 466    }
 467
 468    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 469       /* result[2] = result[0] + LOG2(result[1]); */
 470
 471       /* Why bother?  The above is just a hint how to do this with a
 472        * taylor series.  Maybe we *should* use a taylor series as by
 473        * the time all the above has been done it's almost certainly
 474        * quicker than calling the mathbox, even with low precision.
 475        *
 476        * Options are:
 477        *    - result[0] + mathbox.LOG2(result[1])
 478        *    - mathbox.LOG2(arg0.x)
 479        *    - result[0] + inline_taylor_approx(result[1])
 480        */
 481       emit_math1(c,
 482                  BRW_MATH_FUNCTION_LOG,
 483                  brw_writemask(tmp, WRITEMASK_Z),
 484                  brw_swizzle1(tmp, 1),
 485                  BRW_MATH_PRECISION_FULL);
 486
 487       brw_ADD(p,
 488               brw_writemask(tmp, WRITEMASK_Z),
 489               brw_swizzle1(tmp, 2),
 490               brw_swizzle1(tmp, 0));
 491    }
 492
 493    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 494       /* result[3] = 1.0; */
 495       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 496    }
 497
 498    if (need_tmp) {
 499       brw_MOV(p, dst, tmp);
 500       release_tmp(c, tmp);
 501    }
 502 }
 503
 504
 505
 506
 507 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 508  */
 509 static void emit_dst_noalias( struct brw_vs_compile *c,
 510                               struct brw_reg dst,
 511                               struct brw_reg arg0,
 512                               struct brw_reg arg1)
 513 {
 514    struct brw_compile *p = &c->func;
 515
 516    /* There must be a better way to do this:
 517     */
 518    if (dst.dw1.bits.writemask & WRITEMASK_X)
 519       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 520    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 521       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 522    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 523       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 524    if (dst.dw1.bits.writemask & WRITEMASK_W)
 525       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 526 }
 527
 528 static void emit_xpd( struct brw_compile *p,
 529                       struct brw_reg dst,
 530                       struct brw_reg t,
 531                       struct brw_reg u)
 532 {
 533    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 534    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 535 }
 536
 537
 538
 539 static void emit_lit_noalias( struct brw_vs_compile *c,
 540                               struct brw_reg dst,
 541                               struct brw_reg arg0 )
 542 {
 543    struct brw_compile *p = &c->func;
 544    struct brw_instruction *if_insn;
 545    struct brw_reg tmp = dst;
 546    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 547
 548    if (need_tmp)
 549       tmp = get_tmp(c);
 550
 551    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 552    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 553
 554    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 555     * to get all channels active inside the IF.  In the clipping code
 556     * we run with NoMask, so it's not an option and we can use
 557     * BRW_EXECUTE_1 for all comparisions.
 558     */
 559    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 560    if_insn = brw_IF(p, BRW_EXECUTE_8);
 561    {
 562       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 563
 564       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 565       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 566       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 567
 568       emit_math2(c,
 569                  BRW_MATH_FUNCTION_POW,
 570                  brw_writemask(dst, WRITEMASK_Z),
 571                  brw_swizzle1(tmp, 2),
 572                  brw_swizzle1(arg0, 3),
 573                  BRW_MATH_PRECISION_PARTIAL);
 574    }
 575
 576    brw_ENDIF(p, if_insn);
 577 }
 578
 579
 580
 581
 582
 583 /* TODO: relative addressing!
 584  */
 585 static struct brw_reg get_reg( struct brw_vs_compile *c,
 586                                GLuint file,
 587                                GLuint index )
 588 {
 589
 590    switch (file) {
 591    case PROGRAM_TEMPORARY:
 592    case PROGRAM_INPUT:
 593    case PROGRAM_OUTPUT:
 594    case PROGRAM_STATE_VAR:
 595       assert(c->regs[file][index].nr != 0);
 596       return c->regs[file][index];
 597    case PROGRAM_ADDRESS:
 598       assert(index == 0);
 599       return c->regs[file][index];
 600
 601    case PROGRAM_UNDEFINED:                      /* undef values */
 602       return brw_null_reg();
 603
 604    case PROGRAM_LOCAL_PARAM:
 605    case PROGRAM_ENV_PARAM:
 606    case PROGRAM_WRITE_ONLY:
 607    default:
 608       assert(0);
 609       return brw_null_reg();
 610    }
 611 }
 612
 613
 614
 615 static struct brw_reg deref( struct brw_vs_compile *c,
 616                              struct brw_reg arg,
 617                              GLint offset)
 618 {
 619    struct brw_compile *p = &c->func;
 620    struct brw_reg tmp = vec4(get_tmp(c));
 621    struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
 622    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 623    struct brw_reg indirect = brw_vec4_indirect(0,0);
 624
 625    {
 626       brw_push_insn_state(p);
 627       brw_set_access_mode(p, BRW_ALIGN_1);
 628
 629       /* This is pretty clunky - load the address register twice and
 630        * fetch each 4-dword value in turn.  There must be a way to do
 631        * this in a single pass, but I couldn't get it to work.
 632        */
 633       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 634       brw_MOV(p, tmp, indirect);
 635
 636       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 637       brw_MOV(p, suboffset(tmp, 4), indirect);
 638
 639       brw_pop_insn_state(p);
 640    }
 641
 642    return vec8(tmp);
 643 }
 644
 645
 646 static void emit_arl( struct brw_vs_compile *c,
 647                       struct brw_reg dst,
 648                       struct brw_reg arg0 )
 649 {
 650    struct brw_compile *p = &c->func;
 651    struct brw_reg tmp = dst;
 652    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 653
 654    if (need_tmp)
 655       tmp = get_tmp(c);
 656
 657    brw_RNDD(p, tmp, arg0);
 658    brw_MUL(p, dst, tmp, brw_imm_d(16));
 659
 660    if (need_tmp)
 661       release_tmp(c, tmp);
 662 }
 663
 664
 665 /* Will return mangled results for SWZ op.  The emit_swz() function
 666  * ignores this result and recalculates taking extended swizzles into
 667  * account.
 668  */
 669 static struct brw_reg get_arg( struct brw_vs_compile *c,
 670                                struct prog_src_register src )
 671 {
 672    struct brw_reg reg;
 673
 674    if (src.File == PROGRAM_UNDEFINED)
 675       return brw_null_reg();
 676
 677    if (src.RelAddr)
 678       reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 679    else
 680       reg = get_reg(c, src.File, src.Index);
 681
 682    /* Convert 3-bit swizzle to 2-bit.
 683     */
 684    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src.Swizzle, 0),
 685                                        GET_SWZ(src.Swizzle, 1),
 686                                        GET_SWZ(src.Swizzle, 2),
 687                                        GET_SWZ(src.Swizzle, 3));
 688
 689    /* Note this is ok for non-swizzle instructions:
 690     */
 691    reg.negate = src.NegateBase ? 1 : 0;
 692
 693    return reg;
 694 }
 695
 696
 697 static struct brw_reg get_dst( struct brw_vs_compile *c,
 698                                struct prog_dst_register dst )
 699 {
 700    struct brw_reg reg = get_reg(c, dst.File, dst.Index);
 701
 702    reg.dw1.bits.writemask = dst.WriteMask;
 703
 704    return reg;
 705 }
 706
 707
 708
 709
 710 static void emit_swz( struct brw_vs_compile *c,
 711                       struct brw_reg dst,
 712                       struct prog_src_register src )
 713 {
 714    struct brw_compile *p = &c->func;
 715    GLuint zeros_mask = 0;
 716    GLuint ones_mask = 0;
 717    GLuint src_mask = 0;
 718    GLubyte src_swz[4];
 719    GLboolean need_tmp = (src.NegateBase &&
 720                          dst.file != BRW_GENERAL_REGISTER_FILE);
 721    struct brw_reg tmp = dst;
 722    GLuint i;
 723
 724    if (need_tmp)
 725       tmp = get_tmp(c);
 726
 727    for (i = 0; i < 4; i++) {
 728       if (dst.dw1.bits.writemask & (1<<i)) {
 729          GLubyte s = GET_SWZ(src.Swizzle, i);
 730          switch (s) {
 731          case SWIZZLE_X:
 732          case SWIZZLE_Y:
 733          case SWIZZLE_Z:
 734          case SWIZZLE_W:
 735             src_mask |= 1<<i;
 736             src_swz[i] = s;
 737             break;
 738          case SWIZZLE_ZERO:
 739             zeros_mask |= 1<<i;
 740             break;
 741          case SWIZZLE_ONE:
 742             ones_mask |= 1<<i;
 743             break;
 744          }
 745       }
 746    }
 747
 748    /* Do src first, in case dst aliases src:
 749     */
 750    if (src_mask) {
 751       struct brw_reg arg0;
 752
 753       if (src.RelAddr)
 754          arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 755       else
 756          arg0 = get_reg(c, src.File, src.Index);
 757
 758       arg0 = brw_swizzle(arg0,
 759                          src_swz[0], src_swz[1],
 760                          src_swz[2], src_swz[3]);
 761
 762       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
 763    }
 764
 765    if (zeros_mask)
 766       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
 767
 768    if (ones_mask)
 769       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 770
 771    if (src.NegateBase)
 772       brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
 773
 774    if (need_tmp) {
 775       brw_MOV(p, dst, tmp);
 776       release_tmp(c, tmp);
 777    }
 778 }
 779
 780
 781
 782 /* Post-vertex-program processing.  Send the results to the URB.
 783  */
 784 static void emit_vertex_write( struct brw_vs_compile *c)
 785 {
 786    struct brw_compile *p = &c->func;
 787    struct brw_reg m0 = brw_message_reg(0);
 788    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
 789    struct brw_reg ndc;
 790
 791    if (c->key.copy_edgeflag) {
 792       brw_MOV(p,
 793               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
 794               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
 795    }
 796
 797
 798    /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
 799     */
 800    if (!c->key.know_w_is_one) {
 801       ndc = get_tmp(c);
 802       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
 803       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
 804    }
 805    else {
 806       ndc = pos;
 807    }
 808
 809    /* This includes the workaround for -ve rhw, so is no longer an
 810     * optional step:
 811     */
 812    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
 813        c->key.nr_userclip ||
 814        !c->key.know_w_is_one)
 815    {
 816       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
 817       GLuint i;
 818
 819       brw_MOV(p, header1, brw_imm_ud(0));
 820
 821       brw_set_access_mode(p, BRW_ALIGN_16);
 822
 823       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
 824          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 825          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 826          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
 827       }
 828
 829
 830       for (i = 0; i < c->key.nr_userclip; i++) {
 831          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 832          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
 833          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
 834          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 835       }
 836
 837
 838       /* i965 clipping workaround:
 839        * 1) Test for -ve rhw
 840        * 2) If set,
 841        *      set ndc = (0,0,0,0)
 842        *      set ucp[6] = 1
 843        *
 844        * Later, clipping will detect ucp[6] and ensure the primitive is
 845        * clipped against all fixed planes.
 846        */
 847       if (!c->key.know_w_is_one) {
 848          brw_CMP(p,
 849                  vec8(brw_null_reg()),
 850                  BRW_CONDITIONAL_L,
 851                  brw_swizzle1(ndc, 3),
 852                  brw_imm_f(0));
 853
 854          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
 855          brw_MOV(p, ndc, brw_imm_f(0));
 856          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 857       }
 858
 859       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
 860       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
 861       brw_set_access_mode(p, BRW_ALIGN_16);
 862
 863       release_tmp(c, header1);
 864    }
 865    else {
 866       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
 867    }
 868
 869
 870    /* Emit the (interleaved) headers for the two vertices - an 8-reg
 871     * of zeros followed by two sets of NDC coordinates:
 872     */
 873    brw_set_access_mode(p, BRW_ALIGN_1);
 874    brw_MOV(p, offset(m0, 2), ndc);
 875    brw_MOV(p, offset(m0, 3), pos);
 876
 877
 878    brw_urb_WRITE(p,
 879                  brw_null_reg(), /* dest */
 880                  0,             /* starting mrf reg nr */
 881                  c->r0,         /* src */
 882                  0,             /* allocate */
 883                  1,             /* used */
 884                  c->nr_outputs + 3, /* msg len */
 885                  0,             /* response len */
 886                  1,             /* eot */
 887                  1,             /* writes complete */
 888                  0,             /* urb destination offset */
 889                  BRW_URB_SWIZZLE_INTERLEAVE);
 890
 891 }
 892
 893
 894
 895
 896 /* Emit the fragment program instructions here.
 897  */
 898 void brw_vs_emit( struct brw_vs_compile *c )
 899 {
 900    struct brw_compile *p = &c->func;
 901    GLuint nr_insns = c->vp->program.Base.NumInstructions;
 902    GLuint insn;
 903
 904
 905    if (INTEL_DEBUG & DEBUG_VS) {
 906       _mesa_printf("\n\n\nvs-emit:\n");
 907       _mesa_print_program(&c->vp->program.Base);
 908       _mesa_printf("\n");
 909    }
 910
 911    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 912    brw_set_access_mode(p, BRW_ALIGN_16);
 913
 914    /* Static register allocation
 915     */
 916    brw_vs_alloc_regs(c);
 917
 918    for (insn = 0; insn < nr_insns; insn++) {
 919
 920       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
 921       struct brw_reg args[3], dst;
 922       GLuint i;
 923
 924       /* Get argument regs.  SWZ is special and does this itself.
 925        */
 926       if (inst->Opcode != OPCODE_SWZ)
 927          for (i = 0; i < 3; i++)
 928             args[i] = get_arg(c, inst->SrcReg[i]);
 929
 930       /* Get dest regs.  Note that it is possible for a reg to be both
 931        * dst and arg, given the static allocation of registers.  So
 932        * care needs to be taken emitting multi-operation instructions.
 933        */
 934       dst = get_dst(c, inst->DstReg);
 935
 936
 937       switch (inst->Opcode) {
 938       case OPCODE_ABS:
 939          brw_MOV(p, dst, brw_abs(args[0]));
 940          break;
 941       case OPCODE_ADD:
 942          brw_ADD(p, dst, args[0], args[1]);
 943          break;
 944       case OPCODE_DP3:
 945          brw_DP3(p, dst, args[0], args[1]);
 946          break;
 947       case OPCODE_DP4:
 948          brw_DP4(p, dst, args[0], args[1]);
 949          break;
 950       case OPCODE_DPH:
 951          brw_DPH(p, dst, args[0], args[1]);
 952          break;
 953       case OPCODE_DST:
 954          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
 955          break;
 956       case OPCODE_EXP:
 957          unalias1(c, dst, args[0], emit_exp_noalias);
 958          break;
 959       case OPCODE_EX2:
 960          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 961          break;
 962       case OPCODE_ARL:
 963          emit_arl(c, dst, args[0]);
 964          break;
 965       case OPCODE_FLR:
 966          brw_RNDD(p, dst, args[0]);
 967          break;
 968       case OPCODE_FRC:
 969          brw_FRC(p, dst, args[0]);
 970          break;
 971       case OPCODE_LOG:
 972          unalias1(c, dst, args[0], emit_log_noalias);
 973          break;
 974       case OPCODE_LG2:
 975          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
 976          break;
 977       case OPCODE_LIT:
 978          unalias1(c, dst, args[0], emit_lit_noalias);
 979          break;
 980       case OPCODE_MAD:
 981          brw_MOV(p, brw_acc_reg(), args[2]);
 982          brw_MAC(p, dst, args[0], args[1]);
 983          break;
 984       case OPCODE_MAX:
 985          emit_max(p, dst, args[0], args[1]);
 986          break;
 987       case OPCODE_MIN:
 988          emit_min(p, dst, args[0], args[1]);
 989          break;
 990       case OPCODE_MOV:
 991          brw_MOV(p, dst, args[0]);
 992          break;
 993       case OPCODE_MUL:
 994          brw_MUL(p, dst, args[0], args[1]);
 995          break;
 996       case OPCODE_POW:
 997          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
 998          break;
 999       case OPCODE_RCP:
1000          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1001          break;
1002       case OPCODE_RSQ:
1003          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1004          break;
1005       case OPCODE_SGE:
1006          emit_sge(p, dst, args[0], args[1]);
1007          break;
1008       case OPCODE_SLT:
1009          emit_slt(p, dst, args[0], args[1]);
1010          break;
1011       case OPCODE_SUB:
1012          brw_ADD(p, dst, args[0], negate(args[1]));
1013          break;
1014       case OPCODE_SWZ:
1015          /* The args[0] value can't be used here as it won't have
1016           * correctly encoded the full swizzle:
1017           */
1018          emit_swz(c, dst, inst->SrcReg[0] );
1019          break;
1020       case OPCODE_XPD:
1021          emit_xpd(p, dst, args[0], args[1]);
1022          break;
1023       case OPCODE_END:
1024       case OPCODE_PRINT:
1025          break;
1026       default:
1027          break;
1028       }
1029
1030       release_tmps(c);
1031    }
1032
1033    emit_vertex_write(c);
1034
1035 }
1036
1037
1038
1039
1040