src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "program.h"
  35 #include "program_instruction.h"
  36 #include "macros.h"
  37 #include "brw_vs.h"
  38
  39
  40
  41 /* Do things as simply as possible.  Allocate and populate all regs
  42  * ahead of time.
  43  */
  44 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  45 {
  46    GLuint i, reg = 0, mrf;
  47    GLuint nr_params;
  48
  49    /* r0 -- reserved as usual
  50     */
  51    c->r0 = brw_vec8_grf(reg, 0); reg++;
  52
  53    /* User clip planes from curbe:
  54     */
  55    if (c->key.nr_userclip) {
  56       for (i = 0; i < c->key.nr_userclip; i++) {
  57          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  58       }
  59
  60       /* Deal with curbe alignment:
  61        */
  62       reg += ((6+c->key.nr_userclip+3)/4)*2;
  63    }
  64
  65    /* Vertex program parameters from curbe:
  66     */
  67    nr_params = c->vp->program.Base.Parameters->NumParameters;
  68    for (i = 0; i < nr_params; i++) {
  69       c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
  70    }
  71    reg += (nr_params+1)/2;
  72
  73    c->prog_data.curb_read_length = reg - 1;
  74
  75
  76
  77    /* Allocate input regs:
  78     */
  79    c->nr_inputs = 0;
  80    for (i = 0; i < BRW_ATTRIB_MAX; i++) {
  81       if (c->prog_data.inputs_read & (1<<i)) {
  82          c->nr_inputs++;
  83          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
  84          reg++;
  85       }
  86    }
  87
  88
  89    /* Allocate outputs: TODO: could organize the non-position outputs
  90     * to go straight into message regs.
  91     */
  92    c->nr_outputs = 0;
  93    c->first_output = reg;
  94    mrf = 4;
  95    for (i = 0; i < VERT_RESULT_MAX; i++) {
  96       if (c->prog_data.outputs_written & (1<<i)) {
  97          c->nr_outputs++;
  98          if (i == VERT_RESULT_HPOS) {
  99             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 100             reg++;
 101          }
 102          else if (i == VERT_RESULT_PSIZ) {
 103             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 104             reg++;
 105             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 106          }
 107          else {
 108             c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 109             mrf++;
 110          }
 111       }
 112    }
 113
 114    /* Allocate program temporaries:
 115     */
 116    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 117       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 118       reg++;
 119    }
 120
 121    /* Address reg(s).  Don't try to use the internal address reg until
 122     * deref time.
 123     */
 124    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 125       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 126                                              reg,
 127                                              0,
 128                                              BRW_REGISTER_TYPE_D,
 129                                              BRW_VERTICAL_STRIDE_8,
 130                                              BRW_WIDTH_8,
 131                                              BRW_HORIZONTAL_STRIDE_1,
 132                                              BRW_SWIZZLE_XXXX,
 133                                              WRITEMASK_X);
 134       reg++;
 135    }
 136
 137
 138    /* Some opcodes need an internal temporary:
 139     */
 140    c->first_tmp = reg;
 141    c->last_tmp = reg;           /* for allocation purposes */
 142
 143    /* Each input reg holds data from two vertices.  The
 144     * urb_read_length is the number of registers read from *each*
 145     * vertex urb, so is half the amount:
 146     */
 147    c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
 148
 149    c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
 150    c->prog_data.total_grf = reg;
 151 }
 152
 153
 154 static struct brw_reg get_tmp( struct brw_vs_compile *c )
 155 {
 156    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
 157
 158    if (++c->last_tmp > c->prog_data.total_grf)
 159       c->prog_data.total_grf = c->last_tmp;
 160
 161    return tmp;
 162 }
 163
 164 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
 165 {
 166    if (tmp.nr == c->last_tmp-1)
 167       c->last_tmp--;
 168 }
 169
 170 static void release_tmps( struct brw_vs_compile *c )
 171 {
 172    c->last_tmp = c->first_tmp;
 173 }
 174
 175
 176 static void unalias1( struct brw_vs_compile *c,
 177                       struct brw_reg dst,
 178                       struct brw_reg arg0,
 179                       void (*func)( struct brw_vs_compile *,
 180                                     struct brw_reg,
 181                                     struct brw_reg ))
 182 {
 183    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 184       struct brw_compile *p = &c->func;
 185       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 186       func(c, tmp, arg0);
 187       brw_MOV(p, dst, tmp);
 188    }
 189    else {
 190       func(c, dst, arg0);
 191    }
 192 }
 193
 194 static void unalias2( struct brw_vs_compile *c,
 195                       struct brw_reg dst,
 196                       struct brw_reg arg0,
 197                       struct brw_reg arg1,
 198                       void (*func)( struct brw_vs_compile *,
 199                                     struct brw_reg,
 200                                     struct brw_reg,
 201                                     struct brw_reg ))
 202 {
 203    if ((dst.file == arg0.file && dst.nr == arg0.nr) &&
 204        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 205       struct brw_compile *p = &c->func;
 206       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 207       func(c, tmp, arg0, arg1);
 208       brw_MOV(p, dst, tmp);
 209    }
 210    else {
 211       func(c, dst, arg0, arg1);
 212    }
 213 }
 214
 215
 216
 217
 218 static void emit_slt( struct brw_compile *p,
 219                       struct brw_reg dst,
 220                       struct brw_reg arg0,
 221                       struct brw_reg arg1 )
 222 {
 223    /* Could be done with an if/else/endif, but this method uses half
 224     * the instructions.  Note that we are careful to reference the
 225     * arguments before writing the dest.  That means we emit the
 226     * instructions in an odd order and have to play with the flag
 227     * values.
 228     */
 229    brw_push_insn_state(p);
 230    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 231
 232    /* Write all values to 1:
 233     */
 234    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 235    brw_MOV(p, dst, brw_imm_f(1.0));
 236
 237    /* Where the test succeeded, overwite with zero:
 238     */
 239    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 240    brw_MOV(p, dst, brw_imm_f(0.0));
 241    brw_pop_insn_state(p);
 242 }
 243
 244
 245 static void emit_sge( struct brw_compile *p,
 246                       struct brw_reg dst,
 247                       struct brw_reg arg0,
 248                       struct brw_reg arg1 )
 249 {
 250    brw_push_insn_state(p);
 251    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 252
 253    /* Write all values to zero:
 254     */
 255    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 256    brw_MOV(p, dst, brw_imm_f(0));
 257
 258    /* Where the test succeeded, overwite with 1:
 259     */
 260    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 261    brw_MOV(p, dst, brw_imm_f(1.0));
 262    brw_pop_insn_state(p);
 263 }
 264
 265
 266 static void emit_max( struct brw_compile *p,
 267                       struct brw_reg dst,
 268                       struct brw_reg arg0,
 269                       struct brw_reg arg1 )
 270 {
 271    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 272    brw_SEL(p, dst, arg1, arg0);
 273    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 274 }
 275
 276 static void emit_min( struct brw_compile *p,
 277                       struct brw_reg dst,
 278                       struct brw_reg arg0,
 279                       struct brw_reg arg1 )
 280 {
 281    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 282    brw_SEL(p, dst, arg0, arg1);
 283    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 284 }
 285
 286
 287 static void emit_math1( struct brw_vs_compile *c,
 288                         GLuint function,
 289                         struct brw_reg dst,
 290                         struct brw_reg arg0,
 291                         GLuint precision)
 292 {
 293    /* There are various odd behaviours with SEND on the simulator.  In
 294     * addition there are documented issues with the fact that the GEN4
 295     * processor doesn't do dependency control properly on SEND
 296     * results.  So, on balance, this kludge to get around failures
 297     * with writemasked math results looks like it might be necessary
 298     * whether that turns out to be a simulator bug or not:
 299     */
 300    struct brw_compile *p = &c->func;
 301    struct brw_reg tmp = dst;
 302    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 303                          dst.file != BRW_GENERAL_REGISTER_FILE);
 304
 305    if (need_tmp)
 306       tmp = get_tmp(c);
 307
 308    brw_math(p,
 309             tmp,
 310             function,
 311             BRW_MATH_SATURATE_NONE,
 312             2,
 313             arg0,
 314             BRW_MATH_DATA_SCALAR,
 315             precision);
 316
 317    if (need_tmp) {
 318       brw_MOV(p, dst, tmp);
 319       release_tmp(c, tmp);
 320    }
 321 }
 322
 323 static void emit_math2( struct brw_vs_compile *c,
 324                         GLuint function,
 325                         struct brw_reg dst,
 326                         struct brw_reg arg0,
 327                         struct brw_reg arg1,
 328                         GLuint precision)
 329 {
 330    struct brw_compile *p = &c->func;
 331    struct brw_reg tmp = dst;
 332    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 333                          dst.file != BRW_GENERAL_REGISTER_FILE);
 334
 335    if (need_tmp)
 336       tmp = get_tmp(c);
 337
 338    brw_MOV(p, brw_message_reg(3), arg1);
 339
 340    brw_math(p,
 341             tmp,
 342             function,
 343             BRW_MATH_SATURATE_NONE,
 344             2,
 345             arg0,
 346             BRW_MATH_DATA_SCALAR,
 347             precision);
 348
 349    if (need_tmp) {
 350       brw_MOV(p, dst, tmp);
 351       release_tmp(c, tmp);
 352    }
 353 }
 354
 355
 356
 357 static void emit_exp_noalias( struct brw_vs_compile *c,
 358                               struct brw_reg dst,
 359                               struct brw_reg arg0 )
 360 {
 361    struct brw_compile *p = &c->func;
 362
 363
 364    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 365       struct brw_reg tmp = get_tmp(c);
 366       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 367
 368       /* tmp_d = floor(arg0.x) */
 369       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 370
 371       /* result[0] = 2.0 ^ tmp */
 372
 373       /* Adjust exponent for floating point:
 374        * exp += 127
 375        */
 376       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 377
 378       /* Install exponent and sign.
 379        * Excess drops off the edge:
 380        */
 381       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 382               tmp_d, brw_imm_d(23));
 383
 384       release_tmp(c, tmp);
 385    }
 386
 387    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 388       /* result[1] = arg0.x - floor(arg0.x) */
 389       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 390    }
 391
 392    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 393       /* As with the LOG instruction, we might be better off just
 394        * doing a taylor expansion here, seeing as we have to do all
 395        * the prep work.
 396        *
 397        * If mathbox partial precision is too low, consider also:
 398        * result[3] = result[0] * EXP(result[1])
 399        */
 400       emit_math1(c,
 401                  BRW_MATH_FUNCTION_EXP,
 402                  brw_writemask(dst, WRITEMASK_Z),
 403                  brw_swizzle1(arg0, 0),
 404                  BRW_MATH_PRECISION_PARTIAL);
 405    }
 406
 407    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 408       /* result[3] = 1.0; */
 409       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 410    }
 411 }
 412
 413
 414 static void emit_log_noalias( struct brw_vs_compile *c,
 415                               struct brw_reg dst,
 416                               struct brw_reg arg0 )
 417 {
 418    struct brw_compile *p = &c->func;
 419    struct brw_reg dst_ud = retype(dst, BRW_REGISTER_TYPE_UD);
 420    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 421
 422    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 423     * according to spec:
 424     *
 425     * These almost look likey they could be joined up, but not really
 426     * practical:
 427     *
 428     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 429     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 430     */
 431    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 432       brw_AND(p,
 433               brw_writemask(dst_ud, WRITEMASK_X),
 434               brw_swizzle1(arg0_ud, 0),
 435               brw_imm_ud((1U<<31)-1));
 436
 437       brw_SHR(p,
 438               brw_writemask(dst_ud, WRITEMASK_X),
 439               dst_ud,
 440               brw_imm_ud(23));
 441
 442       brw_ADD(p,
 443               brw_writemask(dst, WRITEMASK_X),
 444               retype(dst_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 445               brw_imm_d(-127));
 446    }
 447
 448    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 449       brw_AND(p,
 450               brw_writemask(dst_ud, WRITEMASK_Y),
 451               brw_swizzle1(arg0_ud, 0),
 452               brw_imm_ud((1<<23)-1));
 453
 454       brw_OR(p,
 455              brw_writemask(dst_ud, WRITEMASK_Y),
 456              dst_ud,
 457              brw_imm_ud(127<<23));
 458    }
 459
 460    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 461       /* result[2] = result[0] + LOG2(result[1]); */
 462
 463       /* Why bother?  The above is just a hint how to do this with a
 464        * taylor series.  Maybe we *should* use a taylor series as by
 465        * the time all the above has been done it's almost certainly
 466        * quicker than calling the mathbox, even with low precision.
 467        *
 468        * Options are:
 469        *    - result[0] + mathbox.LOG2(result[1])
 470        *    - mathbox.LOG2(arg0.x)
 471        *    - result[0] + inline_taylor_approx(result[1])
 472        */
 473       emit_math1(c,
 474                  BRW_MATH_FUNCTION_LOG,
 475                  brw_writemask(dst, WRITEMASK_Z),
 476                  brw_swizzle1(dst, 1),
 477                  BRW_MATH_PRECISION_FULL);
 478
 479       brw_ADD(p,
 480               brw_writemask(dst, WRITEMASK_Z),
 481               brw_swizzle1(dst, 2),
 482               brw_swizzle1(dst, 0));
 483    }
 484
 485    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 486       /* result[3] = 1.0; */
 487       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 488    }
 489 }
 490
 491
 492
 493
 494 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 495  */
 496 static void emit_dst_noalias( struct brw_vs_compile *c,
 497                               struct brw_reg dst,
 498                               struct brw_reg arg0,
 499                               struct brw_reg arg1)
 500 {
 501    struct brw_compile *p = &c->func;
 502
 503    /* There must be a better way to do this:
 504     */
 505    if (dst.dw1.bits.writemask & WRITEMASK_X)
 506       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 507    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 508       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 509    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 510       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 511    if (dst.dw1.bits.writemask & WRITEMASK_W)
 512       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 513 }
 514
 515 static void emit_xpd( struct brw_compile *p,
 516                       struct brw_reg dst,
 517                       struct brw_reg t,
 518                       struct brw_reg u)
 519 {
 520    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 521    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 522 }
 523
 524
 525
 526 static void emit_lit_noalias( struct brw_vs_compile *c,
 527                               struct brw_reg dst,
 528                               struct brw_reg arg0 )
 529 {
 530    struct brw_compile *p = &c->func;
 531    struct brw_instruction *if_insn;
 532    struct brw_reg tmp = dst;
 533    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 534
 535    if (need_tmp)
 536       tmp = get_tmp(c);
 537
 538    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 539    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 540
 541    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 542     * to get all channels active inside the IF.  In the clipping code
 543     * we run with NoMask, so it's not an option and we can use
 544     * BRW_EXECUTE_1 for all comparisions.
 545     */
 546    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 547    if_insn = brw_IF(p, BRW_EXECUTE_8);
 548    {
 549       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 550
 551       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 552       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 553       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 554
 555       emit_math2(c,
 556                  BRW_MATH_FUNCTION_POW,
 557                  brw_writemask(dst, WRITEMASK_Z),
 558                  brw_swizzle1(tmp, 2),
 559                  brw_swizzle1(arg0, 3),
 560                  BRW_MATH_PRECISION_PARTIAL);
 561    }
 562
 563    brw_ENDIF(p, if_insn);
 564 }
 565
 566
 567
 568
 569
 570 /* TODO: relative addressing!
 571  */
 572 static struct brw_reg get_reg( struct brw_vs_compile *c,
 573                                GLuint file,
 574                                GLuint index )
 575 {
 576
 577    switch (file) {
 578    case PROGRAM_TEMPORARY:
 579    case PROGRAM_INPUT:
 580    case PROGRAM_OUTPUT:
 581    case PROGRAM_STATE_VAR:
 582       assert(c->regs[file][index].nr != 0);
 583       return c->regs[file][index];
 584    case PROGRAM_ADDRESS:
 585       assert(index == 0);
 586       return c->regs[file][index];
 587
 588    case PROGRAM_UNDEFINED:                      /* undef values */
 589       return brw_null_reg();
 590
 591    case PROGRAM_LOCAL_PARAM:
 592    case PROGRAM_ENV_PARAM:
 593    case PROGRAM_WRITE_ONLY:
 594    default:
 595       assert(0);
 596       return brw_null_reg();
 597    }
 598 }
 599
 600
 601
 602 static struct brw_reg deref( struct brw_vs_compile *c,
 603                              struct brw_reg arg,
 604                              GLint offset)
 605 {
 606    struct brw_compile *p = &c->func;
 607    struct brw_reg tmp = vec4(get_tmp(c));
 608    struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
 609    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 610    struct brw_reg indirect = brw_vec4_indirect(0,0);
 611
 612    {
 613       brw_push_insn_state(p);
 614       brw_set_access_mode(p, BRW_ALIGN_1);
 615
 616       /* This is pretty clunky - load the address register twice and
 617        * fetch each 4-dword value in turn.  There must be a way to do
 618        * this in a single pass, but I couldn't get it to work.
 619        */
 620       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 621       brw_MOV(p, tmp, indirect);
 622
 623       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 624       brw_MOV(p, suboffset(tmp, 4), indirect);
 625
 626       brw_pop_insn_state(p);
 627    }
 628
 629    return vec8(tmp);
 630 }
 631
 632
 633 static void emit_arl( struct brw_vs_compile *c,
 634                       struct brw_reg dst,
 635                       struct brw_reg arg0 )
 636 {
 637    struct brw_compile *p = &c->func;
 638
 639    brw_RNDD(p, dst, arg0);
 640
 641    brw_MUL(p, dst, dst, brw_imm_d(16));
 642 }
 643
 644
 645 /* Will return mangled results for SWZ op.  The emit_swz() function
 646  * ignores this result and recalculates taking extended swizzles into
 647  * account.
 648  */
 649 static struct brw_reg get_arg( struct brw_vs_compile *c,
 650                                struct prog_src_register src )
 651 {
 652    struct brw_reg reg;
 653
 654    if (src.File == PROGRAM_UNDEFINED)
 655       return brw_null_reg();
 656
 657    if (src.RelAddr)
 658       reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 659    else
 660       reg = get_reg(c, src.File, src.Index);
 661
 662    /* Convert 3-bit swizzle to 2-bit.
 663     */
 664    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src.Swizzle, 0),
 665                                        GET_SWZ(src.Swizzle, 1),
 666                                        GET_SWZ(src.Swizzle, 2),
 667                                        GET_SWZ(src.Swizzle, 3));
 668
 669    /* Note this is ok for non-swizzle instructions:
 670     */
 671    reg.negate = src.NegateBase ? 1 : 0;
 672
 673    return reg;
 674 }
 675
 676
 677 static struct brw_reg get_dst( struct brw_vs_compile *c,
 678                                struct prog_dst_register dst )
 679 {
 680    struct brw_reg reg = get_reg(c, dst.File, dst.Index);
 681
 682    reg.dw1.bits.writemask = dst.WriteMask;
 683
 684    return reg;
 685 }
 686
 687
 688
 689
 690 static void emit_swz( struct brw_vs_compile *c,
 691                       struct brw_reg dst,
 692                       struct prog_src_register src )
 693 {
 694    struct brw_compile *p = &c->func;
 695    GLuint zeros_mask = 0;
 696    GLuint ones_mask = 0;
 697    GLuint src_mask = 0;
 698    GLubyte src_swz[4];
 699    GLboolean need_tmp = (src.NegateBase &&
 700                          dst.file != BRW_GENERAL_REGISTER_FILE);
 701    struct brw_reg tmp = dst;
 702    GLuint i;
 703
 704    if (need_tmp)
 705       tmp = get_tmp(c);
 706
 707    for (i = 0; i < 4; i++) {
 708       if (dst.dw1.bits.writemask & (1<<i)) {
 709          GLubyte s = GET_SWZ(src.Swizzle, i);
 710          switch (s) {
 711          case SWIZZLE_X:
 712          case SWIZZLE_Y:
 713          case SWIZZLE_Z:
 714          case SWIZZLE_W:
 715             src_mask |= 1<<i;
 716             src_swz[i] = s;
 717             break;
 718          case SWIZZLE_ZERO:
 719             zeros_mask |= 1<<i;
 720             break;
 721          case SWIZZLE_ONE:
 722             ones_mask |= 1<<i;
 723             break;
 724          }
 725       }
 726    }
 727
 728    /* Do src first, in case dst aliases src:
 729     */
 730    if (src_mask) {
 731       struct brw_reg arg0;
 732
 733       if (src.RelAddr)
 734          arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 735       else
 736          arg0 = get_reg(c, src.File, src.Index);
 737
 738       arg0 = brw_swizzle(arg0,
 739                          src_swz[0], src_swz[1],
 740                          src_swz[2], src_swz[3]);
 741
 742       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
 743    }
 744
 745    if (zeros_mask)
 746       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
 747
 748    if (ones_mask)
 749       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 750
 751    if (src.NegateBase)
 752       brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
 753
 754    if (need_tmp) {
 755       brw_MOV(p, dst, tmp);
 756       release_tmp(c, tmp);
 757    }
 758 }
 759
 760
 761
 762 /* Post-vertex-program processing.  Send the results to the URB.
 763  */
 764 static void emit_vertex_write( struct brw_vs_compile *c)
 765 {
 766    struct brw_compile *p = &c->func;
 767    struct brw_reg m0 = brw_message_reg(0);
 768    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
 769    struct brw_reg ndc;
 770
 771    if (c->key.copy_edgeflag) {
 772       brw_MOV(p,
 773               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
 774               get_reg(c, PROGRAM_INPUT, BRW_ATTRIB_EDGEFLAG));
 775    }
 776
 777
 778    /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
 779     */
 780    ndc = get_tmp(c);
 781    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
 782    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
 783
 784    /* This includes the workaround for -ve rhw, so is no longer an
 785     * optional step:
 786     */
 787    {
 788       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
 789       GLuint i;
 790
 791       brw_MOV(p, header1, brw_imm_ud(0));
 792
 793       brw_set_access_mode(p, BRW_ALIGN_16);
 794
 795       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
 796          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 797          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 798          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
 799       }
 800
 801
 802       for (i = 0; i < c->key.nr_userclip; i++) {
 803          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 804          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
 805          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
 806          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 807       }
 808
 809
 810       /* i965 clipping workaround:
 811        * 1) Test for -ve rhw
 812        * 2) If set,
 813        *      set ndc = (0,0,0,0)
 814        *      set ucp[6] = 1
 815        *
 816        * Later, clipping will detect ucp[6] and ensure the primitive is
 817        * clipped against all fixed planes.
 818        */
 819       brw_CMP(p,
 820               vec8(brw_null_reg()),
 821               BRW_CONDITIONAL_L,
 822               brw_swizzle1(ndc, 3),
 823               brw_imm_f(0));
 824
 825       brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
 826       brw_MOV(p, ndc, brw_imm_f(0));
 827       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 828
 829
 830
 831
 832
 833
 834       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
 835       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
 836       brw_set_access_mode(p, BRW_ALIGN_16);
 837
 838       release_tmp(c, header1);
 839    }
 840
 841
 842    /* Emit the (interleaved) headers for the two vertices - an 8-reg
 843     * of zeros followed by two sets of NDC coordinates:
 844     */
 845    brw_set_access_mode(p, BRW_ALIGN_1);
 846    brw_MOV(p, offset(m0, 2), ndc);
 847    brw_MOV(p, offset(m0, 3), pos);
 848
 849
 850    brw_urb_WRITE(p,
 851                  brw_null_reg(), /* dest */
 852                  0,             /* starting mrf reg nr */
 853                  c->r0,         /* src */
 854                  0,             /* allocate */
 855                  1,             /* used */
 856                  c->nr_outputs + 3, /* msg len */
 857                  0,             /* response len */
 858                  1,             /* eot */
 859                  1,             /* writes complete */
 860                  0,             /* urb destination offset */
 861                  BRW_URB_SWIZZLE_INTERLEAVE);
 862
 863 }
 864
 865
 866
 867
 868 /* Emit the fragment program instructions here.
 869  */
 870 void brw_vs_emit( struct brw_vs_compile *c )
 871 {
 872    struct brw_compile *p = &c->func;
 873    GLuint nr_insns = c->vp->program.Base.NumInstructions;
 874    GLuint insn;
 875
 876    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 877    brw_set_access_mode(p, BRW_ALIGN_16);
 878
 879    /* Static register allocation
 880     */
 881    brw_vs_alloc_regs(c);
 882
 883    for (insn = 0; insn < nr_insns; insn++) {
 884
 885       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
 886       struct brw_reg args[3], dst;
 887       GLuint i;
 888
 889       /* Get argument regs.  SWZ is special and does this itself.
 890        */
 891       if (inst->Opcode != OPCODE_SWZ)
 892          for (i = 0; i < 3; i++)
 893             args[i] = get_arg(c, inst->SrcReg[i]);
 894
 895       /* Get dest regs.  Note that it is possible for a reg to be both
 896        * dst and arg, given the static allocation of registers.  So
 897        * care needs to be taken emitting multi-operation instructions.
 898        */
 899       dst = get_dst(c, inst->DstReg);
 900
 901
 902       switch (inst->Opcode) {
 903       case OPCODE_ABS:
 904          brw_MOV(p, dst, brw_abs(args[0]));
 905          break;
 906       case OPCODE_ADD:
 907          brw_ADD(p, dst, args[0], args[1]);
 908          break;
 909       case OPCODE_DP3:
 910          brw_DP3(p, dst, args[0], args[1]);
 911          break;
 912       case OPCODE_DP4:
 913          brw_DP4(p, dst, args[0], args[1]);
 914          break;
 915       case OPCODE_DPH:
 916          brw_DPH(p, dst, args[0], args[1]);
 917          break;
 918       case OPCODE_DST:
 919          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
 920          break;
 921       case OPCODE_EXP:
 922          unalias1(c, dst, args[0], emit_exp_noalias);
 923          break;
 924       case OPCODE_EX2:
 925          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 926          break;
 927       case OPCODE_ARL:
 928          emit_arl(c, dst, args[0]);
 929          break;
 930       case OPCODE_FLR:
 931          brw_RNDD(p, dst, args[0]);
 932          break;
 933       case OPCODE_FRC:
 934          brw_FRC(p, dst, args[0]);
 935          break;
 936       case OPCODE_LOG:
 937          unalias1(c, dst, args[0], emit_log_noalias);
 938          break;
 939       case OPCODE_LG2:
 940          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
 941          break;
 942       case OPCODE_LIT:
 943          unalias1(c, dst, args[0], emit_lit_noalias);
 944          break;
 945       case OPCODE_MAD:
 946          brw_MOV(p, brw_acc_reg(), args[2]);
 947          brw_MAC(p, dst, args[0], args[1]);
 948          break;
 949       case OPCODE_MAX:
 950          emit_max(p, dst, args[0], args[1]);
 951          break;
 952       case OPCODE_MIN:
 953          emit_min(p, dst, args[0], args[1]);
 954          break;
 955       case OPCODE_MOV:
 956          brw_MOV(p, dst, args[0]);
 957          break;
 958       case OPCODE_MUL:
 959          brw_MUL(p, dst, args[0], args[1]);
 960          break;
 961       case OPCODE_POW:
 962          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
 963          break;
 964       case OPCODE_RCP:
 965          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 966          break;
 967       case OPCODE_RSQ:
 968          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
 969          break;
 970       case OPCODE_SGE:
 971          emit_sge(p, dst, args[0], args[1]);
 972          break;
 973       case OPCODE_SLT:
 974          emit_slt(p, dst, args[0], args[1]);
 975          break;
 976       case OPCODE_SUB:
 977          brw_ADD(p, dst, args[0], negate(args[1]));
 978          break;
 979       case OPCODE_SWZ:
 980          /* The args[0] value can't be used here as it won't have
 981           * correctly encoded the full swizzle:
 982           */
 983          emit_swz(c, dst, inst->SrcReg[0] );
 984          break;
 985       case OPCODE_XPD:
 986          emit_xpd(p, dst, args[0], args[1]);
 987          break;
 988       case OPCODE_END:
 989       case OPCODE_PRINT:
 990          break;
 991       default:
 992          break;
 993       }
 994
 995       release_tmps(c);
 996    }
 997
 998    emit_vertex_write(c);
 999
1000 }
1001
1002
1003
1004
1005