src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "program.h"
  34 #include "macros.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41
  42 /* Do things as simply as possible.  Allocate and populate all regs
  43  * ahead of time.
  44  */
  45 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  46 {
  47    GLuint i, reg = 0, mrf;
  48    GLuint nr_params;
  49
  50    /* r0 -- reserved as usual
  51     */
  52    c->r0 = brw_vec8_grf(reg, 0); reg++;
  53
  54    /* User clip planes from curbe:
  55     */
  56    if (c->key.nr_userclip) {
  57       for (i = 0; i < c->key.nr_userclip; i++) {
  58          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  59       }
  60
  61       /* Deal with curbe alignment:
  62        */
  63       reg += ((6+c->key.nr_userclip+3)/4)*2;
  64    }
  65
  66    /* Vertex program parameters from curbe:
  67     */
  68    nr_params = c->vp->program.Base.Parameters->NumParameters;
  69    for (i = 0; i < nr_params; i++) {
  70       c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
  71    }
  72    reg += (nr_params+1)/2;
  73
  74    c->prog_data.curb_read_length = reg - 1;
  75
  76
  77
  78    /* Allocate input regs:
  79     */
  80    c->nr_inputs = 0;
  81    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
  82       if (c->prog_data.inputs_read & (1<<i)) {
  83          c->nr_inputs++;
  84          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
  85          reg++;
  86       }
  87    }
  88
  89
  90    /* Allocate outputs: TODO: could organize the non-position outputs
  91     * to go straight into message regs.
  92     */
  93    c->nr_outputs = 0;
  94    c->first_output = reg;
  95    mrf = 4;
  96    for (i = 0; i < VERT_RESULT_MAX; i++) {
  97       if (c->prog_data.outputs_written & (1<<i)) {
  98          c->nr_outputs++;
  99          if (i == VERT_RESULT_HPOS) {
 100             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 101             reg++;
 102          }
 103          else if (i == VERT_RESULT_PSIZ) {
 104             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 105             reg++;
 106             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 107          }
 108          else {
 109             c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 110             mrf++;
 111          }
 112       }
 113    }
 114
 115    /* Allocate program temporaries:
 116     */
 117    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 118       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 119       reg++;
 120    }
 121
 122    /* Address reg(s).  Don't try to use the internal address reg until
 123     * deref time.
 124     */
 125    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 126       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 127                                              reg,
 128                                              0,
 129                                              BRW_REGISTER_TYPE_D,
 130                                              BRW_VERTICAL_STRIDE_8,
 131                                              BRW_WIDTH_8,
 132                                              BRW_HORIZONTAL_STRIDE_1,
 133                                              BRW_SWIZZLE_XXXX,
 134                                              WRITEMASK_X);
 135       reg++;
 136    }
 137
 138
 139    /* Some opcodes need an internal temporary:
 140     */
 141    c->first_tmp = reg;
 142    c->last_tmp = reg;           /* for allocation purposes */
 143
 144    /* Each input reg holds data from two vertices.  The
 145     * urb_read_length is the number of registers read from *each*
 146     * vertex urb, so is half the amount:
 147     */
 148    c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
 149
 150    c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
 151    c->prog_data.total_grf = reg;
 152 }
 153
 154
 155 static struct brw_reg get_tmp( struct brw_vs_compile *c )
 156 {
 157    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
 158
 159    if (++c->last_tmp > c->prog_data.total_grf)
 160       c->prog_data.total_grf = c->last_tmp;
 161
 162    return tmp;
 163 }
 164
 165 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
 166 {
 167    if (tmp.nr == c->last_tmp-1)
 168       c->last_tmp--;
 169 }
 170
 171 static void release_tmps( struct brw_vs_compile *c )
 172 {
 173    c->last_tmp = c->first_tmp;
 174 }
 175
 176
 177 static void unalias1( struct brw_vs_compile *c,
 178                       struct brw_reg dst,
 179                       struct brw_reg arg0,
 180                       void (*func)( struct brw_vs_compile *,
 181                                     struct brw_reg,
 182                                     struct brw_reg ))
 183 {
 184    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 185       struct brw_compile *p = &c->func;
 186       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 187       func(c, tmp, arg0);
 188       brw_MOV(p, dst, tmp);
 189    }
 190    else {
 191       func(c, dst, arg0);
 192    }
 193 }
 194
 195 static void unalias2( struct brw_vs_compile *c,
 196                       struct brw_reg dst,
 197                       struct brw_reg arg0,
 198                       struct brw_reg arg1,
 199                       void (*func)( struct brw_vs_compile *,
 200                                     struct brw_reg,
 201                                     struct brw_reg,
 202                                     struct brw_reg ))
 203 {
 204    if ((dst.file == arg0.file && dst.nr == arg0.nr) &&
 205        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 206       struct brw_compile *p = &c->func;
 207       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 208       func(c, tmp, arg0, arg1);
 209       brw_MOV(p, dst, tmp);
 210    }
 211    else {
 212       func(c, dst, arg0, arg1);
 213    }
 214 }
 215
 216
 217
 218
 219 static void emit_slt( struct brw_compile *p,
 220                       struct brw_reg dst,
 221                       struct brw_reg arg0,
 222                       struct brw_reg arg1 )
 223 {
 224    /* Could be done with an if/else/endif, but this method uses half
 225     * the instructions.  Note that we are careful to reference the
 226     * arguments before writing the dest.  That means we emit the
 227     * instructions in an odd order and have to play with the flag
 228     * values.
 229     */
 230    brw_push_insn_state(p);
 231    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 232
 233    /* Write all values to 1:
 234     */
 235    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 236    brw_MOV(p, dst, brw_imm_f(1.0));
 237
 238    /* Where the test succeeded, overwite with zero:
 239     */
 240    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 241    brw_MOV(p, dst, brw_imm_f(0.0));
 242    brw_pop_insn_state(p);
 243 }
 244
 245
 246 static void emit_sge( struct brw_compile *p,
 247                       struct brw_reg dst,
 248                       struct brw_reg arg0,
 249                       struct brw_reg arg1 )
 250 {
 251    brw_push_insn_state(p);
 252    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 253
 254    /* Write all values to zero:
 255     */
 256    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 257    brw_MOV(p, dst, brw_imm_f(0));
 258
 259    /* Where the test succeeded, overwite with 1:
 260     */
 261    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 262    brw_MOV(p, dst, brw_imm_f(1.0));
 263    brw_pop_insn_state(p);
 264 }
 265
 266
 267 static void emit_max( struct brw_compile *p,
 268                       struct brw_reg dst,
 269                       struct brw_reg arg0,
 270                       struct brw_reg arg1 )
 271 {
 272    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 273    brw_SEL(p, dst, arg1, arg0);
 274    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 275 }
 276
 277 static void emit_min( struct brw_compile *p,
 278                       struct brw_reg dst,
 279                       struct brw_reg arg0,
 280                       struct brw_reg arg1 )
 281 {
 282    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 283    brw_SEL(p, dst, arg0, arg1);
 284    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 285 }
 286
 287
 288 static void emit_math1( struct brw_vs_compile *c,
 289                         GLuint function,
 290                         struct brw_reg dst,
 291                         struct brw_reg arg0,
 292                         GLuint precision)
 293 {
 294    /* There are various odd behaviours with SEND on the simulator.  In
 295     * addition there are documented issues with the fact that the GEN4
 296     * processor doesn't do dependency control properly on SEND
 297     * results.  So, on balance, this kludge to get around failures
 298     * with writemasked math results looks like it might be necessary
 299     * whether that turns out to be a simulator bug or not:
 300     */
 301    struct brw_compile *p = &c->func;
 302    struct brw_reg tmp = dst;
 303    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 304                          dst.file != BRW_GENERAL_REGISTER_FILE);
 305
 306    if (need_tmp)
 307       tmp = get_tmp(c);
 308
 309    brw_math(p,
 310             tmp,
 311             function,
 312             BRW_MATH_SATURATE_NONE,
 313             2,
 314             arg0,
 315             BRW_MATH_DATA_SCALAR,
 316             precision);
 317
 318    if (need_tmp) {
 319       brw_MOV(p, dst, tmp);
 320       release_tmp(c, tmp);
 321    }
 322 }
 323
 324 static void emit_math2( struct brw_vs_compile *c,
 325                         GLuint function,
 326                         struct brw_reg dst,
 327                         struct brw_reg arg0,
 328                         struct brw_reg arg1,
 329                         GLuint precision)
 330 {
 331    struct brw_compile *p = &c->func;
 332    struct brw_reg tmp = dst;
 333    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 334                          dst.file != BRW_GENERAL_REGISTER_FILE);
 335
 336    if (need_tmp)
 337       tmp = get_tmp(c);
 338
 339    brw_MOV(p, brw_message_reg(3), arg1);
 340
 341    brw_math(p,
 342             tmp,
 343             function,
 344             BRW_MATH_SATURATE_NONE,
 345             2,
 346             arg0,
 347             BRW_MATH_DATA_SCALAR,
 348             precision);
 349
 350    if (need_tmp) {
 351       brw_MOV(p, dst, tmp);
 352       release_tmp(c, tmp);
 353    }
 354 }
 355
 356
 357
 358 static void emit_exp_noalias( struct brw_vs_compile *c,
 359                               struct brw_reg dst,
 360                               struct brw_reg arg0 )
 361 {
 362    struct brw_compile *p = &c->func;
 363
 364
 365    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 366       struct brw_reg tmp = get_tmp(c);
 367       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 368
 369       /* tmp_d = floor(arg0.x) */
 370       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 371
 372       /* result[0] = 2.0 ^ tmp */
 373
 374       /* Adjust exponent for floating point:
 375        * exp += 127
 376        */
 377       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 378
 379       /* Install exponent and sign.
 380        * Excess drops off the edge:
 381        */
 382       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 383               tmp_d, brw_imm_d(23));
 384
 385       release_tmp(c, tmp);
 386    }
 387
 388    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 389       /* result[1] = arg0.x - floor(arg0.x) */
 390       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 391    }
 392
 393    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 394       /* As with the LOG instruction, we might be better off just
 395        * doing a taylor expansion here, seeing as we have to do all
 396        * the prep work.
 397        *
 398        * If mathbox partial precision is too low, consider also:
 399        * result[3] = result[0] * EXP(result[1])
 400        */
 401       emit_math1(c,
 402                  BRW_MATH_FUNCTION_EXP,
 403                  brw_writemask(dst, WRITEMASK_Z),
 404                  brw_swizzle1(arg0, 0),
 405                  BRW_MATH_PRECISION_PARTIAL);
 406    }
 407
 408    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 409       /* result[3] = 1.0; */
 410       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 411    }
 412 }
 413
 414
 415 static void emit_log_noalias( struct brw_vs_compile *c,
 416                               struct brw_reg dst,
 417                               struct brw_reg arg0 )
 418 {
 419    struct brw_compile *p = &c->func;
 420    struct brw_reg tmp = dst;
 421    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 422    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 423    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 424                          dst.file != BRW_GENERAL_REGISTER_FILE);
 425
 426    if (need_tmp) {
 427       tmp = get_tmp(c);
 428       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 429    }
 430
 431    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 432     * according to spec:
 433     *
 434     * These almost look likey they could be joined up, but not really
 435     * practical:
 436     *
 437     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 438     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 439     */
 440    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 441       brw_AND(p,
 442               brw_writemask(tmp_ud, WRITEMASK_X),
 443               brw_swizzle1(arg0_ud, 0),
 444               brw_imm_ud((1U<<31)-1));
 445
 446       brw_SHR(p,
 447               brw_writemask(tmp_ud, WRITEMASK_X),
 448               tmp_ud,
 449               brw_imm_ud(23));
 450
 451       brw_ADD(p,
 452               brw_writemask(tmp, WRITEMASK_X),
 453               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 454               brw_imm_d(-127));
 455    }
 456
 457    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 458       brw_AND(p,
 459               brw_writemask(tmp_ud, WRITEMASK_Y),
 460               brw_swizzle1(arg0_ud, 0),
 461               brw_imm_ud((1<<23)-1));
 462
 463       brw_OR(p,
 464              brw_writemask(tmp_ud, WRITEMASK_Y),
 465              tmp_ud,
 466              brw_imm_ud(127<<23));
 467    }
 468
 469    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 470       /* result[2] = result[0] + LOG2(result[1]); */
 471
 472       /* Why bother?  The above is just a hint how to do this with a
 473        * taylor series.  Maybe we *should* use a taylor series as by
 474        * the time all the above has been done it's almost certainly
 475        * quicker than calling the mathbox, even with low precision.
 476        *
 477        * Options are:
 478        *    - result[0] + mathbox.LOG2(result[1])
 479        *    - mathbox.LOG2(arg0.x)
 480        *    - result[0] + inline_taylor_approx(result[1])
 481        */
 482       emit_math1(c,
 483                  BRW_MATH_FUNCTION_LOG,
 484                  brw_writemask(tmp, WRITEMASK_Z),
 485                  brw_swizzle1(tmp, 1),
 486                  BRW_MATH_PRECISION_FULL);
 487
 488       brw_ADD(p,
 489               brw_writemask(tmp, WRITEMASK_Z),
 490               brw_swizzle1(tmp, 2),
 491               brw_swizzle1(tmp, 0));
 492    }
 493
 494    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 495       /* result[3] = 1.0; */
 496       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 497    }
 498
 499    if (need_tmp) {
 500       brw_MOV(p, dst, tmp);
 501       release_tmp(c, tmp);
 502    }
 503 }
 504
 505
 506
 507
 508 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 509  */
 510 static void emit_dst_noalias( struct brw_vs_compile *c,
 511                               struct brw_reg dst,
 512                               struct brw_reg arg0,
 513                               struct brw_reg arg1)
 514 {
 515    struct brw_compile *p = &c->func;
 516
 517    /* There must be a better way to do this:
 518     */
 519    if (dst.dw1.bits.writemask & WRITEMASK_X)
 520       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 521    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 522       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 523    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 524       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 525    if (dst.dw1.bits.writemask & WRITEMASK_W)
 526       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 527 }
 528
 529 static void emit_xpd( struct brw_compile *p,
 530                       struct brw_reg dst,
 531                       struct brw_reg t,
 532                       struct brw_reg u)
 533 {
 534    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 535    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 536 }
 537
 538
 539
 540 static void emit_lit_noalias( struct brw_vs_compile *c,
 541                               struct brw_reg dst,
 542                               struct brw_reg arg0 )
 543 {
 544    struct brw_compile *p = &c->func;
 545    struct brw_instruction *if_insn;
 546    struct brw_reg tmp = dst;
 547    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 548
 549    if (need_tmp)
 550       tmp = get_tmp(c);
 551
 552    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 553    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 554
 555    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 556     * to get all channels active inside the IF.  In the clipping code
 557     * we run with NoMask, so it's not an option and we can use
 558     * BRW_EXECUTE_1 for all comparisions.
 559     */
 560    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 561    if_insn = brw_IF(p, BRW_EXECUTE_8);
 562    {
 563       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 564
 565       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 566       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 567       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 568
 569       emit_math2(c,
 570                  BRW_MATH_FUNCTION_POW,
 571                  brw_writemask(dst, WRITEMASK_Z),
 572                  brw_swizzle1(tmp, 2),
 573                  brw_swizzle1(arg0, 3),
 574                  BRW_MATH_PRECISION_PARTIAL);
 575    }
 576
 577    brw_ENDIF(p, if_insn);
 578 }
 579
 580
 581
 582
 583
 584 /* TODO: relative addressing!
 585  */
 586 static struct brw_reg get_reg( struct brw_vs_compile *c,
 587                                GLuint file,
 588                                GLuint index )
 589 {
 590
 591    switch (file) {
 592    case PROGRAM_TEMPORARY:
 593    case PROGRAM_INPUT:
 594    case PROGRAM_OUTPUT:
 595    case PROGRAM_STATE_VAR:
 596       assert(c->regs[file][index].nr != 0);
 597       return c->regs[file][index];
 598    case PROGRAM_ADDRESS:
 599       assert(index == 0);
 600       return c->regs[file][index];
 601
 602    case PROGRAM_UNDEFINED:                      /* undef values */
 603       return brw_null_reg();
 604
 605    case PROGRAM_LOCAL_PARAM:
 606    case PROGRAM_ENV_PARAM:
 607    case PROGRAM_WRITE_ONLY:
 608    default:
 609       assert(0);
 610       return brw_null_reg();
 611    }
 612 }
 613
 614
 615
 616 static struct brw_reg deref( struct brw_vs_compile *c,
 617                              struct brw_reg arg,
 618                              GLint offset)
 619 {
 620    struct brw_compile *p = &c->func;
 621    struct brw_reg tmp = vec4(get_tmp(c));
 622    struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
 623    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 624    struct brw_reg indirect = brw_vec4_indirect(0,0);
 625
 626    {
 627       brw_push_insn_state(p);
 628       brw_set_access_mode(p, BRW_ALIGN_1);
 629
 630       /* This is pretty clunky - load the address register twice and
 631        * fetch each 4-dword value in turn.  There must be a way to do
 632        * this in a single pass, but I couldn't get it to work.
 633        */
 634       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 635       brw_MOV(p, tmp, indirect);
 636
 637       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 638       brw_MOV(p, suboffset(tmp, 4), indirect);
 639
 640       brw_pop_insn_state(p);
 641    }
 642
 643    return vec8(tmp);
 644 }
 645
 646
 647 static void emit_arl( struct brw_vs_compile *c,
 648                       struct brw_reg dst,
 649                       struct brw_reg arg0 )
 650 {
 651    struct brw_compile *p = &c->func;
 652    struct brw_reg tmp = dst;
 653    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 654
 655    if (need_tmp)
 656       tmp = get_tmp(c);
 657
 658    brw_RNDD(p, tmp, arg0);
 659    brw_MUL(p, dst, tmp, brw_imm_d(16));
 660
 661    if (need_tmp)
 662       release_tmp(c, tmp);
 663 }
 664
 665
 666 /* Will return mangled results for SWZ op.  The emit_swz() function
 667  * ignores this result and recalculates taking extended swizzles into
 668  * account.
 669  */
 670 static struct brw_reg get_arg( struct brw_vs_compile *c,
 671                                struct prog_src_register src )
 672 {
 673    struct brw_reg reg;
 674
 675    if (src.File == PROGRAM_UNDEFINED)
 676       return brw_null_reg();
 677
 678    if (src.RelAddr)
 679       reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 680    else
 681       reg = get_reg(c, src.File, src.Index);
 682
 683    /* Convert 3-bit swizzle to 2-bit.
 684     */
 685    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src.Swizzle, 0),
 686                                        GET_SWZ(src.Swizzle, 1),
 687                                        GET_SWZ(src.Swizzle, 2),
 688                                        GET_SWZ(src.Swizzle, 3));
 689
 690    /* Note this is ok for non-swizzle instructions:
 691     */
 692    reg.negate = src.NegateBase ? 1 : 0;
 693
 694    return reg;
 695 }
 696
 697
 698 static struct brw_reg get_dst( struct brw_vs_compile *c,
 699                                struct prog_dst_register dst )
 700 {
 701    struct brw_reg reg = get_reg(c, dst.File, dst.Index);
 702
 703    reg.dw1.bits.writemask = dst.WriteMask;
 704
 705    return reg;
 706 }
 707
 708
 709
 710
 711 static void emit_swz( struct brw_vs_compile *c,
 712                       struct brw_reg dst,
 713                       struct prog_src_register src )
 714 {
 715    struct brw_compile *p = &c->func;
 716    GLuint zeros_mask = 0;
 717    GLuint ones_mask = 0;
 718    GLuint src_mask = 0;
 719    GLubyte src_swz[4];
 720    GLboolean need_tmp = (src.NegateBase &&
 721                          dst.file != BRW_GENERAL_REGISTER_FILE);
 722    struct brw_reg tmp = dst;
 723    GLuint i;
 724
 725    if (need_tmp)
 726       tmp = get_tmp(c);
 727
 728    for (i = 0; i < 4; i++) {
 729       if (dst.dw1.bits.writemask & (1<<i)) {
 730          GLubyte s = GET_SWZ(src.Swizzle, i);
 731          switch (s) {
 732          case SWIZZLE_X:
 733          case SWIZZLE_Y:
 734          case SWIZZLE_Z:
 735          case SWIZZLE_W:
 736             src_mask |= 1<<i;
 737             src_swz[i] = s;
 738             break;
 739          case SWIZZLE_ZERO:
 740             zeros_mask |= 1<<i;
 741             break;
 742          case SWIZZLE_ONE:
 743             ones_mask |= 1<<i;
 744             break;
 745          }
 746       }
 747    }
 748
 749    /* Do src first, in case dst aliases src:
 750     */
 751    if (src_mask) {
 752       struct brw_reg arg0;
 753
 754       if (src.RelAddr)
 755          arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
 756       else
 757          arg0 = get_reg(c, src.File, src.Index);
 758
 759       arg0 = brw_swizzle(arg0,
 760                          src_swz[0], src_swz[1],
 761                          src_swz[2], src_swz[3]);
 762
 763       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
 764    }
 765
 766    if (zeros_mask)
 767       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
 768
 769    if (ones_mask)
 770       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 771
 772    if (src.NegateBase)
 773       brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
 774
 775    if (need_tmp) {
 776       brw_MOV(p, dst, tmp);
 777       release_tmp(c, tmp);
 778    }
 779 }
 780
 781
 782
 783 /* Post-vertex-program processing.  Send the results to the URB.
 784  */
 785 static void emit_vertex_write( struct brw_vs_compile *c)
 786 {
 787    struct brw_compile *p = &c->func;
 788    struct brw_reg m0 = brw_message_reg(0);
 789    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
 790    struct brw_reg ndc;
 791
 792    if (c->key.copy_edgeflag) {
 793       brw_MOV(p,
 794               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
 795               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
 796    }
 797
 798
 799    /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
 800     */
 801    if (!c->key.know_w_is_one) {
 802       ndc = get_tmp(c);
 803       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
 804       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
 805    }
 806    else {
 807       ndc = pos;
 808    }
 809
 810    /* This includes the workaround for -ve rhw, so is no longer an
 811     * optional step:
 812     */
 813    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
 814        c->key.nr_userclip ||
 815        !c->key.know_w_is_one)
 816    {
 817       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
 818       GLuint i;
 819
 820       brw_MOV(p, header1, brw_imm_ud(0));
 821
 822       brw_set_access_mode(p, BRW_ALIGN_16);
 823
 824       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
 825          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 826          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 827          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
 828       }
 829
 830
 831       for (i = 0; i < c->key.nr_userclip; i++) {
 832          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 833          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
 834          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
 835          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 836       }
 837
 838
 839       /* i965 clipping workaround:
 840        * 1) Test for -ve rhw
 841        * 2) If set,
 842        *      set ndc = (0,0,0,0)
 843        *      set ucp[6] = 1
 844        *
 845        * Later, clipping will detect ucp[6] and ensure the primitive is
 846        * clipped against all fixed planes.
 847        */
 848       if (!c->key.know_w_is_one) {
 849          brw_CMP(p,
 850                  vec8(brw_null_reg()),
 851                  BRW_CONDITIONAL_L,
 852                  brw_swizzle1(ndc, 3),
 853                  brw_imm_f(0));
 854
 855          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
 856          brw_MOV(p, ndc, brw_imm_f(0));
 857          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 858       }
 859
 860       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
 861       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
 862       brw_set_access_mode(p, BRW_ALIGN_16);
 863
 864       release_tmp(c, header1);
 865    }
 866    else {
 867       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
 868    }
 869
 870
 871    /* Emit the (interleaved) headers for the two vertices - an 8-reg
 872     * of zeros followed by two sets of NDC coordinates:
 873     */
 874    brw_set_access_mode(p, BRW_ALIGN_1);
 875    brw_MOV(p, offset(m0, 2), ndc);
 876    brw_MOV(p, offset(m0, 3), pos);
 877
 878
 879    brw_urb_WRITE(p,
 880                  brw_null_reg(), /* dest */
 881                  0,             /* starting mrf reg nr */
 882                  c->r0,         /* src */
 883                  0,             /* allocate */
 884                  1,             /* used */
 885                  c->nr_outputs + 3, /* msg len */
 886                  0,             /* response len */
 887                  1,             /* eot */
 888                  1,             /* writes complete */
 889                  0,             /* urb destination offset */
 890                  BRW_URB_SWIZZLE_INTERLEAVE);
 891
 892 }
 893
 894
 895
 896
 897 /* Emit the fragment program instructions here.
 898  */
 899 void brw_vs_emit( struct brw_vs_compile *c )
 900 {
 901    struct brw_compile *p = &c->func;
 902    GLuint nr_insns = c->vp->program.Base.NumInstructions;
 903    GLuint insn;
 904
 905
 906    if (INTEL_DEBUG & DEBUG_VS) {
 907       _mesa_printf("\n\n\nvs-emit:\n");
 908       _mesa_print_program(&c->vp->program.Base);
 909       _mesa_printf("\n");
 910    }
 911
 912    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 913    brw_set_access_mode(p, BRW_ALIGN_16);
 914
 915    /* Static register allocation
 916     */
 917    brw_vs_alloc_regs(c);
 918
 919    for (insn = 0; insn < nr_insns; insn++) {
 920
 921       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
 922       struct brw_reg args[3], dst;
 923       GLuint i;
 924
 925       /* Get argument regs.  SWZ is special and does this itself.
 926        */
 927       if (inst->Opcode != OPCODE_SWZ)
 928          for (i = 0; i < 3; i++)
 929             args[i] = get_arg(c, inst->SrcReg[i]);
 930
 931       /* Get dest regs.  Note that it is possible for a reg to be both
 932        * dst and arg, given the static allocation of registers.  So
 933        * care needs to be taken emitting multi-operation instructions.
 934        */
 935       dst = get_dst(c, inst->DstReg);
 936
 937
 938       switch (inst->Opcode) {
 939       case OPCODE_ABS:
 940          brw_MOV(p, dst, brw_abs(args[0]));
 941          break;
 942       case OPCODE_ADD:
 943          brw_ADD(p, dst, args[0], args[1]);
 944          break;
 945       case OPCODE_DP3:
 946          brw_DP3(p, dst, args[0], args[1]);
 947          break;
 948       case OPCODE_DP4:
 949          brw_DP4(p, dst, args[0], args[1]);
 950          break;
 951       case OPCODE_DPH:
 952          brw_DPH(p, dst, args[0], args[1]);
 953          break;
 954       case OPCODE_DST:
 955          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
 956          break;
 957       case OPCODE_EXP:
 958          unalias1(c, dst, args[0], emit_exp_noalias);
 959          break;
 960       case OPCODE_EX2:
 961          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 962          break;
 963       case OPCODE_ARL:
 964          emit_arl(c, dst, args[0]);
 965          break;
 966       case OPCODE_FLR:
 967          brw_RNDD(p, dst, args[0]);
 968          break;
 969       case OPCODE_FRC:
 970          brw_FRC(p, dst, args[0]);
 971          break;
 972       case OPCODE_LOG:
 973          unalias1(c, dst, args[0], emit_log_noalias);
 974          break;
 975       case OPCODE_LG2:
 976          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
 977          break;
 978       case OPCODE_LIT:
 979          unalias1(c, dst, args[0], emit_lit_noalias);
 980          break;
 981       case OPCODE_MAD:
 982          brw_MOV(p, brw_acc_reg(), args[2]);
 983          brw_MAC(p, dst, args[0], args[1]);
 984          break;
 985       case OPCODE_MAX:
 986          emit_max(p, dst, args[0], args[1]);
 987          break;
 988       case OPCODE_MIN:
 989          emit_min(p, dst, args[0], args[1]);
 990          break;
 991       case OPCODE_MOV:
 992          brw_MOV(p, dst, args[0]);
 993          break;
 994       case OPCODE_MUL:
 995          brw_MUL(p, dst, args[0], args[1]);
 996          break;
 997       case OPCODE_POW:
 998          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
 999          break;
1000       case OPCODE_RCP:
1001          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1002          break;
1003       case OPCODE_RSQ:
1004          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1005          break;
1006       case OPCODE_SGE:
1007          emit_sge(p, dst, args[0], args[1]);
1008          break;
1009       case OPCODE_SLT:
1010          emit_slt(p, dst, args[0], args[1]);
1011          break;
1012       case OPCODE_SUB:
1013          brw_ADD(p, dst, args[0], negate(args[1]));
1014          break;
1015       case OPCODE_SWZ:
1016          /* The args[0] value can't be used here as it won't have
1017           * correctly encoded the full swizzle:
1018           */
1019          emit_swz(c, dst, inst->SrcReg[0] );
1020          break;
1021       case OPCODE_XPD:
1022          emit_xpd(p, dst, args[0], args[1]);
1023          break;
1024       case OPCODE_END:
1025       case OPCODE_PRINT:
1026          break;
1027       default:
1028          break;
1029       }
1030
1031       release_tmps(c);
1032    }
1033
1034    emit_vertex_write(c);
1035
1036 }
1037
1038
1039
1040
1041