src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    struct intel_context *intel = &c->func.brw->intel;
  71    GLuint i, reg = 0, mrf;
  72    int attributes_in_vue;
  73
  74    /* Determine whether to use a real constant buffer or use a block
  75     * of GRF registers for constants.  The later is faster but only
  76     * works if everything fits in the GRF.
  77     * XXX this heuristic/check may need some fine tuning...
  78     */
  79    if (c->vp->program.Base.Parameters->NumParameters +
  80        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  81       c->vp->use_const_buffer = GL_TRUE;
  82    else
  83       c->vp->use_const_buffer = GL_FALSE;
  84
  85    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  86
  87    /* r0 -- reserved as usual
  88     */
  89    c->r0 = brw_vec8_grf(reg, 0);
  90    reg++;
  91
  92    /* User clip planes from curbe:
  93     */
  94    if (c->key.nr_userclip) {
  95       for (i = 0; i < c->key.nr_userclip; i++) {
  96          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  97       }
  98
  99       /* Deal with curbe alignment:
 100        */
 101       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 102    }
 103
 104    /* Vertex program parameters from curbe:
 105     */
 106    if (c->vp->use_const_buffer) {
 107       /* get constants from a real constant buffer */
 108       c->prog_data.curb_read_length = 0;
 109       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 110    }
 111    else {
 112       /* use a section of the GRF for constants */
 113       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 114       for (i = 0; i < nr_params; i++) {
 115          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 116       }
 117       reg += (nr_params + 1) / 2;
 118       c->prog_data.curb_read_length = reg - 1;
 119
 120       c->prog_data.nr_params = nr_params * 4;
 121    }
 122
 123    /* Allocate input regs:
 124     */
 125    c->nr_inputs = 0;
 126    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 127       if (c->prog_data.inputs_read & (1 << i)) {
 128          c->nr_inputs++;
 129          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 130          reg++;
 131       }
 132    }
 133    /* If there are no inputs, we'll still be reading one attribute's worth
 134     * because it's required -- see urb_read_length setting.
 135     */
 136    if (c->nr_inputs == 0)
 137       reg++;
 138
 139    /* Allocate outputs.  The non-position outputs go straight into message regs.
 140     */
 141    c->nr_outputs = 0;
 142    c->first_output = reg;
 143    c->first_overflow_output = 0;
 144
 145    if (intel->is_ironlake)
 146        mrf = 8;
 147    else
 148        mrf = 4;
 149
 150    for (i = 0; i < VERT_RESULT_MAX; i++) {
 151       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 152          c->nr_outputs++;
 153          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 154          if (i == VERT_RESULT_HPOS) {
 155             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 156             reg++;
 157          }
 158          else if (i == VERT_RESULT_PSIZ) {
 159             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 160             reg++;
 161             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 162          }
 163          else {
 164             if (mrf < 16) {
 165                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 166                mrf++;
 167             }
 168             else {
 169                /* too many vertex results to fit in MRF, use GRF for overflow */
 170                if (!c->first_overflow_output)
 171                   c->first_overflow_output = i;
 172                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 173                reg++;
 174             }
 175          }
 176       }
 177    }
 178
 179    /* Allocate program temporaries:
 180     */
 181    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 182       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 183       reg++;
 184    }
 185
 186    /* Address reg(s).  Don't try to use the internal address reg until
 187     * deref time.
 188     */
 189    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 190       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 191                                              reg,
 192                                              0,
 193                                              BRW_REGISTER_TYPE_D,
 194                                              BRW_VERTICAL_STRIDE_8,
 195                                              BRW_WIDTH_8,
 196                                              BRW_HORIZONTAL_STRIDE_1,
 197                                              BRW_SWIZZLE_XXXX,
 198                                              WRITEMASK_X);
 199       reg++;
 200    }
 201
 202    if (c->vp->use_const_buffer) {
 203       for (i = 0; i < 3; i++) {
 204          c->current_const[i].index = -1;
 205          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 206          reg++;
 207       }
 208    }
 209
 210    for (i = 0; i < 128; i++) {
 211       if (c->output_regs[i].used_in_src) {
 212          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 213          reg++;
 214       }
 215    }
 216
 217    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 218    reg += 2;
 219
 220    /* Some opcodes need an internal temporary:
 221     */
 222    c->first_tmp = reg;
 223    c->last_tmp = reg;           /* for allocation purposes */
 224
 225    /* Each input reg holds data from two vertices.  The
 226     * urb_read_length is the number of registers read from *each*
 227     * vertex urb, so is half the amount:
 228     */
 229    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 230    /* Setting this field to 0 leads to undefined behavior according to the
 231     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 232     * sitting in them, even if it's padding.
 233     */
 234    if (c->prog_data.urb_read_length == 0)
 235       c->prog_data.urb_read_length = 1;
 236
 237    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 238     * them to fit the biggest thing they need to.
 239     */
 240    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 241
 242    if (intel->is_ironlake)
 243        c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 244    else
 245        c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 246
 247    c->prog_data.total_grf = reg;
 248
 249    if (INTEL_DEBUG & DEBUG_VS) {
 250       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 251       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 252       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 253    }
 254 }
 255
 256
 257 /**
 258  * If an instruction uses a temp reg both as a src and the dest, we
 259  * sometimes need to allocate an intermediate temporary.
 260  */
 261 static void unalias1( struct brw_vs_compile *c,
 262                       struct brw_reg dst,
 263                       struct brw_reg arg0,
 264                       void (*func)( struct brw_vs_compile *,
 265                                     struct brw_reg,
 266                                     struct brw_reg ))
 267 {
 268    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 269       struct brw_compile *p = &c->func;
 270       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 271       func(c, tmp, arg0);
 272       brw_MOV(p, dst, tmp);
 273       release_tmp(c, tmp);
 274    }
 275    else {
 276       func(c, dst, arg0);
 277    }
 278 }
 279
 280 /**
 281  * \sa unalias2
 282  * Checkes if 2-operand instruction needs an intermediate temporary.
 283  */
 284 static void unalias2( struct brw_vs_compile *c,
 285                       struct brw_reg dst,
 286                       struct brw_reg arg0,
 287                       struct brw_reg arg1,
 288                       void (*func)( struct brw_vs_compile *,
 289                                     struct brw_reg,
 290                                     struct brw_reg,
 291                                     struct brw_reg ))
 292 {
 293    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 294        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 295       struct brw_compile *p = &c->func;
 296       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 297       func(c, tmp, arg0, arg1);
 298       brw_MOV(p, dst, tmp);
 299       release_tmp(c, tmp);
 300    }
 301    else {
 302       func(c, dst, arg0, arg1);
 303    }
 304 }
 305
 306 /**
 307  * \sa unalias2
 308  * Checkes if 3-operand instruction needs an intermediate temporary.
 309  */
 310 static void unalias3( struct brw_vs_compile *c,
 311                       struct brw_reg dst,
 312                       struct brw_reg arg0,
 313                       struct brw_reg arg1,
 314                       struct brw_reg arg2,
 315                       void (*func)( struct brw_vs_compile *,
 316                                     struct brw_reg,
 317                                     struct brw_reg,
 318                                     struct brw_reg,
 319                                     struct brw_reg ))
 320 {
 321    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 322        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 323        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 324       struct brw_compile *p = &c->func;
 325       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 326       func(c, tmp, arg0, arg1, arg2);
 327       brw_MOV(p, dst, tmp);
 328       release_tmp(c, tmp);
 329    }
 330    else {
 331       func(c, dst, arg0, arg1, arg2);
 332    }
 333 }
 334
 335 static void emit_sop( struct brw_vs_compile *c,
 336                       struct brw_reg dst,
 337                       struct brw_reg arg0,
 338                       struct brw_reg arg1,
 339                       GLuint cond)
 340 {
 341    struct brw_compile *p = &c->func;
 342
 343    brw_MOV(p, dst, brw_imm_f(0.0f));
 344    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 345    brw_MOV(p, dst, brw_imm_f(1.0f));
 346    brw_set_predicate_control_flag_value(p, 0xff);
 347 }
 348
 349 static void emit_seq( struct brw_vs_compile *c,
 350                       struct brw_reg dst,
 351                       struct brw_reg arg0,
 352                       struct brw_reg arg1 )
 353 {
 354    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 355 }
 356
 357 static void emit_sne( struct brw_vs_compile *c,
 358                       struct brw_reg dst,
 359                       struct brw_reg arg0,
 360                       struct brw_reg arg1 )
 361 {
 362    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 363 }
 364 static void emit_slt( struct brw_vs_compile *c,
 365                       struct brw_reg dst,
 366                       struct brw_reg arg0,
 367                       struct brw_reg arg1 )
 368 {
 369    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 370 }
 371
 372 static void emit_sle( struct brw_vs_compile *c,
 373                       struct brw_reg dst,
 374                       struct brw_reg arg0,
 375                       struct brw_reg arg1 )
 376 {
 377    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 378 }
 379
 380 static void emit_sgt( struct brw_vs_compile *c,
 381                       struct brw_reg dst,
 382                       struct brw_reg arg0,
 383                       struct brw_reg arg1 )
 384 {
 385    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 386 }
 387
 388 static void emit_sge( struct brw_vs_compile *c,
 389                       struct brw_reg dst,
 390                       struct brw_reg arg0,
 391                       struct brw_reg arg1 )
 392 {
 393   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 394 }
 395
 396 static void emit_cmp( struct brw_compile *p,
 397                       struct brw_reg dst,
 398                       struct brw_reg arg0,
 399                       struct brw_reg arg1,
 400                       struct brw_reg arg2 )
 401 {
 402    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 403    brw_SEL(p, dst, arg1, arg2);
 404    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 405 }
 406
 407 static void emit_max( struct brw_compile *p,
 408                       struct brw_reg dst,
 409                       struct brw_reg arg0,
 410                       struct brw_reg arg1 )
 411 {
 412    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 413    brw_SEL(p, dst, arg1, arg0);
 414    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 415 }
 416
 417 static void emit_min( struct brw_compile *p,
 418                       struct brw_reg dst,
 419                       struct brw_reg arg0,
 420                       struct brw_reg arg1 )
 421 {
 422    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 423    brw_SEL(p, dst, arg0, arg1);
 424    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 425 }
 426
 427
 428 static void emit_math1( struct brw_vs_compile *c,
 429                         GLuint function,
 430                         struct brw_reg dst,
 431                         struct brw_reg arg0,
 432                         GLuint precision)
 433 {
 434    /* There are various odd behaviours with SEND on the simulator.  In
 435     * addition there are documented issues with the fact that the GEN4
 436     * processor doesn't do dependency control properly on SEND
 437     * results.  So, on balance, this kludge to get around failures
 438     * with writemasked math results looks like it might be necessary
 439     * whether that turns out to be a simulator bug or not:
 440     */
 441    struct brw_compile *p = &c->func;
 442    struct brw_reg tmp = dst;
 443    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 444                          dst.file != BRW_GENERAL_REGISTER_FILE);
 445
 446    if (need_tmp)
 447       tmp = get_tmp(c);
 448
 449    brw_math(p,
 450             tmp,
 451             function,
 452             BRW_MATH_SATURATE_NONE,
 453             2,
 454             arg0,
 455             BRW_MATH_DATA_SCALAR,
 456             precision);
 457
 458    if (need_tmp) {
 459       brw_MOV(p, dst, tmp);
 460       release_tmp(c, tmp);
 461    }
 462 }
 463
 464
 465 static void emit_math2( struct brw_vs_compile *c,
 466                         GLuint function,
 467                         struct brw_reg dst,
 468                         struct brw_reg arg0,
 469                         struct brw_reg arg1,
 470                         GLuint precision)
 471 {
 472    struct brw_compile *p = &c->func;
 473    struct brw_reg tmp = dst;
 474    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 475                          dst.file != BRW_GENERAL_REGISTER_FILE);
 476
 477    if (need_tmp)
 478       tmp = get_tmp(c);
 479
 480    brw_MOV(p, brw_message_reg(3), arg1);
 481
 482    brw_math(p,
 483             tmp,
 484             function,
 485             BRW_MATH_SATURATE_NONE,
 486             2,
 487             arg0,
 488             BRW_MATH_DATA_SCALAR,
 489             precision);
 490
 491    if (need_tmp) {
 492       brw_MOV(p, dst, tmp);
 493       release_tmp(c, tmp);
 494    }
 495 }
 496
 497
 498 static void emit_exp_noalias( struct brw_vs_compile *c,
 499                               struct brw_reg dst,
 500                               struct brw_reg arg0 )
 501 {
 502    struct brw_compile *p = &c->func;
 503
 504
 505    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 506       struct brw_reg tmp = get_tmp(c);
 507       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 508
 509       /* tmp_d = floor(arg0.x) */
 510       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 511
 512       /* result[0] = 2.0 ^ tmp */
 513
 514       /* Adjust exponent for floating point:
 515        * exp += 127
 516        */
 517       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 518
 519       /* Install exponent and sign.
 520        * Excess drops off the edge:
 521        */
 522       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 523               tmp_d, brw_imm_d(23));
 524
 525       release_tmp(c, tmp);
 526    }
 527
 528    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 529       /* result[1] = arg0.x - floor(arg0.x) */
 530       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 531    }
 532
 533    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 534       /* As with the LOG instruction, we might be better off just
 535        * doing a taylor expansion here, seeing as we have to do all
 536        * the prep work.
 537        *
 538        * If mathbox partial precision is too low, consider also:
 539        * result[3] = result[0] * EXP(result[1])
 540        */
 541       emit_math1(c,
 542                  BRW_MATH_FUNCTION_EXP,
 543                  brw_writemask(dst, WRITEMASK_Z),
 544                  brw_swizzle1(arg0, 0),
 545                  BRW_MATH_PRECISION_FULL);
 546    }
 547
 548    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 549       /* result[3] = 1.0; */
 550       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 551    }
 552 }
 553
 554
 555 static void emit_log_noalias( struct brw_vs_compile *c,
 556                               struct brw_reg dst,
 557                               struct brw_reg arg0 )
 558 {
 559    struct brw_compile *p = &c->func;
 560    struct brw_reg tmp = dst;
 561    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 562    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 563    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 564                          dst.file != BRW_GENERAL_REGISTER_FILE);
 565
 566    if (need_tmp) {
 567       tmp = get_tmp(c);
 568       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 569    }
 570
 571    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 572     * according to spec:
 573     *
 574     * These almost look likey they could be joined up, but not really
 575     * practical:
 576     *
 577     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 578     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 579     */
 580    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 581       brw_AND(p,
 582               brw_writemask(tmp_ud, WRITEMASK_X),
 583               brw_swizzle1(arg0_ud, 0),
 584               brw_imm_ud((1U<<31)-1));
 585
 586       brw_SHR(p,
 587               brw_writemask(tmp_ud, WRITEMASK_X),
 588               tmp_ud,
 589               brw_imm_ud(23));
 590
 591       brw_ADD(p,
 592               brw_writemask(tmp, WRITEMASK_X),
 593               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 594               brw_imm_d(-127));
 595    }
 596
 597    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 598       brw_AND(p,
 599               brw_writemask(tmp_ud, WRITEMASK_Y),
 600               brw_swizzle1(arg0_ud, 0),
 601               brw_imm_ud((1<<23)-1));
 602
 603       brw_OR(p,
 604              brw_writemask(tmp_ud, WRITEMASK_Y),
 605              tmp_ud,
 606              brw_imm_ud(127<<23));
 607    }
 608
 609    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 610       /* result[2] = result[0] + LOG2(result[1]); */
 611
 612       /* Why bother?  The above is just a hint how to do this with a
 613        * taylor series.  Maybe we *should* use a taylor series as by
 614        * the time all the above has been done it's almost certainly
 615        * quicker than calling the mathbox, even with low precision.
 616        *
 617        * Options are:
 618        *    - result[0] + mathbox.LOG2(result[1])
 619        *    - mathbox.LOG2(arg0.x)
 620        *    - result[0] + inline_taylor_approx(result[1])
 621        */
 622       emit_math1(c,
 623                  BRW_MATH_FUNCTION_LOG,
 624                  brw_writemask(tmp, WRITEMASK_Z),
 625                  brw_swizzle1(tmp, 1),
 626                  BRW_MATH_PRECISION_FULL);
 627
 628       brw_ADD(p,
 629               brw_writemask(tmp, WRITEMASK_Z),
 630               brw_swizzle1(tmp, 2),
 631               brw_swizzle1(tmp, 0));
 632    }
 633
 634    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 635       /* result[3] = 1.0; */
 636       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 637    }
 638
 639    if (need_tmp) {
 640       brw_MOV(p, dst, tmp);
 641       release_tmp(c, tmp);
 642    }
 643 }
 644
 645
 646 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 647  */
 648 static void emit_dst_noalias( struct brw_vs_compile *c,
 649                               struct brw_reg dst,
 650                               struct brw_reg arg0,
 651                               struct brw_reg arg1)
 652 {
 653    struct brw_compile *p = &c->func;
 654
 655    /* There must be a better way to do this:
 656     */
 657    if (dst.dw1.bits.writemask & WRITEMASK_X)
 658       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 659    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 660       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 661    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 662       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 663    if (dst.dw1.bits.writemask & WRITEMASK_W)
 664       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 665 }
 666
 667
 668 static void emit_xpd( struct brw_compile *p,
 669                       struct brw_reg dst,
 670                       struct brw_reg t,
 671                       struct brw_reg u)
 672 {
 673    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 674    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 675 }
 676
 677
 678 static void emit_lit_noalias( struct brw_vs_compile *c,
 679                               struct brw_reg dst,
 680                               struct brw_reg arg0 )
 681 {
 682    struct brw_compile *p = &c->func;
 683    struct brw_instruction *if_insn;
 684    struct brw_reg tmp = dst;
 685    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 686
 687    if (need_tmp)
 688       tmp = get_tmp(c);
 689
 690    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 691    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 692
 693    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 694     * to get all channels active inside the IF.  In the clipping code
 695     * we run with NoMask, so it's not an option and we can use
 696     * BRW_EXECUTE_1 for all comparisions.
 697     */
 698    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 699    if_insn = brw_IF(p, BRW_EXECUTE_8);
 700    {
 701       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 702
 703       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 704       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 705       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 706
 707       emit_math2(c,
 708                  BRW_MATH_FUNCTION_POW,
 709                  brw_writemask(dst, WRITEMASK_Z),
 710                  brw_swizzle1(tmp, 2),
 711                  brw_swizzle1(arg0, 3),
 712                  BRW_MATH_PRECISION_PARTIAL);
 713    }
 714
 715    brw_ENDIF(p, if_insn);
 716
 717    release_tmp(c, tmp);
 718 }
 719
 720 static void emit_lrp_noalias(struct brw_vs_compile *c,
 721                              struct brw_reg dst,
 722                              struct brw_reg arg0,
 723                              struct brw_reg arg1,
 724                              struct brw_reg arg2)
 725 {
 726    struct brw_compile *p = &c->func;
 727
 728    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 729    brw_MUL(p, brw_null_reg(), dst, arg2);
 730    brw_MAC(p, dst, arg0, arg1);
 731 }
 732
 733 /** 3 or 4-component vector normalization */
 734 static void emit_nrm( struct brw_vs_compile *c,
 735                       struct brw_reg dst,
 736                       struct brw_reg arg0,
 737                       int num_comps)
 738 {
 739    struct brw_compile *p = &c->func;
 740    struct brw_reg tmp = get_tmp(c);
 741
 742    /* tmp = dot(arg0, arg0) */
 743    if (num_comps == 3)
 744       brw_DP3(p, tmp, arg0, arg0);
 745    else
 746       brw_DP4(p, tmp, arg0, arg0);
 747
 748    /* tmp = 1 / sqrt(tmp) */
 749    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 750
 751    /* dst = arg0 * tmp */
 752    brw_MUL(p, dst, arg0, tmp);
 753
 754    release_tmp(c, tmp);
 755 }
 756
 757
 758 static struct brw_reg
 759 get_constant(struct brw_vs_compile *c,
 760              const struct prog_instruction *inst,
 761              GLuint argIndex)
 762 {
 763    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 764    struct brw_compile *p = &c->func;
 765    struct brw_reg const_reg;
 766    struct brw_reg const2_reg;
 767    const GLboolean relAddr = src->RelAddr;
 768
 769    assert(argIndex < 3);
 770
 771    if (c->current_const[argIndex].index != src->Index || relAddr) {
 772       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 773
 774       c->current_const[argIndex].index = src->Index;
 775
 776 #if 0
 777       printf("  fetch const[%d] for arg %d into reg %d\n",
 778              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 779 #endif
 780       /* need to fetch the constant now */
 781       brw_dp_READ_4_vs(p,
 782                        c->current_const[argIndex].reg,/* writeback dest */
 783                        0,                             /* oword */
 784                        relAddr,                       /* relative indexing? */
 785                        addrReg,                       /* address register */
 786                        16 * src->Index,               /* byte offset */
 787                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 788                        );
 789
 790       if (relAddr) {
 791          /* second read */
 792          const2_reg = get_tmp(c);
 793
 794          /* use upper half of address reg for second read */
 795          addrReg = stride(addrReg, 0, 4, 0);
 796          addrReg.subnr = 16;
 797
 798          brw_dp_READ_4_vs(p,
 799                           const2_reg,              /* writeback dest */
 800                           1,                       /* oword */
 801                           relAddr,                 /* relative indexing? */
 802                           addrReg,                 /* address register */
 803                           16 * src->Index,         /* byte offset */
 804                           SURF_INDEX_VERT_CONST_BUFFER
 805                           );
 806       }
 807    }
 808
 809    const_reg = c->current_const[argIndex].reg;
 810
 811    if (relAddr) {
 812       /* merge the two Owords into the constant register */
 813       /* const_reg[7..4] = const2_reg[7..4] */
 814       brw_MOV(p,
 815               suboffset(stride(const_reg, 0, 4, 1), 4),
 816               suboffset(stride(const2_reg, 0, 4, 1), 4));
 817       release_tmp(c, const2_reg);
 818    }
 819    else {
 820       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 821       const_reg = stride(const_reg, 0, 4, 0);
 822       const_reg.subnr = 0;
 823    }
 824
 825    return const_reg;
 826 }
 827
 828
 829
 830 /* TODO: relative addressing!
 831  */
 832 static struct brw_reg get_reg( struct brw_vs_compile *c,
 833                                gl_register_file file,
 834                                GLuint index )
 835 {
 836    switch (file) {
 837    case PROGRAM_TEMPORARY:
 838    case PROGRAM_INPUT:
 839    case PROGRAM_OUTPUT:
 840       assert(c->regs[file][index].nr != 0);
 841       return c->regs[file][index];
 842    case PROGRAM_STATE_VAR:
 843    case PROGRAM_CONSTANT:
 844    case PROGRAM_UNIFORM:
 845       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 846       return c->regs[PROGRAM_STATE_VAR][index];
 847    case PROGRAM_ADDRESS:
 848       assert(index == 0);
 849       return c->regs[file][index];
 850
 851    case PROGRAM_UNDEFINED:                      /* undef values */
 852       return brw_null_reg();
 853
 854    case PROGRAM_LOCAL_PARAM:
 855    case PROGRAM_ENV_PARAM:
 856    case PROGRAM_WRITE_ONLY:
 857    default:
 858       assert(0);
 859       return brw_null_reg();
 860    }
 861 }
 862
 863
 864 /**
 865  * Indirect addressing:  get reg[[arg] + offset].
 866  */
 867 static struct brw_reg deref( struct brw_vs_compile *c,
 868                              struct brw_reg arg,
 869                              GLint offset)
 870 {
 871    struct brw_compile *p = &c->func;
 872    struct brw_reg tmp = vec4(get_tmp(c));
 873    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 874    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 875    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 876    struct brw_reg indirect = brw_vec4_indirect(0,0);
 877
 878    {
 879       brw_push_insn_state(p);
 880       brw_set_access_mode(p, BRW_ALIGN_1);
 881
 882       /* This is pretty clunky - load the address register twice and
 883        * fetch each 4-dword value in turn.  There must be a way to do
 884        * this in a single pass, but I couldn't get it to work.
 885        */
 886       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 887       brw_MOV(p, tmp, indirect);
 888
 889       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 890       brw_MOV(p, suboffset(tmp, 4), indirect);
 891
 892       brw_pop_insn_state(p);
 893    }
 894
 895    /* NOTE: tmp not released */
 896    return vec8(tmp);
 897 }
 898
 899
 900 /**
 901  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 902  * TODO: relative addressing!
 903  */
 904 static struct brw_reg
 905 get_src_reg( struct brw_vs_compile *c,
 906              const struct prog_instruction *inst,
 907              GLuint argIndex )
 908 {
 909    const GLuint file = inst->SrcReg[argIndex].File;
 910    const GLint index = inst->SrcReg[argIndex].Index;
 911    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 912
 913    switch (file) {
 914    case PROGRAM_TEMPORARY:
 915    case PROGRAM_INPUT:
 916    case PROGRAM_OUTPUT:
 917       if (relAddr) {
 918          return deref(c, c->regs[file][0], index);
 919       }
 920       else {
 921          assert(c->regs[file][index].nr != 0);
 922          return c->regs[file][index];
 923       }
 924
 925    case PROGRAM_STATE_VAR:
 926    case PROGRAM_CONSTANT:
 927    case PROGRAM_UNIFORM:
 928    case PROGRAM_ENV_PARAM:
 929    case PROGRAM_LOCAL_PARAM:
 930       if (c->vp->use_const_buffer) {
 931          return get_constant(c, inst, argIndex);
 932       }
 933       else if (relAddr) {
 934          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 935       }
 936       else {
 937          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 938          return c->regs[PROGRAM_STATE_VAR][index];
 939       }
 940    case PROGRAM_ADDRESS:
 941       assert(index == 0);
 942       return c->regs[file][index];
 943
 944    case PROGRAM_UNDEFINED:
 945       /* this is a normal case since we loop over all three src args */
 946       return brw_null_reg();
 947
 948    case PROGRAM_WRITE_ONLY:
 949    default:
 950       assert(0);
 951       return brw_null_reg();
 952    }
 953 }
 954
 955
 956 static void emit_arl( struct brw_vs_compile *c,
 957                       struct brw_reg dst,
 958                       struct brw_reg arg0 )
 959 {
 960    struct brw_compile *p = &c->func;
 961    struct brw_reg tmp = dst;
 962    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 963
 964    if (need_tmp)
 965       tmp = get_tmp(c);
 966
 967    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 968    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 969
 970    if (need_tmp)
 971       release_tmp(c, tmp);
 972 }
 973
 974
 975 /**
 976  * Return the brw reg for the given instruction's src argument.
 977  * Will return mangled results for SWZ op.  The emit_swz() function
 978  * ignores this result and recalculates taking extended swizzles into
 979  * account.
 980  */
 981 static struct brw_reg get_arg( struct brw_vs_compile *c,
 982                                const struct prog_instruction *inst,
 983                                GLuint argIndex )
 984 {
 985    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 986    struct brw_reg reg;
 987
 988    if (src->File == PROGRAM_UNDEFINED)
 989       return brw_null_reg();
 990
 991    reg = get_src_reg(c, inst, argIndex);
 992
 993    /* Convert 3-bit swizzle to 2-bit.
 994     */
 995    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 996                                        GET_SWZ(src->Swizzle, 1),
 997                                        GET_SWZ(src->Swizzle, 2),
 998                                        GET_SWZ(src->Swizzle, 3));
 999
1000    /* Note this is ok for non-swizzle instructions:
1001     */
1002    reg.negate = src->Negate ? 1 : 0;
1003
1004    return reg;
1005 }
1006
1007
1008 /**
1009  * Get brw register for the given program dest register.
1010  */
1011 static struct brw_reg get_dst( struct brw_vs_compile *c,
1012                                struct prog_dst_register dst )
1013 {
1014    struct brw_reg reg;
1015
1016    switch (dst.File) {
1017    case PROGRAM_TEMPORARY:
1018    case PROGRAM_OUTPUT:
1019       assert(c->regs[dst.File][dst.Index].nr != 0);
1020       reg = c->regs[dst.File][dst.Index];
1021       break;
1022    case PROGRAM_ADDRESS:
1023       assert(dst.Index == 0);
1024       reg = c->regs[dst.File][dst.Index];
1025       break;
1026    case PROGRAM_UNDEFINED:
1027       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1028       reg = brw_null_reg();
1029       break;
1030    default:
1031       assert(0);
1032       reg = brw_null_reg();
1033    }
1034
1035    reg.dw1.bits.writemask = dst.WriteMask;
1036
1037    return reg;
1038 }
1039
1040
1041 static void emit_swz( struct brw_vs_compile *c,
1042                       struct brw_reg dst,
1043                       const struct prog_instruction *inst)
1044 {
1045    const GLuint argIndex = 0;
1046    const struct prog_src_register src = inst->SrcReg[argIndex];
1047    struct brw_compile *p = &c->func;
1048    GLuint zeros_mask = 0;
1049    GLuint ones_mask = 0;
1050    GLuint src_mask = 0;
1051    GLubyte src_swz[4];
1052    GLboolean need_tmp = (src.Negate &&
1053                          dst.file != BRW_GENERAL_REGISTER_FILE);
1054    struct brw_reg tmp = dst;
1055    GLuint i;
1056
1057    if (need_tmp)
1058       tmp = get_tmp(c);
1059
1060    for (i = 0; i < 4; i++) {
1061       if (dst.dw1.bits.writemask & (1<<i)) {
1062          GLubyte s = GET_SWZ(src.Swizzle, i);
1063          switch (s) {
1064          case SWIZZLE_X:
1065          case SWIZZLE_Y:
1066          case SWIZZLE_Z:
1067          case SWIZZLE_W:
1068             src_mask |= 1<<i;
1069             src_swz[i] = s;
1070             break;
1071          case SWIZZLE_ZERO:
1072             zeros_mask |= 1<<i;
1073             break;
1074          case SWIZZLE_ONE:
1075             ones_mask |= 1<<i;
1076             break;
1077          }
1078       }
1079    }
1080
1081    /* Do src first, in case dst aliases src:
1082     */
1083    if (src_mask) {
1084       struct brw_reg arg0;
1085
1086       arg0 = get_src_reg(c, inst, argIndex);
1087
1088       arg0 = brw_swizzle(arg0,
1089                          src_swz[0], src_swz[1],
1090                          src_swz[2], src_swz[3]);
1091
1092       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1093    }
1094
1095    if (zeros_mask)
1096       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1097
1098    if (ones_mask)
1099       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1100
1101    if (src.Negate)
1102       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1103
1104    if (need_tmp) {
1105       brw_MOV(p, dst, tmp);
1106       release_tmp(c, tmp);
1107    }
1108 }
1109
1110
1111 /**
1112  * Post-vertex-program processing.  Send the results to the URB.
1113  */
1114 static void emit_vertex_write( struct brw_vs_compile *c)
1115 {
1116    struct brw_compile *p = &c->func;
1117    struct brw_context *brw = p->brw;
1118    struct intel_context *intel = &brw->intel;
1119    struct brw_reg m0 = brw_message_reg(0);
1120    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1121    struct brw_reg ndc;
1122    int eot;
1123    GLuint len_vertext_header = 2;
1124
1125    if (c->key.copy_edgeflag) {
1126       brw_MOV(p,
1127               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1128               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1129    }
1130
1131    /* Build ndc coords */
1132    ndc = get_tmp(c);
1133    /* ndc = 1.0 / pos.w */
1134    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1135    /* ndc.xyz = pos * ndc */
1136    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1137
1138    /* Update the header for point size, user clipping flags, and -ve rhw
1139     * workaround.
1140     */
1141    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1142        c->key.nr_userclip || brw->has_negative_rhw_bug)
1143    {
1144       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1145       GLuint i;
1146
1147       brw_MOV(p, header1, brw_imm_ud(0));
1148
1149       brw_set_access_mode(p, BRW_ALIGN_16);
1150
1151       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1152          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1153          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1154          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1155       }
1156
1157       for (i = 0; i < c->key.nr_userclip; i++) {
1158          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1159          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1160          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1161          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1162       }
1163
1164       /* i965 clipping workaround:
1165        * 1) Test for -ve rhw
1166        * 2) If set,
1167        *      set ndc = (0,0,0,0)
1168        *      set ucp[6] = 1
1169        *
1170        * Later, clipping will detect ucp[6] and ensure the primitive is
1171        * clipped against all fixed planes.
1172        */
1173       if (brw->has_negative_rhw_bug) {
1174          brw_CMP(p,
1175                  vec8(brw_null_reg()),
1176                  BRW_CONDITIONAL_L,
1177                  brw_swizzle1(ndc, 3),
1178                  brw_imm_f(0));
1179
1180          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1181          brw_MOV(p, ndc, brw_imm_f(0));
1182          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1183       }
1184
1185       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1186       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1187       brw_set_access_mode(p, BRW_ALIGN_16);
1188
1189       release_tmp(c, header1);
1190    }
1191    else {
1192       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1193    }
1194
1195    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1196     * of zeros followed by two sets of NDC coordinates:
1197     */
1198    brw_set_access_mode(p, BRW_ALIGN_1);
1199    brw_MOV(p, offset(m0, 2), ndc);
1200
1201    if (intel->is_ironlake) {
1202        /* There are 20 DWs (D0-D19) in VUE vertex header on Ironlake */
1203        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1204        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1205         * Seems it is useless for us.
1206         * m6 is used for aligning, so that the remainder of vertex element is
1207         * reg-aligned.
1208         */
1209        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1210        len_vertext_header = 6;
1211    } else {
1212        brw_MOV(p, offset(m0, 3), pos);
1213        len_vertext_header = 2;
1214    }
1215
1216    eot = (c->first_overflow_output == 0);
1217
1218    brw_urb_WRITE(p,
1219                  brw_null_reg(), /* dest */
1220                  0,             /* starting mrf reg nr */
1221                  c->r0,         /* src */
1222                  0,             /* allocate */
1223                  1,             /* used */
1224                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1225                  0,             /* response len */
1226                  eot,           /* eot */
1227                  eot,           /* writes complete */
1228                  0,             /* urb destination offset */
1229                  BRW_URB_SWIZZLE_INTERLEAVE);
1230
1231    if (c->first_overflow_output > 0) {
1232       /* Not all of the vertex outputs/results fit into the MRF.
1233        * Move the overflowed attributes from the GRF to the MRF and
1234        * issue another brw_urb_WRITE().
1235        */
1236       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1237        * at mrf[4] atm...
1238        */
1239       GLuint i, mrf = 0;
1240       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1241          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1242             /* move from GRF to MRF */
1243             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1244             mrf++;
1245          }
1246       }
1247
1248       brw_urb_WRITE(p,
1249                     brw_null_reg(), /* dest */
1250                     4,              /* starting mrf reg nr */
1251                     c->r0,          /* src */
1252                     0,              /* allocate */
1253                     1,              /* used */
1254                     mrf+1,          /* msg len */
1255                     0,              /* response len */
1256                     1,              /* eot */
1257                     1,              /* writes complete */
1258                     BRW_MAX_MRF-1,  /* urb destination offset */
1259                     BRW_URB_SWIZZLE_INTERLEAVE);
1260    }
1261 }
1262
1263
1264 /**
1265  * Called after code generation to resolve subroutine calls and the
1266  * END instruction.
1267  * \param end_inst  points to brw code for END instruction
1268  * \param last_inst  points to last instruction emitted before vertex write
1269  */
1270 static void
1271 post_vs_emit( struct brw_vs_compile *c,
1272               struct brw_instruction *end_inst,
1273               struct brw_instruction *last_inst )
1274 {
1275    GLint offset;
1276
1277    brw_resolve_cals(&c->func);
1278
1279    /* patch up the END code to jump past subroutines, etc */
1280    offset = last_inst - end_inst;
1281    if (offset > 1) {
1282       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1283    } else {
1284       end_inst->header.opcode = BRW_OPCODE_NOP;
1285    }
1286 }
1287
1288 static GLboolean
1289 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1290 {
1291    struct brw_compile *p = &c->func;
1292    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1293
1294    if (p->nr_insn == 0)
1295       return GL_FALSE;
1296
1297    if (val.address_mode != BRW_ADDRESS_DIRECT)
1298       return GL_FALSE;
1299
1300    switch (prev_insn->header.opcode) {
1301    case BRW_OPCODE_MOV:
1302    case BRW_OPCODE_MAC:
1303    case BRW_OPCODE_MUL:
1304       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1305           prev_insn->header.execution_size == val.width &&
1306           prev_insn->bits1.da1.dest_reg_file == val.file &&
1307           prev_insn->bits1.da1.dest_reg_type == val.type &&
1308           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1309           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1310           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1311           prev_insn->bits1.da16.dest_writemask == 0xf)
1312          return GL_TRUE;
1313       else
1314          return GL_FALSE;
1315    default:
1316       return GL_FALSE;
1317    }
1318 }
1319
1320 static uint32_t
1321 get_predicate(const struct prog_instruction *inst)
1322 {
1323    if (inst->DstReg.CondMask == COND_TR)
1324       return BRW_PREDICATE_NONE;
1325
1326    /* All of GLSL only produces predicates for COND_NE and one channel per
1327     * vector.  Fail badly if someone starts doing something else, as it might
1328     * mean infinite looping or something.
1329     *
1330     * We'd like to support all the condition codes, but our hardware doesn't
1331     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1332     * those, the instruction may update the condition codes or not, then any
1333     * later instruction may use one of those condition codes.  For gen4, the
1334     * instruction may update the flags register based on one of the condition
1335     * codes output by the instruction, and then further instructions may
1336     * predicate on that.  We can probably support this, but it won't
1337     * necessarily be easy.
1338     */
1339    assert(inst->DstReg.CondMask == COND_NE);
1340
1341    switch (inst->DstReg.CondSwizzle) {
1342    case SWIZZLE_XXXX:
1343       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1344    case SWIZZLE_YYYY:
1345       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1346    case SWIZZLE_ZZZZ:
1347       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1348    case SWIZZLE_WWWW:
1349       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1350    default:
1351       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1352                     inst->DstReg.CondMask);
1353       return BRW_PREDICATE_NORMAL;
1354    }
1355 }
1356
1357 /* Emit the vertex program instructions here.
1358  */
1359 void brw_vs_emit(struct brw_vs_compile *c )
1360 {
1361 #define MAX_IF_DEPTH 32
1362 #define MAX_LOOP_DEPTH 32
1363    struct brw_compile *p = &c->func;
1364    struct brw_context *brw = p->brw;
1365    struct intel_context *intel = &brw->intel;
1366    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1367    GLuint insn, if_depth = 0, loop_depth = 0;
1368    GLuint end_offset = 0;
1369    struct brw_instruction *end_inst, *last_inst;
1370    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1371    const struct brw_indirect stack_index = brw_indirect(0, 0);
1372    GLuint index;
1373    GLuint file;
1374
1375    if (INTEL_DEBUG & DEBUG_VS) {
1376       _mesa_printf("vs-mesa:\n");
1377       _mesa_print_program(&c->vp->program.Base);
1378       _mesa_printf("\n");
1379    }
1380
1381    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1382    brw_set_access_mode(p, BRW_ALIGN_16);
1383
1384    /* Message registers can't be read, so copy the output into GRF register
1385       if they are used in source registers */
1386    for (insn = 0; insn < nr_insns; insn++) {
1387        GLuint i;
1388        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1389        for (i = 0; i < 3; i++) {
1390            struct prog_src_register *src = &inst->SrcReg[i];
1391            GLuint index = src->Index;
1392            GLuint file = src->File;
1393            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1394                c->output_regs[index].used_in_src = GL_TRUE;
1395        }
1396    }
1397
1398    /* Static register allocation
1399     */
1400    brw_vs_alloc_regs(c);
1401    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1402
1403    for (insn = 0; insn < nr_insns; insn++) {
1404
1405       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1406       struct brw_reg args[3], dst;
1407       GLuint i;
1408
1409 #if 0
1410       printf("%d: ", insn);
1411       _mesa_print_instruction(inst);
1412 #endif
1413
1414       /* Get argument regs.  SWZ is special and does this itself.
1415        */
1416       if (inst->Opcode != OPCODE_SWZ)
1417           for (i = 0; i < 3; i++) {
1418               const struct prog_src_register *src = &inst->SrcReg[i];
1419               index = src->Index;
1420               file = src->File;
1421               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1422                   args[i] = c->output_regs[index].reg;
1423               else
1424                   args[i] = get_arg(c, inst, i);
1425           }
1426
1427       /* Get dest regs.  Note that it is possible for a reg to be both
1428        * dst and arg, given the static allocation of registers.  So
1429        * care needs to be taken emitting multi-operation instructions.
1430        */
1431       index = inst->DstReg.Index;
1432       file = inst->DstReg.File;
1433       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1434           dst = c->output_regs[index].reg;
1435       else
1436           dst = get_dst(c, inst->DstReg);
1437
1438       if (inst->SaturateMode != SATURATE_OFF) {
1439          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1440                        inst->SaturateMode);
1441       }
1442
1443       switch (inst->Opcode) {
1444       case OPCODE_ABS:
1445          brw_MOV(p, dst, brw_abs(args[0]));
1446          break;
1447       case OPCODE_ADD:
1448          brw_ADD(p, dst, args[0], args[1]);
1449          break;
1450       case OPCODE_COS:
1451          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1452          break;
1453       case OPCODE_DP3:
1454          brw_DP3(p, dst, args[0], args[1]);
1455          break;
1456       case OPCODE_DP4:
1457          brw_DP4(p, dst, args[0], args[1]);
1458          break;
1459       case OPCODE_DPH:
1460          brw_DPH(p, dst, args[0], args[1]);
1461          break;
1462       case OPCODE_NRM3:
1463          emit_nrm(c, dst, args[0], 3);
1464          break;
1465       case OPCODE_NRM4:
1466          emit_nrm(c, dst, args[0], 4);
1467          break;
1468       case OPCODE_DST:
1469          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1470          break;
1471       case OPCODE_EXP:
1472          unalias1(c, dst, args[0], emit_exp_noalias);
1473          break;
1474       case OPCODE_EX2:
1475          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1476          break;
1477       case OPCODE_ARL:
1478          emit_arl(c, dst, args[0]);
1479          break;
1480       case OPCODE_FLR:
1481          brw_RNDD(p, dst, args[0]);
1482          break;
1483       case OPCODE_FRC:
1484          brw_FRC(p, dst, args[0]);
1485          break;
1486       case OPCODE_LOG:
1487          unalias1(c, dst, args[0], emit_log_noalias);
1488          break;
1489       case OPCODE_LG2:
1490          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1491          break;
1492       case OPCODE_LIT:
1493          unalias1(c, dst, args[0], emit_lit_noalias);
1494          break;
1495       case OPCODE_LRP:
1496          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1497          break;
1498       case OPCODE_MAD:
1499          if (!accumulator_contains(c, args[2]))
1500             brw_MOV(p, brw_acc_reg(), args[2]);
1501          brw_MAC(p, dst, args[0], args[1]);
1502          break;
1503       case OPCODE_CMP:
1504          emit_cmp(p, dst, args[0], args[1], args[2]);
1505          break;
1506       case OPCODE_MAX:
1507          emit_max(p, dst, args[0], args[1]);
1508          break;
1509       case OPCODE_MIN:
1510          emit_min(p, dst, args[0], args[1]);
1511          break;
1512       case OPCODE_MOV:
1513          brw_MOV(p, dst, args[0]);
1514          break;
1515       case OPCODE_MUL:
1516          brw_MUL(p, dst, args[0], args[1]);
1517          break;
1518       case OPCODE_POW:
1519          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1520          break;
1521       case OPCODE_RCP:
1522          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1523          break;
1524       case OPCODE_RSQ:
1525          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1526          break;
1527
1528       case OPCODE_SEQ:
1529          unalias2(c, dst, args[0], args[1], emit_seq);
1530          break;
1531       case OPCODE_SIN:
1532          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1533          break;
1534       case OPCODE_SNE:
1535          unalias2(c, dst, args[0], args[1], emit_sne);
1536          break;
1537       case OPCODE_SGE:
1538          unalias2(c, dst, args[0], args[1], emit_sge);
1539          break;
1540       case OPCODE_SGT:
1541          unalias2(c, dst, args[0], args[1], emit_sgt);
1542          break;
1543       case OPCODE_SLT:
1544          unalias2(c, dst, args[0], args[1], emit_slt);
1545          break;
1546       case OPCODE_SLE:
1547          unalias2(c, dst, args[0], args[1], emit_sle);
1548          break;
1549       case OPCODE_SUB:
1550          brw_ADD(p, dst, args[0], negate(args[1]));
1551          break;
1552       case OPCODE_SWZ:
1553          /* The args[0] value can't be used here as it won't have
1554           * correctly encoded the full swizzle:
1555           */
1556          emit_swz(c, dst, inst);
1557          break;
1558       case OPCODE_TRUNC:
1559          /* round toward zero */
1560          brw_RNDZ(p, dst, args[0]);
1561          break;
1562       case OPCODE_XPD:
1563          emit_xpd(p, dst, args[0], args[1]);
1564          break;
1565       case OPCODE_IF:
1566          assert(if_depth < MAX_IF_DEPTH);
1567          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1568          /* Note that brw_IF smashes the predicate_control field. */
1569          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1570          if_depth++;
1571          break;
1572       case OPCODE_ELSE:
1573          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1574          break;
1575       case OPCODE_ENDIF:
1576          assert(if_depth > 0);
1577          brw_ENDIF(p, if_inst[--if_depth]);
1578          break;
1579       case OPCODE_BGNLOOP:
1580          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1581          break;
1582       case OPCODE_BRK:
1583          brw_set_predicate_control(p, get_predicate(inst));
1584          brw_BREAK(p);
1585          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1586          break;
1587       case OPCODE_CONT:
1588          brw_set_predicate_control(p, get_predicate(inst));
1589          brw_CONT(p);
1590          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1591          break;
1592       case OPCODE_ENDLOOP:
1593          {
1594             struct brw_instruction *inst0, *inst1;
1595             GLuint br = 1;
1596
1597             loop_depth--;
1598
1599             if (intel->is_ironlake)
1600                br = 2;
1601
1602             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1603             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1604             while (inst0 > loop_inst[loop_depth]) {
1605                inst0--;
1606                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1607                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1608                   inst0->bits3.if_else.pop_count = 0;
1609                }
1610                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1611                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1612                   inst0->bits3.if_else.pop_count = 0;
1613                }
1614             }
1615          }
1616          break;
1617       case OPCODE_BRA:
1618          brw_set_predicate_control(p, get_predicate(inst));
1619          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1620          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1621          break;
1622       case OPCODE_CAL:
1623          brw_set_access_mode(p, BRW_ALIGN_1);
1624          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1625          brw_set_access_mode(p, BRW_ALIGN_16);
1626          brw_ADD(p, get_addr_reg(stack_index),
1627                          get_addr_reg(stack_index), brw_imm_d(4));
1628          brw_save_call(p, inst->Comment, p->nr_insn);
1629          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1630          break;
1631       case OPCODE_RET:
1632          brw_ADD(p, get_addr_reg(stack_index),
1633                          get_addr_reg(stack_index), brw_imm_d(-4));
1634          brw_set_access_mode(p, BRW_ALIGN_1);
1635          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1636          brw_set_access_mode(p, BRW_ALIGN_16);
1637          break;
1638       case OPCODE_END:
1639          end_offset = p->nr_insn;
1640          /* this instruction will get patched later to jump past subroutine
1641           * code, etc.
1642           */
1643          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1644          break;
1645       case OPCODE_PRINT:
1646          /* no-op */
1647          break;
1648       case OPCODE_BGNSUB:
1649          brw_save_label(p, inst->Comment, p->nr_insn);
1650          break;
1651       case OPCODE_ENDSUB:
1652          /* no-op */
1653          break;
1654       default:
1655          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1656                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1657                                     _mesa_opcode_string(inst->Opcode) :
1658                                     "unknown");
1659       }
1660
1661       /* Set the predication update on the last instruction of the native
1662        * instruction sequence.
1663        *
1664        * This would be problematic if it was set on a math instruction,
1665        * but that shouldn't be the case with the current GLSL compiler.
1666        */
1667       if (inst->CondUpdate) {
1668          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1669
1670          assert(hw_insn->header.destreg__conditionalmod == 0);
1671          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1672       }
1673
1674       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1675           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1676           && c->output_regs[inst->DstReg.Index].used_in_src) {
1677          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1678       }
1679
1680       /* Result color clamping.
1681        *
1682        * When destination register is an output register and
1683        * it's primary/secondary front/back color, we have to clamp
1684        * the result to [0,1]. This is done by enabling the
1685        * saturation bit for the last instruction.
1686        *
1687        * We don't use brw_set_saturate() as it modifies
1688        * p->current->header.saturate, which affects all the subsequent
1689        * instructions. Instead, we directly modify the header
1690        * of the last (already stored) instruction.
1691        */
1692       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1693          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1694              || (inst->DstReg.Index == VERT_RESULT_COL1)
1695              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1696              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1697             p->store[p->nr_insn-1].header.saturate = 1;
1698          }
1699       }
1700
1701       release_tmps(c);
1702    }
1703
1704    end_inst = &p->store[end_offset];
1705    last_inst = &p->store[p->nr_insn];
1706
1707    /* The END instruction will be patched to jump to this code */
1708    emit_vertex_write(c);
1709
1710    post_vs_emit(c, end_inst, last_inst);
1711
1712    if (INTEL_DEBUG & DEBUG_VS) {
1713       int i;
1714
1715       _mesa_printf("vs-native:\n");
1716       for (i = 0; i < p->nr_insn; i++)
1717          brw_disasm(stderr, &p->store[i]);
1718       _mesa_printf("\n");
1719    }
1720 }