src/gallium/drivers/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "pipe/p_shader_tokens.h"
  33
  34 #include "util/u_memory.h"
  35 #include "util/u_math.h"
  36
  37 #include "tgsi/tgsi_parse.h"
  38 #include "tgsi/tgsi_dump.h"
  39 #include "tgsi/tgsi_info.h"
  40
  41 #include "brw_context.h"
  42 #include "brw_vs.h"
  43 #include "brw_debug.h"
  44 #include "brw_disasm.h"
  45
  46 /* Choose one of the 4 vec4's which can be packed into each 16-wide reg.
  47  */
  48 static INLINE struct brw_reg brw_vec4_grf_repeat( GLuint reg, GLuint slot )
  49 {
  50    int nr = reg + slot/2;
  51    int subnr = (slot%2) * 4;
  52
  53    return stride(brw_vec4_grf(nr, subnr), 0, 4, 1);
  54 }
  55
  56
  57 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  58 {
  59    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  60
  61    if (++c->last_tmp > c->prog_data.total_grf)
  62       c->prog_data.total_grf = c->last_tmp;
  63
  64    return tmp;
  65 }
  66
  67 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  68 {
  69    if (tmp.nr == c->last_tmp-1)
  70       c->last_tmp--;
  71 }
  72
  73 static void release_tmps( struct brw_vs_compile *c )
  74 {
  75    c->last_tmp = c->first_tmp;
  76 }
  77
  78
  79 static boolean is_position_output( struct brw_vs_compile *c,
  80                                    unsigned vs_output )
  81 {
  82    const struct brw_vertex_shader *vs = c->vp;
  83    unsigned semantic = vs->info.output_semantic_name[vs_output];
  84    unsigned index = vs->info.output_semantic_index[vs_output];
  85
  86    return (semantic == TGSI_SEMANTIC_POSITION &&
  87            index == 0);
  88 }
  89
  90
  91 static boolean find_output_slot( struct brw_vs_compile *c,
  92                                   unsigned vs_output,
  93                                   unsigned *fs_input_slot )
  94 {
  95    const struct brw_vertex_shader *vs = c->vp;
  96    unsigned semantic = vs->info.output_semantic_name[vs_output];
  97    unsigned index = vs->info.output_semantic_index[vs_output];
  98    unsigned i;
  99
 100    for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
 101       if (c->key.fs_signature.input[i].semantic == semantic &&
 102           c->key.fs_signature.input[i].semantic_index == index) {
 103          *fs_input_slot = i;
 104          return TRUE;
 105       }
 106    }
 107
 108    return FALSE;
 109 }
 110
 111
 112 /**
 113  * Preallocate GRF register before code emit.
 114  * Do things as simply as possible.  Allocate and populate all regs
 115  * ahead of time.
 116  */
 117 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 118 {
 119    GLuint i, reg = 0, subreg = 0, mrf;
 120    int attributes_in_vue;
 121
 122    /* Determine whether to use a real constant buffer or use a block
 123     * of GRF registers for constants.  The later is faster but only
 124     * works if everything fits in the GRF.
 125     * XXX this heuristic/check may need some fine tuning...
 126     */
 127    if (c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1 +
 128        c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
 129        c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 > BRW_MAX_GRF)
 130       c->vp->use_const_buffer = GL_TRUE;
 131    else {
 132       /* XXX: immediates can go elsewhere if necessary:
 133        */
 134       assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
 135              c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 <= BRW_MAX_GRF);
 136
 137       c->vp->use_const_buffer = GL_FALSE;
 138    }
 139
 140    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 141
 142    /* r0 -- reserved as usual
 143     */
 144    c->r0 = brw_vec8_grf(reg, 0);
 145    reg++;
 146
 147    /* User clip planes from curbe:
 148     */
 149    if (c->key.nr_userclip) {
 150       /* Skip over fixed planes:  Or never read them into vs unit?
 151        */
 152       subreg += 6;
 153
 154       for (i = 0; i < c->key.nr_userclip; i++, subreg++) {
 155          c->userplane[i] =
 156             stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
 157       }
 158
 159       /* Deal with curbe alignment:
 160        */
 161       subreg = align(subreg, 2);
 162       /*reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;*/
 163    }
 164
 165
 166    /* Immediates: always in the curbe.
 167     *
 168     * XXX: Can try to encode some immediates as brw immediates
 169     * XXX: Make sure ureg sets minimal immediate size and respect it
 170     * here.
 171     */
 172    for (i = 0; i < c->vp->info.immediate_count; i++, subreg++) {
 173       c->regs[TGSI_FILE_IMMEDIATE][i] =
 174          stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
 175    }
 176    c->prog_data.nr_params = c->vp->info.immediate_count * 4;
 177
 178
 179    /* Vertex constant buffer.
 180     *
 181     * Constants from the buffer can be either cached in the curbe or
 182     * loaded as needed from the actual constant buffer.
 183     */
 184    if (!c->vp->use_const_buffer) {
 185       GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
 186
 187       for (i = 0; i < nr_params; i++, subreg++) {
 188          c->regs[TGSI_FILE_CONSTANT][i] =
 189             stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
 190       }
 191
 192       c->prog_data.nr_params += nr_params * 4;
 193    }
 194
 195    /* All regs allocated
 196     */
 197    reg += (subreg + 1) / 2;
 198    c->prog_data.curb_read_length = reg - 1;
 199
 200
 201    /* Allocate input regs:
 202     */
 203    c->nr_inputs = c->vp->info.num_inputs;
 204    for (i = 0; i < c->nr_inputs; i++) {
 205       c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
 206       reg++;
 207    }
 208
 209    /* If there are no inputs, we'll still be reading one attribute's worth
 210     * because it's required -- see urb_read_length setting.
 211     */
 212    if (c->nr_inputs == 0)
 213       reg++;
 214
 215
 216
 217    /* Allocate outputs.  The non-position outputs go straight into message regs.
 218     */
 219    c->nr_outputs = c->prog_data.nr_outputs;
 220
 221    if (c->chipset.is_igdng)
 222       mrf = 8;
 223    else
 224       mrf = 4;
 225
 226
 227    if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
 228       c->overflow_grf_start = reg;
 229       c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
 230       reg += c->overflow_count;
 231    }
 232
 233    /* XXX: need to access vertex output semantics here:
 234     */
 235    for (i = 0; i < c->nr_outputs; i++) {
 236       unsigned slot;
 237
 238       /* XXX: Put output position in slot zero always.  Clipper, etc,
 239        * need access to this reg.
 240        */
 241       if (is_position_output(c, i)) {
 242          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
 243          reg++;
 244       }
 245       else if (find_output_slot(c, i, &slot)) {
 246
 247          if (0 /* is_psize_output(c, i) */ ) {
 248             /* c->psize_out.grf = reg; */
 249             /* c->psize_out.mrf = i; */
 250          }
 251
 252          /* The first (16-4) outputs can go straight into the message regs.
 253           */
 254          if (slot + mrf < BRW_MAX_MRF) {
 255             c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
 256          }
 257          else {
 258             int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
 259             c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
 260          }
 261       }
 262       else {
 263          c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
 264       }
 265    }
 266
 267    /* Allocate program temporaries:
 268     */
 269
 270    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
 271       c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 272       reg++;
 273    }
 274
 275    /* Address reg(s).  Don't try to use the internal address reg until
 276     * deref time.
 277     */
 278    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
 279       c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 280                                              reg,
 281                                              0,
 282                                              BRW_REGISTER_TYPE_D,
 283                                              BRW_VERTICAL_STRIDE_8,
 284                                              BRW_WIDTH_8,
 285                                              BRW_HORIZONTAL_STRIDE_1,
 286                                              BRW_SWIZZLE_XXXX,
 287                                              BRW_WRITEMASK_X);
 288       reg++;
 289    }
 290
 291    if (c->vp->use_const_buffer) {
 292       for (i = 0; i < 3; i++) {
 293          c->current_const[i].index = -1;
 294          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 295          reg++;
 296       }
 297    }
 298
 299 #if 0
 300    for (i = 0; i < 128; i++) {
 301       if (c->output_regs[i].used_in_src) {
 302          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 303          reg++;
 304       }
 305    }
 306 #endif
 307
 308    if (c->vp->has_flow_control) {
 309       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 310       reg += 2;
 311    }
 312
 313    /* Some opcodes need an internal temporary:
 314     */
 315    c->first_tmp = reg;
 316    c->last_tmp = reg;           /* for allocation purposes */
 317
 318    /* Each input reg holds data from two vertices.  The
 319     * urb_read_length is the number of registers read from *each*
 320     * vertex urb, so is half the amount:
 321     */
 322    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 323
 324    /* Setting this field to 0 leads to undefined behavior according to the
 325     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 326     * sitting in them, even if it's padding.
 327     */
 328    if (c->prog_data.urb_read_length == 0)
 329       c->prog_data.urb_read_length = 1;
 330
 331    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 332     * them to fit the biggest thing they need to.
 333     */
 334    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 335
 336    if (c->chipset.is_igdng)
 337       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 338    else
 339       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 340
 341    c->prog_data.total_grf = reg;
 342
 343    if (BRW_DEBUG & DEBUG_VS) {
 344       debug_printf("%s NumAddrRegs %d\n", __FUNCTION__,
 345                    c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
 346       debug_printf("%s NumTemps %d\n", __FUNCTION__,
 347                    c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
 348       debug_printf("%s reg = %d\n", __FUNCTION__, reg);
 349    }
 350 }
 351
 352
 353 /**
 354  * If an instruction uses a temp reg both as a src and the dest, we
 355  * sometimes need to allocate an intermediate temporary.
 356  */
 357 static void unalias1( struct brw_vs_compile *c,
 358                       struct brw_reg dst,
 359                       struct brw_reg arg0,
 360                       void (*func)( struct brw_vs_compile *,
 361                                     struct brw_reg,
 362                                     struct brw_reg ))
 363 {
 364    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 365       struct brw_compile *p = &c->func;
 366       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 367       func(c, tmp, arg0);
 368       brw_MOV(p, dst, tmp);
 369       release_tmp(c, tmp);
 370    }
 371    else {
 372       func(c, dst, arg0);
 373    }
 374 }
 375
 376 /**
 377  * \sa unalias2
 378  * Checkes if 2-operand instruction needs an intermediate temporary.
 379  */
 380 static void unalias2( struct brw_vs_compile *c,
 381                       struct brw_reg dst,
 382                       struct brw_reg arg0,
 383                       struct brw_reg arg1,
 384                       void (*func)( struct brw_vs_compile *,
 385                                     struct brw_reg,
 386                                     struct brw_reg,
 387                                     struct brw_reg ))
 388 {
 389    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 390        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 391       struct brw_compile *p = &c->func;
 392       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 393       func(c, tmp, arg0, arg1);
 394       brw_MOV(p, dst, tmp);
 395       release_tmp(c, tmp);
 396    }
 397    else {
 398       func(c, dst, arg0, arg1);
 399    }
 400 }
 401
 402 /**
 403  * \sa unalias2
 404  * Checkes if 3-operand instruction needs an intermediate temporary.
 405  */
 406 static void unalias3( struct brw_vs_compile *c,
 407                       struct brw_reg dst,
 408                       struct brw_reg arg0,
 409                       struct brw_reg arg1,
 410                       struct brw_reg arg2,
 411                       void (*func)( struct brw_vs_compile *,
 412                                     struct brw_reg,
 413                                     struct brw_reg,
 414                                     struct brw_reg,
 415                                     struct brw_reg ))
 416 {
 417    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 418        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 419        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 420       struct brw_compile *p = &c->func;
 421       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 422       func(c, tmp, arg0, arg1, arg2);
 423       brw_MOV(p, dst, tmp);
 424       release_tmp(c, tmp);
 425    }
 426    else {
 427       func(c, dst, arg0, arg1, arg2);
 428    }
 429 }
 430
 431 static void emit_sop( struct brw_compile *p,
 432                       struct brw_reg dst,
 433                       struct brw_reg arg0,
 434                       struct brw_reg arg1,
 435                       GLuint cond)
 436 {
 437    brw_MOV(p, dst, brw_imm_f(0.0f));
 438    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 439    brw_MOV(p, dst, brw_imm_f(1.0f));
 440    brw_set_predicate_control_flag_value(p, 0xff);
 441 }
 442
 443 static void emit_seq( struct brw_compile *p,
 444                       struct brw_reg dst,
 445                       struct brw_reg arg0,
 446                       struct brw_reg arg1 )
 447 {
 448    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 449 }
 450
 451 static void emit_sne( struct brw_compile *p,
 452                       struct brw_reg dst,
 453                       struct brw_reg arg0,
 454                       struct brw_reg arg1 )
 455 {
 456    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 457 }
 458 static void emit_slt( struct brw_compile *p,
 459                       struct brw_reg dst,
 460                       struct brw_reg arg0,
 461                       struct brw_reg arg1 )
 462 {
 463    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 464 }
 465
 466 static void emit_sle( struct brw_compile *p,
 467                       struct brw_reg dst,
 468                       struct brw_reg arg0,
 469                       struct brw_reg arg1 )
 470 {
 471    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 472 }
 473
 474 static void emit_sgt( struct brw_compile *p,
 475                       struct brw_reg dst,
 476                       struct brw_reg arg0,
 477                       struct brw_reg arg1 )
 478 {
 479    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 480 }
 481
 482 static void emit_sge( struct brw_compile *p,
 483                       struct brw_reg dst,
 484                       struct brw_reg arg0,
 485                       struct brw_reg arg1 )
 486 {
 487   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 488 }
 489
 490 static void emit_max( struct brw_compile *p,
 491                       struct brw_reg dst,
 492                       struct brw_reg arg0,
 493                       struct brw_reg arg1 )
 494 {
 495    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 496    brw_SEL(p, dst, arg1, arg0);
 497    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 498 }
 499
 500 static void emit_min( struct brw_compile *p,
 501                       struct brw_reg dst,
 502                       struct brw_reg arg0,
 503                       struct brw_reg arg1 )
 504 {
 505    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 506    brw_SEL(p, dst, arg0, arg1);
 507    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 508 }
 509
 510
 511 static void emit_math1( struct brw_vs_compile *c,
 512                         GLuint function,
 513                         struct brw_reg dst,
 514                         struct brw_reg arg0,
 515                         GLuint precision)
 516 {
 517    /* There are various odd behaviours with SEND on the simulator.  In
 518     * addition there are documented issues with the fact that the GEN4
 519     * processor doesn't do dependency control properly on SEND
 520     * results.  So, on balance, this kludge to get around failures
 521     * with writemasked math results looks like it might be necessary
 522     * whether that turns out to be a simulator bug or not:
 523     */
 524    struct brw_compile *p = &c->func;
 525    struct brw_reg tmp = dst;
 526    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 527                          dst.file != BRW_GENERAL_REGISTER_FILE);
 528
 529    if (need_tmp)
 530       tmp = get_tmp(c);
 531
 532    brw_math(p,
 533             tmp,
 534             function,
 535             BRW_MATH_SATURATE_NONE,
 536             2,
 537             arg0,
 538             BRW_MATH_DATA_SCALAR,
 539             precision);
 540
 541    if (need_tmp) {
 542       brw_MOV(p, dst, tmp);
 543       release_tmp(c, tmp);
 544    }
 545 }
 546
 547
 548 static void emit_math2( struct brw_vs_compile *c,
 549                         GLuint function,
 550                         struct brw_reg dst,
 551                         struct brw_reg arg0,
 552                         struct brw_reg arg1,
 553                         GLuint precision)
 554 {
 555    struct brw_compile *p = &c->func;
 556    struct brw_reg tmp = dst;
 557    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 558                          dst.file != BRW_GENERAL_REGISTER_FILE);
 559
 560    if (need_tmp)
 561       tmp = get_tmp(c);
 562
 563    brw_MOV(p, brw_message_reg(3), arg1);
 564
 565    brw_math(p,
 566             tmp,
 567             function,
 568             BRW_MATH_SATURATE_NONE,
 569             2,
 570             arg0,
 571             BRW_MATH_DATA_SCALAR,
 572             precision);
 573
 574    if (need_tmp) {
 575       brw_MOV(p, dst, tmp);
 576       release_tmp(c, tmp);
 577    }
 578 }
 579
 580
 581 static void emit_exp_noalias( struct brw_vs_compile *c,
 582                               struct brw_reg dst,
 583                               struct brw_reg arg0 )
 584 {
 585    struct brw_compile *p = &c->func;
 586
 587
 588    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
 589       struct brw_reg tmp = get_tmp(c);
 590       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 591
 592       /* tmp_d = floor(arg0.x) */
 593       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 594
 595       /* result[0] = 2.0 ^ tmp */
 596
 597       /* Adjust exponent for floating point:
 598        * exp += 127
 599        */
 600       brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 601
 602       /* Install exponent and sign.
 603        * Excess drops off the edge:
 604        */
 605       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X),
 606               tmp_d, brw_imm_d(23));
 607
 608       release_tmp(c, tmp);
 609    }
 610
 611    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
 612       /* result[1] = arg0.x - floor(arg0.x) */
 613       brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
 614    }
 615
 616    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 617       /* As with the LOG instruction, we might be better off just
 618        * doing a taylor expansion here, seeing as we have to do all
 619        * the prep work.
 620        *
 621        * If mathbox partial precision is too low, consider also:
 622        * result[3] = result[0] * EXP(result[1])
 623        */
 624       emit_math1(c,
 625                  BRW_MATH_FUNCTION_EXP,
 626                  brw_writemask(dst, BRW_WRITEMASK_Z),
 627                  brw_swizzle1(arg0, 0),
 628                  BRW_MATH_PRECISION_FULL);
 629    }
 630
 631    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 632       /* result[3] = 1.0; */
 633       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
 634    }
 635 }
 636
 637
 638 static void emit_log_noalias( struct brw_vs_compile *c,
 639                               struct brw_reg dst,
 640                               struct brw_reg arg0 )
 641 {
 642    struct brw_compile *p = &c->func;
 643    struct brw_reg tmp = dst;
 644    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 645    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 646    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 647                          dst.file != BRW_GENERAL_REGISTER_FILE);
 648
 649    if (need_tmp) {
 650       tmp = get_tmp(c);
 651       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 652    }
 653
 654    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 655     * according to spec:
 656     *
 657     * These almost look likey they could be joined up, but not really
 658     * practical:
 659     *
 660     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 661     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 662     */
 663    if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
 664       brw_AND(p,
 665               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 666               brw_swizzle1(arg0_ud, 0),
 667               brw_imm_ud((1U<<31)-1));
 668
 669       brw_SHR(p,
 670               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 671               tmp_ud,
 672               brw_imm_ud(23));
 673
 674       brw_ADD(p,
 675               brw_writemask(tmp, BRW_WRITEMASK_X),
 676               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 677               brw_imm_d(-127));
 678    }
 679
 680    if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
 681       brw_AND(p,
 682               brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 683               brw_swizzle1(arg0_ud, 0),
 684               brw_imm_ud((1<<23)-1));
 685
 686       brw_OR(p,
 687              brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 688              tmp_ud,
 689              brw_imm_ud(127<<23));
 690    }
 691
 692    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 693       /* result[2] = result[0] + LOG2(result[1]); */
 694
 695       /* Why bother?  The above is just a hint how to do this with a
 696        * taylor series.  Maybe we *should* use a taylor series as by
 697        * the time all the above has been done it's almost certainly
 698        * quicker than calling the mathbox, even with low precision.
 699        *
 700        * Options are:
 701        *    - result[0] + mathbox.LOG2(result[1])
 702        *    - mathbox.LOG2(arg0.x)
 703        *    - result[0] + inline_taylor_approx(result[1])
 704        */
 705       emit_math1(c,
 706                  BRW_MATH_FUNCTION_LOG,
 707                  brw_writemask(tmp, BRW_WRITEMASK_Z),
 708                  brw_swizzle1(tmp, 1),
 709                  BRW_MATH_PRECISION_FULL);
 710
 711       brw_ADD(p,
 712               brw_writemask(tmp, BRW_WRITEMASK_Z),
 713               brw_swizzle1(tmp, 2),
 714               brw_swizzle1(tmp, 0));
 715    }
 716
 717    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 718       /* result[3] = 1.0; */
 719       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
 720    }
 721
 722    if (need_tmp) {
 723       brw_MOV(p, dst, tmp);
 724       release_tmp(c, tmp);
 725    }
 726 }
 727
 728
 729 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 730  */
 731 static void emit_dst_noalias( struct brw_vs_compile *c,
 732                               struct brw_reg dst,
 733                               struct brw_reg arg0,
 734                               struct brw_reg arg1)
 735 {
 736    struct brw_compile *p = &c->func;
 737
 738    /* There must be a better way to do this:
 739     */
 740    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
 741       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
 742    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
 743       brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
 744    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
 745       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
 746    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
 747       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 748 }
 749
 750
 751 static void emit_xpd( struct brw_compile *p,
 752                       struct brw_reg dst,
 753                       struct brw_reg t,
 754                       struct brw_reg u)
 755 {
 756    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 757    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 758 }
 759
 760
 761 static void emit_lit_noalias( struct brw_vs_compile *c,
 762                               struct brw_reg dst,
 763                               struct brw_reg arg0 )
 764 {
 765    struct brw_compile *p = &c->func;
 766    struct brw_instruction *if_insn;
 767    struct brw_reg tmp = dst;
 768    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 769
 770    if (need_tmp)
 771       tmp = get_tmp(c);
 772
 773    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0));
 774    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1));
 775
 776    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 777     * to get all channels active inside the IF.  In the clipping code
 778     * we run with NoMask, so it's not an option and we can use
 779     * BRW_EXECUTE_1 for all comparisions.
 780     */
 781    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 782    if_insn = brw_IF(p, BRW_EXECUTE_8);
 783    {
 784       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 785
 786       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 787       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
 788       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 789
 790       emit_math2(c,
 791                  BRW_MATH_FUNCTION_POW,
 792                  brw_writemask(dst, BRW_WRITEMASK_Z),
 793                  brw_swizzle1(tmp, 2),
 794                  brw_swizzle1(arg0, 3),
 795                  BRW_MATH_PRECISION_PARTIAL);
 796    }
 797
 798    brw_ENDIF(p, if_insn);
 799
 800    release_tmp(c, tmp);
 801 }
 802
 803 static void emit_lrp_noalias(struct brw_vs_compile *c,
 804                              struct brw_reg dst,
 805                              struct brw_reg arg0,
 806                              struct brw_reg arg1,
 807                              struct brw_reg arg2)
 808 {
 809    struct brw_compile *p = &c->func;
 810
 811    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 812    brw_MUL(p, brw_null_reg(), dst, arg2);
 813    brw_MAC(p, dst, arg0, arg1);
 814 }
 815
 816 /** 3 or 4-component vector normalization */
 817 static void emit_nrm( struct brw_vs_compile *c,
 818                       struct brw_reg dst,
 819                       struct brw_reg arg0,
 820                       int num_comps)
 821 {
 822    struct brw_compile *p = &c->func;
 823    struct brw_reg tmp = get_tmp(c);
 824
 825    /* tmp = dot(arg0, arg0) */
 826    if (num_comps == 3)
 827       brw_DP3(p, tmp, arg0, arg0);
 828    else
 829       brw_DP4(p, tmp, arg0, arg0);
 830
 831    /* tmp = 1 / sqrt(tmp) */
 832    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 833
 834    /* dst = arg0 * tmp */
 835    brw_MUL(p, dst, arg0, tmp);
 836
 837    release_tmp(c, tmp);
 838 }
 839
 840
 841 static struct brw_reg
 842 get_constant(struct brw_vs_compile *c,
 843              GLuint argIndex,
 844              GLuint index,
 845              GLboolean relAddr)
 846 {
 847    struct brw_compile *p = &c->func;
 848    struct brw_reg const_reg;
 849    struct brw_reg const2_reg;
 850
 851    assert(argIndex < 3);
 852
 853    if (c->current_const[argIndex].index != index || relAddr) {
 854       struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 855
 856       c->current_const[argIndex].index = index;
 857
 858 #if 0
 859       printf("  fetch const[%d] for arg %d into reg %d\n",
 860              src.Index, argIndex, c->current_const[argIndex].reg.nr);
 861 #endif
 862       /* need to fetch the constant now */
 863       brw_dp_READ_4_vs(p,
 864                        c->current_const[argIndex].reg,/* writeback dest */
 865                        0,                             /* oword */
 866                        relAddr,                       /* relative indexing? */
 867                        addrReg,                       /* address register */
 868                        16 * index,               /* byte offset */
 869                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 870                        );
 871
 872       if (relAddr) {
 873          /* second read */
 874          const2_reg = get_tmp(c);
 875
 876          /* use upper half of address reg for second read */
 877          addrReg = stride(addrReg, 0, 4, 0);
 878          addrReg.subnr = 16;
 879
 880          brw_dp_READ_4_vs(p,
 881                           const2_reg,              /* writeback dest */
 882                           1,                       /* oword */
 883                           relAddr,                 /* relative indexing? */
 884                           addrReg,                 /* address register */
 885                           16 * index,         /* byte offset */
 886                           SURF_INDEX_VERT_CONST_BUFFER
 887                           );
 888       }
 889    }
 890
 891    const_reg = c->current_const[argIndex].reg;
 892
 893    if (relAddr) {
 894       /* merge the two Owords into the constant register */
 895       /* const_reg[7..4] = const2_reg[7..4] */
 896       brw_MOV(p,
 897               suboffset(stride(const_reg, 0, 4, 1), 4),
 898               suboffset(stride(const2_reg, 0, 4, 1), 4));
 899       release_tmp(c, const2_reg);
 900    }
 901    else {
 902       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 903       const_reg = stride(const_reg, 0, 4, 0);
 904       const_reg.subnr = 0;
 905    }
 906
 907    return const_reg;
 908 }
 909
 910
 911 #if 0
 912
 913 /* TODO: relative addressing!
 914  */
 915 static struct brw_reg get_reg( struct brw_vs_compile *c,
 916                                enum tgsi_file_type file,
 917                                GLuint index )
 918 {
 919    switch (file) {
 920    case TGSI_FILE_TEMPORARY:
 921    case TGSI_FILE_INPUT:
 922    case TGSI_FILE_OUTPUT:
 923    case TGSI_FILE_CONSTANT:
 924       assert(c->regs[file][index].nr != 0);
 925       return c->regs[file][index];
 926
 927    case TGSI_FILE_ADDRESS:
 928       assert(index == 0);
 929       return c->regs[file][index];
 930
 931    case TGSI_FILE_NULL:                 /* undef values */
 932       return brw_null_reg();
 933
 934    default:
 935       assert(0);
 936       return brw_null_reg();
 937    }
 938 }
 939
 940 #endif
 941
 942
 943 /**
 944  * Indirect addressing:  get reg[[arg] + offset].
 945  */
 946 static struct brw_reg deref( struct brw_vs_compile *c,
 947                              struct brw_reg arg,
 948                              GLint offset)
 949 {
 950    struct brw_compile *p = &c->func;
 951    struct brw_reg tmp = vec4(get_tmp(c));
 952    struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
 953    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 954    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 955    struct brw_reg indirect = brw_vec4_indirect(0,0);
 956
 957    {
 958       brw_push_insn_state(p);
 959       brw_set_access_mode(p, BRW_ALIGN_1);
 960
 961       /* This is pretty clunky - load the address register twice and
 962        * fetch each 4-dword value in turn.  There must be a way to do
 963        * this in a single pass, but I couldn't get it to work.
 964        */
 965       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 966       brw_MOV(p, tmp, indirect);
 967
 968       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 969       brw_MOV(p, suboffset(tmp, 4), indirect);
 970
 971       brw_pop_insn_state(p);
 972    }
 973
 974    /* NOTE: tmp not released */
 975    return vec8(tmp);
 976 }
 977
 978
 979 /**
 980  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 981  * TODO: relative addressing!
 982  */
 983 static struct brw_reg
 984 get_src_reg( struct brw_vs_compile *c,
 985              GLuint argIndex,
 986              GLuint file,
 987              GLint index,
 988              GLboolean relAddr )
 989 {
 990
 991    switch (file) {
 992    case TGSI_FILE_TEMPORARY:
 993    case TGSI_FILE_INPUT:
 994    case TGSI_FILE_OUTPUT:
 995       if (relAddr) {
 996          return deref(c, c->regs[file][0], index);
 997       }
 998       else {
 999          assert(c->regs[file][index].nr != 0);
1000          return c->regs[file][index];
1001       }
1002
1003    case TGSI_FILE_IMMEDIATE:
1004       return c->regs[file][index];
1005
1006    case TGSI_FILE_CONSTANT:
1007       if (c->vp->use_const_buffer) {
1008          return get_constant(c, argIndex, index, relAddr);
1009       }
1010       else if (relAddr) {
1011          return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
1012       }
1013       else {
1014          assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
1015          return c->regs[TGSI_FILE_CONSTANT][index];
1016       }
1017    case TGSI_FILE_ADDRESS:
1018       assert(index == 0);
1019       return c->regs[file][index];
1020
1021    case TGSI_FILE_NULL:
1022       /* this is a normal case since we loop over all three src args */
1023       return brw_null_reg();
1024
1025    default:
1026       assert(0);
1027       return brw_null_reg();
1028    }
1029 }
1030
1031
1032 static void emit_arl( struct brw_vs_compile *c,
1033                       struct brw_reg dst,
1034                       struct brw_reg arg0 )
1035 {
1036    struct brw_compile *p = &c->func;
1037    struct brw_reg tmp = dst;
1038    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1039
1040    if (need_tmp)
1041       tmp = get_tmp(c);
1042
1043    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
1044    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
1045
1046    if (need_tmp)
1047       release_tmp(c, tmp);
1048 }
1049
1050
1051 /**
1052  * Return the brw reg for the given instruction's src argument.
1053  */
1054 static struct brw_reg get_arg( struct brw_vs_compile *c,
1055                                const struct tgsi_full_src_register *src,
1056                                GLuint argIndex )
1057 {
1058    struct brw_reg reg;
1059
1060    if (src->Register.File == TGSI_FILE_NULL)
1061       return brw_null_reg();
1062
1063    reg = get_src_reg(c, argIndex,
1064                      src->Register.File,
1065                      src->Register.Index,
1066                      src->Register.Indirect);
1067
1068    /* Convert 3-bit swizzle to 2-bit.
1069     */
1070    reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->Register.SwizzleX,
1071                                        src->Register.SwizzleY,
1072                                        src->Register.SwizzleZ,
1073                                        src->Register.SwizzleW);
1074
1075    reg.negate = src->Register.Negate ? 1 : 0;
1076
1077    /* XXX: abs, absneg
1078     */
1079
1080    return reg;
1081 }
1082
1083
1084 /**
1085  * Get brw register for the given program dest register.
1086  */
1087 static struct brw_reg get_dst( struct brw_vs_compile *c,
1088                                unsigned file,
1089                                unsigned index,
1090                                unsigned writemask )
1091 {
1092    struct brw_reg reg;
1093
1094    switch (file) {
1095    case TGSI_FILE_TEMPORARY:
1096    case TGSI_FILE_OUTPUT:
1097       assert(c->regs[file][index].nr != 0);
1098       reg = c->regs[file][index];
1099       break;
1100    case TGSI_FILE_ADDRESS:
1101       assert(index == 0);
1102       reg = c->regs[file][index];
1103       break;
1104    case TGSI_FILE_NULL:
1105       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1106       reg = brw_null_reg();
1107       break;
1108    default:
1109       assert(0);
1110       reg = brw_null_reg();
1111    }
1112
1113    reg.dw1.bits.writemask = writemask;
1114
1115    return reg;
1116 }
1117
1118
1119
1120
1121 /**
1122  * Post-vertex-program processing.  Send the results to the URB.
1123  */
1124 static void emit_vertex_write( struct brw_vs_compile *c)
1125 {
1126    struct brw_compile *p = &c->func;
1127    struct brw_reg m0 = brw_message_reg(0);
1128    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
1129    struct brw_reg ndc;
1130    int eot;
1131    int i;
1132    GLuint len_vertext_header = 2;
1133
1134    /* Build ndc coords */
1135    ndc = get_tmp(c);
1136    /* ndc = 1.0 / pos.w */
1137    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1138    /* ndc.xyz = pos * ndc */
1139    brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
1140
1141    /* Update the header for point size, user clipping flags, and -ve rhw
1142     * workaround.
1143     */
1144    if (c->prog_data.writes_psiz ||
1145        c->key.nr_userclip ||
1146        c->chipset.is_965)
1147    {
1148       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1149       GLuint i;
1150
1151       brw_MOV(p, header1, brw_imm_ud(0));
1152
1153       brw_set_access_mode(p, BRW_ALIGN_16);
1154
1155       if (c->prog_data.writes_psiz) {
1156          struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
1157          brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1158          brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1159       }
1160
1161       for (i = 0; i < c->key.nr_userclip; i++) {
1162          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1163          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1164          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
1165          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1166       }
1167
1168       /* i965 clipping workaround:
1169        * 1) Test for -ve rhw
1170        * 2) If set,
1171        *      set ndc = (0,0,0,0)
1172        *      set ucp[6] = 1
1173        *
1174        * Later, clipping will detect ucp[6] and ensure the primitive is
1175        * clipped against all fixed planes.
1176        */
1177       if (c->chipset.is_965) {
1178          brw_CMP(p,
1179                  vec8(brw_null_reg()),
1180                  BRW_CONDITIONAL_L,
1181                  brw_swizzle1(ndc, 3),
1182                  brw_imm_f(0));
1183
1184          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
1185          brw_MOV(p, ndc, brw_imm_f(0));
1186          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1187       }
1188
1189       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1190       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1191       brw_set_access_mode(p, BRW_ALIGN_16);
1192
1193       release_tmp(c, header1);
1194    }
1195    else {
1196       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1197    }
1198
1199    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1200     * of zeros followed by two sets of NDC coordinates:
1201     */
1202    brw_set_access_mode(p, BRW_ALIGN_1);
1203    brw_MOV(p, offset(m0, 2), ndc);
1204
1205    if (c->chipset.is_igdng) {
1206        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1207        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1208        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1209         * Seems it is useless for us.
1210         * m6 is used for aligning, so that the remainder of vertex element is
1211         * reg-aligned.
1212         */
1213        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1214        len_vertext_header = 6;
1215    } else {
1216        brw_MOV(p, offset(m0, 3), pos);
1217        len_vertext_header = 2;
1218    }
1219
1220    eot = (c->overflow_count == 0);
1221
1222    brw_urb_WRITE(p,
1223                  brw_null_reg(), /* dest */
1224                  0,             /* starting mrf reg nr */
1225                  c->r0,         /* src */
1226                  0,             /* allocate */
1227                  1,             /* used */
1228                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1229                  0,             /* response len */
1230                  eot,           /* eot */
1231                  eot,           /* writes complete */
1232                  0,             /* urb destination offset */
1233                  BRW_URB_SWIZZLE_INTERLEAVE);
1234
1235    /* Not all of the vertex outputs/results fit into the MRF.
1236     * Move the overflowed attributes from the GRF to the MRF and
1237     * issue another brw_urb_WRITE().
1238     */
1239    for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
1240       unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
1241       GLuint j;
1242
1243       eot = (i + nr >= c->overflow_count);
1244
1245       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1246        * at mrf[4] atm...
1247        */
1248       for (j = 0; j < nr; j++) {
1249          brw_MOV(p, brw_message_reg(4+j),
1250                  brw_vec8_grf(c->overflow_grf_start + i + j, 0));
1251       }
1252
1253       brw_urb_WRITE(p,
1254                     brw_null_reg(), /* dest */
1255                     4,              /* starting mrf reg nr */
1256                     c->r0,          /* src */
1257                     0,              /* allocate */
1258                     1,              /* used */
1259                     nr+1,          /* msg len */
1260                     0,              /* response len */
1261                     eot,            /* eot */
1262                     eot,            /* writes complete */
1263                     i-1,            /* urb destination offset */
1264                     BRW_URB_SWIZZLE_INTERLEAVE);
1265    }
1266 }
1267
1268
1269 /**
1270  * Called after code generation to resolve subroutine calls and the
1271  * END instruction.
1272  * \param end_inst  points to brw code for END instruction
1273  * \param last_inst  points to last instruction emitted before vertex write
1274  */
1275 static void
1276 post_vs_emit( struct brw_vs_compile *c,
1277               struct brw_instruction *end_inst,
1278               struct brw_instruction *last_inst )
1279 {
1280    GLint offset;
1281
1282    brw_resolve_cals(&c->func);
1283
1284    /* patch up the END code to jump past subroutines, etc */
1285    offset = last_inst - end_inst;
1286    if (offset > 1) {
1287       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1288    } else {
1289       end_inst->header.opcode = BRW_OPCODE_NOP;
1290    }
1291 }
1292
1293 static uint32_t
1294 get_predicate(const struct tgsi_full_instruction *inst)
1295 {
1296    /* XXX: disabling for now
1297     */
1298 #if 0
1299    if (inst->dst.CondMask == COND_TR)
1300       return BRW_PREDICATE_NONE;
1301
1302    /* All of GLSL only produces predicates for COND_NE and one channel per
1303     * vector.  Fail badly if someone starts doing something else, as it might
1304     * mean infinite looping or something.
1305     *
1306     * We'd like to support all the condition codes, but our hardware doesn't
1307     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1308     * those, the instruction may update the condition codes or not, then any
1309     * later instruction may use one of those condition codes.  For gen4, the
1310     * instruction may update the flags register based on one of the condition
1311     * codes output by the instruction, and then further instructions may
1312     * predicate on that.  We can probably support this, but it won't
1313     * necessarily be easy.
1314     */
1315 /*   assert(inst->dst.CondMask == COND_NE); */
1316
1317    switch (inst->dst.CondSwizzle) {
1318    case SWIZZLE_XXXX:
1319       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1320    case SWIZZLE_YYYY:
1321       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1322    case SWIZZLE_ZZZZ:
1323       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1324    case SWIZZLE_WWWW:
1325       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1326    default:
1327       debug_printf("Unexpected predicate: 0x%08x\n",
1328                     inst->dst.CondMask);
1329       return BRW_PREDICATE_NORMAL;
1330    }
1331 #else
1332    return BRW_PREDICATE_NORMAL;
1333 #endif
1334 }
1335
1336 static void emit_insn(struct brw_vs_compile *c,
1337                       const struct tgsi_full_instruction *inst)
1338 {
1339    unsigned opcode = inst->Instruction.Opcode;
1340    unsigned label = inst->Label.Label;
1341    struct brw_compile *p = &c->func;
1342    struct brw_reg args[3], dst;
1343    GLuint i;
1344
1345 #if 0
1346    printf("%d: ", insn);
1347    _mesa_print_instruction(inst);
1348 #endif
1349
1350    /* Get argument regs.
1351     */
1352    for (i = 0; i < 3; i++) {
1353       args[i] = get_arg(c, &inst->Src[i], i);
1354    }
1355
1356    /* Get dest regs.  Note that it is possible for a reg to be both
1357     * dst and arg, given the static allocation of registers.  So
1358     * care needs to be taken emitting multi-operation instructions.
1359     */
1360    dst = get_dst(c,
1361                  inst->Dst[0].Register.File,
1362                  inst->Dst[0].Register.Index,
1363                  inst->Dst[0].Register.WriteMask);
1364
1365    /* XXX: saturate
1366     */
1367    if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
1368       debug_printf("Unsupported saturate in vertex shader");
1369    }
1370
1371    switch (opcode) {
1372    case TGSI_OPCODE_ABS:
1373       brw_MOV(p, dst, brw_abs(args[0]));
1374       break;
1375    case TGSI_OPCODE_ADD:
1376       brw_ADD(p, dst, args[0], args[1]);
1377       break;
1378    case TGSI_OPCODE_COS:
1379       emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1380       break;
1381    case TGSI_OPCODE_DP3:
1382       brw_DP3(p, dst, args[0], args[1]);
1383       break;
1384    case TGSI_OPCODE_DP4:
1385       brw_DP4(p, dst, args[0], args[1]);
1386       break;
1387    case TGSI_OPCODE_DPH:
1388       brw_DPH(p, dst, args[0], args[1]);
1389       break;
1390    case TGSI_OPCODE_NRM:
1391       emit_nrm(c, dst, args[0], 3);
1392       break;
1393    case TGSI_OPCODE_NRM4:
1394       emit_nrm(c, dst, args[0], 4);
1395       break;
1396    case TGSI_OPCODE_DST:
1397       unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1398       break;
1399    case TGSI_OPCODE_EXP:
1400       unalias1(c, dst, args[0], emit_exp_noalias);
1401       break;
1402    case TGSI_OPCODE_EX2:
1403       emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1404       break;
1405    case TGSI_OPCODE_ARL:
1406       emit_arl(c, dst, args[0]);
1407       break;
1408    case TGSI_OPCODE_FLR:
1409       brw_RNDD(p, dst, args[0]);
1410       break;
1411    case TGSI_OPCODE_FRC:
1412       brw_FRC(p, dst, args[0]);
1413       break;
1414    case TGSI_OPCODE_LOG:
1415       unalias1(c, dst, args[0], emit_log_noalias);
1416       break;
1417    case TGSI_OPCODE_LG2:
1418       emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1419       break;
1420    case TGSI_OPCODE_LIT:
1421       unalias1(c, dst, args[0], emit_lit_noalias);
1422       break;
1423    case TGSI_OPCODE_LRP:
1424       unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1425       break;
1426    case TGSI_OPCODE_MAD:
1427       brw_MOV(p, brw_acc_reg(), args[2]);
1428       brw_MAC(p, dst, args[0], args[1]);
1429       break;
1430    case TGSI_OPCODE_MAX:
1431       emit_max(p, dst, args[0], args[1]);
1432       break;
1433    case TGSI_OPCODE_MIN:
1434       emit_min(p, dst, args[0], args[1]);
1435       break;
1436    case TGSI_OPCODE_MOV:
1437       brw_MOV(p, dst, args[0]);
1438       break;
1439    case TGSI_OPCODE_MUL:
1440       brw_MUL(p, dst, args[0], args[1]);
1441       break;
1442    case TGSI_OPCODE_POW:
1443       emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1444       break;
1445    case TGSI_OPCODE_RCP:
1446       emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1447       break;
1448    case TGSI_OPCODE_RSQ:
1449       emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst,
1450                  brw_swizzle(args[0], 0,0,0,0), BRW_MATH_PRECISION_FULL);
1451       break;
1452    case TGSI_OPCODE_SEQ:
1453       emit_seq(p, dst, args[0], args[1]);
1454       break;
1455    case TGSI_OPCODE_SIN:
1456       emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1457       break;
1458    case TGSI_OPCODE_SNE:
1459       emit_sne(p, dst, args[0], args[1]);
1460       break;
1461    case TGSI_OPCODE_SGE:
1462       emit_sge(p, dst, args[0], args[1]);
1463       break;
1464    case TGSI_OPCODE_SGT:
1465       emit_sgt(p, dst, args[0], args[1]);
1466       break;
1467    case TGSI_OPCODE_SLT:
1468       emit_slt(p, dst, args[0], args[1]);
1469       break;
1470    case TGSI_OPCODE_SLE:
1471       emit_sle(p, dst, args[0], args[1]);
1472       break;
1473    case TGSI_OPCODE_SUB:
1474       brw_ADD(p, dst, args[0], negate(args[1]));
1475       break;
1476    case TGSI_OPCODE_TRUNC:
1477       /* round toward zero */
1478       brw_RNDZ(p, dst, args[0]);
1479       break;
1480    case TGSI_OPCODE_XPD:
1481       emit_xpd(p, dst, args[0], args[1]);
1482       break;
1483    case TGSI_OPCODE_IF:
1484       assert(c->if_depth < MAX_IF_DEPTH);
1485       c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
1486       /* Note that brw_IF smashes the predicate_control field. */
1487       c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
1488       c->if_depth++;
1489       break;
1490    case TGSI_OPCODE_ELSE:
1491       c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
1492       break;
1493    case TGSI_OPCODE_ENDIF:
1494       assert(c->if_depth > 0);
1495       brw_ENDIF(p, c->if_inst[--c->if_depth]);
1496       break;
1497    case TGSI_OPCODE_BGNLOOP:
1498       c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1499       break;
1500    case TGSI_OPCODE_BRK:
1501       brw_set_predicate_control(p, get_predicate(inst));
1502       brw_BREAK(p);
1503       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1504       break;
1505    case TGSI_OPCODE_CONT:
1506       brw_set_predicate_control(p, get_predicate(inst));
1507       brw_CONT(p);
1508       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1509       break;
1510    case TGSI_OPCODE_ENDLOOP:
1511    {
1512       struct brw_instruction *inst0, *inst1;
1513       GLuint br = 1;
1514
1515       c->loop_depth--;
1516
1517       if (c->chipset.is_igdng)
1518          br = 2;
1519
1520       inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
1521       /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1522       while (inst0 > c->loop_inst[c->loop_depth]) {
1523          inst0--;
1524          if (inst0->header.opcode == TGSI_OPCODE_BRK) {
1525             inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1526             inst0->bits3.if_else.pop_count = 0;
1527          }
1528          else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
1529             inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1530             inst0->bits3.if_else.pop_count = 0;
1531          }
1532       }
1533    }
1534    break;
1535    case TGSI_OPCODE_BRA:
1536       brw_set_predicate_control(p, get_predicate(inst));
1537       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1538       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1539       break;
1540    case TGSI_OPCODE_CAL:
1541       brw_set_access_mode(p, BRW_ALIGN_1);
1542       brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1543       brw_set_access_mode(p, BRW_ALIGN_16);
1544       brw_ADD(p, get_addr_reg(c->stack_index),
1545               get_addr_reg(c->stack_index), brw_imm_d(4));
1546       brw_save_call(p, label, p->nr_insn);
1547       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1548       break;
1549    case TGSI_OPCODE_RET:
1550       brw_ADD(p, get_addr_reg(c->stack_index),
1551               get_addr_reg(c->stack_index), brw_imm_d(-4));
1552       brw_set_access_mode(p, BRW_ALIGN_1);
1553       brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
1554       brw_set_access_mode(p, BRW_ALIGN_16);
1555       break;
1556    case TGSI_OPCODE_END:
1557       c->end_offset = p->nr_insn;
1558       /* this instruction will get patched later to jump past subroutine
1559        * code, etc.
1560        */
1561       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1562       break;
1563    case TGSI_OPCODE_BGNSUB:
1564       brw_save_label(p, p->nr_insn, p->nr_insn);
1565       break;
1566    case TGSI_OPCODE_ENDSUB:
1567       /* no-op */
1568       break;
1569    default:
1570       debug_printf("Unsupported opcode %i (%s) in vertex shader",
1571                    opcode,
1572                    tgsi_get_opcode_name(opcode));
1573    }
1574
1575    /* Set the predication update on the last instruction of the native
1576     * instruction sequence.
1577     *
1578     * This would be problematic if it was set on a math instruction,
1579     * but that shouldn't be the case with the current GLSL compiler.
1580     */
1581 #if 0
1582    /* XXX: disabled
1583     */
1584    if (inst->CondUpdate) {
1585       struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1586
1587       assert(hw_insn->header.destreg__conditionalmod == 0);
1588       hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1589    }
1590 #endif
1591
1592    release_tmps(c);
1593 }
1594
1595
1596 /* Emit the vertex program instructions here.
1597  */
1598 void brw_vs_emit(struct brw_vs_compile *c)
1599 {
1600    struct brw_compile *p = &c->func;
1601    const struct tgsi_token *tokens = c->vp->tokens;
1602    struct brw_instruction *end_inst, *last_inst;
1603    struct tgsi_parse_context parse;
1604    struct tgsi_full_instruction *inst;
1605
1606    if (BRW_DEBUG & DEBUG_VS)
1607       tgsi_dump(c->vp->tokens, 0);
1608
1609    c->stack_index = brw_indirect(0, 0);
1610
1611    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1612    brw_set_access_mode(p, BRW_ALIGN_16);
1613
1614
1615    /* Static register allocation
1616     */
1617    brw_vs_alloc_regs(c);
1618
1619    if (c->vp->has_flow_control) {
1620       brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
1621    }
1622
1623    /* Instructions
1624     */
1625    tgsi_parse_init( &parse, tokens );
1626    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1627       tgsi_parse_token( &parse );
1628
1629       switch( parse.FullToken.Token.Type ) {
1630       case TGSI_TOKEN_TYPE_DECLARATION:
1631       case TGSI_TOKEN_TYPE_IMMEDIATE:
1632          break;
1633
1634       case TGSI_TOKEN_TYPE_INSTRUCTION:
1635          inst = &parse.FullToken.FullInstruction;
1636          emit_insn( c, inst );
1637          break;
1638
1639       default:
1640          assert( 0 );
1641       }
1642    }
1643    tgsi_parse_free( &parse );
1644
1645    end_inst = &p->store[c->end_offset];
1646    last_inst = &p->store[p->nr_insn];
1647
1648    /* The END instruction will be patched to jump to this code */
1649    emit_vertex_write(c);
1650
1651    post_vs_emit(c, end_inst, last_inst);
1652
1653    if (BRW_DEBUG & DEBUG_VS) {
1654       debug_printf("vs-native:\n");
1655       brw_disasm(stderr, p->store, p->nr_insn);
1656    }
1657 }