src/gallium/drivers/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "pipe/p_shader_tokens.h"
  33
  34 #include "util/u_memory.h"
  35 #include "util/u_math.h"
  36
  37 #include "tgsi/tgsi_parse.h"
  38 #include "tgsi/tgsi_dump.h"
  39 #include "tgsi/tgsi_info.h"
  40
  41 #include "brw_context.h"
  42 #include "brw_vs.h"
  43 #include "brw_debug.h"
  44 #include "brw_disasm.h"
  45
  46 /* Choose one of the 4 vec4's which can be packed into each 16-wide reg.
  47  */
  48 static INLINE struct brw_reg brw_vec4_grf_repeat( GLuint reg, GLuint slot )
  49 {
  50    int nr = reg + slot/2;
  51    int subnr = (slot%2) * 4;
  52
  53    return stride(brw_vec4_grf(nr, subnr), 0, 4, 1);
  54 }
  55
  56
  57 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  58 {
  59    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  60
  61    if (++c->last_tmp > c->prog_data.total_grf)
  62       c->prog_data.total_grf = c->last_tmp;
  63
  64    return tmp;
  65 }
  66
  67 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  68 {
  69    if (tmp.nr == c->last_tmp-1)
  70       c->last_tmp--;
  71 }
  72
  73 static void release_tmps( struct brw_vs_compile *c )
  74 {
  75    c->last_tmp = c->first_tmp;
  76 }
  77
  78
  79 static boolean is_position_output( struct brw_vs_compile *c,
  80                                    unsigned vs_output )
  81 {
  82    struct brw_vertex_shader *vs = c->vp;
  83
  84    if (vs_output == c->prog_data.output_edgeflag) {
  85       return FALSE;
  86    }
  87    else {
  88       unsigned semantic = vs->info.output_semantic_name[vs_output];
  89       unsigned index = vs->info.output_semantic_index[vs_output];
  90
  91       return (semantic == TGSI_SEMANTIC_POSITION &&
  92               index == 0);
  93    }
  94 }
  95
  96
  97 static boolean find_output_slot( struct brw_vs_compile *c,
  98                                   unsigned vs_output,
  99                                   unsigned *fs_input_slot )
 100 {
 101    struct brw_vertex_shader *vs = c->vp;
 102
 103    if (vs_output == c->prog_data.output_edgeflag) {
 104       *fs_input_slot = c->key.fs_signature.nr_inputs;
 105       return TRUE;
 106    }
 107    else {
 108       unsigned semantic = vs->info.output_semantic_name[vs_output];
 109       unsigned index = vs->info.output_semantic_index[vs_output];
 110       unsigned i;
 111
 112       for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
 113          if (c->key.fs_signature.input[i].semantic == semantic &&
 114           c->key.fs_signature.input[i].semantic_index == index) {
 115             *fs_input_slot = i;
 116             return TRUE;
 117          }
 118       }
 119    }
 120
 121    return FALSE;
 122 }
 123
 124
 125 /**
 126  * Preallocate GRF register before code emit.
 127  * Do things as simply as possible.  Allocate and populate all regs
 128  * ahead of time.
 129  */
 130 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 131 {
 132    GLuint i, reg = 0, subreg = 0, mrf;
 133    int attributes_in_vue;
 134
 135    /* Determine whether to use a real constant buffer or use a block
 136     * of GRF registers for constants.  The later is faster but only
 137     * works if everything fits in the GRF.
 138     * XXX this heuristic/check may need some fine tuning...
 139     */
 140    if (c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1 +
 141        c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
 142        c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 > BRW_MAX_GRF)
 143       c->vp->use_const_buffer = GL_TRUE;
 144    else {
 145       /* XXX: immediates can go elsewhere if necessary:
 146        */
 147       assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
 148              c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 <= BRW_MAX_GRF);
 149
 150       c->vp->use_const_buffer = GL_FALSE;
 151    }
 152
 153    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 154
 155    /* r0 -- reserved as usual
 156     */
 157    c->r0 = brw_vec8_grf(reg, 0);
 158    reg++;
 159
 160    /* User clip planes from curbe:
 161     */
 162    if (c->key.nr_userclip) {
 163       /* Skip over fixed planes:  Or never read them into vs unit?
 164        */
 165       subreg += 6;
 166
 167       for (i = 0; i < c->key.nr_userclip; i++, subreg++) {
 168          c->userplane[i] =
 169             stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
 170       }
 171
 172       /* Deal with curbe alignment:
 173        */
 174       subreg = align(subreg, 2);
 175       /*reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;*/
 176    }
 177
 178
 179    /* Immediates: always in the curbe.
 180     *
 181     * XXX: Can try to encode some immediates as brw immediates
 182     * XXX: Make sure ureg sets minimal immediate size and respect it
 183     * here.
 184     */
 185    for (i = 0; i < c->vp->info.immediate_count; i++, subreg++) {
 186       c->regs[TGSI_FILE_IMMEDIATE][i] =
 187          stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
 188    }
 189    c->prog_data.nr_params = c->vp->info.immediate_count * 4;
 190
 191
 192    /* Vertex constant buffer.
 193     *
 194     * Constants from the buffer can be either cached in the curbe or
 195     * loaded as needed from the actual constant buffer.
 196     */
 197    if (!c->vp->use_const_buffer) {
 198       GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
 199
 200       for (i = 0; i < nr_params; i++, subreg++) {
 201          c->regs[TGSI_FILE_CONSTANT][i] =
 202             stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
 203       }
 204
 205       c->prog_data.nr_params += nr_params * 4;
 206    }
 207
 208    /* All regs allocated
 209     */
 210    reg += (subreg + 1) / 2;
 211    c->prog_data.curb_read_length = reg - 1;
 212
 213
 214    /* Allocate input regs:
 215     */
 216    c->nr_inputs = c->vp->info.num_inputs;
 217    for (i = 0; i < c->nr_inputs; i++) {
 218       c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
 219       reg++;
 220    }
 221
 222    /* If there are no inputs, we'll still be reading one attribute's worth
 223     * because it's required -- see urb_read_length setting.
 224     */
 225    if (c->nr_inputs == 0)
 226       reg++;
 227
 228
 229
 230    /* Allocate outputs.  The non-position outputs go straight into message regs.
 231     */
 232    c->nr_outputs = c->prog_data.nr_outputs;
 233
 234    if (c->chipset.is_igdng)
 235       mrf = 8;
 236    else
 237       mrf = 4;
 238
 239
 240    if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
 241       c->overflow_grf_start = reg;
 242       c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
 243       reg += c->overflow_count;
 244    }
 245
 246    /* XXX: need to access vertex output semantics here:
 247     */
 248    for (i = 0; i < c->nr_outputs; i++) {
 249       unsigned slot;
 250
 251       /* XXX: Put output position in slot zero always.  Clipper, etc,
 252        * need access to this reg.
 253        */
 254       if (is_position_output(c, i)) {
 255          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
 256          reg++;
 257       }
 258       else if (find_output_slot(c, i, &slot)) {
 259
 260          if (0 /* is_psize_output(c, i) */ ) {
 261             /* c->psize_out.grf = reg; */
 262             /* c->psize_out.mrf = i; */
 263          }
 264
 265          /* The first (16-4) outputs can go straight into the message regs.
 266           */
 267          if (slot + mrf < BRW_MAX_MRF) {
 268             c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
 269          }
 270          else {
 271             int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
 272             c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
 273          }
 274       }
 275       else {
 276          c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
 277       }
 278    }
 279
 280    /* Allocate program temporaries:
 281     */
 282
 283    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
 284       c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 285       reg++;
 286    }
 287
 288    /* Address reg(s).  Don't try to use the internal address reg until
 289     * deref time.
 290     */
 291    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
 292       c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 293                                              reg,
 294                                              0,
 295                                              BRW_REGISTER_TYPE_D,
 296                                              BRW_VERTICAL_STRIDE_8,
 297                                              BRW_WIDTH_8,
 298                                              BRW_HORIZONTAL_STRIDE_1,
 299                                              BRW_SWIZZLE_XXXX,
 300                                              BRW_WRITEMASK_X);
 301       reg++;
 302    }
 303
 304    if (c->vp->use_const_buffer) {
 305       for (i = 0; i < 3; i++) {
 306          c->current_const[i].index = -1;
 307          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 308          reg++;
 309       }
 310    }
 311
 312 #if 0
 313    for (i = 0; i < 128; i++) {
 314       if (c->output_regs[i].used_in_src) {
 315          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 316          reg++;
 317       }
 318    }
 319 #endif
 320
 321    if (c->vp->has_flow_control) {
 322       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 323       reg += 2;
 324    }
 325
 326    /* Some opcodes need an internal temporary:
 327     */
 328    c->first_tmp = reg;
 329    c->last_tmp = reg;           /* for allocation purposes */
 330
 331    /* Each input reg holds data from two vertices.  The
 332     * urb_read_length is the number of registers read from *each*
 333     * vertex urb, so is half the amount:
 334     */
 335    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 336
 337    /* Setting this field to 0 leads to undefined behavior according to the
 338     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 339     * sitting in them, even if it's padding.
 340     */
 341    if (c->prog_data.urb_read_length == 0)
 342       c->prog_data.urb_read_length = 1;
 343
 344    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 345     * them to fit the biggest thing they need to.
 346     */
 347    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 348
 349    if (c->chipset.is_igdng)
 350       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 351    else
 352       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 353
 354    c->prog_data.total_grf = reg;
 355
 356    if (BRW_DEBUG & DEBUG_VS) {
 357       debug_printf("%s NumAddrRegs %d\n", __FUNCTION__,
 358                    c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
 359       debug_printf("%s NumTemps %d\n", __FUNCTION__,
 360                    c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
 361       debug_printf("%s reg = %d\n", __FUNCTION__, reg);
 362    }
 363 }
 364
 365
 366 /**
 367  * If an instruction uses a temp reg both as a src and the dest, we
 368  * sometimes need to allocate an intermediate temporary.
 369  */
 370 static void unalias1( struct brw_vs_compile *c,
 371                       struct brw_reg dst,
 372                       struct brw_reg arg0,
 373                       void (*func)( struct brw_vs_compile *,
 374                                     struct brw_reg,
 375                                     struct brw_reg ))
 376 {
 377    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 378       struct brw_compile *p = &c->func;
 379       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 380       func(c, tmp, arg0);
 381       brw_MOV(p, dst, tmp);
 382       release_tmp(c, tmp);
 383    }
 384    else {
 385       func(c, dst, arg0);
 386    }
 387 }
 388
 389 /**
 390  * \sa unalias2
 391  * Checkes if 2-operand instruction needs an intermediate temporary.
 392  */
 393 static void unalias2( struct brw_vs_compile *c,
 394                       struct brw_reg dst,
 395                       struct brw_reg arg0,
 396                       struct brw_reg arg1,
 397                       void (*func)( struct brw_vs_compile *,
 398                                     struct brw_reg,
 399                                     struct brw_reg,
 400                                     struct brw_reg ))
 401 {
 402    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 403        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 404       struct brw_compile *p = &c->func;
 405       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 406       func(c, tmp, arg0, arg1);
 407       brw_MOV(p, dst, tmp);
 408       release_tmp(c, tmp);
 409    }
 410    else {
 411       func(c, dst, arg0, arg1);
 412    }
 413 }
 414
 415 /**
 416  * \sa unalias2
 417  * Checkes if 3-operand instruction needs an intermediate temporary.
 418  */
 419 static void unalias3( struct brw_vs_compile *c,
 420                       struct brw_reg dst,
 421                       struct brw_reg arg0,
 422                       struct brw_reg arg1,
 423                       struct brw_reg arg2,
 424                       void (*func)( struct brw_vs_compile *,
 425                                     struct brw_reg,
 426                                     struct brw_reg,
 427                                     struct brw_reg,
 428                                     struct brw_reg ))
 429 {
 430    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 431        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 432        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 433       struct brw_compile *p = &c->func;
 434       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 435       func(c, tmp, arg0, arg1, arg2);
 436       brw_MOV(p, dst, tmp);
 437       release_tmp(c, tmp);
 438    }
 439    else {
 440       func(c, dst, arg0, arg1, arg2);
 441    }
 442 }
 443
 444 static void emit_sop( struct brw_compile *p,
 445                       struct brw_reg dst,
 446                       struct brw_reg arg0,
 447                       struct brw_reg arg1,
 448                       GLuint cond)
 449 {
 450    brw_MOV(p, dst, brw_imm_f(0.0f));
 451    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 452    brw_MOV(p, dst, brw_imm_f(1.0f));
 453    brw_set_predicate_control_flag_value(p, 0xff);
 454 }
 455
 456 static void emit_seq( struct brw_compile *p,
 457                       struct brw_reg dst,
 458                       struct brw_reg arg0,
 459                       struct brw_reg arg1 )
 460 {
 461    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 462 }
 463
 464 static void emit_sne( struct brw_compile *p,
 465                       struct brw_reg dst,
 466                       struct brw_reg arg0,
 467                       struct brw_reg arg1 )
 468 {
 469    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 470 }
 471 static void emit_slt( struct brw_compile *p,
 472                       struct brw_reg dst,
 473                       struct brw_reg arg0,
 474                       struct brw_reg arg1 )
 475 {
 476    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 477 }
 478
 479 static void emit_sle( struct brw_compile *p,
 480                       struct brw_reg dst,
 481                       struct brw_reg arg0,
 482                       struct brw_reg arg1 )
 483 {
 484    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 485 }
 486
 487 static void emit_sgt( struct brw_compile *p,
 488                       struct brw_reg dst,
 489                       struct brw_reg arg0,
 490                       struct brw_reg arg1 )
 491 {
 492    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 493 }
 494
 495 static void emit_sge( struct brw_compile *p,
 496                       struct brw_reg dst,
 497                       struct brw_reg arg0,
 498                       struct brw_reg arg1 )
 499 {
 500   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 501 }
 502
 503 static void emit_max( struct brw_compile *p,
 504                       struct brw_reg dst,
 505                       struct brw_reg arg0,
 506                       struct brw_reg arg1 )
 507 {
 508    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 509    brw_SEL(p, dst, arg1, arg0);
 510    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 511 }
 512
 513 static void emit_min( struct brw_compile *p,
 514                       struct brw_reg dst,
 515                       struct brw_reg arg0,
 516                       struct brw_reg arg1 )
 517 {
 518    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 519    brw_SEL(p, dst, arg0, arg1);
 520    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 521 }
 522
 523
 524 static void emit_math1( struct brw_vs_compile *c,
 525                         GLuint function,
 526                         struct brw_reg dst,
 527                         struct brw_reg arg0,
 528                         GLuint precision)
 529 {
 530    /* There are various odd behaviours with SEND on the simulator.  In
 531     * addition there are documented issues with the fact that the GEN4
 532     * processor doesn't do dependency control properly on SEND
 533     * results.  So, on balance, this kludge to get around failures
 534     * with writemasked math results looks like it might be necessary
 535     * whether that turns out to be a simulator bug or not:
 536     */
 537    struct brw_compile *p = &c->func;
 538    struct brw_reg tmp = dst;
 539    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 540                          dst.file != BRW_GENERAL_REGISTER_FILE);
 541
 542    if (need_tmp)
 543       tmp = get_tmp(c);
 544
 545    brw_math(p,
 546             tmp,
 547             function,
 548             BRW_MATH_SATURATE_NONE,
 549             2,
 550             arg0,
 551             BRW_MATH_DATA_SCALAR,
 552             precision);
 553
 554    if (need_tmp) {
 555       brw_MOV(p, dst, tmp);
 556       release_tmp(c, tmp);
 557    }
 558 }
 559
 560
 561 static void emit_math2( struct brw_vs_compile *c,
 562                         GLuint function,
 563                         struct brw_reg dst,
 564                         struct brw_reg arg0,
 565                         struct brw_reg arg1,
 566                         GLuint precision)
 567 {
 568    struct brw_compile *p = &c->func;
 569    struct brw_reg tmp = dst;
 570    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 571                          dst.file != BRW_GENERAL_REGISTER_FILE);
 572
 573    if (need_tmp)
 574       tmp = get_tmp(c);
 575
 576    brw_MOV(p, brw_message_reg(3), arg1);
 577
 578    brw_math(p,
 579             tmp,
 580             function,
 581             BRW_MATH_SATURATE_NONE,
 582             2,
 583             arg0,
 584             BRW_MATH_DATA_SCALAR,
 585             precision);
 586
 587    if (need_tmp) {
 588       brw_MOV(p, dst, tmp);
 589       release_tmp(c, tmp);
 590    }
 591 }
 592
 593
 594 static void emit_exp_noalias( struct brw_vs_compile *c,
 595                               struct brw_reg dst,
 596                               struct brw_reg arg0 )
 597 {
 598    struct brw_compile *p = &c->func;
 599
 600
 601    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
 602       struct brw_reg tmp = get_tmp(c);
 603       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 604
 605       /* tmp_d = floor(arg0.x) */
 606       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 607
 608       /* result[0] = 2.0 ^ tmp */
 609
 610       /* Adjust exponent for floating point:
 611        * exp += 127
 612        */
 613       brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 614
 615       /* Install exponent and sign.
 616        * Excess drops off the edge:
 617        */
 618       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X),
 619               tmp_d, brw_imm_d(23));
 620
 621       release_tmp(c, tmp);
 622    }
 623
 624    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
 625       /* result[1] = arg0.x - floor(arg0.x) */
 626       brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
 627    }
 628
 629    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 630       /* As with the LOG instruction, we might be better off just
 631        * doing a taylor expansion here, seeing as we have to do all
 632        * the prep work.
 633        *
 634        * If mathbox partial precision is too low, consider also:
 635        * result[3] = result[0] * EXP(result[1])
 636        */
 637       emit_math1(c,
 638                  BRW_MATH_FUNCTION_EXP,
 639                  brw_writemask(dst, BRW_WRITEMASK_Z),
 640                  brw_swizzle1(arg0, 0),
 641                  BRW_MATH_PRECISION_FULL);
 642    }
 643
 644    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 645       /* result[3] = 1.0; */
 646       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
 647    }
 648 }
 649
 650
 651 static void emit_log_noalias( struct brw_vs_compile *c,
 652                               struct brw_reg dst,
 653                               struct brw_reg arg0 )
 654 {
 655    struct brw_compile *p = &c->func;
 656    struct brw_reg tmp = dst;
 657    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 658    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 659    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 660                          dst.file != BRW_GENERAL_REGISTER_FILE);
 661
 662    if (need_tmp) {
 663       tmp = get_tmp(c);
 664       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 665    }
 666
 667    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 668     * according to spec:
 669     *
 670     * These almost look likey they could be joined up, but not really
 671     * practical:
 672     *
 673     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 674     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 675     */
 676    if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
 677       brw_AND(p,
 678               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 679               brw_swizzle1(arg0_ud, 0),
 680               brw_imm_ud((1U<<31)-1));
 681
 682       brw_SHR(p,
 683               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 684               tmp_ud,
 685               brw_imm_ud(23));
 686
 687       brw_ADD(p,
 688               brw_writemask(tmp, BRW_WRITEMASK_X),
 689               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 690               brw_imm_d(-127));
 691    }
 692
 693    if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
 694       brw_AND(p,
 695               brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 696               brw_swizzle1(arg0_ud, 0),
 697               brw_imm_ud((1<<23)-1));
 698
 699       brw_OR(p,
 700              brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 701              tmp_ud,
 702              brw_imm_ud(127<<23));
 703    }
 704
 705    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 706       /* result[2] = result[0] + LOG2(result[1]); */
 707
 708       /* Why bother?  The above is just a hint how to do this with a
 709        * taylor series.  Maybe we *should* use a taylor series as by
 710        * the time all the above has been done it's almost certainly
 711        * quicker than calling the mathbox, even with low precision.
 712        *
 713        * Options are:
 714        *    - result[0] + mathbox.LOG2(result[1])
 715        *    - mathbox.LOG2(arg0.x)
 716        *    - result[0] + inline_taylor_approx(result[1])
 717        */
 718       emit_math1(c,
 719                  BRW_MATH_FUNCTION_LOG,
 720                  brw_writemask(tmp, BRW_WRITEMASK_Z),
 721                  brw_swizzle1(tmp, 1),
 722                  BRW_MATH_PRECISION_FULL);
 723
 724       brw_ADD(p,
 725               brw_writemask(tmp, BRW_WRITEMASK_Z),
 726               brw_swizzle1(tmp, 2),
 727               brw_swizzle1(tmp, 0));
 728    }
 729
 730    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 731       /* result[3] = 1.0; */
 732       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
 733    }
 734
 735    if (need_tmp) {
 736       brw_MOV(p, dst, tmp);
 737       release_tmp(c, tmp);
 738    }
 739 }
 740
 741
 742 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 743  */
 744 static void emit_dst_noalias( struct brw_vs_compile *c,
 745                               struct brw_reg dst,
 746                               struct brw_reg arg0,
 747                               struct brw_reg arg1)
 748 {
 749    struct brw_compile *p = &c->func;
 750
 751    /* There must be a better way to do this:
 752     */
 753    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
 754       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
 755    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
 756       brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
 757    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
 758       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
 759    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
 760       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 761 }
 762
 763
 764 static void emit_xpd( struct brw_compile *p,
 765                       struct brw_reg dst,
 766                       struct brw_reg t,
 767                       struct brw_reg u)
 768 {
 769    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 770    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 771 }
 772
 773
 774 static void emit_lit_noalias( struct brw_vs_compile *c,
 775                               struct brw_reg dst,
 776                               struct brw_reg arg0 )
 777 {
 778    struct brw_compile *p = &c->func;
 779    struct brw_instruction *if_insn;
 780    struct brw_reg tmp = dst;
 781    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 782
 783    if (need_tmp)
 784       tmp = get_tmp(c);
 785
 786    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0));
 787    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1));
 788
 789    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 790     * to get all channels active inside the IF.  In the clipping code
 791     * we run with NoMask, so it's not an option and we can use
 792     * BRW_EXECUTE_1 for all comparisions.
 793     */
 794    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 795    if_insn = brw_IF(p, BRW_EXECUTE_8);
 796    {
 797       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 798
 799       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 800       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
 801       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 802
 803       emit_math2(c,
 804                  BRW_MATH_FUNCTION_POW,
 805                  brw_writemask(dst, BRW_WRITEMASK_Z),
 806                  brw_swizzle1(tmp, 2),
 807                  brw_swizzle1(arg0, 3),
 808                  BRW_MATH_PRECISION_PARTIAL);
 809    }
 810
 811    brw_ENDIF(p, if_insn);
 812
 813    release_tmp(c, tmp);
 814 }
 815
 816 static void emit_lrp_noalias(struct brw_vs_compile *c,
 817                              struct brw_reg dst,
 818                              struct brw_reg arg0,
 819                              struct brw_reg arg1,
 820                              struct brw_reg arg2)
 821 {
 822    struct brw_compile *p = &c->func;
 823
 824    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 825    brw_MUL(p, brw_null_reg(), dst, arg2);
 826    brw_MAC(p, dst, arg0, arg1);
 827 }
 828
 829 /** 3 or 4-component vector normalization */
 830 static void emit_nrm( struct brw_vs_compile *c,
 831                       struct brw_reg dst,
 832                       struct brw_reg arg0,
 833                       int num_comps)
 834 {
 835    struct brw_compile *p = &c->func;
 836    struct brw_reg tmp = get_tmp(c);
 837
 838    /* tmp = dot(arg0, arg0) */
 839    if (num_comps == 3)
 840       brw_DP3(p, tmp, arg0, arg0);
 841    else
 842       brw_DP4(p, tmp, arg0, arg0);
 843
 844    /* tmp = 1 / sqrt(tmp) */
 845    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 846
 847    /* dst = arg0 * tmp */
 848    brw_MUL(p, dst, arg0, tmp);
 849
 850    release_tmp(c, tmp);
 851 }
 852
 853
 854 static struct brw_reg
 855 get_constant(struct brw_vs_compile *c,
 856              GLuint argIndex,
 857              GLuint index,
 858              GLboolean relAddr)
 859 {
 860    struct brw_compile *p = &c->func;
 861    struct brw_reg const_reg;
 862    struct brw_reg const2_reg;
 863
 864    assert(argIndex < 3);
 865
 866    if (c->current_const[argIndex].index != index || relAddr) {
 867       struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 868
 869       c->current_const[argIndex].index = index;
 870
 871 #if 0
 872       printf("  fetch const[%d] for arg %d into reg %d\n",
 873              src.Index, argIndex, c->current_const[argIndex].reg.nr);
 874 #endif
 875       /* need to fetch the constant now */
 876       brw_dp_READ_4_vs(p,
 877                        c->current_const[argIndex].reg,/* writeback dest */
 878                        0,                             /* oword */
 879                        relAddr,                       /* relative indexing? */
 880                        addrReg,                       /* address register */
 881                        16 * index,               /* byte offset */
 882                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 883                        );
 884
 885       if (relAddr) {
 886          /* second read */
 887          const2_reg = get_tmp(c);
 888
 889          /* use upper half of address reg for second read */
 890          addrReg = stride(addrReg, 0, 4, 0);
 891          addrReg.subnr = 16;
 892
 893          brw_dp_READ_4_vs(p,
 894                           const2_reg,              /* writeback dest */
 895                           1,                       /* oword */
 896                           relAddr,                 /* relative indexing? */
 897                           addrReg,                 /* address register */
 898                           16 * index,         /* byte offset */
 899                           SURF_INDEX_VERT_CONST_BUFFER
 900                           );
 901       }
 902    }
 903
 904    const_reg = c->current_const[argIndex].reg;
 905
 906    if (relAddr) {
 907       /* merge the two Owords into the constant register */
 908       /* const_reg[7..4] = const2_reg[7..4] */
 909       brw_MOV(p,
 910               suboffset(stride(const_reg, 0, 4, 1), 4),
 911               suboffset(stride(const2_reg, 0, 4, 1), 4));
 912       release_tmp(c, const2_reg);
 913    }
 914    else {
 915       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 916       const_reg = stride(const_reg, 0, 4, 0);
 917       const_reg.subnr = 0;
 918    }
 919
 920    return const_reg;
 921 }
 922
 923
 924
 925 /* TODO: relative addressing!
 926  */
 927 static struct brw_reg get_reg( struct brw_vs_compile *c,
 928                                enum tgsi_file_type file,
 929                                GLuint index )
 930 {
 931    switch (file) {
 932    case TGSI_FILE_TEMPORARY:
 933    case TGSI_FILE_INPUT:
 934    case TGSI_FILE_OUTPUT:
 935    case TGSI_FILE_CONSTANT:
 936       assert(c->regs[file][index].nr != 0);
 937       return c->regs[file][index];
 938
 939    case TGSI_FILE_ADDRESS:
 940       assert(index == 0);
 941       return c->regs[file][index];
 942
 943    case TGSI_FILE_NULL:                 /* undef values */
 944       return brw_null_reg();
 945
 946    default:
 947       assert(0);
 948       return brw_null_reg();
 949    }
 950 }
 951
 952
 953 /**
 954  * Indirect addressing:  get reg[[arg] + offset].
 955  */
 956 static struct brw_reg deref( struct brw_vs_compile *c,
 957                              struct brw_reg arg,
 958                              GLint offset)
 959 {
 960    struct brw_compile *p = &c->func;
 961    struct brw_reg tmp = vec4(get_tmp(c));
 962    struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
 963    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 964    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 965    struct brw_reg indirect = brw_vec4_indirect(0,0);
 966
 967    {
 968       brw_push_insn_state(p);
 969       brw_set_access_mode(p, BRW_ALIGN_1);
 970
 971       /* This is pretty clunky - load the address register twice and
 972        * fetch each 4-dword value in turn.  There must be a way to do
 973        * this in a single pass, but I couldn't get it to work.
 974        */
 975       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 976       brw_MOV(p, tmp, indirect);
 977
 978       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 979       brw_MOV(p, suboffset(tmp, 4), indirect);
 980
 981       brw_pop_insn_state(p);
 982    }
 983
 984    /* NOTE: tmp not released */
 985    return vec8(tmp);
 986 }
 987
 988
 989 /**
 990  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 991  * TODO: relative addressing!
 992  */
 993 static struct brw_reg
 994 get_src_reg( struct brw_vs_compile *c,
 995              GLuint argIndex,
 996              GLuint file,
 997              GLint index,
 998              GLboolean relAddr )
 999 {
1000
1001    switch (file) {
1002    case TGSI_FILE_TEMPORARY:
1003    case TGSI_FILE_INPUT:
1004    case TGSI_FILE_OUTPUT:
1005       if (relAddr) {
1006          return deref(c, c->regs[file][0], index);
1007       }
1008       else {
1009          assert(c->regs[file][index].nr != 0);
1010          return c->regs[file][index];
1011       }
1012
1013    case TGSI_FILE_IMMEDIATE:
1014       return c->regs[file][index];
1015
1016    case TGSI_FILE_CONSTANT:
1017       if (c->vp->use_const_buffer) {
1018          return get_constant(c, argIndex, index, relAddr);
1019       }
1020       else if (relAddr) {
1021          return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
1022       }
1023       else {
1024          assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
1025          return c->regs[TGSI_FILE_CONSTANT][index];
1026       }
1027    case TGSI_FILE_ADDRESS:
1028       assert(index == 0);
1029       return c->regs[file][index];
1030
1031    case TGSI_FILE_NULL:
1032       /* this is a normal case since we loop over all three src args */
1033       return brw_null_reg();
1034
1035    default:
1036       assert(0);
1037       return brw_null_reg();
1038    }
1039 }
1040
1041
1042 static void emit_arl( struct brw_vs_compile *c,
1043                       struct brw_reg dst,
1044                       struct brw_reg arg0 )
1045 {
1046    struct brw_compile *p = &c->func;
1047    struct brw_reg tmp = dst;
1048    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1049
1050    if (need_tmp)
1051       tmp = get_tmp(c);
1052
1053    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
1054    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
1055
1056    if (need_tmp)
1057       release_tmp(c, tmp);
1058 }
1059
1060
1061 /**
1062  * Return the brw reg for the given instruction's src argument.
1063  */
1064 static struct brw_reg get_arg( struct brw_vs_compile *c,
1065                                const struct tgsi_full_src_register *src,
1066                                GLuint argIndex )
1067 {
1068    struct brw_reg reg;
1069
1070    if (src->Register.File == TGSI_FILE_NULL)
1071       return brw_null_reg();
1072
1073    reg = get_src_reg(c, argIndex,
1074                      src->Register.File,
1075                      src->Register.Index,
1076                      src->Register.Indirect);
1077
1078    /* Convert 3-bit swizzle to 2-bit.
1079     */
1080    reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->Register.SwizzleX,
1081                                        src->Register.SwizzleY,
1082                                        src->Register.SwizzleZ,
1083                                        src->Register.SwizzleW);
1084
1085    reg.negate = src->Register.Negate ? 1 : 0;
1086
1087    /* XXX: abs, absneg
1088     */
1089
1090    return reg;
1091 }
1092
1093
1094 /**
1095  * Get brw register for the given program dest register.
1096  */
1097 static struct brw_reg get_dst( struct brw_vs_compile *c,
1098                                unsigned file,
1099                                unsigned index,
1100                                unsigned writemask )
1101 {
1102    struct brw_reg reg;
1103
1104    switch (file) {
1105    case TGSI_FILE_TEMPORARY:
1106    case TGSI_FILE_OUTPUT:
1107       assert(c->regs[file][index].nr != 0);
1108       reg = c->regs[file][index];
1109       break;
1110    case TGSI_FILE_ADDRESS:
1111       assert(index == 0);
1112       reg = c->regs[file][index];
1113       break;
1114    case TGSI_FILE_NULL:
1115       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1116       reg = brw_null_reg();
1117       break;
1118    default:
1119       assert(0);
1120       reg = brw_null_reg();
1121    }
1122
1123    reg.dw1.bits.writemask = writemask;
1124
1125    return reg;
1126 }
1127
1128
1129
1130
1131 /**
1132  * Post-vertex-program processing.  Send the results to the URB.
1133  */
1134 static void emit_vertex_write( struct brw_vs_compile *c)
1135 {
1136    struct brw_compile *p = &c->func;
1137    struct brw_reg m0 = brw_message_reg(0);
1138    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
1139    struct brw_reg ndc;
1140    int eot;
1141    int i;
1142    GLuint len_vertext_header = 2;
1143
1144    if (c->key.copy_edgeflag) {
1145       brw_MOV(p,
1146               get_reg(c, TGSI_FILE_OUTPUT, c->prog_data.output_edgeflag),
1147               brw_imm_f(1));
1148    }
1149
1150    /* Build ndc coords */
1151    ndc = get_tmp(c);
1152    /* ndc = 1.0 / pos.w */
1153    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1154    /* ndc.xyz = pos * ndc */
1155    brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
1156
1157    /* Update the header for point size, user clipping flags, and -ve rhw
1158     * workaround.
1159     */
1160    if (c->prog_data.writes_psiz ||
1161        c->key.nr_userclip ||
1162        c->chipset.is_965)
1163    {
1164       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1165       GLuint i;
1166
1167       brw_MOV(p, header1, brw_imm_ud(0));
1168
1169       brw_set_access_mode(p, BRW_ALIGN_16);
1170
1171       if (c->prog_data.writes_psiz) {
1172          struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
1173          brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1174          brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1175       }
1176
1177       for (i = 0; i < c->key.nr_userclip; i++) {
1178          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1179          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1180          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
1181          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1182       }
1183
1184       /* i965 clipping workaround:
1185        * 1) Test for -ve rhw
1186        * 2) If set,
1187        *      set ndc = (0,0,0,0)
1188        *      set ucp[6] = 1
1189        *
1190        * Later, clipping will detect ucp[6] and ensure the primitive is
1191        * clipped against all fixed planes.
1192        */
1193       if (c->chipset.is_965) {
1194          brw_CMP(p,
1195                  vec8(brw_null_reg()),
1196                  BRW_CONDITIONAL_L,
1197                  brw_swizzle1(ndc, 3),
1198                  brw_imm_f(0));
1199
1200          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
1201          brw_MOV(p, ndc, brw_imm_f(0));
1202          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1203       }
1204
1205       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1206       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1207       brw_set_access_mode(p, BRW_ALIGN_16);
1208
1209       release_tmp(c, header1);
1210    }
1211    else {
1212       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1213    }
1214
1215    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1216     * of zeros followed by two sets of NDC coordinates:
1217     */
1218    brw_set_access_mode(p, BRW_ALIGN_1);
1219    brw_MOV(p, offset(m0, 2), ndc);
1220
1221    if (c->chipset.is_igdng) {
1222        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1223        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1224        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1225         * Seems it is useless for us.
1226         * m6 is used for aligning, so that the remainder of vertex element is
1227         * reg-aligned.
1228         */
1229        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1230        len_vertext_header = 6;
1231    } else {
1232        brw_MOV(p, offset(m0, 3), pos);
1233        len_vertext_header = 2;
1234    }
1235
1236    eot = (c->overflow_count == 0);
1237
1238    brw_urb_WRITE(p,
1239                  brw_null_reg(), /* dest */
1240                  0,             /* starting mrf reg nr */
1241                  c->r0,         /* src */
1242                  0,             /* allocate */
1243                  1,             /* used */
1244                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1245                  0,             /* response len */
1246                  eot,           /* eot */
1247                  eot,           /* writes complete */
1248                  0,             /* urb destination offset */
1249                  BRW_URB_SWIZZLE_INTERLEAVE);
1250
1251    /* Not all of the vertex outputs/results fit into the MRF.
1252     * Move the overflowed attributes from the GRF to the MRF and
1253     * issue another brw_urb_WRITE().
1254     */
1255    for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
1256       unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
1257       GLuint j;
1258
1259       eot = (i + nr >= c->overflow_count);
1260
1261       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1262        * at mrf[4] atm...
1263        */
1264       for (j = 0; j < nr; j++) {
1265          brw_MOV(p, brw_message_reg(4+j),
1266                  brw_vec8_grf(c->overflow_grf_start + i + j, 0));
1267       }
1268
1269       brw_urb_WRITE(p,
1270                     brw_null_reg(), /* dest */
1271                     4,              /* starting mrf reg nr */
1272                     c->r0,          /* src */
1273                     0,              /* allocate */
1274                     1,              /* used */
1275                     nr+1,          /* msg len */
1276                     0,              /* response len */
1277                     eot,            /* eot */
1278                     eot,            /* writes complete */
1279                     i-1,            /* urb destination offset */
1280                     BRW_URB_SWIZZLE_INTERLEAVE);
1281    }
1282 }
1283
1284
1285 /**
1286  * Called after code generation to resolve subroutine calls and the
1287  * END instruction.
1288  * \param end_inst  points to brw code for END instruction
1289  * \param last_inst  points to last instruction emitted before vertex write
1290  */
1291 static void
1292 post_vs_emit( struct brw_vs_compile *c,
1293               struct brw_instruction *end_inst,
1294               struct brw_instruction *last_inst )
1295 {
1296    GLint offset;
1297
1298    brw_resolve_cals(&c->func);
1299
1300    /* patch up the END code to jump past subroutines, etc */
1301    offset = last_inst - end_inst;
1302    if (offset > 1) {
1303       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1304    } else {
1305       end_inst->header.opcode = BRW_OPCODE_NOP;
1306    }
1307 }
1308
1309 static uint32_t
1310 get_predicate(const struct tgsi_full_instruction *inst)
1311 {
1312    /* XXX: disabling for now
1313     */
1314 #if 0
1315    if (inst->dst.CondMask == COND_TR)
1316       return BRW_PREDICATE_NONE;
1317
1318    /* All of GLSL only produces predicates for COND_NE and one channel per
1319     * vector.  Fail badly if someone starts doing something else, as it might
1320     * mean infinite looping or something.
1321     *
1322     * We'd like to support all the condition codes, but our hardware doesn't
1323     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1324     * those, the instruction may update the condition codes or not, then any
1325     * later instruction may use one of those condition codes.  For gen4, the
1326     * instruction may update the flags register based on one of the condition
1327     * codes output by the instruction, and then further instructions may
1328     * predicate on that.  We can probably support this, but it won't
1329     * necessarily be easy.
1330     */
1331 /*   assert(inst->dst.CondMask == COND_NE); */
1332
1333    switch (inst->dst.CondSwizzle) {
1334    case SWIZZLE_XXXX:
1335       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1336    case SWIZZLE_YYYY:
1337       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1338    case SWIZZLE_ZZZZ:
1339       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1340    case SWIZZLE_WWWW:
1341       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1342    default:
1343       debug_printf("Unexpected predicate: 0x%08x\n",
1344                     inst->dst.CondMask);
1345       return BRW_PREDICATE_NORMAL;
1346    }
1347 #else
1348    return BRW_PREDICATE_NORMAL;
1349 #endif
1350 }
1351
1352 static void emit_insn(struct brw_vs_compile *c,
1353                       const struct tgsi_full_instruction *inst)
1354 {
1355    unsigned opcode = inst->Instruction.Opcode;
1356    unsigned label = inst->Label.Label;
1357    struct brw_compile *p = &c->func;
1358    struct brw_reg args[3], dst;
1359    GLuint i;
1360
1361 #if 0
1362    printf("%d: ", insn);
1363    _mesa_print_instruction(inst);
1364 #endif
1365
1366    /* Get argument regs.
1367     */
1368    for (i = 0; i < 3; i++) {
1369       args[i] = get_arg(c, &inst->Src[i], i);
1370    }
1371
1372    /* Get dest regs.  Note that it is possible for a reg to be both
1373     * dst and arg, given the static allocation of registers.  So
1374     * care needs to be taken emitting multi-operation instructions.
1375     */
1376    dst = get_dst(c,
1377                  inst->Dst[0].Register.File,
1378                  inst->Dst[0].Register.Index,
1379                  inst->Dst[0].Register.WriteMask);
1380
1381    /* XXX: saturate
1382     */
1383    if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
1384       debug_printf("Unsupported saturate in vertex shader");
1385    }
1386
1387    switch (opcode) {
1388    case TGSI_OPCODE_ABS:
1389       brw_MOV(p, dst, brw_abs(args[0]));
1390       break;
1391    case TGSI_OPCODE_ADD:
1392       brw_ADD(p, dst, args[0], args[1]);
1393       break;
1394    case TGSI_OPCODE_COS:
1395       emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1396       break;
1397    case TGSI_OPCODE_DP3:
1398       brw_DP3(p, dst, args[0], args[1]);
1399       break;
1400    case TGSI_OPCODE_DP4:
1401       brw_DP4(p, dst, args[0], args[1]);
1402       break;
1403    case TGSI_OPCODE_DPH:
1404       brw_DPH(p, dst, args[0], args[1]);
1405       break;
1406    case TGSI_OPCODE_NRM:
1407       emit_nrm(c, dst, args[0], 3);
1408       break;
1409    case TGSI_OPCODE_NRM4:
1410       emit_nrm(c, dst, args[0], 4);
1411       break;
1412    case TGSI_OPCODE_DST:
1413       unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1414       break;
1415    case TGSI_OPCODE_EXP:
1416       unalias1(c, dst, args[0], emit_exp_noalias);
1417       break;
1418    case TGSI_OPCODE_EX2:
1419       emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1420       break;
1421    case TGSI_OPCODE_ARL:
1422       emit_arl(c, dst, args[0]);
1423       break;
1424    case TGSI_OPCODE_FLR:
1425       brw_RNDD(p, dst, args[0]);
1426       break;
1427    case TGSI_OPCODE_FRC:
1428       brw_FRC(p, dst, args[0]);
1429       break;
1430    case TGSI_OPCODE_LOG:
1431       unalias1(c, dst, args[0], emit_log_noalias);
1432       break;
1433    case TGSI_OPCODE_LG2:
1434       emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1435       break;
1436    case TGSI_OPCODE_LIT:
1437       unalias1(c, dst, args[0], emit_lit_noalias);
1438       break;
1439    case TGSI_OPCODE_LRP:
1440       unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1441       break;
1442    case TGSI_OPCODE_MAD:
1443       brw_MOV(p, brw_acc_reg(), args[2]);
1444       brw_MAC(p, dst, args[0], args[1]);
1445       break;
1446    case TGSI_OPCODE_MAX:
1447       emit_max(p, dst, args[0], args[1]);
1448       break;
1449    case TGSI_OPCODE_MIN:
1450       emit_min(p, dst, args[0], args[1]);
1451       break;
1452    case TGSI_OPCODE_MOV:
1453       brw_MOV(p, dst, args[0]);
1454       break;
1455    case TGSI_OPCODE_MUL:
1456       brw_MUL(p, dst, args[0], args[1]);
1457       break;
1458    case TGSI_OPCODE_POW:
1459       emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1460       break;
1461    case TGSI_OPCODE_RCP:
1462       emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1463       break;
1464    case TGSI_OPCODE_RSQ:
1465       emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst,
1466                  brw_swizzle(args[0], 0,0,0,0), BRW_MATH_PRECISION_FULL);
1467       break;
1468    case TGSI_OPCODE_SEQ:
1469       emit_seq(p, dst, args[0], args[1]);
1470       break;
1471    case TGSI_OPCODE_SIN:
1472       emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1473       break;
1474    case TGSI_OPCODE_SNE:
1475       emit_sne(p, dst, args[0], args[1]);
1476       break;
1477    case TGSI_OPCODE_SGE:
1478       emit_sge(p, dst, args[0], args[1]);
1479       break;
1480    case TGSI_OPCODE_SGT:
1481       emit_sgt(p, dst, args[0], args[1]);
1482       break;
1483    case TGSI_OPCODE_SLT:
1484       emit_slt(p, dst, args[0], args[1]);
1485       break;
1486    case TGSI_OPCODE_SLE:
1487       emit_sle(p, dst, args[0], args[1]);
1488       break;
1489    case TGSI_OPCODE_SUB:
1490       brw_ADD(p, dst, args[0], negate(args[1]));
1491       break;
1492    case TGSI_OPCODE_TRUNC:
1493       /* round toward zero */
1494       brw_RNDZ(p, dst, args[0]);
1495       break;
1496    case TGSI_OPCODE_XPD:
1497       emit_xpd(p, dst, args[0], args[1]);
1498       break;
1499    case TGSI_OPCODE_IF:
1500       assert(c->if_depth < MAX_IF_DEPTH);
1501       c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
1502       /* Note that brw_IF smashes the predicate_control field. */
1503       c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
1504       c->if_depth++;
1505       break;
1506    case TGSI_OPCODE_ELSE:
1507       c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
1508       break;
1509    case TGSI_OPCODE_ENDIF:
1510       assert(c->if_depth > 0);
1511       brw_ENDIF(p, c->if_inst[--c->if_depth]);
1512       break;
1513    case TGSI_OPCODE_BGNLOOP:
1514       c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1515       break;
1516    case TGSI_OPCODE_BRK:
1517       brw_set_predicate_control(p, get_predicate(inst));
1518       brw_BREAK(p);
1519       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1520       break;
1521    case TGSI_OPCODE_CONT:
1522       brw_set_predicate_control(p, get_predicate(inst));
1523       brw_CONT(p);
1524       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1525       break;
1526    case TGSI_OPCODE_ENDLOOP:
1527    {
1528       struct brw_instruction *inst0, *inst1;
1529       GLuint br = 1;
1530
1531       c->loop_depth--;
1532
1533       if (c->chipset.is_igdng)
1534          br = 2;
1535
1536       inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
1537       /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1538       while (inst0 > c->loop_inst[c->loop_depth]) {
1539          inst0--;
1540          if (inst0->header.opcode == TGSI_OPCODE_BRK) {
1541             inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1542             inst0->bits3.if_else.pop_count = 0;
1543          }
1544          else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
1545             inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1546             inst0->bits3.if_else.pop_count = 0;
1547          }
1548       }
1549    }
1550    break;
1551    case TGSI_OPCODE_BRA:
1552       brw_set_predicate_control(p, get_predicate(inst));
1553       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1554       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1555       break;
1556    case TGSI_OPCODE_CAL:
1557       brw_set_access_mode(p, BRW_ALIGN_1);
1558       brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1559       brw_set_access_mode(p, BRW_ALIGN_16);
1560       brw_ADD(p, get_addr_reg(c->stack_index),
1561               get_addr_reg(c->stack_index), brw_imm_d(4));
1562       brw_save_call(p, label, p->nr_insn);
1563       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1564       break;
1565    case TGSI_OPCODE_RET:
1566       brw_ADD(p, get_addr_reg(c->stack_index),
1567               get_addr_reg(c->stack_index), brw_imm_d(-4));
1568       brw_set_access_mode(p, BRW_ALIGN_1);
1569       brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
1570       brw_set_access_mode(p, BRW_ALIGN_16);
1571       break;
1572    case TGSI_OPCODE_END:
1573       c->end_offset = p->nr_insn;
1574       /* this instruction will get patched later to jump past subroutine
1575        * code, etc.
1576        */
1577       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1578       break;
1579    case TGSI_OPCODE_BGNSUB:
1580       brw_save_label(p, p->nr_insn, p->nr_insn);
1581       break;
1582    case TGSI_OPCODE_ENDSUB:
1583       /* no-op */
1584       break;
1585    default:
1586       debug_printf("Unsupported opcode %i (%s) in vertex shader",
1587                    opcode,
1588                    tgsi_get_opcode_name(opcode));
1589    }
1590
1591    /* Set the predication update on the last instruction of the native
1592     * instruction sequence.
1593     *
1594     * This would be problematic if it was set on a math instruction,
1595     * but that shouldn't be the case with the current GLSL compiler.
1596     */
1597 #if 0
1598    /* XXX: disabled
1599     */
1600    if (inst->CondUpdate) {
1601       struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1602
1603       assert(hw_insn->header.destreg__conditionalmod == 0);
1604       hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1605    }
1606 #endif
1607
1608    release_tmps(c);
1609 }
1610
1611
1612 /* Emit the vertex program instructions here.
1613  */
1614 void brw_vs_emit(struct brw_vs_compile *c)
1615 {
1616    struct brw_compile *p = &c->func;
1617    const struct tgsi_token *tokens = c->vp->tokens;
1618    struct brw_instruction *end_inst, *last_inst;
1619    struct tgsi_parse_context parse;
1620    struct tgsi_full_instruction *inst;
1621
1622    if (BRW_DEBUG & DEBUG_VS)
1623       tgsi_dump(c->vp->tokens, 0);
1624
1625    c->stack_index = brw_indirect(0, 0);
1626
1627    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1628    brw_set_access_mode(p, BRW_ALIGN_16);
1629
1630
1631    /* Static register allocation
1632     */
1633    brw_vs_alloc_regs(c);
1634
1635    if (c->vp->has_flow_control) {
1636       brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
1637    }
1638
1639    /* Instructions
1640     */
1641    tgsi_parse_init( &parse, tokens );
1642    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1643       tgsi_parse_token( &parse );
1644
1645       switch( parse.FullToken.Token.Type ) {
1646       case TGSI_TOKEN_TYPE_DECLARATION:
1647       case TGSI_TOKEN_TYPE_IMMEDIATE:
1648          break;
1649
1650       case TGSI_TOKEN_TYPE_INSTRUCTION:
1651          inst = &parse.FullToken.FullInstruction;
1652          emit_insn( c, inst );
1653          break;
1654
1655       default:
1656          assert( 0 );
1657       }
1658    }
1659    tgsi_parse_free( &parse );
1660
1661    end_inst = &p->store[c->end_offset];
1662    last_inst = &p->store[p->nr_insn];
1663
1664    /* The END instruction will be patched to jump to this code */
1665    emit_vertex_write(c);
1666
1667    post_vs_emit(c, end_inst, last_inst);
1668
1669    if (BRW_DEBUG & DEBUG_VS) {
1670       debug_printf("vs-native:\n");
1671       brw_disasm(stderr, p->store, p->nr_insn);
1672    }
1673 }