src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static GLboolean
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return GL_TRUE;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return GL_FALSE;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101 static int
 102 get_first_reladdr_output(struct gl_vertex_program *vp)
 103 {
 104    int i;
 105    int first_reladdr_output = VERT_RESULT_MAX;
 106
 107    for (i = 0; i < vp->Base.NumInstructions; i++) {
 108       struct prog_instruction *inst = vp->Base.Instructions + i;
 109
 110       if (inst->DstReg.File == PROGRAM_OUTPUT &&
 111           inst->DstReg.RelAddr &&
 112           inst->DstReg.Index < first_reladdr_output)
 113          first_reladdr_output = inst->DstReg.Index;
 114    }
 115
 116    return first_reladdr_output;
 117 }
 118
 119 /* Clears the record of which vp_const_buffer elements have been
 120  * loaded into our constant buffer registers, for the starts of new
 121  * blocks after control flow.
 122  */
 123 static void
 124 clear_current_const(struct brw_vs_compile *c)
 125 {
 126    unsigned int i;
 127
 128    if (c->vp->use_const_buffer) {
 129       for (i = 0; i < 3; i++) {
 130          c->current_const[i].index = -1;
 131       }
 132    }
 133 }
 134
 135 /**
 136  * Preallocate GRF register before code emit.
 137  * Do things as simply as possible.  Allocate and populate all regs
 138  * ahead of time.
 139  */
 140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 141 {
 142    struct intel_context *intel = &c->func.brw->intel;
 143    GLuint i, reg = 0, mrf, j;
 144    int attributes_in_vue;
 145    int first_reladdr_output;
 146    int max_constant;
 147    int constant = 0;
 148    int vert_result_reoder[VERT_RESULT_MAX];
 149    int bfc = 0;
 150
 151    /* Determine whether to use a real constant buffer or use a block
 152     * of GRF registers for constants.  The later is faster but only
 153     * works if everything fits in the GRF.
 154     * XXX this heuristic/check may need some fine tuning...
 155     */
 156    if (c->vp->program.Base.Parameters->NumParameters +
 157        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 158       c->vp->use_const_buffer = GL_TRUE;
 159    else
 160       c->vp->use_const_buffer = GL_FALSE;
 161
 162    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 163
 164    /* r0 -- reserved as usual
 165     */
 166    c->r0 = brw_vec8_grf(reg, 0);
 167    reg++;
 168
 169    /* User clip planes from curbe:
 170     */
 171    if (c->key.nr_userclip) {
 172       if (intel->gen >= 6) {
 173          for (i = 0; i < c->key.nr_userclip; i++) {
 174             c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
 175                                                   (i % 2) * 4), 0, 4, 1);
 176          }
 177          reg += ALIGN(c->key.nr_userclip, 2) / 2;
 178       } else {
 179          for (i = 0; i < c->key.nr_userclip; i++) {
 180             c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
 181                                                   (i % 2) * 4), 0, 4, 1);
 182          }
 183          reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
 184       }
 185
 186    }
 187
 188    /* Assign some (probably all) of the vertex program constants to
 189     * the push constant buffer/CURBE.
 190     *
 191     * There's an obvious limit to the numer of push constants equal to
 192     * the number of register available, and that number is smaller
 193     * than the minimum maximum number of vertex program parameters, so
 194     * support for pull constants is required if we overflow.
 195     * Additionally, on gen6 the number of push constants is even
 196     * lower.
 197     *
 198     * When there's relative addressing, we don't know what range of
 199     * Mesa IR registers can be accessed.  And generally, when relative
 200     * addressing is used we also have too many constants to load them
 201     * all as push constants.  So, we'll just support relative
 202     * addressing out of the pull constant buffers, and try to load as
 203     * many statically-accessed constants into the push constant buffer
 204     * as we can.
 205     */
 206    if (intel->gen >= 6) {
 207       /* We can only load 32 regs of push constants. */
 208       max_constant = 32 * 2 - c->key.nr_userclip;
 209    } else {
 210       max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 211    }
 212
 213    /* constant_map maps from ParameterValues[] index to index in the
 214     * push constant buffer, or -1 if it's only in the pull constant
 215     * buffer.
 216     */
 217    memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 218    for (i = 0;
 219         i < c->vp->program.Base.NumInstructions && constant < max_constant;
 220         i++) {
 221       struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 222       int arg;
 223
 224       for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 225          if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 226              inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 227              inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 228              inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 229              inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
 230             continue;
 231          }
 232
 233          if (inst->SrcReg[arg].RelAddr) {
 234             c->vp->use_const_buffer = GL_TRUE;
 235             continue;
 236          }
 237
 238          if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 239             c->constant_map[inst->SrcReg[arg].Index] = constant++;
 240          }
 241       }
 242    }
 243
 244    /* If we ran out of push constant space, then we'll also upload all
 245     * constants through the pull constant buffer so that they can be
 246     * accessed no matter what.  For relative addressing (the common
 247     * case) we need them all in place anyway.
 248     */
 249    if (constant == max_constant)
 250       c->vp->use_const_buffer = GL_TRUE;
 251
 252    for (i = 0; i < constant; i++) {
 253       c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
 254                                                           (i % 2) * 4),
 255                                              0, 4, 1);
 256    }
 257    reg += (constant + 1) / 2;
 258    c->prog_data.curb_read_length = reg - 1;
 259    c->prog_data.nr_params = constant * 4;
 260    /* XXX 0 causes a bug elsewhere... */
 261    if (intel->gen < 6 && c->prog_data.nr_params == 0)
 262       c->prog_data.nr_params = 4;
 263
 264    /* Allocate input regs:
 265     */
 266    c->nr_inputs = 0;
 267    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 268       if (c->prog_data.inputs_read & (1 << i)) {
 269          c->nr_inputs++;
 270          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 271          reg++;
 272       }
 273    }
 274    /* If there are no inputs, we'll still be reading one attribute's worth
 275     * because it's required -- see urb_read_length setting.
 276     */
 277    if (c->nr_inputs == 0)
 278       reg++;
 279
 280    /* Allocate outputs.  The non-position outputs go straight into message regs.
 281     */
 282    c->nr_outputs = 0;
 283    c->first_output = reg;
 284    c->first_overflow_output = 0;
 285
 286    if (intel->gen >= 6) {
 287       mrf = 3;
 288       if (c->key.nr_userclip)
 289          mrf += 2;
 290    } else if (intel->gen == 5)
 291       mrf = 8;
 292    else
 293       mrf = 4;
 294
 295    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
 296
 297    for (i = 0; i < VERT_RESULT_MAX; i++)
 298        vert_result_reoder[i] = i;
 299
 300    /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
 301    if (intel->gen >= 6 && c->key.two_side_color) {
 302        if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
 303            (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
 304            assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
 305            assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
 306            bfc = 2;
 307        } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
 308            (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
 309            bfc = 1;
 310
 311        if (bfc) {
 312            for (i = 0; i < bfc; i++) {
 313                vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
 314                vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
 315            }
 316
 317            for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
 318                vert_result_reoder[i] = i - bfc;
 319            }
 320        }
 321    }
 322
 323    for (j = 0; j < VERT_RESULT_MAX; j++) {
 324       i = vert_result_reoder[j];
 325
 326       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 327          c->nr_outputs++;
 328          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 329          if (i == VERT_RESULT_HPOS) {
 330             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 331             reg++;
 332          }
 333          else if (i == VERT_RESULT_PSIZ) {
 334             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 335             reg++;
 336          }
 337          else {
 338             /* Two restrictions on our compute-to-MRF here.  The
 339              * message length for all SEND messages is restricted to
 340              * [1,15], so we can't use mrf 15, as that means a length
 341              * of 16.
 342              *
 343              * Additionally, URB writes are aligned to URB rows, so we
 344              * need to put an even number of registers of URB data in
 345              * each URB write so that the later write is aligned.  A
 346              * message length of 15 means 1 message header reg plus 14
 347              * regs of URB data.
 348              *
 349              * For attributes beyond the compute-to-MRF, we compute to
 350              * GRFs and they will be written in the second URB_WRITE.
 351              */
 352             if (first_reladdr_output > i && mrf < 15) {
 353                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 354                mrf++;
 355             }
 356             else {
 357                if (mrf >= 15 && !c->first_overflow_output)
 358                   c->first_overflow_output = i;
 359                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 360                reg++;
 361                mrf++;
 362             }
 363          }
 364       }
 365    }
 366
 367    /* Allocate program temporaries:
 368     */
 369    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 370       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 371       reg++;
 372    }
 373
 374    /* Address reg(s).  Don't try to use the internal address reg until
 375     * deref time.
 376     */
 377    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 378       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 379                                              reg,
 380                                              0,
 381                                              BRW_REGISTER_TYPE_D,
 382                                              BRW_VERTICAL_STRIDE_8,
 383                                              BRW_WIDTH_8,
 384                                              BRW_HORIZONTAL_STRIDE_1,
 385                                              BRW_SWIZZLE_XXXX,
 386                                              WRITEMASK_X);
 387       reg++;
 388    }
 389
 390    if (c->vp->use_const_buffer) {
 391       for (i = 0; i < 3; i++) {
 392          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 393          reg++;
 394       }
 395       clear_current_const(c);
 396    }
 397
 398    for (i = 0; i < 128; i++) {
 399       if (c->output_regs[i].used_in_src) {
 400          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 401          reg++;
 402       }
 403    }
 404
 405    if (c->needs_stack) {
 406       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 407       reg += 2;
 408    }
 409
 410    /* Some opcodes need an internal temporary:
 411     */
 412    c->first_tmp = reg;
 413    c->last_tmp = reg;           /* for allocation purposes */
 414
 415    /* Each input reg holds data from two vertices.  The
 416     * urb_read_length is the number of registers read from *each*
 417     * vertex urb, so is half the amount:
 418     */
 419    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 420    /* Setting this field to 0 leads to undefined behavior according to the
 421     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 422     * sitting in them, even if it's padding.
 423     */
 424    if (c->prog_data.urb_read_length == 0)
 425       c->prog_data.urb_read_length = 1;
 426
 427    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 428     * them to fit the biggest thing they need to.
 429     */
 430    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 431
 432    /* See emit_vertex_write() for where the VUE's overhead on top of the
 433     * attributes comes from.
 434     */
 435    if (intel->gen >= 6) {
 436       int header_regs = 2;
 437       if (c->key.nr_userclip)
 438          header_regs += 2;
 439
 440       /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
 441        * number of 128-byte (1024-bit) units.
 442        */
 443       c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
 444    } else if (intel->gen == 5)
 445       /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
 446        * number of 64-byte (512-bit) units.
 447        */
 448       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 449    else
 450       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 451
 452    c->prog_data.total_grf = reg;
 453
 454    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
 455       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 456       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 457       printf("%s reg = %d\n", __FUNCTION__, reg);
 458    }
 459 }
 460
 461
 462 /**
 463  * If an instruction uses a temp reg both as a src and the dest, we
 464  * sometimes need to allocate an intermediate temporary.
 465  */
 466 static void unalias1( struct brw_vs_compile *c,
 467                       struct brw_reg dst,
 468                       struct brw_reg arg0,
 469                       void (*func)( struct brw_vs_compile *,
 470                                     struct brw_reg,
 471                                     struct brw_reg ))
 472 {
 473    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 474       struct brw_compile *p = &c->func;
 475       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 476       func(c, tmp, arg0);
 477       brw_MOV(p, dst, tmp);
 478       release_tmp(c, tmp);
 479    }
 480    else {
 481       func(c, dst, arg0);
 482    }
 483 }
 484
 485 /**
 486  * \sa unalias2
 487  * Checkes if 2-operand instruction needs an intermediate temporary.
 488  */
 489 static void unalias2( struct brw_vs_compile *c,
 490                       struct brw_reg dst,
 491                       struct brw_reg arg0,
 492                       struct brw_reg arg1,
 493                       void (*func)( struct brw_vs_compile *,
 494                                     struct brw_reg,
 495                                     struct brw_reg,
 496                                     struct brw_reg ))
 497 {
 498    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 499        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 500       struct brw_compile *p = &c->func;
 501       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 502       func(c, tmp, arg0, arg1);
 503       brw_MOV(p, dst, tmp);
 504       release_tmp(c, tmp);
 505    }
 506    else {
 507       func(c, dst, arg0, arg1);
 508    }
 509 }
 510
 511 /**
 512  * \sa unalias2
 513  * Checkes if 3-operand instruction needs an intermediate temporary.
 514  */
 515 static void unalias3( struct brw_vs_compile *c,
 516                       struct brw_reg dst,
 517                       struct brw_reg arg0,
 518                       struct brw_reg arg1,
 519                       struct brw_reg arg2,
 520                       void (*func)( struct brw_vs_compile *,
 521                                     struct brw_reg,
 522                                     struct brw_reg,
 523                                     struct brw_reg,
 524                                     struct brw_reg ))
 525 {
 526    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 527        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 528        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 529       struct brw_compile *p = &c->func;
 530       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 531       func(c, tmp, arg0, arg1, arg2);
 532       brw_MOV(p, dst, tmp);
 533       release_tmp(c, tmp);
 534    }
 535    else {
 536       func(c, dst, arg0, arg1, arg2);
 537    }
 538 }
 539
 540 static void emit_sop( struct brw_vs_compile *c,
 541                       struct brw_reg dst,
 542                       struct brw_reg arg0,
 543                       struct brw_reg arg1,
 544                       GLuint cond)
 545 {
 546    struct brw_compile *p = &c->func;
 547
 548    brw_MOV(p, dst, brw_imm_f(0.0f));
 549    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 550    brw_MOV(p, dst, brw_imm_f(1.0f));
 551    brw_set_predicate_control_flag_value(p, 0xff);
 552 }
 553
 554 static void emit_seq( struct brw_vs_compile *c,
 555                       struct brw_reg dst,
 556                       struct brw_reg arg0,
 557                       struct brw_reg arg1 )
 558 {
 559    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 560 }
 561
 562 static void emit_sne( struct brw_vs_compile *c,
 563                       struct brw_reg dst,
 564                       struct brw_reg arg0,
 565                       struct brw_reg arg1 )
 566 {
 567    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 568 }
 569 static void emit_slt( struct brw_vs_compile *c,
 570                       struct brw_reg dst,
 571                       struct brw_reg arg0,
 572                       struct brw_reg arg1 )
 573 {
 574    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 575 }
 576
 577 static void emit_sle( struct brw_vs_compile *c,
 578                       struct brw_reg dst,
 579                       struct brw_reg arg0,
 580                       struct brw_reg arg1 )
 581 {
 582    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 583 }
 584
 585 static void emit_sgt( struct brw_vs_compile *c,
 586                       struct brw_reg dst,
 587                       struct brw_reg arg0,
 588                       struct brw_reg arg1 )
 589 {
 590    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 591 }
 592
 593 static void emit_sge( struct brw_vs_compile *c,
 594                       struct brw_reg dst,
 595                       struct brw_reg arg0,
 596                       struct brw_reg arg1 )
 597 {
 598   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 599 }
 600
 601 static void emit_cmp( struct brw_compile *p,
 602                       struct brw_reg dst,
 603                       struct brw_reg arg0,
 604                       struct brw_reg arg1,
 605                       struct brw_reg arg2 )
 606 {
 607    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 608    brw_SEL(p, dst, arg1, arg2);
 609    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 610 }
 611
 612 static void emit_sign(struct brw_vs_compile *c,
 613                       struct brw_reg dst,
 614                       struct brw_reg arg0)
 615 {
 616    struct brw_compile *p = &c->func;
 617
 618    brw_MOV(p, dst, brw_imm_f(0));
 619
 620    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 621    brw_MOV(p, dst, brw_imm_f(-1.0));
 622    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 623
 624    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 625    brw_MOV(p, dst, brw_imm_f(1.0));
 626    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 627 }
 628
 629 static void emit_max( struct brw_compile *p,
 630                       struct brw_reg dst,
 631                       struct brw_reg arg0,
 632                       struct brw_reg arg1 )
 633 {
 634    struct intel_context *intel = &p->brw->intel;
 635
 636    if (intel->gen >= 6) {
 637       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
 638       brw_SEL(p, dst, arg0, arg1);
 639       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 640       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 641    } else {
 642       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 643       brw_SEL(p, dst, arg0, arg1);
 644       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 645    }
 646 }
 647
 648 static void emit_min( struct brw_compile *p,
 649                       struct brw_reg dst,
 650                       struct brw_reg arg0,
 651                       struct brw_reg arg1 )
 652 {
 653    struct intel_context *intel = &p->brw->intel;
 654
 655    if (intel->gen >= 6) {
 656       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 657       brw_SEL(p, dst, arg0, arg1);
 658       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 659       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 660    } else {
 661       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 662       brw_SEL(p, dst, arg0, arg1);
 663       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 664    }
 665 }
 666
 667 static void emit_arl(struct brw_compile *p,
 668                      struct brw_reg dst,
 669                      struct brw_reg src)
 670 {
 671    struct intel_context *intel = &p->brw->intel;
 672
 673    if (intel->gen >= 6) {
 674       struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
 675
 676       brw_RNDD(p, dst_f, src);
 677       brw_MOV(p, dst, dst_f);
 678    } else {
 679       brw_RNDD(p, dst, src);
 680    }
 681 }
 682
 683 static void emit_math1_gen4(struct brw_vs_compile *c,
 684                             GLuint function,
 685                             struct brw_reg dst,
 686                             struct brw_reg arg0,
 687                             GLuint precision)
 688 {
 689    /* There are various odd behaviours with SEND on the simulator.  In
 690     * addition there are documented issues with the fact that the GEN4
 691     * processor doesn't do dependency control properly on SEND
 692     * results.  So, on balance, this kludge to get around failures
 693     * with writemasked math results looks like it might be necessary
 694     * whether that turns out to be a simulator bug or not:
 695     */
 696    struct brw_compile *p = &c->func;
 697    struct brw_reg tmp = dst;
 698    GLboolean need_tmp = GL_FALSE;
 699
 700    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 701        dst.dw1.bits.writemask != 0xf)
 702       need_tmp = GL_TRUE;
 703
 704    if (need_tmp)
 705       tmp = get_tmp(c);
 706
 707    brw_math(p,
 708             tmp,
 709             function,
 710             BRW_MATH_SATURATE_NONE,
 711             2,
 712             arg0,
 713             BRW_MATH_DATA_SCALAR,
 714             precision);
 715
 716    if (need_tmp) {
 717       brw_MOV(p, dst, tmp);
 718       release_tmp(c, tmp);
 719    }
 720 }
 721
 722 static void
 723 emit_math1_gen6(struct brw_vs_compile *c,
 724                 GLuint function,
 725                 struct brw_reg dst,
 726                 struct brw_reg arg0,
 727                 GLuint precision)
 728 {
 729    struct brw_compile *p = &c->func;
 730    struct brw_reg tmp_src, tmp_dst;
 731
 732    /* Something is strange on gen6 math in 16-wide mode, though the
 733     * docs say it's supposed to work.  Punt to using align1 mode,
 734     * which doesn't do writemasking and swizzles.
 735     */
 736    tmp_src = get_tmp(c);
 737    tmp_dst = get_tmp(c);
 738
 739    brw_MOV(p, tmp_src, arg0);
 740
 741    brw_set_access_mode(p, BRW_ALIGN_1);
 742    brw_math(p,
 743             tmp_dst,
 744             function,
 745             BRW_MATH_SATURATE_NONE,
 746             2,
 747             tmp_src,
 748             BRW_MATH_DATA_SCALAR,
 749             precision);
 750    brw_set_access_mode(p, BRW_ALIGN_16);
 751
 752    brw_MOV(p, dst, tmp_dst);
 753
 754    release_tmp(c, tmp_src);
 755    release_tmp(c, tmp_dst);
 756 }
 757
 758 static void
 759 emit_math1(struct brw_vs_compile *c,
 760            GLuint function,
 761            struct brw_reg dst,
 762            struct brw_reg arg0,
 763            GLuint precision)
 764 {
 765    struct brw_compile *p = &c->func;
 766    struct intel_context *intel = &p->brw->intel;
 767
 768    if (intel->gen >= 6)
 769       emit_math1_gen6(c, function, dst, arg0, precision);
 770    else
 771       emit_math1_gen4(c, function, dst, arg0, precision);
 772 }
 773
 774 static void emit_math2_gen4( struct brw_vs_compile *c,
 775                         GLuint function,
 776                         struct brw_reg dst,
 777                         struct brw_reg arg0,
 778                         struct brw_reg arg1,
 779                         GLuint precision)
 780 {
 781    struct brw_compile *p = &c->func;
 782    struct brw_reg tmp = dst;
 783    GLboolean need_tmp = GL_FALSE;
 784
 785    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 786        dst.dw1.bits.writemask != 0xf)
 787       need_tmp = GL_TRUE;
 788
 789    if (need_tmp)
 790       tmp = get_tmp(c);
 791
 792    brw_MOV(p, brw_message_reg(3), arg1);
 793
 794    brw_math(p,
 795             tmp,
 796             function,
 797             BRW_MATH_SATURATE_NONE,
 798             2,
 799             arg0,
 800             BRW_MATH_DATA_SCALAR,
 801             precision);
 802
 803    if (need_tmp) {
 804       brw_MOV(p, dst, tmp);
 805       release_tmp(c, tmp);
 806    }
 807 }
 808
 809 static void emit_math2_gen6( struct brw_vs_compile *c,
 810                         GLuint function,
 811                         struct brw_reg dst,
 812                         struct brw_reg arg0,
 813                         struct brw_reg arg1,
 814                         GLuint precision)
 815 {
 816    struct brw_compile *p = &c->func;
 817    struct brw_reg tmp_src0, tmp_src1, tmp_dst;
 818
 819    tmp_src0 = get_tmp(c);
 820    tmp_src1 = get_tmp(c);
 821    tmp_dst = get_tmp(c);
 822
 823    brw_MOV(p, tmp_src0, arg0);
 824    brw_MOV(p, tmp_src1, arg1);
 825
 826    brw_set_access_mode(p, BRW_ALIGN_1);
 827    brw_math2(p,
 828             tmp_dst,
 829             function,
 830             tmp_src0,
 831             tmp_src1);
 832    brw_set_access_mode(p, BRW_ALIGN_16);
 833
 834    brw_MOV(p, dst, tmp_dst);
 835
 836    release_tmp(c, tmp_src0);
 837    release_tmp(c, tmp_src1);
 838    release_tmp(c, tmp_dst);
 839 }
 840
 841 static void emit_math2( struct brw_vs_compile *c,
 842                         GLuint function,
 843                         struct brw_reg dst,
 844                         struct brw_reg arg0,
 845                         struct brw_reg arg1,
 846                         GLuint precision)
 847 {
 848    struct brw_compile *p = &c->func;
 849    struct intel_context *intel = &p->brw->intel;
 850
 851    if (intel->gen >= 6)
 852       emit_math2_gen6(c, function, dst, arg0, arg1, precision);
 853    else
 854       emit_math2_gen4(c, function, dst, arg0, arg1, precision);
 855 }
 856
 857 static void emit_exp_noalias( struct brw_vs_compile *c,
 858                               struct brw_reg dst,
 859                               struct brw_reg arg0 )
 860 {
 861    struct brw_compile *p = &c->func;
 862
 863
 864    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 865       struct brw_reg tmp = get_tmp(c);
 866       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 867
 868       /* tmp_d = floor(arg0.x) */
 869       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 870
 871       /* result[0] = 2.0 ^ tmp */
 872
 873       /* Adjust exponent for floating point:
 874        * exp += 127
 875        */
 876       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 877
 878       /* Install exponent and sign.
 879        * Excess drops off the edge:
 880        */
 881       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 882               tmp_d, brw_imm_d(23));
 883
 884       release_tmp(c, tmp);
 885    }
 886
 887    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 888       /* result[1] = arg0.x - floor(arg0.x) */
 889       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 890    }
 891
 892    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 893       /* As with the LOG instruction, we might be better off just
 894        * doing a taylor expansion here, seeing as we have to do all
 895        * the prep work.
 896        *
 897        * If mathbox partial precision is too low, consider also:
 898        * result[3] = result[0] * EXP(result[1])
 899        */
 900       emit_math1(c,
 901                  BRW_MATH_FUNCTION_EXP,
 902                  brw_writemask(dst, WRITEMASK_Z),
 903                  brw_swizzle1(arg0, 0),
 904                  BRW_MATH_PRECISION_FULL);
 905    }
 906
 907    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 908       /* result[3] = 1.0; */
 909       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 910    }
 911 }
 912
 913
 914 static void emit_log_noalias( struct brw_vs_compile *c,
 915                               struct brw_reg dst,
 916                               struct brw_reg arg0 )
 917 {
 918    struct brw_compile *p = &c->func;
 919    struct brw_reg tmp = dst;
 920    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 921    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 922    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 923                          dst.file != BRW_GENERAL_REGISTER_FILE);
 924
 925    if (need_tmp) {
 926       tmp = get_tmp(c);
 927       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 928    }
 929
 930    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 931     * according to spec:
 932     *
 933     * These almost look likey they could be joined up, but not really
 934     * practical:
 935     *
 936     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 937     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 938     */
 939    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 940       brw_AND(p,
 941               brw_writemask(tmp_ud, WRITEMASK_X),
 942               brw_swizzle1(arg0_ud, 0),
 943               brw_imm_ud((1U<<31)-1));
 944
 945       brw_SHR(p,
 946               brw_writemask(tmp_ud, WRITEMASK_X),
 947               tmp_ud,
 948               brw_imm_ud(23));
 949
 950       brw_ADD(p,
 951               brw_writemask(tmp, WRITEMASK_X),
 952               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 953               brw_imm_d(-127));
 954    }
 955
 956    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 957       brw_AND(p,
 958               brw_writemask(tmp_ud, WRITEMASK_Y),
 959               brw_swizzle1(arg0_ud, 0),
 960               brw_imm_ud((1<<23)-1));
 961
 962       brw_OR(p,
 963              brw_writemask(tmp_ud, WRITEMASK_Y),
 964              tmp_ud,
 965              brw_imm_ud(127<<23));
 966    }
 967
 968    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 969       /* result[2] = result[0] + LOG2(result[1]); */
 970
 971       /* Why bother?  The above is just a hint how to do this with a
 972        * taylor series.  Maybe we *should* use a taylor series as by
 973        * the time all the above has been done it's almost certainly
 974        * quicker than calling the mathbox, even with low precision.
 975        *
 976        * Options are:
 977        *    - result[0] + mathbox.LOG2(result[1])
 978        *    - mathbox.LOG2(arg0.x)
 979        *    - result[0] + inline_taylor_approx(result[1])
 980        */
 981       emit_math1(c,
 982                  BRW_MATH_FUNCTION_LOG,
 983                  brw_writemask(tmp, WRITEMASK_Z),
 984                  brw_swizzle1(tmp, 1),
 985                  BRW_MATH_PRECISION_FULL);
 986
 987       brw_ADD(p,
 988               brw_writemask(tmp, WRITEMASK_Z),
 989               brw_swizzle1(tmp, 2),
 990               brw_swizzle1(tmp, 0));
 991    }
 992
 993    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 994       /* result[3] = 1.0; */
 995       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 996    }
 997
 998    if (need_tmp) {
 999       brw_MOV(p, dst, tmp);
1000       release_tmp(c, tmp);
1001    }
1002 }
1003
1004
1005 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
1006  */
1007 static void emit_dst_noalias( struct brw_vs_compile *c,
1008                               struct brw_reg dst,
1009                               struct brw_reg arg0,
1010                               struct brw_reg arg1)
1011 {
1012    struct brw_compile *p = &c->func;
1013
1014    /* There must be a better way to do this:
1015     */
1016    if (dst.dw1.bits.writemask & WRITEMASK_X)
1017       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
1018    if (dst.dw1.bits.writemask & WRITEMASK_Y)
1019       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
1020    if (dst.dw1.bits.writemask & WRITEMASK_Z)
1021       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
1022    if (dst.dw1.bits.writemask & WRITEMASK_W)
1023       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
1024 }
1025
1026
1027 static void emit_xpd( struct brw_compile *p,
1028                       struct brw_reg dst,
1029                       struct brw_reg t,
1030                       struct brw_reg u)
1031 {
1032    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
1033    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1034 }
1035
1036
1037 static void emit_lit_noalias( struct brw_vs_compile *c,
1038                               struct brw_reg dst,
1039                               struct brw_reg arg0 )
1040 {
1041    struct brw_compile *p = &c->func;
1042    struct brw_instruction *if_insn;
1043    struct brw_reg tmp = dst;
1044    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1045
1046    if (need_tmp)
1047       tmp = get_tmp(c);
1048
1049    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1050    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1051
1052    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1053     * to get all channels active inside the IF.  In the clipping code
1054     * we run with NoMask, so it's not an option and we can use
1055     * BRW_EXECUTE_1 for all comparisions.
1056     */
1057    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1058    if_insn = brw_IF(p, BRW_EXECUTE_8);
1059    {
1060       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1061
1062       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1063       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
1064       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1065
1066       emit_math2(c,
1067                  BRW_MATH_FUNCTION_POW,
1068                  brw_writemask(dst, WRITEMASK_Z),
1069                  brw_swizzle1(tmp, 2),
1070                  brw_swizzle1(arg0, 3),
1071                  BRW_MATH_PRECISION_PARTIAL);
1072    }
1073
1074    brw_ENDIF(p, if_insn);
1075
1076    release_tmp(c, tmp);
1077 }
1078
1079 static void emit_lrp_noalias(struct brw_vs_compile *c,
1080                              struct brw_reg dst,
1081                              struct brw_reg arg0,
1082                              struct brw_reg arg1,
1083                              struct brw_reg arg2)
1084 {
1085    struct brw_compile *p = &c->func;
1086
1087    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1088    brw_MUL(p, brw_null_reg(), dst, arg2);
1089    brw_MAC(p, dst, arg0, arg1);
1090 }
1091
1092 /** 3 or 4-component vector normalization */
1093 static void emit_nrm( struct brw_vs_compile *c,
1094                       struct brw_reg dst,
1095                       struct brw_reg arg0,
1096                       int num_comps)
1097 {
1098    struct brw_compile *p = &c->func;
1099    struct brw_reg tmp = get_tmp(c);
1100
1101    /* tmp = dot(arg0, arg0) */
1102    if (num_comps == 3)
1103       brw_DP3(p, tmp, arg0, arg0);
1104    else
1105       brw_DP4(p, tmp, arg0, arg0);
1106
1107    /* tmp = 1 / sqrt(tmp) */
1108    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
1109
1110    /* dst = arg0 * tmp */
1111    brw_MUL(p, dst, arg0, tmp);
1112
1113    release_tmp(c, tmp);
1114 }
1115
1116
1117 static struct brw_reg
1118 get_constant(struct brw_vs_compile *c,
1119              const struct prog_instruction *inst,
1120              GLuint argIndex)
1121 {
1122    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1123    struct brw_compile *p = &c->func;
1124    struct brw_reg const_reg = c->current_const[argIndex].reg;
1125
1126    assert(argIndex < 3);
1127
1128    if (c->current_const[argIndex].index != src->Index) {
1129       /* Keep track of the last constant loaded in this slot, for reuse. */
1130       c->current_const[argIndex].index = src->Index;
1131
1132 #if 0
1133       printf("  fetch const[%d] for arg %d into reg %d\n",
1134              src->Index, argIndex, c->current_const[argIndex].reg.nr);
1135 #endif
1136       /* need to fetch the constant now */
1137       brw_dp_READ_4_vs(p,
1138                        const_reg,                     /* writeback dest */
1139                        16 * src->Index,               /* byte offset */
1140                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
1141                        );
1142    }
1143
1144    /* replicate lower four floats into upper half (to get XYZWXYZW) */
1145    const_reg = stride(const_reg, 0, 4, 1);
1146    const_reg.subnr = 0;
1147
1148    return const_reg;
1149 }
1150
1151 static struct brw_reg
1152 get_reladdr_constant(struct brw_vs_compile *c,
1153                      const struct prog_instruction *inst,
1154                      GLuint argIndex)
1155 {
1156    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1157    struct brw_compile *p = &c->func;
1158    struct brw_context *brw = p->brw;
1159    struct intel_context *intel = &brw->intel;
1160    struct brw_reg const_reg = c->current_const[argIndex].reg;
1161    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1162    uint32_t offset;
1163
1164    assert(argIndex < 3);
1165
1166    /* Can't reuse a reladdr constant load. */
1167    c->current_const[argIndex].index = -1;
1168
1169  #if 0
1170    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
1171           src->Index, argIndex, c->current_const[argIndex].reg.nr);
1172 #endif
1173
1174    if (intel->gen >= 6) {
1175       offset = src->Index;
1176    } else {
1177       struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1178       brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1179       addr_reg = byte_addr_reg;
1180       offset = 16 * src->Index;
1181    }
1182
1183    /* fetch the first vec4 */
1184    brw_dp_READ_4_vs_relative(p,
1185                              const_reg,
1186                              addr_reg,
1187                              offset,
1188                              SURF_INDEX_VERT_CONST_BUFFER);
1189
1190    return const_reg;
1191 }
1192
1193
1194
1195 /* TODO: relative addressing!
1196  */
1197 static struct brw_reg get_reg( struct brw_vs_compile *c,
1198                                gl_register_file file,
1199                                GLuint index )
1200 {
1201    switch (file) {
1202    case PROGRAM_TEMPORARY:
1203    case PROGRAM_INPUT:
1204    case PROGRAM_OUTPUT:
1205       assert(c->regs[file][index].nr != 0);
1206       return c->regs[file][index];
1207    case PROGRAM_STATE_VAR:
1208    case PROGRAM_CONSTANT:
1209    case PROGRAM_UNIFORM:
1210       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1211       return c->regs[PROGRAM_STATE_VAR][index];
1212    case PROGRAM_ADDRESS:
1213       assert(index == 0);
1214       return c->regs[file][index];
1215
1216    case PROGRAM_UNDEFINED:                      /* undef values */
1217       return brw_null_reg();
1218
1219    case PROGRAM_LOCAL_PARAM:
1220    case PROGRAM_ENV_PARAM:
1221    case PROGRAM_WRITE_ONLY:
1222    default:
1223       assert(0);
1224       return brw_null_reg();
1225    }
1226 }
1227
1228
1229 /**
1230  * Indirect addressing:  get reg[[arg] + offset].
1231  */
1232 static struct brw_reg deref( struct brw_vs_compile *c,
1233                              struct brw_reg arg,
1234                              GLint offset,
1235                              GLuint reg_size )
1236 {
1237    struct brw_compile *p = &c->func;
1238    struct brw_reg tmp = get_tmp(c);
1239    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1240    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1241    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1242    struct brw_reg indirect = brw_vec4_indirect(0,0);
1243    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1244
1245    /* Set the vertical stride on the register access so that the first
1246     * 4 components come from a0.0 and the second 4 from a0.1.
1247     */
1248    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1249
1250    {
1251       brw_push_insn_state(p);
1252       brw_set_access_mode(p, BRW_ALIGN_1);
1253
1254       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1255       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1256
1257       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1258       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1259
1260       brw_MOV(p, tmp, indirect);
1261
1262       brw_pop_insn_state(p);
1263    }
1264
1265    /* NOTE: tmp not released */
1266    return tmp;
1267 }
1268
1269 static void
1270 move_to_reladdr_dst(struct brw_vs_compile *c,
1271                     const struct prog_instruction *inst,
1272                     struct brw_reg val)
1273 {
1274    struct brw_compile *p = &c->func;
1275    int reg_size = 32;
1276    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1277    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1278    struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1279    GLuint byte_offset = base.nr * 32 + base.subnr;
1280    struct brw_reg indirect = brw_vec4_indirect(0,0);
1281    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1282
1283    /* Because destination register indirect addressing can only use
1284     * one index, we'll write each vertex's vec4 value separately.
1285     */
1286    val.width = BRW_WIDTH_4;
1287    val.vstride = BRW_VERTICAL_STRIDE_4;
1288
1289    brw_push_insn_state(p);
1290    brw_set_access_mode(p, BRW_ALIGN_1);
1291
1292    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1293    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1294    brw_MOV(p, indirect, val);
1295
1296    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1297    brw_ADD(p, brw_address_reg(0), acc,
1298            brw_imm_uw(byte_offset + reg_size / 2));
1299    brw_MOV(p, indirect, suboffset(val, 4));
1300
1301    brw_pop_insn_state(p);
1302 }
1303
1304 /**
1305  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1306  * TODO: relative addressing!
1307  */
1308 static struct brw_reg
1309 get_src_reg( struct brw_vs_compile *c,
1310              const struct prog_instruction *inst,
1311              GLuint argIndex )
1312 {
1313    const GLuint file = inst->SrcReg[argIndex].File;
1314    const GLint index = inst->SrcReg[argIndex].Index;
1315    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1316
1317    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1318       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1319
1320       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1321                                         SWIZZLE_ZERO,
1322                                         SWIZZLE_ZERO,
1323                                         SWIZZLE_ZERO)) {
1324           return brw_imm_f(0.0f);
1325       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1326                                                SWIZZLE_ONE,
1327                                                SWIZZLE_ONE,
1328                                                SWIZZLE_ONE)) {
1329          if (src->Negate)
1330             return brw_imm_f(-1.0F);
1331          else
1332             return brw_imm_f(1.0F);
1333       } else if (src->File == PROGRAM_CONSTANT) {
1334          const struct gl_program_parameter_list *params;
1335          float f;
1336          int component = -1;
1337
1338          switch (src->Swizzle) {
1339          case SWIZZLE_XXXX:
1340             component = 0;
1341             break;
1342          case SWIZZLE_YYYY:
1343             component = 1;
1344             break;
1345          case SWIZZLE_ZZZZ:
1346             component = 2;
1347             break;
1348          case SWIZZLE_WWWW:
1349             component = 3;
1350             break;
1351          }
1352
1353          if (component >= 0) {
1354             params = c->vp->program.Base.Parameters;
1355             f = params->ParameterValues[src->Index][component];
1356
1357             if (src->Abs)
1358                f = fabs(f);
1359             if (src->Negate)
1360                f = -f;
1361             return brw_imm_f(f);
1362          }
1363       }
1364    }
1365
1366    switch (file) {
1367    case PROGRAM_TEMPORARY:
1368    case PROGRAM_INPUT:
1369    case PROGRAM_OUTPUT:
1370       if (relAddr) {
1371          return deref(c, c->regs[file][0], index, 32);
1372       }
1373       else {
1374          assert(c->regs[file][index].nr != 0);
1375          return c->regs[file][index];
1376       }
1377
1378    case PROGRAM_STATE_VAR:
1379    case PROGRAM_CONSTANT:
1380    case PROGRAM_UNIFORM:
1381    case PROGRAM_ENV_PARAM:
1382    case PROGRAM_LOCAL_PARAM:
1383       if (!relAddr && c->constant_map[index] != -1) {
1384          /* Take from the push constant buffer if possible. */
1385          assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1386          return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1387       } else {
1388          /* Must be in the pull constant buffer then .*/
1389          assert(c->vp->use_const_buffer);
1390          if (relAddr)
1391             return get_reladdr_constant(c, inst, argIndex);
1392          else
1393             return get_constant(c, inst, argIndex);
1394       }
1395    case PROGRAM_ADDRESS:
1396       assert(index == 0);
1397       return c->regs[file][index];
1398
1399    case PROGRAM_UNDEFINED:
1400       /* this is a normal case since we loop over all three src args */
1401       return brw_null_reg();
1402
1403    case PROGRAM_WRITE_ONLY:
1404    default:
1405       assert(0);
1406       return brw_null_reg();
1407    }
1408 }
1409
1410 /**
1411  * Return the brw reg for the given instruction's src argument.
1412  * Will return mangled results for SWZ op.  The emit_swz() function
1413  * ignores this result and recalculates taking extended swizzles into
1414  * account.
1415  */
1416 static struct brw_reg get_arg( struct brw_vs_compile *c,
1417                                const struct prog_instruction *inst,
1418                                GLuint argIndex )
1419 {
1420    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1421    struct brw_reg reg;
1422
1423    if (src->File == PROGRAM_UNDEFINED)
1424       return brw_null_reg();
1425
1426    reg = get_src_reg(c, inst, argIndex);
1427
1428    /* Convert 3-bit swizzle to 2-bit.
1429     */
1430    if (reg.file != BRW_IMMEDIATE_VALUE) {
1431       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1432                                           GET_SWZ(src->Swizzle, 1),
1433                                           GET_SWZ(src->Swizzle, 2),
1434                                           GET_SWZ(src->Swizzle, 3));
1435
1436       /* Note this is ok for non-swizzle ARB_vp instructions */
1437       reg.negate = src->Negate ? 1 : 0;
1438    }
1439
1440    return reg;
1441 }
1442
1443
1444 /**
1445  * Get brw register for the given program dest register.
1446  */
1447 static struct brw_reg get_dst( struct brw_vs_compile *c,
1448                                struct prog_dst_register dst )
1449 {
1450    struct brw_reg reg;
1451
1452    switch (dst.File) {
1453    case PROGRAM_TEMPORARY:
1454    case PROGRAM_OUTPUT:
1455       /* register-indirect addressing is only 1x1, not VxH, for
1456        * destination regs.  So, for RelAddr we'll return a temporary
1457        * for the dest and do a move of the result to the RelAddr
1458        * register after the instruction emit.
1459        */
1460       if (dst.RelAddr) {
1461          reg = get_tmp(c);
1462       } else {
1463          assert(c->regs[dst.File][dst.Index].nr != 0);
1464          reg = c->regs[dst.File][dst.Index];
1465       }
1466       break;
1467    case PROGRAM_ADDRESS:
1468       assert(dst.Index == 0);
1469       reg = c->regs[dst.File][dst.Index];
1470       break;
1471    case PROGRAM_UNDEFINED:
1472       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1473       reg = brw_null_reg();
1474       break;
1475    default:
1476       assert(0);
1477       reg = brw_null_reg();
1478    }
1479
1480    assert(reg.type != BRW_IMMEDIATE_VALUE);
1481    reg.dw1.bits.writemask = dst.WriteMask;
1482
1483    return reg;
1484 }
1485
1486
1487 static void emit_swz( struct brw_vs_compile *c,
1488                       struct brw_reg dst,
1489                       const struct prog_instruction *inst)
1490 {
1491    const GLuint argIndex = 0;
1492    const struct prog_src_register src = inst->SrcReg[argIndex];
1493    struct brw_compile *p = &c->func;
1494    GLuint zeros_mask = 0;
1495    GLuint ones_mask = 0;
1496    GLuint src_mask = 0;
1497    GLubyte src_swz[4];
1498    GLboolean need_tmp = (src.Negate &&
1499                          dst.file != BRW_GENERAL_REGISTER_FILE);
1500    struct brw_reg tmp = dst;
1501    GLuint i;
1502
1503    if (need_tmp)
1504       tmp = get_tmp(c);
1505
1506    for (i = 0; i < 4; i++) {
1507       if (dst.dw1.bits.writemask & (1<<i)) {
1508          GLubyte s = GET_SWZ(src.Swizzle, i);
1509          switch (s) {
1510          case SWIZZLE_X:
1511          case SWIZZLE_Y:
1512          case SWIZZLE_Z:
1513          case SWIZZLE_W:
1514             src_mask |= 1<<i;
1515             src_swz[i] = s;
1516             break;
1517          case SWIZZLE_ZERO:
1518             zeros_mask |= 1<<i;
1519             break;
1520          case SWIZZLE_ONE:
1521             ones_mask |= 1<<i;
1522             break;
1523          }
1524       }
1525    }
1526
1527    /* Do src first, in case dst aliases src:
1528     */
1529    if (src_mask) {
1530       struct brw_reg arg0;
1531
1532       arg0 = get_src_reg(c, inst, argIndex);
1533
1534       arg0 = brw_swizzle(arg0,
1535                          src_swz[0], src_swz[1],
1536                          src_swz[2], src_swz[3]);
1537
1538       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1539    }
1540
1541    if (zeros_mask)
1542       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1543
1544    if (ones_mask)
1545       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1546
1547    if (src.Negate)
1548       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1549
1550    if (need_tmp) {
1551       brw_MOV(p, dst, tmp);
1552       release_tmp(c, tmp);
1553    }
1554 }
1555
1556 static int
1557 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1558 {
1559    struct intel_context *intel = &brw->intel;
1560
1561    if (intel->gen >= 6) {
1562       /* URB data written (does not include the message header reg) must
1563        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1564        * section 5.4.3.2.2: URB_INTERLEAVED.
1565        *
1566        * URB entries are allocated on a multiple of 1024 bits, so an
1567        * extra 128 bits written here to make the end align to 256 is
1568        * no problem.
1569        */
1570       if ((mlen % 2) != 1)
1571          mlen++;
1572    }
1573
1574    return mlen;
1575 }
1576
1577 /**
1578  * Post-vertex-program processing.  Send the results to the URB.
1579  */
1580 static void emit_vertex_write( struct brw_vs_compile *c)
1581 {
1582    struct brw_compile *p = &c->func;
1583    struct brw_context *brw = p->brw;
1584    struct intel_context *intel = &brw->intel;
1585    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1586    struct brw_reg ndc;
1587    int eot;
1588    GLuint len_vertex_header = 2;
1589    int next_mrf, i;
1590    int msg_len;
1591
1592    if (c->key.copy_edgeflag) {
1593       brw_MOV(p,
1594               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1595               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1596    }
1597
1598    if (intel->gen < 6) {
1599       /* Build ndc coords */
1600       ndc = get_tmp(c);
1601       /* ndc = 1.0 / pos.w */
1602       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1603       /* ndc.xyz = pos * ndc */
1604       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1605    }
1606
1607    /* Update the header for point size, user clipping flags, and -ve rhw
1608     * workaround.
1609     */
1610    if (intel->gen >= 6) {
1611       struct brw_reg m1 = brw_message_reg(1);
1612
1613       /* On gen6, m1 has each value in a separate dword, so we never
1614        * need to mess with a temporary for computing the m1 value.
1615        */
1616       brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1617       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1618          brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1619                  brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1620       }
1621
1622       /* Set the user clip distances in dword 8-15. (m3-4)*/
1623       if (c->key.nr_userclip) {
1624          for (i = 0; i < c->key.nr_userclip; i++) {
1625             struct brw_reg m;
1626             if (i < 4)
1627                m = brw_message_reg(3);
1628             else
1629                m = brw_message_reg(4);
1630
1631             brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1632          }
1633       }
1634    } else if ((c->prog_data.outputs_written &
1635                BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1636               c->key.nr_userclip || brw->has_negative_rhw_bug) {
1637       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1638       GLuint i;
1639
1640       brw_MOV(p, header1, brw_imm_ud(0));
1641
1642       brw_set_access_mode(p, BRW_ALIGN_16);
1643
1644       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1645          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1646          brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1647                  brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1648          brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1649                  header1, brw_imm_ud(0x7ff<<8));
1650       }
1651
1652       for (i = 0; i < c->key.nr_userclip; i++) {
1653          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1654          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1655          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1656          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1657       }
1658
1659       /* i965 clipping workaround:
1660        * 1) Test for -ve rhw
1661        * 2) If set,
1662        *      set ndc = (0,0,0,0)
1663        *      set ucp[6] = 1
1664        *
1665        * Later, clipping will detect ucp[6] and ensure the primitive is
1666        * clipped against all fixed planes.
1667        */
1668       if (brw->has_negative_rhw_bug) {
1669          brw_CMP(p,
1670                  vec8(brw_null_reg()),
1671                  BRW_CONDITIONAL_L,
1672                  brw_swizzle1(ndc, 3),
1673                  brw_imm_f(0));
1674
1675          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1676          brw_MOV(p, ndc, brw_imm_f(0));
1677          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1678       }
1679
1680       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1681       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1682       brw_set_access_mode(p, BRW_ALIGN_16);
1683
1684       release_tmp(c, header1);
1685    }
1686    else {
1687       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1688    }
1689
1690    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1691     * of zeros followed by two sets of NDC coordinates:
1692     */
1693    brw_set_access_mode(p, BRW_ALIGN_1);
1694    brw_set_acc_write_control(p, 0);
1695
1696    /* The VUE layout is documented in Volume 2a. */
1697    if (intel->gen >= 6) {
1698       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1699        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1700        * dword 4-7 (m2) is the 4D space position
1701        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1702        * enabled.
1703        * m3 or 5 is the first vertex element data we fill, which is
1704        * the vertex position.
1705        */
1706       brw_MOV(p, brw_message_reg(2), pos);
1707       len_vertex_header = 1;
1708       if (c->key.nr_userclip > 0)
1709          len_vertex_header += 2;
1710    } else if (intel->gen == 5) {
1711       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1712        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1713        * dword 4-7 (m2) is the ndc position (set above)
1714        * dword 8-11 (m3) of the vertex header is the 4D space position
1715        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1716        * m6 is a pad so that the vertex element data is aligned
1717        * m7 is the first vertex data we fill, which is the vertex position.
1718        */
1719       brw_MOV(p, brw_message_reg(2), ndc);
1720       brw_MOV(p, brw_message_reg(3), pos);
1721       brw_MOV(p, brw_message_reg(7), pos);
1722       len_vertex_header = 6;
1723    } else {
1724       /* There are 8 dwords in VUE header pre-Ironlake:
1725        * dword 0-3 (m1) is indices, point width, clip flags.
1726        * dword 4-7 (m2) is ndc position (set above)
1727        *
1728        * dword 8-11 (m3) is the first vertex data, which we always have be the
1729        * vertex position.
1730        */
1731       brw_MOV(p, brw_message_reg(2), ndc);
1732       brw_MOV(p, brw_message_reg(3), pos);
1733       len_vertex_header = 2;
1734    }
1735
1736    /* Move variable-addressed, non-overflow outputs to their MRFs. */
1737    next_mrf = 2 + len_vertex_header;
1738    for (i = 0; i < VERT_RESULT_MAX; i++) {
1739       if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1740          break;
1741       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1742          continue;
1743       if (i == VERT_RESULT_PSIZ)
1744          continue;
1745
1746       if (i >= VERT_RESULT_TEX0 &&
1747           c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1748          brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1749          next_mrf++;
1750       } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1751          next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1752       }
1753    }
1754
1755    eot = (c->first_overflow_output == 0);
1756
1757    /* Message header, plus VUE header, plus the (first set of) outputs. */
1758    msg_len = 1 + len_vertex_header + c->nr_outputs;
1759    msg_len = align_interleaved_urb_mlen(brw, msg_len);
1760    /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1761    msg_len = MIN2(msg_len, (BRW_MAX_MRF - 1)),
1762
1763    brw_urb_WRITE(p,
1764                  brw_null_reg(), /* dest */
1765                  0,             /* starting mrf reg nr */
1766                  c->r0,         /* src */
1767                  0,             /* allocate */
1768                  1,             /* used */
1769                  msg_len,
1770                  0,             /* response len */
1771                  eot,           /* eot */
1772                  eot,           /* writes complete */
1773                  0,             /* urb destination offset */
1774                  BRW_URB_SWIZZLE_INTERLEAVE);
1775
1776    if (c->first_overflow_output > 0) {
1777       /* Not all of the vertex outputs/results fit into the MRF.
1778        * Move the overflowed attributes from the GRF to the MRF and
1779        * issue another brw_urb_WRITE().
1780        */
1781       GLuint i, mrf = 1;
1782       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1783          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1784             /* move from GRF to MRF */
1785             brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1786             mrf++;
1787          }
1788       }
1789
1790       brw_urb_WRITE(p,
1791                     brw_null_reg(), /* dest */
1792                     0,              /* starting mrf reg nr */
1793                     c->r0,          /* src */
1794                     0,              /* allocate */
1795                     1,              /* used */
1796                     align_interleaved_urb_mlen(brw, mrf),
1797                     0,              /* response len */
1798                     1,              /* eot */
1799                     1,              /* writes complete */
1800                     14 / 2,  /* urb destination offset */
1801                     BRW_URB_SWIZZLE_INTERLEAVE);
1802    }
1803 }
1804
1805 static GLboolean
1806 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1807 {
1808    struct brw_compile *p = &c->func;
1809    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1810
1811    if (p->nr_insn == 0)
1812       return GL_FALSE;
1813
1814    if (val.address_mode != BRW_ADDRESS_DIRECT)
1815       return GL_FALSE;
1816
1817    switch (prev_insn->header.opcode) {
1818    case BRW_OPCODE_MOV:
1819    case BRW_OPCODE_MAC:
1820    case BRW_OPCODE_MUL:
1821       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1822           prev_insn->header.execution_size == val.width &&
1823           prev_insn->bits1.da1.dest_reg_file == val.file &&
1824           prev_insn->bits1.da1.dest_reg_type == val.type &&
1825           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1826           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1827           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1828           prev_insn->bits1.da16.dest_writemask == 0xf)
1829          return GL_TRUE;
1830       else
1831          return GL_FALSE;
1832    default:
1833       return GL_FALSE;
1834    }
1835 }
1836
1837 static uint32_t
1838 get_predicate(const struct prog_instruction *inst)
1839 {
1840    if (inst->DstReg.CondMask == COND_TR)
1841       return BRW_PREDICATE_NONE;
1842
1843    /* All of GLSL only produces predicates for COND_NE and one channel per
1844     * vector.  Fail badly if someone starts doing something else, as it might
1845     * mean infinite looping or something.
1846     *
1847     * We'd like to support all the condition codes, but our hardware doesn't
1848     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1849     * those, the instruction may update the condition codes or not, then any
1850     * later instruction may use one of those condition codes.  For gen4, the
1851     * instruction may update the flags register based on one of the condition
1852     * codes output by the instruction, and then further instructions may
1853     * predicate on that.  We can probably support this, but it won't
1854     * necessarily be easy.
1855     */
1856    assert(inst->DstReg.CondMask == COND_NE);
1857
1858    switch (inst->DstReg.CondSwizzle) {
1859    case SWIZZLE_XXXX:
1860       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1861    case SWIZZLE_YYYY:
1862       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1863    case SWIZZLE_ZZZZ:
1864       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1865    case SWIZZLE_WWWW:
1866       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1867    default:
1868       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1869                     inst->DstReg.CondMask);
1870       return BRW_PREDICATE_NORMAL;
1871    }
1872 }
1873
1874 /* Emit the vertex program instructions here.
1875  */
1876 void brw_vs_emit(struct brw_vs_compile *c )
1877 {
1878 #define MAX_IF_DEPTH 32
1879 #define MAX_LOOP_DEPTH 32
1880    struct brw_compile *p = &c->func;
1881    struct brw_context *brw = p->brw;
1882    struct intel_context *intel = &brw->intel;
1883    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1884    GLuint insn, if_depth = 0, loop_depth = 0;
1885    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1886    int if_depth_in_loop[MAX_LOOP_DEPTH];
1887    const struct brw_indirect stack_index = brw_indirect(0, 0);
1888    GLuint index;
1889    GLuint file;
1890
1891    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1892       printf("vs-mesa:\n");
1893       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1894                                GL_TRUE);
1895       printf("\n");
1896    }
1897
1898    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1899    brw_set_access_mode(p, BRW_ALIGN_16);
1900    if_depth_in_loop[loop_depth] = 0;
1901
1902    brw_set_acc_write_control(p, 1);
1903
1904    for (insn = 0; insn < nr_insns; insn++) {
1905        GLuint i;
1906        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1907
1908        /* Message registers can't be read, so copy the output into GRF
1909         * register if they are used in source registers
1910         */
1911        for (i = 0; i < 3; i++) {
1912            struct prog_src_register *src = &inst->SrcReg[i];
1913            GLuint index = src->Index;
1914            GLuint file = src->File;
1915            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1916                c->output_regs[index].used_in_src = GL_TRUE;
1917        }
1918
1919        switch (inst->Opcode) {
1920        case OPCODE_CAL:
1921        case OPCODE_RET:
1922           c->needs_stack = GL_TRUE;
1923           break;
1924        default:
1925           break;
1926        }
1927    }
1928
1929    /* Static register allocation
1930     */
1931    brw_vs_alloc_regs(c);
1932
1933    if (c->needs_stack)
1934       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1935
1936    for (insn = 0; insn < nr_insns; insn++) {
1937
1938       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1939       struct brw_reg args[3], dst;
1940       GLuint i;
1941
1942 #if 0
1943       printf("%d: ", insn);
1944       _mesa_print_instruction(inst);
1945 #endif
1946
1947       /* Get argument regs.  SWZ is special and does this itself.
1948        */
1949       if (inst->Opcode != OPCODE_SWZ)
1950           for (i = 0; i < 3; i++) {
1951               const struct prog_src_register *src = &inst->SrcReg[i];
1952               index = src->Index;
1953               file = src->File;
1954               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1955                   args[i] = c->output_regs[index].reg;
1956               else
1957                   args[i] = get_arg(c, inst, i);
1958           }
1959
1960       /* Get dest regs.  Note that it is possible for a reg to be both
1961        * dst and arg, given the static allocation of registers.  So
1962        * care needs to be taken emitting multi-operation instructions.
1963        */
1964       index = inst->DstReg.Index;
1965       file = inst->DstReg.File;
1966       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1967           dst = c->output_regs[index].reg;
1968       else
1969           dst = get_dst(c, inst->DstReg);
1970
1971       if (inst->SaturateMode != SATURATE_OFF) {
1972          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1973                        inst->SaturateMode);
1974       }
1975
1976       switch (inst->Opcode) {
1977       case OPCODE_ABS:
1978          args[0].negate = false;
1979          brw_MOV(p, dst, brw_abs(args[0]));
1980          break;
1981       case OPCODE_ADD:
1982          brw_ADD(p, dst, args[0], args[1]);
1983          break;
1984       case OPCODE_COS:
1985          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1986          break;
1987       case OPCODE_DP2:
1988          brw_DP2(p, dst, args[0], args[1]);
1989          break;
1990       case OPCODE_DP3:
1991          brw_DP3(p, dst, args[0], args[1]);
1992          break;
1993       case OPCODE_DP4:
1994          brw_DP4(p, dst, args[0], args[1]);
1995          break;
1996       case OPCODE_DPH:
1997          brw_DPH(p, dst, args[0], args[1]);
1998          break;
1999       case OPCODE_NRM3:
2000          emit_nrm(c, dst, args[0], 3);
2001          break;
2002       case OPCODE_NRM4:
2003          emit_nrm(c, dst, args[0], 4);
2004          break;
2005       case OPCODE_DST:
2006          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
2007          break;
2008       case OPCODE_EXP:
2009          unalias1(c, dst, args[0], emit_exp_noalias);
2010          break;
2011       case OPCODE_EX2:
2012          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
2013          break;
2014       case OPCODE_ARL:
2015          emit_arl(p, dst, args[0]);
2016          break;
2017       case OPCODE_FLR:
2018          brw_RNDD(p, dst, args[0]);
2019          break;
2020       case OPCODE_FRC:
2021          brw_FRC(p, dst, args[0]);
2022          break;
2023       case OPCODE_LOG:
2024          unalias1(c, dst, args[0], emit_log_noalias);
2025          break;
2026       case OPCODE_LG2:
2027          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2028          break;
2029       case OPCODE_LIT:
2030          unalias1(c, dst, args[0], emit_lit_noalias);
2031          break;
2032       case OPCODE_LRP:
2033          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2034          break;
2035       case OPCODE_MAD:
2036          if (!accumulator_contains(c, args[2]))
2037             brw_MOV(p, brw_acc_reg(), args[2]);
2038          brw_MAC(p, dst, args[0], args[1]);
2039          break;
2040       case OPCODE_CMP:
2041          emit_cmp(p, dst, args[0], args[1], args[2]);
2042          break;
2043       case OPCODE_MAX:
2044          emit_max(p, dst, args[0], args[1]);
2045          break;
2046       case OPCODE_MIN:
2047          emit_min(p, dst, args[0], args[1]);
2048          break;
2049       case OPCODE_MOV:
2050          brw_MOV(p, dst, args[0]);
2051          break;
2052       case OPCODE_MUL:
2053          brw_MUL(p, dst, args[0], args[1]);
2054          break;
2055       case OPCODE_POW:
2056          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2057          break;
2058       case OPCODE_RCP:
2059          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2060          break;
2061       case OPCODE_RSQ:
2062          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2063          break;
2064
2065       case OPCODE_SEQ:
2066          unalias2(c, dst, args[0], args[1], emit_seq);
2067          break;
2068       case OPCODE_SIN:
2069          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2070          break;
2071       case OPCODE_SNE:
2072          unalias2(c, dst, args[0], args[1], emit_sne);
2073          break;
2074       case OPCODE_SGE:
2075          unalias2(c, dst, args[0], args[1], emit_sge);
2076          break;
2077       case OPCODE_SGT:
2078          unalias2(c, dst, args[0], args[1], emit_sgt);
2079          break;
2080       case OPCODE_SLT:
2081          unalias2(c, dst, args[0], args[1], emit_slt);
2082          break;
2083       case OPCODE_SLE:
2084          unalias2(c, dst, args[0], args[1], emit_sle);
2085          break;
2086       case OPCODE_SSG:
2087          unalias1(c, dst, args[0], emit_sign);
2088          break;
2089       case OPCODE_SUB:
2090          brw_ADD(p, dst, args[0], negate(args[1]));
2091          break;
2092       case OPCODE_SWZ:
2093          /* The args[0] value can't be used here as it won't have
2094           * correctly encoded the full swizzle:
2095           */
2096          emit_swz(c, dst, inst);
2097          break;
2098       case OPCODE_TRUNC:
2099          /* round toward zero */
2100          brw_RNDZ(p, dst, args[0]);
2101          break;
2102       case OPCODE_XPD:
2103          emit_xpd(p, dst, args[0], args[1]);
2104          break;
2105       case OPCODE_IF:
2106          assert(if_depth < MAX_IF_DEPTH);
2107          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
2108          /* Note that brw_IF smashes the predicate_control field. */
2109          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
2110          if_depth_in_loop[loop_depth]++;
2111          if_depth++;
2112          break;
2113       case OPCODE_ELSE:
2114          clear_current_const(c);
2115          assert(if_depth > 0);
2116          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2117          break;
2118       case OPCODE_ENDIF:
2119          clear_current_const(c);
2120          assert(if_depth > 0);
2121          brw_ENDIF(p, if_inst[--if_depth]);
2122          if_depth_in_loop[loop_depth]--;
2123          break;
2124       case OPCODE_BGNLOOP:
2125          clear_current_const(c);
2126          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2127          if_depth_in_loop[loop_depth] = 0;
2128          break;
2129       case OPCODE_BRK:
2130          brw_set_predicate_control(p, get_predicate(inst));
2131          brw_BREAK(p, if_depth_in_loop[loop_depth]);
2132          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2133          break;
2134       case OPCODE_CONT:
2135          brw_set_predicate_control(p, get_predicate(inst));
2136          if (intel->gen >= 6) {
2137             gen6_CONT(p, loop_inst[loop_depth - 1]);
2138          } else {
2139             brw_CONT(p, if_depth_in_loop[loop_depth]);
2140          }
2141          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2142          break;
2143
2144       case OPCODE_ENDLOOP: {
2145          clear_current_const(c);
2146          struct brw_instruction *inst0, *inst1;
2147          GLuint br = 1;
2148
2149          loop_depth--;
2150
2151          if (intel->gen == 5)
2152             br = 2;
2153
2154          inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2155
2156          if (intel->gen < 6) {
2157             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2158             while (inst0 > loop_inst[loop_depth]) {
2159                inst0--;
2160                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2161                    inst0->bits3.if_else.jump_count == 0) {
2162                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2163                } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2164                           inst0->bits3.if_else.jump_count == 0) {
2165                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2166                }
2167             }
2168          }
2169       }
2170          break;
2171
2172       case OPCODE_BRA:
2173          brw_set_predicate_control(p, get_predicate(inst));
2174          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2175          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2176          break;
2177       case OPCODE_CAL:
2178          brw_set_access_mode(p, BRW_ALIGN_1);
2179          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2180          brw_set_access_mode(p, BRW_ALIGN_16);
2181          brw_ADD(p, get_addr_reg(stack_index),
2182                          get_addr_reg(stack_index), brw_imm_d(4));
2183          brw_save_call(p, inst->Comment, p->nr_insn);
2184          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2185          break;
2186       case OPCODE_RET:
2187          brw_ADD(p, get_addr_reg(stack_index),
2188                          get_addr_reg(stack_index), brw_imm_d(-4));
2189          brw_set_access_mode(p, BRW_ALIGN_1);
2190          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2191          brw_set_access_mode(p, BRW_ALIGN_16);
2192          break;
2193       case OPCODE_END:
2194          emit_vertex_write(c);
2195          break;
2196       case OPCODE_PRINT:
2197          /* no-op */
2198          break;
2199       case OPCODE_BGNSUB:
2200          brw_save_label(p, inst->Comment, p->nr_insn);
2201          break;
2202       case OPCODE_ENDSUB:
2203          /* no-op */
2204          break;
2205       default:
2206          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2207                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2208                                     _mesa_opcode_string(inst->Opcode) :
2209                                     "unknown");
2210       }
2211
2212       /* Set the predication update on the last instruction of the native
2213        * instruction sequence.
2214        *
2215        * This would be problematic if it was set on a math instruction,
2216        * but that shouldn't be the case with the current GLSL compiler.
2217        */
2218       if (inst->CondUpdate) {
2219          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2220
2221          assert(hw_insn->header.destreg__conditionalmod == 0);
2222          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2223       }
2224
2225       if ((inst->DstReg.File == PROGRAM_OUTPUT)
2226           && (inst->DstReg.Index != VERT_RESULT_HPOS)
2227           && c->output_regs[inst->DstReg.Index].used_in_src) {
2228          brw_MOV(p, get_dst(c, inst->DstReg), dst);
2229       }
2230
2231       /* Result color clamping.
2232        *
2233        * When destination register is an output register and
2234        * it's primary/secondary front/back color, we have to clamp
2235        * the result to [0,1]. This is done by enabling the
2236        * saturation bit for the last instruction.
2237        *
2238        * We don't use brw_set_saturate() as it modifies
2239        * p->current->header.saturate, which affects all the subsequent
2240        * instructions. Instead, we directly modify the header
2241        * of the last (already stored) instruction.
2242        */
2243       if (inst->DstReg.File == PROGRAM_OUTPUT &&
2244           c->key.clamp_vertex_color) {
2245          if ((inst->DstReg.Index == VERT_RESULT_COL0)
2246              || (inst->DstReg.Index == VERT_RESULT_COL1)
2247              || (inst->DstReg.Index == VERT_RESULT_BFC0)
2248              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2249             p->store[p->nr_insn-1].header.saturate = 1;
2250          }
2251       }
2252
2253       if (inst->DstReg.RelAddr) {
2254          assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2255                 inst->DstReg.File == PROGRAM_OUTPUT);
2256          move_to_reladdr_dst(c, inst, dst);
2257       }
2258
2259       release_tmps(c);
2260    }
2261
2262    brw_resolve_cals(p);
2263    brw_set_uip_jip(p);
2264
2265    brw_optimize(p);
2266
2267    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2268       int i;
2269
2270       printf("vs-native:\n");
2271       for (i = 0; i < p->nr_insn; i++)
2272          brw_disasm(stdout, &p->store[i], intel->gen);
2273       printf("\n");
2274    }
2275 }