src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static bool
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return true;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return false;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101 static int
 102 get_first_reladdr_output(struct gl_vertex_program *vp)
 103 {
 104    int i;
 105    int first_reladdr_output = VERT_RESULT_MAX;
 106
 107    for (i = 0; i < vp->Base.NumInstructions; i++) {
 108       struct prog_instruction *inst = vp->Base.Instructions + i;
 109
 110       if (inst->DstReg.File == PROGRAM_OUTPUT &&
 111           inst->DstReg.RelAddr &&
 112           inst->DstReg.Index < first_reladdr_output)
 113          first_reladdr_output = inst->DstReg.Index;
 114    }
 115
 116    return first_reladdr_output;
 117 }
 118
 119 /* Clears the record of which vp_const_buffer elements have been
 120  * loaded into our constant buffer registers, for the starts of new
 121  * blocks after control flow.
 122  */
 123 static void
 124 clear_current_const(struct brw_vs_compile *c)
 125 {
 126    unsigned int i;
 127
 128    if (c->vp->use_const_buffer) {
 129       for (i = 0; i < 3; i++) {
 130          c->current_const[i].index = -1;
 131       }
 132    }
 133 }
 134
 135 /* The message length for all SEND messages is restricted to [1,15].  This
 136  * includes 1 for the header, so anything in slots 14 and above needs to be
 137  * placed in a general-purpose register and emitted using a second URB write.
 138  */
 139 #define MAX_SLOTS_IN_FIRST_URB_WRITE 14
 140
 141 /**
 142  * Determine whether the given vertex output can be written directly to a MRF
 143  * or whether it has to be stored in a general-purpose register.
 144  */
 145 static inline bool can_use_direct_mrf(int vert_result,
 146                                       int first_reladdr_output, int slot)
 147 {
 148    if (vert_result == VERT_RESULT_HPOS || vert_result == VERT_RESULT_PSIZ) {
 149       /* These never go straight into MRF's.  They are placed in the MRF by
 150        * epilog code.
 151        */
 152       return false;
 153    }
 154    if (first_reladdr_output <= vert_result && vert_result < VERT_RESULT_MAX) {
 155       /* Relative addressing might be used to access this vert_result, so it
 156        * needs to go into a general-purpose register.
 157        */
 158       return false;
 159    }
 160    if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE) {
 161       /* This output won't go out until the second URB write so it must be
 162        * stored in a general-purpose register until then.
 163        */
 164       return false;
 165    }
 166    return true;
 167 }
 168
 169 /**
 170  * Preallocate GRF register before code emit.
 171  * Do things as simply as possible.  Allocate and populate all regs
 172  * ahead of time.
 173  */
 174 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 175 {
 176    struct intel_context *intel = &c->func.brw->intel;
 177    GLuint i, reg = 0, slot;
 178    int attributes_in_vue;
 179    int first_reladdr_output;
 180    int max_constant;
 181    int constant = 0;
 182    struct brw_vertex_program *vp = c->vp;
 183    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
 184
 185    /* Determine whether to use a real constant buffer or use a block
 186     * of GRF registers for constants.  The later is faster but only
 187     * works if everything fits in the GRF.
 188     * XXX this heuristic/check may need some fine tuning...
 189     */
 190    if (c->vp->program.Base.Parameters->NumParameters +
 191        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 192       c->vp->use_const_buffer = true;
 193    else
 194       c->vp->use_const_buffer = false;
 195
 196    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 197
 198    /* r0 -- reserved as usual
 199     */
 200    c->r0 = brw_vec8_grf(reg, 0);
 201    reg++;
 202
 203    /* User clip planes from curbe:
 204     */
 205    if (c->key.userclip_active) {
 206       if (intel->gen >= 6) {
 207          for (i = 0; i <= c->key.nr_userclip_plane_consts; i++) {
 208             c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
 209                                                   (i % 2) * 4), 0, 4, 1);
 210          }
 211          reg += ALIGN(c->key.nr_userclip_plane_consts, 2) / 2;
 212       } else {
 213          for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
 214             c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
 215                                                   (i % 2) * 4), 0, 4, 1);
 216          }
 217          reg += (ALIGN(6 + c->key.nr_userclip_plane_consts, 4) / 4) * 2;
 218       }
 219
 220    }
 221
 222    /* Assign some (probably all) of the vertex program constants to
 223     * the push constant buffer/CURBE.
 224     *
 225     * There's an obvious limit to the numer of push constants equal to
 226     * the number of register available, and that number is smaller
 227     * than the minimum maximum number of vertex program parameters, so
 228     * support for pull constants is required if we overflow.
 229     * Additionally, on gen6 the number of push constants is even
 230     * lower.
 231     *
 232     * When there's relative addressing, we don't know what range of
 233     * Mesa IR registers can be accessed.  And generally, when relative
 234     * addressing is used we also have too many constants to load them
 235     * all as push constants.  So, we'll just support relative
 236     * addressing out of the pull constant buffers, and try to load as
 237     * many statically-accessed constants into the push constant buffer
 238     * as we can.
 239     */
 240    if (intel->gen >= 6) {
 241       /* We can only load 32 regs of push constants. */
 242       max_constant = 32 * 2 - c->key.nr_userclip_plane_consts;
 243    } else {
 244       max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 245    }
 246
 247    /* constant_map maps from ParameterValues[] index to index in the
 248     * push constant buffer, or -1 if it's only in the pull constant
 249     * buffer.
 250     */
 251    memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 252    for (i = 0;
 253         i < c->vp->program.Base.NumInstructions && constant < max_constant;
 254         i++) {
 255       struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 256       int arg;
 257
 258       for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 259          if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 260              inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 261              inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 262              inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 263              inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
 264             continue;
 265          }
 266
 267          if (inst->SrcReg[arg].RelAddr) {
 268             c->vp->use_const_buffer = true;
 269             continue;
 270          }
 271
 272          if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 273             c->constant_map[inst->SrcReg[arg].Index] = constant++;
 274          }
 275       }
 276    }
 277
 278    /* If we ran out of push constant space, then we'll also upload all
 279     * constants through the pull constant buffer so that they can be
 280     * accessed no matter what.  For relative addressing (the common
 281     * case) we need them all in place anyway.
 282     */
 283    if (constant == max_constant)
 284       c->vp->use_const_buffer = true;
 285
 286    /* Set up the references to the pull parameters if present.  This backend
 287     * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
 288     * buffer, while the new VS backend allocates values to the pull buffer on
 289     * demand.
 290     */
 291    if (c->vp->use_const_buffer) {
 292       for (i = 0; i < params->NumParameters * 4; i++) {
 293          c->prog_data.pull_param[i] = &params->ParameterValues[i / 4][i % 4].f;
 294       }
 295       c->prog_data.nr_pull_params = i;
 296    }
 297
 298    for (i = 0; i < constant; i++) {
 299       c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
 300                                                           (i % 2) * 4),
 301                                              0, 4, 1);
 302    }
 303    reg += (constant + 1) / 2;
 304    c->prog_data.curb_read_length = reg - 1;
 305    c->prog_data.nr_params = constant * 4;
 306    /* XXX 0 causes a bug elsewhere... */
 307    if (intel->gen < 6 && c->prog_data.nr_params == 0)
 308       c->prog_data.nr_params = 4;
 309
 310    /* Allocate input regs:
 311     */
 312    c->nr_inputs = 0;
 313    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 314       if (c->prog_data.inputs_read & BITFIELD64_BIT(i)) {
 315          c->nr_inputs++;
 316          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 317          reg++;
 318       }
 319    }
 320    /* If there are no inputs, we'll still be reading one attribute's worth
 321     * because it's required -- see urb_read_length setting.
 322     */
 323    if (c->nr_inputs == 0)
 324       reg++;
 325
 326    /* Allocate outputs.  The non-position outputs go straight into message regs.
 327     */
 328    c->first_output = reg;
 329
 330    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
 331
 332    for (slot = 0; slot < c->prog_data.vue_map.num_slots; slot++) {
 333       int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
 334       assert(vert_result < Elements(c->regs[PROGRAM_OUTPUT]));
 335       if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) {
 336          c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1);
 337       } else {
 338          c->regs[PROGRAM_OUTPUT][vert_result] = brw_vec8_grf(reg, 0);
 339          reg++;
 340       }
 341    }
 342
 343    /* Allocate program temporaries:
 344     */
 345    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 346       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 347       reg++;
 348    }
 349
 350    /* Address reg(s).  Don't try to use the internal address reg until
 351     * deref time.
 352     */
 353    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 354       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 355                                              reg,
 356                                              0,
 357                                              BRW_REGISTER_TYPE_D,
 358                                              BRW_VERTICAL_STRIDE_8,
 359                                              BRW_WIDTH_8,
 360                                              BRW_HORIZONTAL_STRIDE_1,
 361                                              BRW_SWIZZLE_XXXX,
 362                                              WRITEMASK_X);
 363       reg++;
 364    }
 365
 366    if (c->vp->use_const_buffer) {
 367       for (i = 0; i < 3; i++) {
 368          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 369          reg++;
 370       }
 371       clear_current_const(c);
 372    }
 373
 374    for (i = 0; i < 128; i++) {
 375       if (c->output_regs[i].used_in_src) {
 376          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 377          reg++;
 378       }
 379    }
 380
 381    if (c->needs_stack) {
 382       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 383       reg += 2;
 384    }
 385
 386    /* Some opcodes need an internal temporary:
 387     */
 388    c->first_tmp = reg;
 389    c->last_tmp = reg;           /* for allocation purposes */
 390
 391    /* Each input reg holds data from two vertices.  The
 392     * urb_read_length is the number of registers read from *each*
 393     * vertex urb, so is half the amount:
 394     */
 395    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 396    /* Setting this field to 0 leads to undefined behavior according to the
 397     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 398     * sitting in them, even if it's padding.
 399     */
 400    if (c->prog_data.urb_read_length == 0)
 401       c->prog_data.urb_read_length = 1;
 402
 403    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 404     * them to fit the biggest thing they need to.
 405     */
 406    attributes_in_vue = MAX2(c->prog_data.vue_map.num_slots, c->nr_inputs);
 407
 408    if (intel->gen == 6) {
 409       /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
 410        * number of 128-byte (1024-bit) units.
 411        */
 412       c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 8) / 8;
 413    } else {
 414       /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
 415        * number of 64-byte (512-bit) units.
 416        */
 417       c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 4) / 4;
 418    }
 419
 420    c->prog_data.total_grf = reg;
 421
 422    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
 423       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 424       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 425       printf("%s reg = %d\n", __FUNCTION__, reg);
 426    }
 427 }
 428
 429
 430 /**
 431  * If an instruction uses a temp reg both as a src and the dest, we
 432  * sometimes need to allocate an intermediate temporary.
 433  */
 434 static void unalias1( struct brw_vs_compile *c,
 435                       struct brw_reg dst,
 436                       struct brw_reg arg0,
 437                       void (*func)( struct brw_vs_compile *,
 438                                     struct brw_reg,
 439                                     struct brw_reg ))
 440 {
 441    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 442       struct brw_compile *p = &c->func;
 443       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 444       func(c, tmp, arg0);
 445       brw_MOV(p, dst, tmp);
 446       release_tmp(c, tmp);
 447    }
 448    else {
 449       func(c, dst, arg0);
 450    }
 451 }
 452
 453 /**
 454  * \sa unalias2
 455  * Checkes if 2-operand instruction needs an intermediate temporary.
 456  */
 457 static void unalias2( struct brw_vs_compile *c,
 458                       struct brw_reg dst,
 459                       struct brw_reg arg0,
 460                       struct brw_reg arg1,
 461                       void (*func)( struct brw_vs_compile *,
 462                                     struct brw_reg,
 463                                     struct brw_reg,
 464                                     struct brw_reg ))
 465 {
 466    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 467        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 468       struct brw_compile *p = &c->func;
 469       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 470       func(c, tmp, arg0, arg1);
 471       brw_MOV(p, dst, tmp);
 472       release_tmp(c, tmp);
 473    }
 474    else {
 475       func(c, dst, arg0, arg1);
 476    }
 477 }
 478
 479 /**
 480  * \sa unalias2
 481  * Checkes if 3-operand instruction needs an intermediate temporary.
 482  */
 483 static void unalias3( struct brw_vs_compile *c,
 484                       struct brw_reg dst,
 485                       struct brw_reg arg0,
 486                       struct brw_reg arg1,
 487                       struct brw_reg arg2,
 488                       void (*func)( struct brw_vs_compile *,
 489                                     struct brw_reg,
 490                                     struct brw_reg,
 491                                     struct brw_reg,
 492                                     struct brw_reg ))
 493 {
 494    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 495        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 496        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 497       struct brw_compile *p = &c->func;
 498       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 499       func(c, tmp, arg0, arg1, arg2);
 500       brw_MOV(p, dst, tmp);
 501       release_tmp(c, tmp);
 502    }
 503    else {
 504       func(c, dst, arg0, arg1, arg2);
 505    }
 506 }
 507
 508 static void emit_sop( struct brw_vs_compile *c,
 509                       struct brw_reg dst,
 510                       struct brw_reg arg0,
 511                       struct brw_reg arg1,
 512                       GLuint cond)
 513 {
 514    struct brw_compile *p = &c->func;
 515
 516    brw_MOV(p, dst, brw_imm_f(0.0f));
 517    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 518    brw_MOV(p, dst, brw_imm_f(1.0f));
 519    brw_set_predicate_control_flag_value(p, 0xff);
 520 }
 521
 522 static void emit_seq( struct brw_vs_compile *c,
 523                       struct brw_reg dst,
 524                       struct brw_reg arg0,
 525                       struct brw_reg arg1 )
 526 {
 527    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 528 }
 529
 530 static void emit_sne( struct brw_vs_compile *c,
 531                       struct brw_reg dst,
 532                       struct brw_reg arg0,
 533                       struct brw_reg arg1 )
 534 {
 535    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 536 }
 537 static void emit_slt( struct brw_vs_compile *c,
 538                       struct brw_reg dst,
 539                       struct brw_reg arg0,
 540                       struct brw_reg arg1 )
 541 {
 542    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 543 }
 544
 545 static void emit_sle( struct brw_vs_compile *c,
 546                       struct brw_reg dst,
 547                       struct brw_reg arg0,
 548                       struct brw_reg arg1 )
 549 {
 550    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 551 }
 552
 553 static void emit_sgt( struct brw_vs_compile *c,
 554                       struct brw_reg dst,
 555                       struct brw_reg arg0,
 556                       struct brw_reg arg1 )
 557 {
 558    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 559 }
 560
 561 static void emit_sge( struct brw_vs_compile *c,
 562                       struct brw_reg dst,
 563                       struct brw_reg arg0,
 564                       struct brw_reg arg1 )
 565 {
 566   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 567 }
 568
 569 static void emit_cmp( struct brw_compile *p,
 570                       struct brw_reg dst,
 571                       struct brw_reg arg0,
 572                       struct brw_reg arg1,
 573                       struct brw_reg arg2 )
 574 {
 575    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 576    brw_SEL(p, dst, arg1, arg2);
 577    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 578 }
 579
 580 static void emit_sign(struct brw_vs_compile *c,
 581                       struct brw_reg dst,
 582                       struct brw_reg arg0)
 583 {
 584    struct brw_compile *p = &c->func;
 585
 586    brw_MOV(p, dst, brw_imm_f(0));
 587
 588    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 589    brw_MOV(p, dst, brw_imm_f(-1.0));
 590    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 591
 592    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 593    brw_MOV(p, dst, brw_imm_f(1.0));
 594    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 595 }
 596
 597 static void emit_max( struct brw_compile *p,
 598                       struct brw_reg dst,
 599                       struct brw_reg arg0,
 600                       struct brw_reg arg1 )
 601 {
 602    struct intel_context *intel = &p->brw->intel;
 603
 604    if (intel->gen >= 6) {
 605       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
 606       brw_SEL(p, dst, arg0, arg1);
 607       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 608       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 609    } else {
 610       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 611       brw_SEL(p, dst, arg0, arg1);
 612       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 613    }
 614 }
 615
 616 static void emit_min( struct brw_compile *p,
 617                       struct brw_reg dst,
 618                       struct brw_reg arg0,
 619                       struct brw_reg arg1 )
 620 {
 621    struct intel_context *intel = &p->brw->intel;
 622
 623    if (intel->gen >= 6) {
 624       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 625       brw_SEL(p, dst, arg0, arg1);
 626       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 627       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 628    } else {
 629       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 630       brw_SEL(p, dst, arg0, arg1);
 631       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 632    }
 633 }
 634
 635 static void emit_arl(struct brw_compile *p,
 636                      struct brw_reg dst,
 637                      struct brw_reg src)
 638 {
 639    struct intel_context *intel = &p->brw->intel;
 640
 641    if (intel->gen >= 6) {
 642       struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
 643
 644       brw_RNDD(p, dst_f, src);
 645       brw_MOV(p, dst, dst_f);
 646    } else {
 647       brw_RNDD(p, dst, src);
 648    }
 649 }
 650
 651 static void emit_math1_gen4(struct brw_vs_compile *c,
 652                             GLuint function,
 653                             struct brw_reg dst,
 654                             struct brw_reg arg0,
 655                             GLuint precision)
 656 {
 657    /* There are various odd behaviours with SEND on the simulator.  In
 658     * addition there are documented issues with the fact that the GEN4
 659     * processor doesn't do dependency control properly on SEND
 660     * results.  So, on balance, this kludge to get around failures
 661     * with writemasked math results looks like it might be necessary
 662     * whether that turns out to be a simulator bug or not:
 663     */
 664    struct brw_compile *p = &c->func;
 665    struct brw_reg tmp = dst;
 666    bool need_tmp = false;
 667
 668    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 669        dst.dw1.bits.writemask != 0xf)
 670       need_tmp = true;
 671
 672    if (need_tmp)
 673       tmp = get_tmp(c);
 674
 675    brw_math(p,
 676             tmp,
 677             function,
 678             BRW_MATH_SATURATE_NONE,
 679             2,
 680             arg0,
 681             BRW_MATH_DATA_SCALAR,
 682             precision);
 683
 684    if (need_tmp) {
 685       brw_MOV(p, dst, tmp);
 686       release_tmp(c, tmp);
 687    }
 688 }
 689
 690 static void
 691 emit_math1_gen6(struct brw_vs_compile *c,
 692                 GLuint function,
 693                 struct brw_reg dst,
 694                 struct brw_reg arg0,
 695                 GLuint precision)
 696 {
 697    struct brw_compile *p = &c->func;
 698    struct brw_reg tmp_src, tmp_dst;
 699
 700    /* Something is strange on gen6 math in 16-wide mode, though the
 701     * docs say it's supposed to work.  Punt to using align1 mode,
 702     * which doesn't do writemasking and swizzles.
 703     */
 704    tmp_src = get_tmp(c);
 705    tmp_dst = get_tmp(c);
 706
 707    brw_MOV(p, tmp_src, arg0);
 708
 709    brw_set_access_mode(p, BRW_ALIGN_1);
 710    brw_math(p,
 711             tmp_dst,
 712             function,
 713             BRW_MATH_SATURATE_NONE,
 714             2,
 715             tmp_src,
 716             BRW_MATH_DATA_SCALAR,
 717             precision);
 718    brw_set_access_mode(p, BRW_ALIGN_16);
 719
 720    brw_MOV(p, dst, tmp_dst);
 721
 722    release_tmp(c, tmp_src);
 723    release_tmp(c, tmp_dst);
 724 }
 725
 726 static void
 727 emit_math1(struct brw_vs_compile *c,
 728            GLuint function,
 729            struct brw_reg dst,
 730            struct brw_reg arg0,
 731            GLuint precision)
 732 {
 733    struct brw_compile *p = &c->func;
 734    struct intel_context *intel = &p->brw->intel;
 735
 736    if (intel->gen >= 6)
 737       emit_math1_gen6(c, function, dst, arg0, precision);
 738    else
 739       emit_math1_gen4(c, function, dst, arg0, precision);
 740 }
 741
 742 static void emit_math2_gen4( struct brw_vs_compile *c,
 743                         GLuint function,
 744                         struct brw_reg dst,
 745                         struct brw_reg arg0,
 746                         struct brw_reg arg1,
 747                         GLuint precision)
 748 {
 749    struct brw_compile *p = &c->func;
 750    struct brw_reg tmp = dst;
 751    bool need_tmp = false;
 752
 753    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 754        dst.dw1.bits.writemask != 0xf)
 755       need_tmp = true;
 756
 757    if (need_tmp)
 758       tmp = get_tmp(c);
 759
 760    brw_MOV(p, brw_message_reg(3), arg1);
 761
 762    brw_math(p,
 763             tmp,
 764             function,
 765             BRW_MATH_SATURATE_NONE,
 766             2,
 767             arg0,
 768             BRW_MATH_DATA_SCALAR,
 769             precision);
 770
 771    if (need_tmp) {
 772       brw_MOV(p, dst, tmp);
 773       release_tmp(c, tmp);
 774    }
 775 }
 776
 777 static void emit_math2_gen6( struct brw_vs_compile *c,
 778                         GLuint function,
 779                         struct brw_reg dst,
 780                         struct brw_reg arg0,
 781                         struct brw_reg arg1,
 782                         GLuint precision)
 783 {
 784    struct brw_compile *p = &c->func;
 785    struct brw_reg tmp_src0, tmp_src1, tmp_dst;
 786
 787    tmp_src0 = get_tmp(c);
 788    tmp_src1 = get_tmp(c);
 789    tmp_dst = get_tmp(c);
 790
 791    brw_MOV(p, tmp_src0, arg0);
 792    brw_MOV(p, tmp_src1, arg1);
 793
 794    brw_set_access_mode(p, BRW_ALIGN_1);
 795    brw_math2(p,
 796             tmp_dst,
 797             function,
 798             tmp_src0,
 799             tmp_src1);
 800    brw_set_access_mode(p, BRW_ALIGN_16);
 801
 802    brw_MOV(p, dst, tmp_dst);
 803
 804    release_tmp(c, tmp_src0);
 805    release_tmp(c, tmp_src1);
 806    release_tmp(c, tmp_dst);
 807 }
 808
 809 static void emit_math2( struct brw_vs_compile *c,
 810                         GLuint function,
 811                         struct brw_reg dst,
 812                         struct brw_reg arg0,
 813                         struct brw_reg arg1,
 814                         GLuint precision)
 815 {
 816    struct brw_compile *p = &c->func;
 817    struct intel_context *intel = &p->brw->intel;
 818
 819    if (intel->gen >= 6)
 820       emit_math2_gen6(c, function, dst, arg0, arg1, precision);
 821    else
 822       emit_math2_gen4(c, function, dst, arg0, arg1, precision);
 823 }
 824
 825 static void emit_exp_noalias( struct brw_vs_compile *c,
 826                               struct brw_reg dst,
 827                               struct brw_reg arg0 )
 828 {
 829    struct brw_compile *p = &c->func;
 830
 831
 832    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 833       struct brw_reg tmp = get_tmp(c);
 834       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 835
 836       /* tmp_d = floor(arg0.x) */
 837       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 838
 839       /* result[0] = 2.0 ^ tmp */
 840
 841       /* Adjust exponent for floating point:
 842        * exp += 127
 843        */
 844       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 845
 846       /* Install exponent and sign.
 847        * Excess drops off the edge:
 848        */
 849       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 850               tmp_d, brw_imm_d(23));
 851
 852       release_tmp(c, tmp);
 853    }
 854
 855    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 856       /* result[1] = arg0.x - floor(arg0.x) */
 857       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 858    }
 859
 860    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 861       /* As with the LOG instruction, we might be better off just
 862        * doing a taylor expansion here, seeing as we have to do all
 863        * the prep work.
 864        *
 865        * If mathbox partial precision is too low, consider also:
 866        * result[3] = result[0] * EXP(result[1])
 867        */
 868       emit_math1(c,
 869                  BRW_MATH_FUNCTION_EXP,
 870                  brw_writemask(dst, WRITEMASK_Z),
 871                  brw_swizzle1(arg0, 0),
 872                  BRW_MATH_PRECISION_FULL);
 873    }
 874
 875    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 876       /* result[3] = 1.0; */
 877       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 878    }
 879 }
 880
 881
 882 static void emit_log_noalias( struct brw_vs_compile *c,
 883                               struct brw_reg dst,
 884                               struct brw_reg arg0 )
 885 {
 886    struct brw_compile *p = &c->func;
 887    struct brw_reg tmp = dst;
 888    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 889    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 890    bool need_tmp = (dst.dw1.bits.writemask != 0xf ||
 891                          dst.file != BRW_GENERAL_REGISTER_FILE);
 892
 893    if (need_tmp) {
 894       tmp = get_tmp(c);
 895       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 896    }
 897
 898    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 899     * according to spec:
 900     *
 901     * These almost look likey they could be joined up, but not really
 902     * practical:
 903     *
 904     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 905     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 906     */
 907    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 908       brw_AND(p,
 909               brw_writemask(tmp_ud, WRITEMASK_X),
 910               brw_swizzle1(arg0_ud, 0),
 911               brw_imm_ud((1U<<31)-1));
 912
 913       brw_SHR(p,
 914               brw_writemask(tmp_ud, WRITEMASK_X),
 915               tmp_ud,
 916               brw_imm_ud(23));
 917
 918       brw_ADD(p,
 919               brw_writemask(tmp, WRITEMASK_X),
 920               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 921               brw_imm_d(-127));
 922    }
 923
 924    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 925       brw_AND(p,
 926               brw_writemask(tmp_ud, WRITEMASK_Y),
 927               brw_swizzle1(arg0_ud, 0),
 928               brw_imm_ud((1<<23)-1));
 929
 930       brw_OR(p,
 931              brw_writemask(tmp_ud, WRITEMASK_Y),
 932              tmp_ud,
 933              brw_imm_ud(127<<23));
 934    }
 935
 936    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 937       /* result[2] = result[0] + LOG2(result[1]); */
 938
 939       /* Why bother?  The above is just a hint how to do this with a
 940        * taylor series.  Maybe we *should* use a taylor series as by
 941        * the time all the above has been done it's almost certainly
 942        * quicker than calling the mathbox, even with low precision.
 943        *
 944        * Options are:
 945        *    - result[0] + mathbox.LOG2(result[1])
 946        *    - mathbox.LOG2(arg0.x)
 947        *    - result[0] + inline_taylor_approx(result[1])
 948        */
 949       emit_math1(c,
 950                  BRW_MATH_FUNCTION_LOG,
 951                  brw_writemask(tmp, WRITEMASK_Z),
 952                  brw_swizzle1(tmp, 1),
 953                  BRW_MATH_PRECISION_FULL);
 954
 955       brw_ADD(p,
 956               brw_writemask(tmp, WRITEMASK_Z),
 957               brw_swizzle1(tmp, 2),
 958               brw_swizzle1(tmp, 0));
 959    }
 960
 961    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 962       /* result[3] = 1.0; */
 963       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 964    }
 965
 966    if (need_tmp) {
 967       brw_MOV(p, dst, tmp);
 968       release_tmp(c, tmp);
 969    }
 970 }
 971
 972
 973 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 974  */
 975 static void emit_dst_noalias( struct brw_vs_compile *c,
 976                               struct brw_reg dst,
 977                               struct brw_reg arg0,
 978                               struct brw_reg arg1)
 979 {
 980    struct brw_compile *p = &c->func;
 981
 982    /* There must be a better way to do this:
 983     */
 984    if (dst.dw1.bits.writemask & WRITEMASK_X)
 985       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 986    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 987       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 988    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 989       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 990    if (dst.dw1.bits.writemask & WRITEMASK_W)
 991       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 992 }
 993
 994
 995 static void emit_xpd( struct brw_compile *p,
 996                       struct brw_reg dst,
 997                       struct brw_reg t,
 998                       struct brw_reg u)
 999 {
1000    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
1001    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1002 }
1003
1004
1005 static void emit_lit_noalias( struct brw_vs_compile *c,
1006                               struct brw_reg dst,
1007                               struct brw_reg arg0 )
1008 {
1009    struct brw_compile *p = &c->func;
1010    struct brw_reg tmp = dst;
1011    bool need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1012
1013    if (need_tmp)
1014       tmp = get_tmp(c);
1015
1016    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1017    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1018
1019    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1020     * to get all channels active inside the IF.  In the clipping code
1021     * we run with NoMask, so it's not an option and we can use
1022     * BRW_EXECUTE_1 for all comparisions.
1023     */
1024    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1025    brw_IF(p, BRW_EXECUTE_8);
1026    {
1027       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1028
1029       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1030       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
1031       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1032
1033       emit_math2(c,
1034                  BRW_MATH_FUNCTION_POW,
1035                  brw_writemask(dst, WRITEMASK_Z),
1036                  brw_swizzle1(tmp, 2),
1037                  brw_swizzle1(arg0, 3),
1038                  BRW_MATH_PRECISION_PARTIAL);
1039    }
1040    brw_ENDIF(p);
1041
1042    release_tmp(c, tmp);
1043 }
1044
1045 static void emit_lrp_noalias(struct brw_vs_compile *c,
1046                              struct brw_reg dst,
1047                              struct brw_reg arg0,
1048                              struct brw_reg arg1,
1049                              struct brw_reg arg2)
1050 {
1051    struct brw_compile *p = &c->func;
1052
1053    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1054    brw_MUL(p, brw_null_reg(), dst, arg2);
1055    brw_MAC(p, dst, arg0, arg1);
1056 }
1057
1058 static struct brw_reg
1059 get_constant(struct brw_vs_compile *c,
1060              const struct prog_instruction *inst,
1061              GLuint argIndex)
1062 {
1063    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1064    struct brw_compile *p = &c->func;
1065    struct brw_reg const_reg = c->current_const[argIndex].reg;
1066
1067    assert(argIndex < 3);
1068
1069    if (c->current_const[argIndex].index != src->Index) {
1070       /* Keep track of the last constant loaded in this slot, for reuse. */
1071       c->current_const[argIndex].index = src->Index;
1072
1073 #if 0
1074       printf("  fetch const[%d] for arg %d into reg %d\n",
1075              src->Index, argIndex, c->current_const[argIndex].reg.nr);
1076 #endif
1077       /* need to fetch the constant now */
1078       brw_dp_READ_4_vs(p,
1079                        const_reg,                     /* writeback dest */
1080                        16 * src->Index,               /* byte offset */
1081                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
1082                        );
1083    }
1084
1085    /* replicate lower four floats into upper half (to get XYZWXYZW) */
1086    const_reg = stride(const_reg, 0, 4, 1);
1087    const_reg.subnr = 0;
1088
1089    return const_reg;
1090 }
1091
1092 static struct brw_reg
1093 get_reladdr_constant(struct brw_vs_compile *c,
1094                      const struct prog_instruction *inst,
1095                      GLuint argIndex)
1096 {
1097    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1098    struct brw_compile *p = &c->func;
1099    struct brw_context *brw = p->brw;
1100    struct intel_context *intel = &brw->intel;
1101    struct brw_reg const_reg = c->current_const[argIndex].reg;
1102    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1103    uint32_t offset;
1104
1105    assert(argIndex < 3);
1106
1107    /* Can't reuse a reladdr constant load. */
1108    c->current_const[argIndex].index = -1;
1109
1110  #if 0
1111    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
1112           src->Index, argIndex, c->current_const[argIndex].reg.nr);
1113 #endif
1114
1115    if (intel->gen >= 6) {
1116       offset = src->Index;
1117    } else {
1118       struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1119       brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1120       addr_reg = byte_addr_reg;
1121       offset = 16 * src->Index;
1122    }
1123
1124    /* fetch the first vec4 */
1125    brw_dp_READ_4_vs_relative(p,
1126                              const_reg,
1127                              addr_reg,
1128                              offset,
1129                              SURF_INDEX_VERT_CONST_BUFFER);
1130
1131    return const_reg;
1132 }
1133
1134
1135
1136 /* TODO: relative addressing!
1137  */
1138 static struct brw_reg get_reg( struct brw_vs_compile *c,
1139                                gl_register_file file,
1140                                GLuint index )
1141 {
1142    switch (file) {
1143    case PROGRAM_TEMPORARY:
1144    case PROGRAM_INPUT:
1145    case PROGRAM_OUTPUT:
1146       assert(c->regs[file][index].nr != 0);
1147       return c->regs[file][index];
1148    case PROGRAM_STATE_VAR:
1149    case PROGRAM_CONSTANT:
1150    case PROGRAM_UNIFORM:
1151       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1152       return c->regs[PROGRAM_STATE_VAR][index];
1153    case PROGRAM_ADDRESS:
1154       assert(index == 0);
1155       return c->regs[file][index];
1156
1157    case PROGRAM_UNDEFINED:                      /* undef values */
1158       return brw_null_reg();
1159
1160    case PROGRAM_LOCAL_PARAM:
1161    case PROGRAM_ENV_PARAM:
1162    case PROGRAM_WRITE_ONLY:
1163    default:
1164       assert(0);
1165       return brw_null_reg();
1166    }
1167 }
1168
1169
1170 /**
1171  * Indirect addressing:  get reg[[arg] + offset].
1172  */
1173 static struct brw_reg deref( struct brw_vs_compile *c,
1174                              struct brw_reg arg,
1175                              GLint offset,
1176                              GLuint reg_size )
1177 {
1178    struct brw_compile *p = &c->func;
1179    struct brw_reg tmp = get_tmp(c);
1180    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1181    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1182    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1183    struct brw_reg indirect = brw_vec4_indirect(0,0);
1184    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1185
1186    /* Set the vertical stride on the register access so that the first
1187     * 4 components come from a0.0 and the second 4 from a0.1.
1188     */
1189    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1190
1191    {
1192       brw_push_insn_state(p);
1193       brw_set_access_mode(p, BRW_ALIGN_1);
1194
1195       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1196       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1197
1198       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1199       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1200
1201       brw_MOV(p, tmp, indirect);
1202
1203       brw_pop_insn_state(p);
1204    }
1205
1206    /* NOTE: tmp not released */
1207    return tmp;
1208 }
1209
1210 static void
1211 move_to_reladdr_dst(struct brw_vs_compile *c,
1212                     const struct prog_instruction *inst,
1213                     struct brw_reg val)
1214 {
1215    struct brw_compile *p = &c->func;
1216    int reg_size = 32;
1217    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1218    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1219    struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1220    GLuint byte_offset = base.nr * 32 + base.subnr;
1221    struct brw_reg indirect = brw_vec4_indirect(0,0);
1222    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1223
1224    /* Because destination register indirect addressing can only use
1225     * one index, we'll write each vertex's vec4 value separately.
1226     */
1227    val.width = BRW_WIDTH_4;
1228    val.vstride = BRW_VERTICAL_STRIDE_4;
1229
1230    brw_push_insn_state(p);
1231    brw_set_access_mode(p, BRW_ALIGN_1);
1232
1233    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1234    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1235    brw_MOV(p, indirect, val);
1236
1237    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1238    brw_ADD(p, brw_address_reg(0), acc,
1239            brw_imm_uw(byte_offset + reg_size / 2));
1240    brw_MOV(p, indirect, suboffset(val, 4));
1241
1242    brw_pop_insn_state(p);
1243 }
1244
1245 /**
1246  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1247  * TODO: relative addressing!
1248  */
1249 static struct brw_reg
1250 get_src_reg( struct brw_vs_compile *c,
1251              const struct prog_instruction *inst,
1252              GLuint argIndex )
1253 {
1254    const GLuint file = inst->SrcReg[argIndex].File;
1255    const GLint index = inst->SrcReg[argIndex].Index;
1256    const bool relAddr = inst->SrcReg[argIndex].RelAddr;
1257
1258    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1259       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1260
1261       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1262                                         SWIZZLE_ZERO,
1263                                         SWIZZLE_ZERO,
1264                                         SWIZZLE_ZERO)) {
1265           return brw_imm_f(0.0f);
1266       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1267                                                SWIZZLE_ONE,
1268                                                SWIZZLE_ONE,
1269                                                SWIZZLE_ONE)) {
1270          if (src->Negate)
1271             return brw_imm_f(-1.0F);
1272          else
1273             return brw_imm_f(1.0F);
1274       } else if (src->File == PROGRAM_CONSTANT) {
1275          const struct gl_program_parameter_list *params;
1276          float f;
1277          int component = -1;
1278
1279          switch (src->Swizzle) {
1280          case SWIZZLE_XXXX:
1281             component = 0;
1282             break;
1283          case SWIZZLE_YYYY:
1284             component = 1;
1285             break;
1286          case SWIZZLE_ZZZZ:
1287             component = 2;
1288             break;
1289          case SWIZZLE_WWWW:
1290             component = 3;
1291             break;
1292          }
1293
1294          if (component >= 0) {
1295             params = c->vp->program.Base.Parameters;
1296             f = params->ParameterValues[src->Index][component].f;
1297
1298             if (src->Abs)
1299                f = fabs(f);
1300             if (src->Negate)
1301                f = -f;
1302             return brw_imm_f(f);
1303          }
1304       }
1305    }
1306
1307    switch (file) {
1308    case PROGRAM_TEMPORARY:
1309    case PROGRAM_INPUT:
1310    case PROGRAM_OUTPUT:
1311       if (relAddr) {
1312          return deref(c, c->regs[file][0], index, 32);
1313       }
1314       else {
1315          assert(c->regs[file][index].nr != 0);
1316          return c->regs[file][index];
1317       }
1318
1319    case PROGRAM_STATE_VAR:
1320    case PROGRAM_CONSTANT:
1321    case PROGRAM_UNIFORM:
1322    case PROGRAM_ENV_PARAM:
1323    case PROGRAM_LOCAL_PARAM:
1324       if (!relAddr && c->constant_map[index] != -1) {
1325          /* Take from the push constant buffer if possible. */
1326          assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1327          return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1328       } else {
1329          /* Must be in the pull constant buffer then .*/
1330          assert(c->vp->use_const_buffer);
1331          if (relAddr)
1332             return get_reladdr_constant(c, inst, argIndex);
1333          else
1334             return get_constant(c, inst, argIndex);
1335       }
1336    case PROGRAM_ADDRESS:
1337       assert(index == 0);
1338       return c->regs[file][index];
1339
1340    case PROGRAM_UNDEFINED:
1341       /* this is a normal case since we loop over all three src args */
1342       return brw_null_reg();
1343
1344    case PROGRAM_WRITE_ONLY:
1345    default:
1346       assert(0);
1347       return brw_null_reg();
1348    }
1349 }
1350
1351 /**
1352  * Return the brw reg for the given instruction's src argument.
1353  * Will return mangled results for SWZ op.  The emit_swz() function
1354  * ignores this result and recalculates taking extended swizzles into
1355  * account.
1356  */
1357 static struct brw_reg get_arg( struct brw_vs_compile *c,
1358                                const struct prog_instruction *inst,
1359                                GLuint argIndex )
1360 {
1361    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1362    struct brw_reg reg;
1363
1364    if (src->File == PROGRAM_UNDEFINED)
1365       return brw_null_reg();
1366
1367    reg = get_src_reg(c, inst, argIndex);
1368
1369    /* Convert 3-bit swizzle to 2-bit.
1370     */
1371    if (reg.file != BRW_IMMEDIATE_VALUE) {
1372       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1373                                           GET_SWZ(src->Swizzle, 1),
1374                                           GET_SWZ(src->Swizzle, 2),
1375                                           GET_SWZ(src->Swizzle, 3));
1376
1377       /* Note this is ok for non-swizzle ARB_vp instructions */
1378       reg.negate = src->Negate ? 1 : 0;
1379    }
1380
1381    return reg;
1382 }
1383
1384
1385 /**
1386  * Get brw register for the given program dest register.
1387  */
1388 static struct brw_reg get_dst( struct brw_vs_compile *c,
1389                                struct prog_dst_register dst )
1390 {
1391    struct brw_reg reg;
1392
1393    switch (dst.File) {
1394    case PROGRAM_TEMPORARY:
1395    case PROGRAM_OUTPUT:
1396       /* register-indirect addressing is only 1x1, not VxH, for
1397        * destination regs.  So, for RelAddr we'll return a temporary
1398        * for the dest and do a move of the result to the RelAddr
1399        * register after the instruction emit.
1400        */
1401       if (dst.RelAddr) {
1402          reg = get_tmp(c);
1403       } else {
1404          assert(c->regs[dst.File][dst.Index].nr != 0);
1405          reg = c->regs[dst.File][dst.Index];
1406       }
1407       break;
1408    case PROGRAM_ADDRESS:
1409       assert(dst.Index == 0);
1410       reg = c->regs[dst.File][dst.Index];
1411       break;
1412    case PROGRAM_UNDEFINED:
1413       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1414       reg = brw_null_reg();
1415       break;
1416    default:
1417       assert(0);
1418       reg = brw_null_reg();
1419    }
1420
1421    assert(reg.type != BRW_IMMEDIATE_VALUE);
1422    reg.dw1.bits.writemask = dst.WriteMask;
1423
1424    return reg;
1425 }
1426
1427
1428 static void emit_swz( struct brw_vs_compile *c,
1429                       struct brw_reg dst,
1430                       const struct prog_instruction *inst)
1431 {
1432    const GLuint argIndex = 0;
1433    const struct prog_src_register src = inst->SrcReg[argIndex];
1434    struct brw_compile *p = &c->func;
1435    GLuint zeros_mask = 0;
1436    GLuint ones_mask = 0;
1437    GLuint src_mask = 0;
1438    GLubyte src_swz[4];
1439    bool need_tmp = (src.Negate &&
1440                          dst.file != BRW_GENERAL_REGISTER_FILE);
1441    struct brw_reg tmp = dst;
1442    GLuint i;
1443
1444    if (need_tmp)
1445       tmp = get_tmp(c);
1446
1447    for (i = 0; i < 4; i++) {
1448       if (dst.dw1.bits.writemask & (1<<i)) {
1449          GLubyte s = GET_SWZ(src.Swizzle, i);
1450          switch (s) {
1451          case SWIZZLE_X:
1452          case SWIZZLE_Y:
1453          case SWIZZLE_Z:
1454          case SWIZZLE_W:
1455             src_mask |= 1<<i;
1456             src_swz[i] = s;
1457             break;
1458          case SWIZZLE_ZERO:
1459             zeros_mask |= 1<<i;
1460             break;
1461          case SWIZZLE_ONE:
1462             ones_mask |= 1<<i;
1463             break;
1464          }
1465       }
1466    }
1467
1468    /* Do src first, in case dst aliases src:
1469     */
1470    if (src_mask) {
1471       struct brw_reg arg0;
1472
1473       arg0 = get_src_reg(c, inst, argIndex);
1474
1475       arg0 = brw_swizzle(arg0,
1476                          src_swz[0], src_swz[1],
1477                          src_swz[2], src_swz[3]);
1478
1479       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1480    }
1481
1482    if (zeros_mask)
1483       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1484
1485    if (ones_mask)
1486       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1487
1488    if (src.Negate)
1489       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1490
1491    if (need_tmp) {
1492       brw_MOV(p, dst, tmp);
1493       release_tmp(c, tmp);
1494    }
1495 }
1496
1497 static int
1498 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1499 {
1500    struct intel_context *intel = &brw->intel;
1501
1502    if (intel->gen >= 6) {
1503       /* URB data written (does not include the message header reg) must
1504        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1505        * section 5.4.3.2.2: URB_INTERLEAVED.
1506        *
1507        * URB entries are allocated on a multiple of 1024 bits, so an
1508        * extra 128 bits written here to make the end align to 256 is
1509        * no problem.
1510        */
1511       if ((mlen % 2) != 1)
1512          mlen++;
1513    }
1514
1515    return mlen;
1516 }
1517
1518 /**
1519  * Post-vertex-program processing.  Send the results to the URB.
1520  */
1521 static void emit_vertex_write( struct brw_vs_compile *c)
1522 {
1523    struct brw_compile *p = &c->func;
1524    struct brw_context *brw = p->brw;
1525    struct intel_context *intel = &brw->intel;
1526    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1527    struct brw_reg ndc;
1528    int eot;
1529    GLuint len_vertex_header = 2;
1530    int i;
1531    int msg_len;
1532    int slot;
1533
1534    if (c->key.copy_edgeflag) {
1535       brw_MOV(p,
1536               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1537               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1538    }
1539
1540    if (intel->gen < 6) {
1541       /* Build ndc coords */
1542       ndc = get_tmp(c);
1543       /* ndc = 1.0 / pos.w */
1544       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1545       /* ndc.xyz = pos * ndc */
1546       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1547    }
1548
1549    /* Update the header for point size, user clipping flags, and -ve rhw
1550     * workaround.
1551     */
1552    if (intel->gen >= 6) {
1553       struct brw_reg m1 = brw_message_reg(1);
1554
1555       /* On gen6, m1 has each value in a separate dword, so we never
1556        * need to mess with a temporary for computing the m1 value.
1557        */
1558       brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1559       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1560          brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1561                  brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1562       }
1563
1564       /* Set the user clip distances in dword 8-15. (m3-4)*/
1565       if (c->key.userclip_active) {
1566          for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1567             struct brw_reg m;
1568             if (i < 4)
1569                m = brw_message_reg(3);
1570             else
1571                m = brw_message_reg(4);
1572
1573             brw_DP4(p, brw_writemask(m, (1 << (i & 3))),pos, c->userplane[i]);
1574          }
1575       }
1576    } else if ((c->prog_data.outputs_written &
1577                BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1578               c->key.userclip_active || brw->has_negative_rhw_bug) {
1579       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1580       GLuint i;
1581
1582       brw_MOV(p, header1, brw_imm_ud(0));
1583
1584       brw_set_access_mode(p, BRW_ALIGN_16);
1585
1586       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1587          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1588          brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1589                  brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1590          brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1591                  header1, brw_imm_ud(0x7ff<<8));
1592       }
1593
1594       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1595          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1596          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1597          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1598          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1599       }
1600
1601       /* i965 clipping workaround:
1602        * 1) Test for -ve rhw
1603        * 2) If set,
1604        *      set ndc = (0,0,0,0)
1605        *      set ucp[6] = 1
1606        *
1607        * Later, clipping will detect ucp[6] and ensure the primitive is
1608        * clipped against all fixed planes.
1609        */
1610       if (brw->has_negative_rhw_bug) {
1611          brw_CMP(p,
1612                  vec8(brw_null_reg()),
1613                  BRW_CONDITIONAL_L,
1614                  brw_swizzle1(ndc, 3),
1615                  brw_imm_f(0));
1616
1617          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1618          brw_MOV(p, ndc, brw_imm_f(0));
1619          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1620       }
1621
1622       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1623       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1624       brw_set_access_mode(p, BRW_ALIGN_16);
1625
1626       release_tmp(c, header1);
1627    }
1628    else {
1629       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1630    }
1631
1632    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1633     * of zeros followed by two sets of NDC coordinates:
1634     */
1635    brw_set_access_mode(p, BRW_ALIGN_1);
1636    brw_set_acc_write_control(p, 0);
1637
1638    /* The VUE layout is documented in Volume 2a. */
1639    if (intel->gen >= 6) {
1640       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1641        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1642        * dword 4-7 (m2) is the 4D space position
1643        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1644        * enabled.
1645        * m3 or 5 is the first vertex element data we fill, which is
1646        * the vertex position.
1647        */
1648       brw_MOV(p, brw_message_reg(2), pos);
1649       len_vertex_header = 1;
1650       if (c->key.userclip_active)
1651          len_vertex_header += 2;
1652    } else if (intel->gen == 5) {
1653       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1654        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1655        * dword 4-7 (m2) is the ndc position (set above)
1656        * dword 8-11 (m3) of the vertex header is the 4D space position
1657        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1658        * m6 is a pad so that the vertex element data is aligned
1659        * m7 is the first vertex data we fill, which is the vertex position.
1660        */
1661       brw_MOV(p, brw_message_reg(2), ndc);
1662       brw_MOV(p, brw_message_reg(3), pos);
1663       brw_MOV(p, brw_message_reg(7), pos);
1664       len_vertex_header = 6;
1665    } else {
1666       /* There are 8 dwords in VUE header pre-Ironlake:
1667        * dword 0-3 (m1) is indices, point width, clip flags.
1668        * dword 4-7 (m2) is ndc position (set above)
1669        *
1670        * dword 8-11 (m3) is the first vertex data, which we always have be the
1671        * vertex position.
1672        */
1673       brw_MOV(p, brw_message_reg(2), ndc);
1674       brw_MOV(p, brw_message_reg(3), pos);
1675       len_vertex_header = 2;
1676    }
1677
1678    /* Move variable-addressed, non-overflow outputs to their MRFs. */
1679    for (slot = len_vertex_header; slot < c->prog_data.vue_map.num_slots; ++slot) {
1680       if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE)
1681          break;
1682
1683       int mrf = slot + 1;
1684       int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
1685       if (c->regs[PROGRAM_OUTPUT][vert_result].file ==
1686           BRW_GENERAL_REGISTER_FILE) {
1687          brw_MOV(p, brw_message_reg(mrf),
1688                  c->regs[PROGRAM_OUTPUT][vert_result]);
1689       }
1690    }
1691
1692    eot = (slot >= c->prog_data.vue_map.num_slots);
1693
1694    /* Message header, plus the (first part of the) VUE. */
1695    msg_len = 1 + slot;
1696    msg_len = align_interleaved_urb_mlen(brw, msg_len);
1697    /* Any outputs beyond BRW_MAX_MRF should be in the second URB write */
1698    assert (msg_len <= BRW_MAX_MRF - 1);
1699
1700    brw_urb_WRITE(p,
1701                  brw_null_reg(), /* dest */
1702                  0,             /* starting mrf reg nr */
1703                  c->r0,         /* src */
1704                  0,             /* allocate */
1705                  1,             /* used */
1706                  msg_len,
1707                  0,             /* response len */
1708                  eot,           /* eot */
1709                  eot,           /* writes complete */
1710                  0,             /* urb destination offset */
1711                  BRW_URB_SWIZZLE_INTERLEAVE);
1712
1713    if (slot < c->prog_data.vue_map.num_slots) {
1714       /* Not all of the vertex outputs/results fit into the MRF.
1715        * Move the overflowed attributes from the GRF to the MRF and
1716        * issue another brw_urb_WRITE().
1717        */
1718       GLuint mrf = 1;
1719       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
1720          int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
1721          /* move from GRF to MRF */
1722          brw_MOV(p, brw_message_reg(mrf),
1723                  c->regs[PROGRAM_OUTPUT][vert_result]);
1724          mrf++;
1725       }
1726
1727       brw_urb_WRITE(p,
1728                     brw_null_reg(), /* dest */
1729                     0,              /* starting mrf reg nr */
1730                     c->r0,          /* src */
1731                     0,              /* allocate */
1732                     1,              /* used */
1733                     align_interleaved_urb_mlen(brw, mrf),
1734                     0,              /* response len */
1735                     1,              /* eot */
1736                     1,              /* writes complete */
1737                     MAX_SLOTS_IN_FIRST_URB_WRITE / 2,  /* urb destination offset */
1738                     BRW_URB_SWIZZLE_INTERLEAVE);
1739    }
1740 }
1741
1742 static bool
1743 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1744 {
1745    struct brw_compile *p = &c->func;
1746    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1747
1748    if (p->nr_insn == 0)
1749       return false;
1750
1751    if (val.address_mode != BRW_ADDRESS_DIRECT)
1752       return false;
1753
1754    if (val.negate || val.abs)
1755       return false;
1756
1757    switch (prev_insn->header.opcode) {
1758    case BRW_OPCODE_MOV:
1759    case BRW_OPCODE_MAC:
1760    case BRW_OPCODE_MUL:
1761       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1762           prev_insn->header.execution_size == val.width &&
1763           prev_insn->bits1.da1.dest_reg_file == val.file &&
1764           prev_insn->bits1.da1.dest_reg_type == val.type &&
1765           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1766           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1767           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1768           prev_insn->bits1.da16.dest_writemask == 0xf)
1769          return true;
1770       else
1771          return false;
1772    default:
1773       return false;
1774    }
1775 }
1776
1777 static uint32_t
1778 get_predicate(const struct prog_instruction *inst)
1779 {
1780    if (inst->DstReg.CondMask == COND_TR)
1781       return BRW_PREDICATE_NONE;
1782
1783    /* All of GLSL only produces predicates for COND_NE and one channel per
1784     * vector.  Fail badly if someone starts doing something else, as it might
1785     * mean infinite looping or something.
1786     *
1787     * We'd like to support all the condition codes, but our hardware doesn't
1788     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1789     * those, the instruction may update the condition codes or not, then any
1790     * later instruction may use one of those condition codes.  For gen4, the
1791     * instruction may update the flags register based on one of the condition
1792     * codes output by the instruction, and then further instructions may
1793     * predicate on that.  We can probably support this, but it won't
1794     * necessarily be easy.
1795     */
1796    assert(inst->DstReg.CondMask == COND_NE);
1797
1798    switch (inst->DstReg.CondSwizzle) {
1799    case SWIZZLE_XXXX:
1800       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1801    case SWIZZLE_YYYY:
1802       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1803    case SWIZZLE_ZZZZ:
1804       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1805    case SWIZZLE_WWWW:
1806       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1807    default:
1808       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1809                     inst->DstReg.CondMask);
1810       return BRW_PREDICATE_NORMAL;
1811    }
1812 }
1813
1814 static void
1815 brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
1816 {
1817    struct brw_compile *p = &c->func;
1818    int i;
1819
1820    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1821       if (!(c->prog_data.inputs_read & BITFIELD64_BIT(i)))
1822          continue;
1823
1824       if (c->key.gl_fixed_input_size[i] != 0) {
1825          struct brw_reg reg = c->regs[PROGRAM_INPUT][i];
1826
1827          brw_MUL(p,
1828                  brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
1829                  reg, brw_imm_f(1.0 / 65536.0));
1830       }
1831    }
1832 }
1833
1834 /* Emit the vertex program instructions here.
1835  */
1836 void brw_old_vs_emit(struct brw_vs_compile *c )
1837 {
1838 #define MAX_IF_DEPTH 32
1839 #define MAX_LOOP_DEPTH 32
1840    struct brw_compile *p = &c->func;
1841    struct brw_context *brw = p->brw;
1842    struct intel_context *intel = &brw->intel;
1843    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1844    GLuint insn;
1845    const struct brw_indirect stack_index = brw_indirect(0, 0);
1846    GLuint index;
1847    GLuint file;
1848
1849    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1850       printf("vs-mesa:\n");
1851       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1852                                true);
1853       printf("\n");
1854    }
1855
1856    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1857    brw_set_access_mode(p, BRW_ALIGN_16);
1858
1859    brw_set_acc_write_control(p, 1);
1860
1861    for (insn = 0; insn < nr_insns; insn++) {
1862        GLuint i;
1863        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1864
1865        /* Message registers can't be read, so copy the output into GRF
1866         * register if they are used in source registers
1867         */
1868        for (i = 0; i < 3; i++) {
1869            struct prog_src_register *src = &inst->SrcReg[i];
1870            GLuint index = src->Index;
1871            GLuint file = src->File;
1872            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1873                c->output_regs[index].used_in_src = true;
1874        }
1875
1876        switch (inst->Opcode) {
1877        case OPCODE_CAL:
1878        case OPCODE_RET:
1879           c->needs_stack = true;
1880           break;
1881        default:
1882           break;
1883        }
1884    }
1885
1886    /* Static register allocation
1887     */
1888    brw_vs_alloc_regs(c);
1889
1890    brw_vs_rescale_gl_fixed(c);
1891
1892    if (c->needs_stack)
1893       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1894
1895    for (insn = 0; insn < nr_insns; insn++) {
1896
1897       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1898       struct brw_reg args[3], dst;
1899       GLuint i;
1900
1901 #if 0
1902       printf("%d: ", insn);
1903       _mesa_print_instruction(inst);
1904 #endif
1905
1906       /* Get argument regs.  SWZ is special and does this itself.
1907        */
1908       if (inst->Opcode != OPCODE_SWZ)
1909           for (i = 0; i < 3; i++) {
1910               const struct prog_src_register *src = &inst->SrcReg[i];
1911               index = src->Index;
1912               file = src->File;
1913               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
1914                  /* Can't just make get_arg "do the right thing" here because
1915                   * other callers of get_arg and get_src_reg don't expect any
1916                   * special behavior for the c->output_regs[index].used_in_src
1917                   * case.
1918                   */
1919                  args[i] = c->output_regs[index].reg;
1920                  args[i].dw1.bits.swizzle =
1921                     BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1922                                  GET_SWZ(src->Swizzle, 1),
1923                                  GET_SWZ(src->Swizzle, 2),
1924                                  GET_SWZ(src->Swizzle, 3));
1925
1926                  /* Note this is ok for non-swizzle ARB_vp instructions */
1927                  args[i].negate = src->Negate ? 1 : 0;
1928               } else
1929                   args[i] = get_arg(c, inst, i);
1930           }
1931
1932       /* Get dest regs.  Note that it is possible for a reg to be both
1933        * dst and arg, given the static allocation of registers.  So
1934        * care needs to be taken emitting multi-operation instructions.
1935        */
1936       index = inst->DstReg.Index;
1937       file = inst->DstReg.File;
1938       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1939          /* Can't just make get_dst "do the right thing" here because other
1940           * callers of get_dst don't expect any special behavior for the
1941           * c->output_regs[index].used_in_src case.
1942           */
1943          dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
1944       else
1945           dst = get_dst(c, inst->DstReg);
1946
1947       if (inst->SaturateMode != SATURATE_OFF) {
1948          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1949                        inst->SaturateMode);
1950       }
1951
1952       switch (inst->Opcode) {
1953       case OPCODE_ABS:
1954          args[0].negate = false;
1955          brw_MOV(p, dst, brw_abs(args[0]));
1956          break;
1957       case OPCODE_ADD:
1958          brw_ADD(p, dst, args[0], args[1]);
1959          break;
1960       case OPCODE_COS:
1961          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1962          break;
1963       case OPCODE_DP2:
1964          brw_DP2(p, dst, args[0], args[1]);
1965          break;
1966       case OPCODE_DP3:
1967          brw_DP3(p, dst, args[0], args[1]);
1968          break;
1969       case OPCODE_DP4:
1970          brw_DP4(p, dst, args[0], args[1]);
1971          break;
1972       case OPCODE_DPH:
1973          brw_DPH(p, dst, args[0], args[1]);
1974          break;
1975       case OPCODE_DST:
1976          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1977          break;
1978       case OPCODE_EXP:
1979          unalias1(c, dst, args[0], emit_exp_noalias);
1980          break;
1981       case OPCODE_EX2:
1982          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1983          break;
1984       case OPCODE_ARL:
1985          emit_arl(p, dst, args[0]);
1986          break;
1987       case OPCODE_FLR:
1988          brw_RNDD(p, dst, args[0]);
1989          break;
1990       case OPCODE_FRC:
1991          brw_FRC(p, dst, args[0]);
1992          break;
1993       case OPCODE_LOG:
1994          unalias1(c, dst, args[0], emit_log_noalias);
1995          break;
1996       case OPCODE_LG2:
1997          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1998          break;
1999       case OPCODE_LIT:
2000          unalias1(c, dst, args[0], emit_lit_noalias);
2001          break;
2002       case OPCODE_LRP:
2003          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2004          break;
2005       case OPCODE_MAD:
2006          if (!accumulator_contains(c, args[2]))
2007             brw_MOV(p, brw_acc_reg(), args[2]);
2008          brw_MAC(p, dst, args[0], args[1]);
2009          break;
2010       case OPCODE_CMP:
2011          emit_cmp(p, dst, args[0], args[1], args[2]);
2012          break;
2013       case OPCODE_MAX:
2014          emit_max(p, dst, args[0], args[1]);
2015          break;
2016       case OPCODE_MIN:
2017          emit_min(p, dst, args[0], args[1]);
2018          break;
2019       case OPCODE_MOV:
2020          brw_MOV(p, dst, args[0]);
2021          break;
2022       case OPCODE_MUL:
2023          brw_MUL(p, dst, args[0], args[1]);
2024          break;
2025       case OPCODE_POW:
2026          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2027          break;
2028       case OPCODE_RCP:
2029          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2030          break;
2031       case OPCODE_RSQ:
2032          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2033          break;
2034
2035       case OPCODE_SEQ:
2036          unalias2(c, dst, args[0], args[1], emit_seq);
2037          break;
2038       case OPCODE_SIN:
2039          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2040          break;
2041       case OPCODE_SNE:
2042          unalias2(c, dst, args[0], args[1], emit_sne);
2043          break;
2044       case OPCODE_SGE:
2045          unalias2(c, dst, args[0], args[1], emit_sge);
2046          break;
2047       case OPCODE_SGT:
2048          unalias2(c, dst, args[0], args[1], emit_sgt);
2049          break;
2050       case OPCODE_SLT:
2051          unalias2(c, dst, args[0], args[1], emit_slt);
2052          break;
2053       case OPCODE_SLE:
2054          unalias2(c, dst, args[0], args[1], emit_sle);
2055          break;
2056       case OPCODE_SSG:
2057          unalias1(c, dst, args[0], emit_sign);
2058          break;
2059       case OPCODE_SUB:
2060          brw_ADD(p, dst, args[0], negate(args[1]));
2061          break;
2062       case OPCODE_SWZ:
2063          /* The args[0] value can't be used here as it won't have
2064           * correctly encoded the full swizzle:
2065           */
2066          emit_swz(c, dst, inst);
2067          break;
2068       case OPCODE_TRUNC:
2069          /* round toward zero */
2070          brw_RNDZ(p, dst, args[0]);
2071          break;
2072       case OPCODE_XPD:
2073          emit_xpd(p, dst, args[0], args[1]);
2074          break;
2075       case OPCODE_IF: {
2076          struct brw_instruction *if_inst = brw_IF(p, BRW_EXECUTE_8);
2077          /* Note that brw_IF smashes the predicate_control field. */
2078          if_inst->header.predicate_control = get_predicate(inst);
2079          break;
2080       }
2081       case OPCODE_ELSE:
2082          clear_current_const(c);
2083          brw_ELSE(p);
2084          break;
2085       case OPCODE_ENDIF:
2086          clear_current_const(c);
2087          brw_ENDIF(p);
2088          break;
2089       case OPCODE_BGNLOOP:
2090          clear_current_const(c);
2091          brw_DO(p, BRW_EXECUTE_8);
2092          break;
2093       case OPCODE_BRK:
2094          brw_set_predicate_control(p, get_predicate(inst));
2095          brw_BREAK(p);
2096          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2097          break;
2098       case OPCODE_CONT:
2099          brw_set_predicate_control(p, get_predicate(inst));
2100          if (intel->gen >= 6) {
2101             gen6_CONT(p);
2102          } else {
2103             brw_CONT(p);
2104          }
2105          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2106          break;
2107
2108       case OPCODE_ENDLOOP:
2109          clear_current_const(c);
2110          brw_WHILE(p);
2111          break;
2112
2113       case OPCODE_BRA:
2114          brw_set_predicate_control(p, get_predicate(inst));
2115          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2116          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2117          break;
2118       case OPCODE_CAL:
2119          brw_set_access_mode(p, BRW_ALIGN_1);
2120          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2121          brw_set_access_mode(p, BRW_ALIGN_16);
2122          brw_ADD(p, get_addr_reg(stack_index),
2123                          get_addr_reg(stack_index), brw_imm_d(4));
2124          brw_save_call(p, inst->Comment, p->nr_insn);
2125          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2126          break;
2127       case OPCODE_RET:
2128          brw_ADD(p, get_addr_reg(stack_index),
2129                          get_addr_reg(stack_index), brw_imm_d(-4));
2130          brw_set_access_mode(p, BRW_ALIGN_1);
2131          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2132          brw_set_access_mode(p, BRW_ALIGN_16);
2133          break;
2134       case OPCODE_END:
2135          emit_vertex_write(c);
2136          break;
2137       case OPCODE_PRINT:
2138          /* no-op */
2139          break;
2140       case OPCODE_BGNSUB:
2141          brw_save_label(p, inst->Comment, p->nr_insn);
2142          break;
2143       case OPCODE_ENDSUB:
2144          /* no-op */
2145          break;
2146       default:
2147          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2148                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2149                                     _mesa_opcode_string(inst->Opcode) :
2150                                     "unknown");
2151       }
2152
2153       /* Set the predication update on the last instruction of the native
2154        * instruction sequence.
2155        *
2156        * This would be problematic if it was set on a math instruction,
2157        * but that shouldn't be the case with the current GLSL compiler.
2158        */
2159       if (inst->CondUpdate) {
2160          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2161
2162          assert(hw_insn->header.destreg__conditionalmod == 0);
2163          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2164       }
2165
2166       if ((inst->DstReg.File == PROGRAM_OUTPUT)
2167           && (inst->DstReg.Index != VERT_RESULT_HPOS)
2168           && c->output_regs[inst->DstReg.Index].used_in_src) {
2169          brw_MOV(p, get_dst(c, inst->DstReg), dst);
2170       }
2171
2172       /* Result color clamping.
2173        *
2174        * When destination register is an output register and
2175        * it's primary/secondary front/back color, we have to clamp
2176        * the result to [0,1]. This is done by enabling the
2177        * saturation bit for the last instruction.
2178        *
2179        * We don't use brw_set_saturate() as it modifies
2180        * p->current->header.saturate, which affects all the subsequent
2181        * instructions. Instead, we directly modify the header
2182        * of the last (already stored) instruction.
2183        */
2184       if (inst->DstReg.File == PROGRAM_OUTPUT &&
2185           c->key.clamp_vertex_color) {
2186          if ((inst->DstReg.Index == VERT_RESULT_COL0)
2187              || (inst->DstReg.Index == VERT_RESULT_COL1)
2188              || (inst->DstReg.Index == VERT_RESULT_BFC0)
2189              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2190             p->store[p->nr_insn-1].header.saturate = 1;
2191          }
2192       }
2193
2194       if (inst->DstReg.RelAddr) {
2195          assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2196                 inst->DstReg.File == PROGRAM_OUTPUT);
2197          move_to_reladdr_dst(c, inst, dst);
2198       }
2199
2200       release_tmps(c);
2201    }
2202
2203    brw_resolve_cals(p);
2204    brw_set_uip_jip(p);
2205
2206    brw_optimize(p);
2207
2208    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2209       int i;
2210
2211       printf("vs-native:\n");
2212       for (i = 0; i < p->nr_insn; i++)
2213          brw_disasm(stdout, &p->store[i], intel->gen);
2214       printf("\n");
2215    }
2216 }