src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "program/program.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40 /* Return the SrcReg index of the channels that can be immediate float operands
  41  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  42  */
  43 static bool
  44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  45 {
  46    int opcode_array[] = {
  47       [OPCODE_MOV] = 1,
  48       [OPCODE_ADD] = 2,
  49       [OPCODE_CMP] = 3,
  50       [OPCODE_DP2] = 2,
  51       [OPCODE_DP3] = 2,
  52       [OPCODE_DP4] = 2,
  53       [OPCODE_DPH] = 2,
  54       [OPCODE_MAX] = 2,
  55       [OPCODE_MIN] = 2,
  56       [OPCODE_MUL] = 2,
  57       [OPCODE_SEQ] = 2,
  58       [OPCODE_SGE] = 2,
  59       [OPCODE_SGT] = 2,
  60       [OPCODE_SLE] = 2,
  61       [OPCODE_SLT] = 2,
  62       [OPCODE_SNE] = 2,
  63       [OPCODE_XPD] = 2,
  64    };
  65
  66    /* These opcodes get broken down in a way that allow two
  67     * args to be immediates.
  68     */
  69    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  70       if (arg == 1 || arg == 2)
  71          return true;
  72    }
  73
  74    if (opcode > ARRAY_SIZE(opcode_array))
  75       return false;
  76
  77    return arg == opcode_array[opcode] - 1;
  78 }
  79
  80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  81 {
  82    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  83
  84    if (++c->last_tmp > c->prog_data.total_grf)
  85       c->prog_data.total_grf = c->last_tmp;
  86
  87    return tmp;
  88 }
  89
  90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  91 {
  92    if (tmp.nr == c->last_tmp-1)
  93       c->last_tmp--;
  94 }
  95
  96 static void release_tmps( struct brw_vs_compile *c )
  97 {
  98    c->last_tmp = c->first_tmp;
  99 }
 100
 101 static int
 102 get_first_reladdr_output(struct gl_vertex_program *vp)
 103 {
 104    int i;
 105    int first_reladdr_output = VERT_RESULT_MAX;
 106
 107    for (i = 0; i < vp->Base.NumInstructions; i++) {
 108       struct prog_instruction *inst = vp->Base.Instructions + i;
 109
 110       if (inst->DstReg.File == PROGRAM_OUTPUT &&
 111           inst->DstReg.RelAddr &&
 112           inst->DstReg.Index < first_reladdr_output)
 113          first_reladdr_output = inst->DstReg.Index;
 114    }
 115
 116    return first_reladdr_output;
 117 }
 118
 119 /* Clears the record of which vp_const_buffer elements have been
 120  * loaded into our constant buffer registers, for the starts of new
 121  * blocks after control flow.
 122  */
 123 static void
 124 clear_current_const(struct brw_vs_compile *c)
 125 {
 126    unsigned int i;
 127
 128    if (c->vp->use_const_buffer) {
 129       for (i = 0; i < 3; i++) {
 130          c->current_const[i].index = -1;
 131       }
 132    }
 133 }
 134
 135 /* The message length for all SEND messages is restricted to [1,15].  This
 136  * includes 1 for the header, so anything in slots 14 and above needs to be
 137  * placed in a general-purpose register and emitted using a second URB write.
 138  */
 139 #define MAX_SLOTS_IN_FIRST_URB_WRITE 14
 140
 141 /**
 142  * Determine whether the given vertex output can be written directly to a MRF
 143  * or whether it has to be stored in a general-purpose register.
 144  */
 145 static inline bool can_use_direct_mrf(int vert_result,
 146                                       int first_reladdr_output, int slot)
 147 {
 148    if (vert_result == VERT_RESULT_HPOS || vert_result == VERT_RESULT_PSIZ) {
 149       /* These never go straight into MRF's.  They are placed in the MRF by
 150        * epilog code.
 151        */
 152       return false;
 153    }
 154    if (first_reladdr_output <= vert_result && vert_result < VERT_RESULT_MAX) {
 155       /* Relative addressing might be used to access this vert_result, so it
 156        * needs to go into a general-purpose register.
 157        */
 158       return false;
 159    }
 160    if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE) {
 161       /* This output won't go out until the second URB write so it must be
 162        * stored in a general-purpose register until then.
 163        */
 164       return false;
 165    }
 166    return true;
 167 }
 168
 169 /**
 170  * Preallocate GRF register before code emit.
 171  * Do things as simply as possible.  Allocate and populate all regs
 172  * ahead of time.
 173  */
 174 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 175 {
 176    struct intel_context *intel = &c->func.brw->intel;
 177    GLuint i, reg = 0, slot;
 178    int attributes_in_vue;
 179    int first_reladdr_output;
 180    int max_constant;
 181    int constant = 0;
 182    struct brw_vertex_program *vp = c->vp;
 183    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
 184
 185    /* Determine whether to use a real constant buffer or use a block
 186     * of GRF registers for constants.  The later is faster but only
 187     * works if everything fits in the GRF.
 188     * XXX this heuristic/check may need some fine tuning...
 189     */
 190    if (c->vp->program.Base.Parameters->NumParameters +
 191        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
 192       c->vp->use_const_buffer = true;
 193    else
 194       c->vp->use_const_buffer = false;
 195
 196    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 197
 198    /* r0 -- reserved as usual
 199     */
 200    c->r0 = brw_vec8_grf(reg, 0);
 201    reg++;
 202
 203    /* User clip planes from curbe:
 204     */
 205    if (c->key.userclip_active) {
 206       if (intel->gen >= 6) {
 207          for (i = 0; i <= c->key.nr_userclip_plane_consts; i++) {
 208             c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
 209                                                   (i % 2) * 4), 0, 4, 1);
 210          }
 211          reg += ALIGN(c->key.nr_userclip_plane_consts, 2) / 2;
 212       } else {
 213          for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
 214             c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
 215                                                   (i % 2) * 4), 0, 4, 1);
 216          }
 217          reg += (ALIGN(6 + c->key.nr_userclip_plane_consts, 4) / 4) * 2;
 218       }
 219
 220    }
 221
 222    /* Assign some (probably all) of the vertex program constants to
 223     * the push constant buffer/CURBE.
 224     *
 225     * There's an obvious limit to the numer of push constants equal to
 226     * the number of register available, and that number is smaller
 227     * than the minimum maximum number of vertex program parameters, so
 228     * support for pull constants is required if we overflow.
 229     * Additionally, on gen6 the number of push constants is even
 230     * lower.
 231     *
 232     * When there's relative addressing, we don't know what range of
 233     * Mesa IR registers can be accessed.  And generally, when relative
 234     * addressing is used we also have too many constants to load them
 235     * all as push constants.  So, we'll just support relative
 236     * addressing out of the pull constant buffers, and try to load as
 237     * many statically-accessed constants into the push constant buffer
 238     * as we can.
 239     */
 240    if (intel->gen >= 6) {
 241       /* We can only load 32 regs of push constants. */
 242       max_constant = 32 * 2 - c->key.nr_userclip_plane_consts;
 243    } else {
 244       max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 245    }
 246
 247    /* constant_map maps from ParameterValues[] index to index in the
 248     * push constant buffer, or -1 if it's only in the pull constant
 249     * buffer.
 250     */
 251    memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 252    for (i = 0;
 253         i < c->vp->program.Base.NumInstructions && constant < max_constant;
 254         i++) {
 255       struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 256       int arg;
 257
 258       for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 259          if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 260              inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 261              inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 262              inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 263              inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
 264             continue;
 265          }
 266
 267          if (inst->SrcReg[arg].RelAddr) {
 268             c->vp->use_const_buffer = true;
 269             continue;
 270          }
 271
 272          if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 273             c->constant_map[inst->SrcReg[arg].Index] = constant++;
 274          }
 275       }
 276    }
 277
 278    /* If we ran out of push constant space, then we'll also upload all
 279     * constants through the pull constant buffer so that they can be
 280     * accessed no matter what.  For relative addressing (the common
 281     * case) we need them all in place anyway.
 282     */
 283    if (constant == max_constant)
 284       c->vp->use_const_buffer = true;
 285
 286    /* Set up the references to the pull parameters if present.  This backend
 287     * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
 288     * buffer, while the new VS backend allocates values to the pull buffer on
 289     * demand.
 290     */
 291    if (c->vp->use_const_buffer) {
 292       for (i = 0; i < params->NumParameters * 4; i++) {
 293          c->prog_data.pull_param[i] = &params->ParameterValues[i / 4][i % 4].f;
 294       }
 295       c->prog_data.nr_pull_params = i;
 296    }
 297
 298    for (i = 0; i < constant; i++) {
 299       c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
 300                                                           (i % 2) * 4),
 301                                              0, 4, 1);
 302    }
 303    reg += (constant + 1) / 2;
 304    c->prog_data.curb_read_length = reg - 1;
 305    c->prog_data.nr_params = constant * 4;
 306    /* XXX 0 causes a bug elsewhere... */
 307    if (intel->gen < 6 && c->prog_data.nr_params == 0)
 308       c->prog_data.nr_params = 4;
 309
 310    /* Allocate input regs:
 311     */
 312    c->nr_inputs = 0;
 313    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 314       if (c->prog_data.inputs_read & BITFIELD64_BIT(i)) {
 315          c->nr_inputs++;
 316          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 317          reg++;
 318       }
 319    }
 320    /* If there are no inputs, we'll still be reading one attribute's worth
 321     * because it's required -- see urb_read_length setting.
 322     */
 323    if (c->nr_inputs == 0)
 324       reg++;
 325
 326    /* Allocate outputs.  The non-position outputs go straight into message regs.
 327     */
 328    brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
 329                        c->prog_data.outputs_written);
 330    c->first_output = reg;
 331
 332    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
 333
 334    for (slot = 0; slot < c->vue_map.num_slots; slot++) {
 335       int vert_result = c->vue_map.slot_to_vert_result[slot];
 336       assert(vert_result < Elements(c->regs[PROGRAM_OUTPUT]));
 337       if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) {
 338          c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1);
 339       } else {
 340          c->regs[PROGRAM_OUTPUT][vert_result] = brw_vec8_grf(reg, 0);
 341          reg++;
 342       }
 343    }
 344
 345    /* Allocate program temporaries:
 346     */
 347    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 348       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 349       reg++;
 350    }
 351
 352    /* Address reg(s).  Don't try to use the internal address reg until
 353     * deref time.
 354     */
 355    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 356       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 357                                              reg,
 358                                              0,
 359                                              BRW_REGISTER_TYPE_D,
 360                                              BRW_VERTICAL_STRIDE_8,
 361                                              BRW_WIDTH_8,
 362                                              BRW_HORIZONTAL_STRIDE_1,
 363                                              BRW_SWIZZLE_XXXX,
 364                                              WRITEMASK_X);
 365       reg++;
 366    }
 367
 368    if (c->vp->use_const_buffer) {
 369       for (i = 0; i < 3; i++) {
 370          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 371          reg++;
 372       }
 373       clear_current_const(c);
 374    }
 375
 376    for (i = 0; i < 128; i++) {
 377       if (c->output_regs[i].used_in_src) {
 378          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 379          reg++;
 380       }
 381    }
 382
 383    if (c->needs_stack) {
 384       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 385       reg += 2;
 386    }
 387
 388    /* Some opcodes need an internal temporary:
 389     */
 390    c->first_tmp = reg;
 391    c->last_tmp = reg;           /* for allocation purposes */
 392
 393    /* Each input reg holds data from two vertices.  The
 394     * urb_read_length is the number of registers read from *each*
 395     * vertex urb, so is half the amount:
 396     */
 397    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 398    /* Setting this field to 0 leads to undefined behavior according to the
 399     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 400     * sitting in them, even if it's padding.
 401     */
 402    if (c->prog_data.urb_read_length == 0)
 403       c->prog_data.urb_read_length = 1;
 404
 405    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 406     * them to fit the biggest thing they need to.
 407     */
 408    attributes_in_vue = MAX2(c->vue_map.num_slots, c->nr_inputs);
 409
 410    if (intel->gen == 6) {
 411       /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
 412        * number of 128-byte (1024-bit) units.
 413        */
 414       c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 8) / 8;
 415    } else {
 416       /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
 417        * number of 64-byte (512-bit) units.
 418        */
 419       c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 4) / 4;
 420    }
 421
 422    c->prog_data.total_grf = reg;
 423
 424    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
 425       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 426       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 427       printf("%s reg = %d\n", __FUNCTION__, reg);
 428    }
 429 }
 430
 431
 432 /**
 433  * If an instruction uses a temp reg both as a src and the dest, we
 434  * sometimes need to allocate an intermediate temporary.
 435  */
 436 static void unalias1( struct brw_vs_compile *c,
 437                       struct brw_reg dst,
 438                       struct brw_reg arg0,
 439                       void (*func)( struct brw_vs_compile *,
 440                                     struct brw_reg,
 441                                     struct brw_reg ))
 442 {
 443    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 444       struct brw_compile *p = &c->func;
 445       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 446       func(c, tmp, arg0);
 447       brw_MOV(p, dst, tmp);
 448       release_tmp(c, tmp);
 449    }
 450    else {
 451       func(c, dst, arg0);
 452    }
 453 }
 454
 455 /**
 456  * \sa unalias2
 457  * Checkes if 2-operand instruction needs an intermediate temporary.
 458  */
 459 static void unalias2( struct brw_vs_compile *c,
 460                       struct brw_reg dst,
 461                       struct brw_reg arg0,
 462                       struct brw_reg arg1,
 463                       void (*func)( struct brw_vs_compile *,
 464                                     struct brw_reg,
 465                                     struct brw_reg,
 466                                     struct brw_reg ))
 467 {
 468    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 469        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 470       struct brw_compile *p = &c->func;
 471       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 472       func(c, tmp, arg0, arg1);
 473       brw_MOV(p, dst, tmp);
 474       release_tmp(c, tmp);
 475    }
 476    else {
 477       func(c, dst, arg0, arg1);
 478    }
 479 }
 480
 481 /**
 482  * \sa unalias2
 483  * Checkes if 3-operand instruction needs an intermediate temporary.
 484  */
 485 static void unalias3( struct brw_vs_compile *c,
 486                       struct brw_reg dst,
 487                       struct brw_reg arg0,
 488                       struct brw_reg arg1,
 489                       struct brw_reg arg2,
 490                       void (*func)( struct brw_vs_compile *,
 491                                     struct brw_reg,
 492                                     struct brw_reg,
 493                                     struct brw_reg,
 494                                     struct brw_reg ))
 495 {
 496    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 497        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 498        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 499       struct brw_compile *p = &c->func;
 500       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 501       func(c, tmp, arg0, arg1, arg2);
 502       brw_MOV(p, dst, tmp);
 503       release_tmp(c, tmp);
 504    }
 505    else {
 506       func(c, dst, arg0, arg1, arg2);
 507    }
 508 }
 509
 510 static void emit_sop( struct brw_vs_compile *c,
 511                       struct brw_reg dst,
 512                       struct brw_reg arg0,
 513                       struct brw_reg arg1,
 514                       GLuint cond)
 515 {
 516    struct brw_compile *p = &c->func;
 517
 518    brw_MOV(p, dst, brw_imm_f(0.0f));
 519    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 520    brw_MOV(p, dst, brw_imm_f(1.0f));
 521    brw_set_predicate_control_flag_value(p, 0xff);
 522 }
 523
 524 static void emit_seq( struct brw_vs_compile *c,
 525                       struct brw_reg dst,
 526                       struct brw_reg arg0,
 527                       struct brw_reg arg1 )
 528 {
 529    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 530 }
 531
 532 static void emit_sne( struct brw_vs_compile *c,
 533                       struct brw_reg dst,
 534                       struct brw_reg arg0,
 535                       struct brw_reg arg1 )
 536 {
 537    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 538 }
 539 static void emit_slt( struct brw_vs_compile *c,
 540                       struct brw_reg dst,
 541                       struct brw_reg arg0,
 542                       struct brw_reg arg1 )
 543 {
 544    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 545 }
 546
 547 static void emit_sle( struct brw_vs_compile *c,
 548                       struct brw_reg dst,
 549                       struct brw_reg arg0,
 550                       struct brw_reg arg1 )
 551 {
 552    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 553 }
 554
 555 static void emit_sgt( struct brw_vs_compile *c,
 556                       struct brw_reg dst,
 557                       struct brw_reg arg0,
 558                       struct brw_reg arg1 )
 559 {
 560    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 561 }
 562
 563 static void emit_sge( struct brw_vs_compile *c,
 564                       struct brw_reg dst,
 565                       struct brw_reg arg0,
 566                       struct brw_reg arg1 )
 567 {
 568   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 569 }
 570
 571 static void emit_cmp( struct brw_compile *p,
 572                       struct brw_reg dst,
 573                       struct brw_reg arg0,
 574                       struct brw_reg arg1,
 575                       struct brw_reg arg2 )
 576 {
 577    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 578    brw_SEL(p, dst, arg1, arg2);
 579    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 580 }
 581
 582 static void emit_sign(struct brw_vs_compile *c,
 583                       struct brw_reg dst,
 584                       struct brw_reg arg0)
 585 {
 586    struct brw_compile *p = &c->func;
 587
 588    brw_MOV(p, dst, brw_imm_f(0));
 589
 590    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 591    brw_MOV(p, dst, brw_imm_f(-1.0));
 592    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 593
 594    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
 595    brw_MOV(p, dst, brw_imm_f(1.0));
 596    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 597 }
 598
 599 static void emit_max( struct brw_compile *p,
 600                       struct brw_reg dst,
 601                       struct brw_reg arg0,
 602                       struct brw_reg arg1 )
 603 {
 604    struct intel_context *intel = &p->brw->intel;
 605
 606    if (intel->gen >= 6) {
 607       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
 608       brw_SEL(p, dst, arg0, arg1);
 609       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 610       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 611    } else {
 612       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
 613       brw_SEL(p, dst, arg0, arg1);
 614       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 615    }
 616 }
 617
 618 static void emit_min( struct brw_compile *p,
 619                       struct brw_reg dst,
 620                       struct brw_reg arg0,
 621                       struct brw_reg arg1 )
 622 {
 623    struct intel_context *intel = &p->brw->intel;
 624
 625    if (intel->gen >= 6) {
 626       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 627       brw_SEL(p, dst, arg0, arg1);
 628       brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 629       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 630    } else {
 631       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 632       brw_SEL(p, dst, arg0, arg1);
 633       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 634    }
 635 }
 636
 637 static void emit_arl(struct brw_compile *p,
 638                      struct brw_reg dst,
 639                      struct brw_reg src)
 640 {
 641    struct intel_context *intel = &p->brw->intel;
 642
 643    if (intel->gen >= 6) {
 644       struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
 645
 646       brw_RNDD(p, dst_f, src);
 647       brw_MOV(p, dst, dst_f);
 648    } else {
 649       brw_RNDD(p, dst, src);
 650    }
 651 }
 652
 653 static void emit_math1_gen4(struct brw_vs_compile *c,
 654                             GLuint function,
 655                             struct brw_reg dst,
 656                             struct brw_reg arg0,
 657                             GLuint precision)
 658 {
 659    /* There are various odd behaviours with SEND on the simulator.  In
 660     * addition there are documented issues with the fact that the GEN4
 661     * processor doesn't do dependency control properly on SEND
 662     * results.  So, on balance, this kludge to get around failures
 663     * with writemasked math results looks like it might be necessary
 664     * whether that turns out to be a simulator bug or not:
 665     */
 666    struct brw_compile *p = &c->func;
 667    struct brw_reg tmp = dst;
 668    bool need_tmp = false;
 669
 670    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 671        dst.dw1.bits.writemask != 0xf)
 672       need_tmp = true;
 673
 674    if (need_tmp)
 675       tmp = get_tmp(c);
 676
 677    brw_math(p,
 678             tmp,
 679             function,
 680             BRW_MATH_SATURATE_NONE,
 681             2,
 682             arg0,
 683             BRW_MATH_DATA_SCALAR,
 684             precision);
 685
 686    if (need_tmp) {
 687       brw_MOV(p, dst, tmp);
 688       release_tmp(c, tmp);
 689    }
 690 }
 691
 692 static void
 693 emit_math1_gen6(struct brw_vs_compile *c,
 694                 GLuint function,
 695                 struct brw_reg dst,
 696                 struct brw_reg arg0,
 697                 GLuint precision)
 698 {
 699    struct brw_compile *p = &c->func;
 700    struct brw_reg tmp_src, tmp_dst;
 701
 702    /* Something is strange on gen6 math in 16-wide mode, though the
 703     * docs say it's supposed to work.  Punt to using align1 mode,
 704     * which doesn't do writemasking and swizzles.
 705     */
 706    tmp_src = get_tmp(c);
 707    tmp_dst = get_tmp(c);
 708
 709    brw_MOV(p, tmp_src, arg0);
 710
 711    brw_set_access_mode(p, BRW_ALIGN_1);
 712    brw_math(p,
 713             tmp_dst,
 714             function,
 715             BRW_MATH_SATURATE_NONE,
 716             2,
 717             tmp_src,
 718             BRW_MATH_DATA_SCALAR,
 719             precision);
 720    brw_set_access_mode(p, BRW_ALIGN_16);
 721
 722    brw_MOV(p, dst, tmp_dst);
 723
 724    release_tmp(c, tmp_src);
 725    release_tmp(c, tmp_dst);
 726 }
 727
 728 static void
 729 emit_math1(struct brw_vs_compile *c,
 730            GLuint function,
 731            struct brw_reg dst,
 732            struct brw_reg arg0,
 733            GLuint precision)
 734 {
 735    struct brw_compile *p = &c->func;
 736    struct intel_context *intel = &p->brw->intel;
 737
 738    if (intel->gen >= 6)
 739       emit_math1_gen6(c, function, dst, arg0, precision);
 740    else
 741       emit_math1_gen4(c, function, dst, arg0, precision);
 742 }
 743
 744 static void emit_math2_gen4( struct brw_vs_compile *c,
 745                         GLuint function,
 746                         struct brw_reg dst,
 747                         struct brw_reg arg0,
 748                         struct brw_reg arg1,
 749                         GLuint precision)
 750 {
 751    struct brw_compile *p = &c->func;
 752    struct brw_reg tmp = dst;
 753    bool need_tmp = false;
 754
 755    if (dst.file != BRW_GENERAL_REGISTER_FILE ||
 756        dst.dw1.bits.writemask != 0xf)
 757       need_tmp = true;
 758
 759    if (need_tmp)
 760       tmp = get_tmp(c);
 761
 762    brw_MOV(p, brw_message_reg(3), arg1);
 763
 764    brw_math(p,
 765             tmp,
 766             function,
 767             BRW_MATH_SATURATE_NONE,
 768             2,
 769             arg0,
 770             BRW_MATH_DATA_SCALAR,
 771             precision);
 772
 773    if (need_tmp) {
 774       brw_MOV(p, dst, tmp);
 775       release_tmp(c, tmp);
 776    }
 777 }
 778
 779 static void emit_math2_gen6( struct brw_vs_compile *c,
 780                         GLuint function,
 781                         struct brw_reg dst,
 782                         struct brw_reg arg0,
 783                         struct brw_reg arg1,
 784                         GLuint precision)
 785 {
 786    struct brw_compile *p = &c->func;
 787    struct brw_reg tmp_src0, tmp_src1, tmp_dst;
 788
 789    tmp_src0 = get_tmp(c);
 790    tmp_src1 = get_tmp(c);
 791    tmp_dst = get_tmp(c);
 792
 793    brw_MOV(p, tmp_src0, arg0);
 794    brw_MOV(p, tmp_src1, arg1);
 795
 796    brw_set_access_mode(p, BRW_ALIGN_1);
 797    brw_math2(p,
 798             tmp_dst,
 799             function,
 800             tmp_src0,
 801             tmp_src1);
 802    brw_set_access_mode(p, BRW_ALIGN_16);
 803
 804    brw_MOV(p, dst, tmp_dst);
 805
 806    release_tmp(c, tmp_src0);
 807    release_tmp(c, tmp_src1);
 808    release_tmp(c, tmp_dst);
 809 }
 810
 811 static void emit_math2( struct brw_vs_compile *c,
 812                         GLuint function,
 813                         struct brw_reg dst,
 814                         struct brw_reg arg0,
 815                         struct brw_reg arg1,
 816                         GLuint precision)
 817 {
 818    struct brw_compile *p = &c->func;
 819    struct intel_context *intel = &p->brw->intel;
 820
 821    if (intel->gen >= 6)
 822       emit_math2_gen6(c, function, dst, arg0, arg1, precision);
 823    else
 824       emit_math2_gen4(c, function, dst, arg0, arg1, precision);
 825 }
 826
 827 static void emit_exp_noalias( struct brw_vs_compile *c,
 828                               struct brw_reg dst,
 829                               struct brw_reg arg0 )
 830 {
 831    struct brw_compile *p = &c->func;
 832
 833
 834    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 835       struct brw_reg tmp = get_tmp(c);
 836       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 837
 838       /* tmp_d = floor(arg0.x) */
 839       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 840
 841       /* result[0] = 2.0 ^ tmp */
 842
 843       /* Adjust exponent for floating point:
 844        * exp += 127
 845        */
 846       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 847
 848       /* Install exponent and sign.
 849        * Excess drops off the edge:
 850        */
 851       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 852               tmp_d, brw_imm_d(23));
 853
 854       release_tmp(c, tmp);
 855    }
 856
 857    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 858       /* result[1] = arg0.x - floor(arg0.x) */
 859       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 860    }
 861
 862    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 863       /* As with the LOG instruction, we might be better off just
 864        * doing a taylor expansion here, seeing as we have to do all
 865        * the prep work.
 866        *
 867        * If mathbox partial precision is too low, consider also:
 868        * result[3] = result[0] * EXP(result[1])
 869        */
 870       emit_math1(c,
 871                  BRW_MATH_FUNCTION_EXP,
 872                  brw_writemask(dst, WRITEMASK_Z),
 873                  brw_swizzle1(arg0, 0),
 874                  BRW_MATH_PRECISION_FULL);
 875    }
 876
 877    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 878       /* result[3] = 1.0; */
 879       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 880    }
 881 }
 882
 883
 884 static void emit_log_noalias( struct brw_vs_compile *c,
 885                               struct brw_reg dst,
 886                               struct brw_reg arg0 )
 887 {
 888    struct brw_compile *p = &c->func;
 889    struct brw_reg tmp = dst;
 890    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 891    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 892    bool need_tmp = (dst.dw1.bits.writemask != 0xf ||
 893                          dst.file != BRW_GENERAL_REGISTER_FILE);
 894
 895    if (need_tmp) {
 896       tmp = get_tmp(c);
 897       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 898    }
 899
 900    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 901     * according to spec:
 902     *
 903     * These almost look likey they could be joined up, but not really
 904     * practical:
 905     *
 906     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 907     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 908     */
 909    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 910       brw_AND(p,
 911               brw_writemask(tmp_ud, WRITEMASK_X),
 912               brw_swizzle1(arg0_ud, 0),
 913               brw_imm_ud((1U<<31)-1));
 914
 915       brw_SHR(p,
 916               brw_writemask(tmp_ud, WRITEMASK_X),
 917               tmp_ud,
 918               brw_imm_ud(23));
 919
 920       brw_ADD(p,
 921               brw_writemask(tmp, WRITEMASK_X),
 922               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 923               brw_imm_d(-127));
 924    }
 925
 926    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 927       brw_AND(p,
 928               brw_writemask(tmp_ud, WRITEMASK_Y),
 929               brw_swizzle1(arg0_ud, 0),
 930               brw_imm_ud((1<<23)-1));
 931
 932       brw_OR(p,
 933              brw_writemask(tmp_ud, WRITEMASK_Y),
 934              tmp_ud,
 935              brw_imm_ud(127<<23));
 936    }
 937
 938    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 939       /* result[2] = result[0] + LOG2(result[1]); */
 940
 941       /* Why bother?  The above is just a hint how to do this with a
 942        * taylor series.  Maybe we *should* use a taylor series as by
 943        * the time all the above has been done it's almost certainly
 944        * quicker than calling the mathbox, even with low precision.
 945        *
 946        * Options are:
 947        *    - result[0] + mathbox.LOG2(result[1])
 948        *    - mathbox.LOG2(arg0.x)
 949        *    - result[0] + inline_taylor_approx(result[1])
 950        */
 951       emit_math1(c,
 952                  BRW_MATH_FUNCTION_LOG,
 953                  brw_writemask(tmp, WRITEMASK_Z),
 954                  brw_swizzle1(tmp, 1),
 955                  BRW_MATH_PRECISION_FULL);
 956
 957       brw_ADD(p,
 958               brw_writemask(tmp, WRITEMASK_Z),
 959               brw_swizzle1(tmp, 2),
 960               brw_swizzle1(tmp, 0));
 961    }
 962
 963    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 964       /* result[3] = 1.0; */
 965       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 966    }
 967
 968    if (need_tmp) {
 969       brw_MOV(p, dst, tmp);
 970       release_tmp(c, tmp);
 971    }
 972 }
 973
 974
 975 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 976  */
 977 static void emit_dst_noalias( struct brw_vs_compile *c,
 978                               struct brw_reg dst,
 979                               struct brw_reg arg0,
 980                               struct brw_reg arg1)
 981 {
 982    struct brw_compile *p = &c->func;
 983
 984    /* There must be a better way to do this:
 985     */
 986    if (dst.dw1.bits.writemask & WRITEMASK_X)
 987       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 988    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 989       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 990    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 991       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 992    if (dst.dw1.bits.writemask & WRITEMASK_W)
 993       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 994 }
 995
 996
 997 static void emit_xpd( struct brw_compile *p,
 998                       struct brw_reg dst,
 999                       struct brw_reg t,
1000                       struct brw_reg u)
1001 {
1002    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
1003    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1004 }
1005
1006
1007 static void emit_lit_noalias( struct brw_vs_compile *c,
1008                               struct brw_reg dst,
1009                               struct brw_reg arg0 )
1010 {
1011    struct brw_compile *p = &c->func;
1012    struct brw_reg tmp = dst;
1013    bool need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1014
1015    if (need_tmp)
1016       tmp = get_tmp(c);
1017
1018    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1019    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1020
1021    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1022     * to get all channels active inside the IF.  In the clipping code
1023     * we run with NoMask, so it's not an option and we can use
1024     * BRW_EXECUTE_1 for all comparisions.
1025     */
1026    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1027    brw_IF(p, BRW_EXECUTE_8);
1028    {
1029       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1030
1031       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1032       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
1033       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1034
1035       emit_math2(c,
1036                  BRW_MATH_FUNCTION_POW,
1037                  brw_writemask(dst, WRITEMASK_Z),
1038                  brw_swizzle1(tmp, 2),
1039                  brw_swizzle1(arg0, 3),
1040                  BRW_MATH_PRECISION_PARTIAL);
1041    }
1042    brw_ENDIF(p);
1043
1044    release_tmp(c, tmp);
1045 }
1046
1047 static void emit_lrp_noalias(struct brw_vs_compile *c,
1048                              struct brw_reg dst,
1049                              struct brw_reg arg0,
1050                              struct brw_reg arg1,
1051                              struct brw_reg arg2)
1052 {
1053    struct brw_compile *p = &c->func;
1054
1055    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1056    brw_MUL(p, brw_null_reg(), dst, arg2);
1057    brw_MAC(p, dst, arg0, arg1);
1058 }
1059
1060 static struct brw_reg
1061 get_constant(struct brw_vs_compile *c,
1062              const struct prog_instruction *inst,
1063              GLuint argIndex)
1064 {
1065    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1066    struct brw_compile *p = &c->func;
1067    struct brw_reg const_reg = c->current_const[argIndex].reg;
1068
1069    assert(argIndex < 3);
1070
1071    if (c->current_const[argIndex].index != src->Index) {
1072       /* Keep track of the last constant loaded in this slot, for reuse. */
1073       c->current_const[argIndex].index = src->Index;
1074
1075 #if 0
1076       printf("  fetch const[%d] for arg %d into reg %d\n",
1077              src->Index, argIndex, c->current_const[argIndex].reg.nr);
1078 #endif
1079       /* need to fetch the constant now */
1080       brw_dp_READ_4_vs(p,
1081                        const_reg,                     /* writeback dest */
1082                        16 * src->Index,               /* byte offset */
1083                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
1084                        );
1085    }
1086
1087    /* replicate lower four floats into upper half (to get XYZWXYZW) */
1088    const_reg = stride(const_reg, 0, 4, 1);
1089    const_reg.subnr = 0;
1090
1091    return const_reg;
1092 }
1093
1094 static struct brw_reg
1095 get_reladdr_constant(struct brw_vs_compile *c,
1096                      const struct prog_instruction *inst,
1097                      GLuint argIndex)
1098 {
1099    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1100    struct brw_compile *p = &c->func;
1101    struct brw_context *brw = p->brw;
1102    struct intel_context *intel = &brw->intel;
1103    struct brw_reg const_reg = c->current_const[argIndex].reg;
1104    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1105    uint32_t offset;
1106
1107    assert(argIndex < 3);
1108
1109    /* Can't reuse a reladdr constant load. */
1110    c->current_const[argIndex].index = -1;
1111
1112  #if 0
1113    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
1114           src->Index, argIndex, c->current_const[argIndex].reg.nr);
1115 #endif
1116
1117    if (intel->gen >= 6) {
1118       offset = src->Index;
1119    } else {
1120       struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1121       brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1122       addr_reg = byte_addr_reg;
1123       offset = 16 * src->Index;
1124    }
1125
1126    /* fetch the first vec4 */
1127    brw_dp_READ_4_vs_relative(p,
1128                              const_reg,
1129                              addr_reg,
1130                              offset,
1131                              SURF_INDEX_VERT_CONST_BUFFER);
1132
1133    return const_reg;
1134 }
1135
1136
1137
1138 /* TODO: relative addressing!
1139  */
1140 static struct brw_reg get_reg( struct brw_vs_compile *c,
1141                                gl_register_file file,
1142                                GLuint index )
1143 {
1144    switch (file) {
1145    case PROGRAM_TEMPORARY:
1146    case PROGRAM_INPUT:
1147    case PROGRAM_OUTPUT:
1148       assert(c->regs[file][index].nr != 0);
1149       return c->regs[file][index];
1150    case PROGRAM_STATE_VAR:
1151    case PROGRAM_CONSTANT:
1152    case PROGRAM_UNIFORM:
1153       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1154       return c->regs[PROGRAM_STATE_VAR][index];
1155    case PROGRAM_ADDRESS:
1156       assert(index == 0);
1157       return c->regs[file][index];
1158
1159    case PROGRAM_UNDEFINED:                      /* undef values */
1160       return brw_null_reg();
1161
1162    case PROGRAM_LOCAL_PARAM:
1163    case PROGRAM_ENV_PARAM:
1164    case PROGRAM_WRITE_ONLY:
1165    default:
1166       assert(0);
1167       return brw_null_reg();
1168    }
1169 }
1170
1171
1172 /**
1173  * Indirect addressing:  get reg[[arg] + offset].
1174  */
1175 static struct brw_reg deref( struct brw_vs_compile *c,
1176                              struct brw_reg arg,
1177                              GLint offset,
1178                              GLuint reg_size )
1179 {
1180    struct brw_compile *p = &c->func;
1181    struct brw_reg tmp = get_tmp(c);
1182    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1183    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1184    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1185    struct brw_reg indirect = brw_vec4_indirect(0,0);
1186    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1187
1188    /* Set the vertical stride on the register access so that the first
1189     * 4 components come from a0.0 and the second 4 from a0.1.
1190     */
1191    indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1192
1193    {
1194       brw_push_insn_state(p);
1195       brw_set_access_mode(p, BRW_ALIGN_1);
1196
1197       brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1198       brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1199
1200       brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1201       brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1202
1203       brw_MOV(p, tmp, indirect);
1204
1205       brw_pop_insn_state(p);
1206    }
1207
1208    /* NOTE: tmp not released */
1209    return tmp;
1210 }
1211
1212 static void
1213 move_to_reladdr_dst(struct brw_vs_compile *c,
1214                     const struct prog_instruction *inst,
1215                     struct brw_reg val)
1216 {
1217    struct brw_compile *p = &c->func;
1218    int reg_size = 32;
1219    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1220    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1221    struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1222    GLuint byte_offset = base.nr * 32 + base.subnr;
1223    struct brw_reg indirect = brw_vec4_indirect(0,0);
1224    struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1225
1226    /* Because destination register indirect addressing can only use
1227     * one index, we'll write each vertex's vec4 value separately.
1228     */
1229    val.width = BRW_WIDTH_4;
1230    val.vstride = BRW_VERTICAL_STRIDE_4;
1231
1232    brw_push_insn_state(p);
1233    brw_set_access_mode(p, BRW_ALIGN_1);
1234
1235    brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1236    brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1237    brw_MOV(p, indirect, val);
1238
1239    brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1240    brw_ADD(p, brw_address_reg(0), acc,
1241            brw_imm_uw(byte_offset + reg_size / 2));
1242    brw_MOV(p, indirect, suboffset(val, 4));
1243
1244    brw_pop_insn_state(p);
1245 }
1246
1247 /**
1248  * Get brw reg corresponding to the instruction's [argIndex] src reg.
1249  * TODO: relative addressing!
1250  */
1251 static struct brw_reg
1252 get_src_reg( struct brw_vs_compile *c,
1253              const struct prog_instruction *inst,
1254              GLuint argIndex )
1255 {
1256    const GLuint file = inst->SrcReg[argIndex].File;
1257    const GLint index = inst->SrcReg[argIndex].Index;
1258    const bool relAddr = inst->SrcReg[argIndex].RelAddr;
1259
1260    if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1261       const struct prog_src_register *src = &inst->SrcReg[argIndex];
1262
1263       if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1264                                         SWIZZLE_ZERO,
1265                                         SWIZZLE_ZERO,
1266                                         SWIZZLE_ZERO)) {
1267           return brw_imm_f(0.0f);
1268       } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1269                                                SWIZZLE_ONE,
1270                                                SWIZZLE_ONE,
1271                                                SWIZZLE_ONE)) {
1272          if (src->Negate)
1273             return brw_imm_f(-1.0F);
1274          else
1275             return brw_imm_f(1.0F);
1276       } else if (src->File == PROGRAM_CONSTANT) {
1277          const struct gl_program_parameter_list *params;
1278          float f;
1279          int component = -1;
1280
1281          switch (src->Swizzle) {
1282          case SWIZZLE_XXXX:
1283             component = 0;
1284             break;
1285          case SWIZZLE_YYYY:
1286             component = 1;
1287             break;
1288          case SWIZZLE_ZZZZ:
1289             component = 2;
1290             break;
1291          case SWIZZLE_WWWW:
1292             component = 3;
1293             break;
1294          }
1295
1296          if (component >= 0) {
1297             params = c->vp->program.Base.Parameters;
1298             f = params->ParameterValues[src->Index][component].f;
1299
1300             if (src->Abs)
1301                f = fabs(f);
1302             if (src->Negate)
1303                f = -f;
1304             return brw_imm_f(f);
1305          }
1306       }
1307    }
1308
1309    switch (file) {
1310    case PROGRAM_TEMPORARY:
1311    case PROGRAM_INPUT:
1312    case PROGRAM_OUTPUT:
1313       if (relAddr) {
1314          return deref(c, c->regs[file][0], index, 32);
1315       }
1316       else {
1317          assert(c->regs[file][index].nr != 0);
1318          return c->regs[file][index];
1319       }
1320
1321    case PROGRAM_STATE_VAR:
1322    case PROGRAM_CONSTANT:
1323    case PROGRAM_UNIFORM:
1324    case PROGRAM_ENV_PARAM:
1325    case PROGRAM_LOCAL_PARAM:
1326       if (!relAddr && c->constant_map[index] != -1) {
1327          /* Take from the push constant buffer if possible. */
1328          assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1329          return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1330       } else {
1331          /* Must be in the pull constant buffer then .*/
1332          assert(c->vp->use_const_buffer);
1333          if (relAddr)
1334             return get_reladdr_constant(c, inst, argIndex);
1335          else
1336             return get_constant(c, inst, argIndex);
1337       }
1338    case PROGRAM_ADDRESS:
1339       assert(index == 0);
1340       return c->regs[file][index];
1341
1342    case PROGRAM_UNDEFINED:
1343       /* this is a normal case since we loop over all three src args */
1344       return brw_null_reg();
1345
1346    case PROGRAM_WRITE_ONLY:
1347    default:
1348       assert(0);
1349       return brw_null_reg();
1350    }
1351 }
1352
1353 /**
1354  * Return the brw reg for the given instruction's src argument.
1355  * Will return mangled results for SWZ op.  The emit_swz() function
1356  * ignores this result and recalculates taking extended swizzles into
1357  * account.
1358  */
1359 static struct brw_reg get_arg( struct brw_vs_compile *c,
1360                                const struct prog_instruction *inst,
1361                                GLuint argIndex )
1362 {
1363    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1364    struct brw_reg reg;
1365
1366    if (src->File == PROGRAM_UNDEFINED)
1367       return brw_null_reg();
1368
1369    reg = get_src_reg(c, inst, argIndex);
1370
1371    /* Convert 3-bit swizzle to 2-bit.
1372     */
1373    if (reg.file != BRW_IMMEDIATE_VALUE) {
1374       reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1375                                           GET_SWZ(src->Swizzle, 1),
1376                                           GET_SWZ(src->Swizzle, 2),
1377                                           GET_SWZ(src->Swizzle, 3));
1378
1379       /* Note this is ok for non-swizzle ARB_vp instructions */
1380       reg.negate = src->Negate ? 1 : 0;
1381    }
1382
1383    return reg;
1384 }
1385
1386
1387 /**
1388  * Get brw register for the given program dest register.
1389  */
1390 static struct brw_reg get_dst( struct brw_vs_compile *c,
1391                                struct prog_dst_register dst )
1392 {
1393    struct brw_reg reg;
1394
1395    switch (dst.File) {
1396    case PROGRAM_TEMPORARY:
1397    case PROGRAM_OUTPUT:
1398       /* register-indirect addressing is only 1x1, not VxH, for
1399        * destination regs.  So, for RelAddr we'll return a temporary
1400        * for the dest and do a move of the result to the RelAddr
1401        * register after the instruction emit.
1402        */
1403       if (dst.RelAddr) {
1404          reg = get_tmp(c);
1405       } else {
1406          assert(c->regs[dst.File][dst.Index].nr != 0);
1407          reg = c->regs[dst.File][dst.Index];
1408       }
1409       break;
1410    case PROGRAM_ADDRESS:
1411       assert(dst.Index == 0);
1412       reg = c->regs[dst.File][dst.Index];
1413       break;
1414    case PROGRAM_UNDEFINED:
1415       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1416       reg = brw_null_reg();
1417       break;
1418    default:
1419       assert(0);
1420       reg = brw_null_reg();
1421    }
1422
1423    assert(reg.type != BRW_IMMEDIATE_VALUE);
1424    reg.dw1.bits.writemask = dst.WriteMask;
1425
1426    return reg;
1427 }
1428
1429
1430 static void emit_swz( struct brw_vs_compile *c,
1431                       struct brw_reg dst,
1432                       const struct prog_instruction *inst)
1433 {
1434    const GLuint argIndex = 0;
1435    const struct prog_src_register src = inst->SrcReg[argIndex];
1436    struct brw_compile *p = &c->func;
1437    GLuint zeros_mask = 0;
1438    GLuint ones_mask = 0;
1439    GLuint src_mask = 0;
1440    GLubyte src_swz[4];
1441    bool need_tmp = (src.Negate &&
1442                          dst.file != BRW_GENERAL_REGISTER_FILE);
1443    struct brw_reg tmp = dst;
1444    GLuint i;
1445
1446    if (need_tmp)
1447       tmp = get_tmp(c);
1448
1449    for (i = 0; i < 4; i++) {
1450       if (dst.dw1.bits.writemask & (1<<i)) {
1451          GLubyte s = GET_SWZ(src.Swizzle, i);
1452          switch (s) {
1453          case SWIZZLE_X:
1454          case SWIZZLE_Y:
1455          case SWIZZLE_Z:
1456          case SWIZZLE_W:
1457             src_mask |= 1<<i;
1458             src_swz[i] = s;
1459             break;
1460          case SWIZZLE_ZERO:
1461             zeros_mask |= 1<<i;
1462             break;
1463          case SWIZZLE_ONE:
1464             ones_mask |= 1<<i;
1465             break;
1466          }
1467       }
1468    }
1469
1470    /* Do src first, in case dst aliases src:
1471     */
1472    if (src_mask) {
1473       struct brw_reg arg0;
1474
1475       arg0 = get_src_reg(c, inst, argIndex);
1476
1477       arg0 = brw_swizzle(arg0,
1478                          src_swz[0], src_swz[1],
1479                          src_swz[2], src_swz[3]);
1480
1481       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1482    }
1483
1484    if (zeros_mask)
1485       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1486
1487    if (ones_mask)
1488       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1489
1490    if (src.Negate)
1491       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1492
1493    if (need_tmp) {
1494       brw_MOV(p, dst, tmp);
1495       release_tmp(c, tmp);
1496    }
1497 }
1498
1499 static int
1500 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1501 {
1502    struct intel_context *intel = &brw->intel;
1503
1504    if (intel->gen >= 6) {
1505       /* URB data written (does not include the message header reg) must
1506        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1507        * section 5.4.3.2.2: URB_INTERLEAVED.
1508        *
1509        * URB entries are allocated on a multiple of 1024 bits, so an
1510        * extra 128 bits written here to make the end align to 256 is
1511        * no problem.
1512        */
1513       if ((mlen % 2) != 1)
1514          mlen++;
1515    }
1516
1517    return mlen;
1518 }
1519
1520 /**
1521  * Post-vertex-program processing.  Send the results to the URB.
1522  */
1523 static void emit_vertex_write( struct brw_vs_compile *c)
1524 {
1525    struct brw_compile *p = &c->func;
1526    struct brw_context *brw = p->brw;
1527    struct intel_context *intel = &brw->intel;
1528    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1529    struct brw_reg ndc;
1530    int eot;
1531    GLuint len_vertex_header = 2;
1532    int i;
1533    int msg_len;
1534    int slot;
1535
1536    if (c->key.copy_edgeflag) {
1537       brw_MOV(p,
1538               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1539               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1540    }
1541
1542    if (intel->gen < 6) {
1543       /* Build ndc coords */
1544       ndc = get_tmp(c);
1545       /* ndc = 1.0 / pos.w */
1546       emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1547       /* ndc.xyz = pos * ndc */
1548       brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1549    }
1550
1551    /* Update the header for point size, user clipping flags, and -ve rhw
1552     * workaround.
1553     */
1554    if (intel->gen >= 6) {
1555       struct brw_reg m1 = brw_message_reg(1);
1556
1557       /* On gen6, m1 has each value in a separate dword, so we never
1558        * need to mess with a temporary for computing the m1 value.
1559        */
1560       brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1561       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1562          brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1563                  brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1564       }
1565
1566       /* Set the user clip distances in dword 8-15. (m3-4)*/
1567       if (c->key.userclip_active) {
1568          for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1569             struct brw_reg m;
1570             if (i < 4)
1571                m = brw_message_reg(3);
1572             else
1573                m = brw_message_reg(4);
1574
1575             brw_DP4(p, brw_writemask(m, (1 << (i & 3))),pos, c->userplane[i]);
1576          }
1577       }
1578    } else if ((c->prog_data.outputs_written &
1579                BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1580               c->key.userclip_active || brw->has_negative_rhw_bug) {
1581       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1582       GLuint i;
1583
1584       brw_MOV(p, header1, brw_imm_ud(0));
1585
1586       brw_set_access_mode(p, BRW_ALIGN_16);
1587
1588       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1589          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1590          brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1591                  brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1592          brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1593                  header1, brw_imm_ud(0x7ff<<8));
1594       }
1595
1596       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1597          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1598          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1599          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1600          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1601       }
1602
1603       /* i965 clipping workaround:
1604        * 1) Test for -ve rhw
1605        * 2) If set,
1606        *      set ndc = (0,0,0,0)
1607        *      set ucp[6] = 1
1608        *
1609        * Later, clipping will detect ucp[6] and ensure the primitive is
1610        * clipped against all fixed planes.
1611        */
1612       if (brw->has_negative_rhw_bug) {
1613          brw_CMP(p,
1614                  vec8(brw_null_reg()),
1615                  BRW_CONDITIONAL_L,
1616                  brw_swizzle1(ndc, 3),
1617                  brw_imm_f(0));
1618
1619          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1620          brw_MOV(p, ndc, brw_imm_f(0));
1621          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1622       }
1623
1624       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1625       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1626       brw_set_access_mode(p, BRW_ALIGN_16);
1627
1628       release_tmp(c, header1);
1629    }
1630    else {
1631       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1632    }
1633
1634    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1635     * of zeros followed by two sets of NDC coordinates:
1636     */
1637    brw_set_access_mode(p, BRW_ALIGN_1);
1638    brw_set_acc_write_control(p, 0);
1639
1640    /* The VUE layout is documented in Volume 2a. */
1641    if (intel->gen >= 6) {
1642       /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1643        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1644        * dword 4-7 (m2) is the 4D space position
1645        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1646        * enabled.
1647        * m3 or 5 is the first vertex element data we fill, which is
1648        * the vertex position.
1649        */
1650       brw_MOV(p, brw_message_reg(2), pos);
1651       len_vertex_header = 1;
1652       if (c->key.userclip_active)
1653          len_vertex_header += 2;
1654    } else if (intel->gen == 5) {
1655       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1656        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1657        * dword 4-7 (m2) is the ndc position (set above)
1658        * dword 8-11 (m3) of the vertex header is the 4D space position
1659        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1660        * m6 is a pad so that the vertex element data is aligned
1661        * m7 is the first vertex data we fill, which is the vertex position.
1662        */
1663       brw_MOV(p, brw_message_reg(2), ndc);
1664       brw_MOV(p, brw_message_reg(3), pos);
1665       brw_MOV(p, brw_message_reg(7), pos);
1666       len_vertex_header = 6;
1667    } else {
1668       /* There are 8 dwords in VUE header pre-Ironlake:
1669        * dword 0-3 (m1) is indices, point width, clip flags.
1670        * dword 4-7 (m2) is ndc position (set above)
1671        *
1672        * dword 8-11 (m3) is the first vertex data, which we always have be the
1673        * vertex position.
1674        */
1675       brw_MOV(p, brw_message_reg(2), ndc);
1676       brw_MOV(p, brw_message_reg(3), pos);
1677       len_vertex_header = 2;
1678    }
1679
1680    /* Move variable-addressed, non-overflow outputs to their MRFs. */
1681    for (slot = len_vertex_header; slot < c->vue_map.num_slots; ++slot) {
1682       if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE)
1683          break;
1684
1685       int mrf = slot + 1;
1686       int vert_result = c->vue_map.slot_to_vert_result[slot];
1687       if (c->regs[PROGRAM_OUTPUT][vert_result].file ==
1688           BRW_GENERAL_REGISTER_FILE) {
1689          brw_MOV(p, brw_message_reg(mrf),
1690                  c->regs[PROGRAM_OUTPUT][vert_result]);
1691       }
1692    }
1693
1694    eot = (slot >= c->vue_map.num_slots);
1695
1696    /* Message header, plus the (first part of the) VUE. */
1697    msg_len = 1 + slot;
1698    msg_len = align_interleaved_urb_mlen(brw, msg_len);
1699    /* Any outputs beyond BRW_MAX_MRF should be in the second URB write */
1700    assert (msg_len <= BRW_MAX_MRF - 1);
1701
1702    brw_urb_WRITE(p,
1703                  brw_null_reg(), /* dest */
1704                  0,             /* starting mrf reg nr */
1705                  c->r0,         /* src */
1706                  0,             /* allocate */
1707                  1,             /* used */
1708                  msg_len,
1709                  0,             /* response len */
1710                  eot,           /* eot */
1711                  eot,           /* writes complete */
1712                  0,             /* urb destination offset */
1713                  BRW_URB_SWIZZLE_INTERLEAVE);
1714
1715    if (slot < c->vue_map.num_slots) {
1716       /* Not all of the vertex outputs/results fit into the MRF.
1717        * Move the overflowed attributes from the GRF to the MRF and
1718        * issue another brw_urb_WRITE().
1719        */
1720       GLuint mrf = 1;
1721       for (; slot < c->vue_map.num_slots; ++slot) {
1722          int vert_result = c->vue_map.slot_to_vert_result[slot];
1723          /* move from GRF to MRF */
1724          brw_MOV(p, brw_message_reg(mrf),
1725                  c->regs[PROGRAM_OUTPUT][vert_result]);
1726          mrf++;
1727       }
1728
1729       brw_urb_WRITE(p,
1730                     brw_null_reg(), /* dest */
1731                     0,              /* starting mrf reg nr */
1732                     c->r0,          /* src */
1733                     0,              /* allocate */
1734                     1,              /* used */
1735                     align_interleaved_urb_mlen(brw, mrf),
1736                     0,              /* response len */
1737                     1,              /* eot */
1738                     1,              /* writes complete */
1739                     MAX_SLOTS_IN_FIRST_URB_WRITE / 2,  /* urb destination offset */
1740                     BRW_URB_SWIZZLE_INTERLEAVE);
1741    }
1742 }
1743
1744 static bool
1745 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1746 {
1747    struct brw_compile *p = &c->func;
1748    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1749
1750    if (p->nr_insn == 0)
1751       return false;
1752
1753    if (val.address_mode != BRW_ADDRESS_DIRECT)
1754       return false;
1755
1756    if (val.negate || val.abs)
1757       return false;
1758
1759    switch (prev_insn->header.opcode) {
1760    case BRW_OPCODE_MOV:
1761    case BRW_OPCODE_MAC:
1762    case BRW_OPCODE_MUL:
1763       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1764           prev_insn->header.execution_size == val.width &&
1765           prev_insn->bits1.da1.dest_reg_file == val.file &&
1766           prev_insn->bits1.da1.dest_reg_type == val.type &&
1767           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1768           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1769           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1770           prev_insn->bits1.da16.dest_writemask == 0xf)
1771          return true;
1772       else
1773          return false;
1774    default:
1775       return false;
1776    }
1777 }
1778
1779 static uint32_t
1780 get_predicate(const struct prog_instruction *inst)
1781 {
1782    if (inst->DstReg.CondMask == COND_TR)
1783       return BRW_PREDICATE_NONE;
1784
1785    /* All of GLSL only produces predicates for COND_NE and one channel per
1786     * vector.  Fail badly if someone starts doing something else, as it might
1787     * mean infinite looping or something.
1788     *
1789     * We'd like to support all the condition codes, but our hardware doesn't
1790     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1791     * those, the instruction may update the condition codes or not, then any
1792     * later instruction may use one of those condition codes.  For gen4, the
1793     * instruction may update the flags register based on one of the condition
1794     * codes output by the instruction, and then further instructions may
1795     * predicate on that.  We can probably support this, but it won't
1796     * necessarily be easy.
1797     */
1798    assert(inst->DstReg.CondMask == COND_NE);
1799
1800    switch (inst->DstReg.CondSwizzle) {
1801    case SWIZZLE_XXXX:
1802       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1803    case SWIZZLE_YYYY:
1804       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1805    case SWIZZLE_ZZZZ:
1806       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1807    case SWIZZLE_WWWW:
1808       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1809    default:
1810       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1811                     inst->DstReg.CondMask);
1812       return BRW_PREDICATE_NORMAL;
1813    }
1814 }
1815
1816 static void
1817 brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
1818 {
1819    struct brw_compile *p = &c->func;
1820    int i;
1821
1822    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1823       if (!(c->prog_data.inputs_read & BITFIELD64_BIT(i)))
1824          continue;
1825
1826       if (c->key.gl_fixed_input_size[i] != 0) {
1827          struct brw_reg reg = c->regs[PROGRAM_INPUT][i];
1828
1829          brw_MUL(p,
1830                  brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
1831                  reg, brw_imm_f(1.0 / 65536.0));
1832       }
1833    }
1834 }
1835
1836 /* Emit the vertex program instructions here.
1837  */
1838 void brw_old_vs_emit(struct brw_vs_compile *c )
1839 {
1840 #define MAX_IF_DEPTH 32
1841 #define MAX_LOOP_DEPTH 32
1842    struct brw_compile *p = &c->func;
1843    struct brw_context *brw = p->brw;
1844    struct intel_context *intel = &brw->intel;
1845    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1846    GLuint insn, loop_depth = 0;
1847    struct brw_instruction *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1848    int if_depth_in_loop[MAX_LOOP_DEPTH];
1849    const struct brw_indirect stack_index = brw_indirect(0, 0);
1850    GLuint index;
1851    GLuint file;
1852
1853    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1854       printf("vs-mesa:\n");
1855       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1856                                true);
1857       printf("\n");
1858    }
1859
1860    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1861    brw_set_access_mode(p, BRW_ALIGN_16);
1862    if_depth_in_loop[loop_depth] = 0;
1863
1864    brw_set_acc_write_control(p, 1);
1865
1866    for (insn = 0; insn < nr_insns; insn++) {
1867        GLuint i;
1868        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1869
1870        /* Message registers can't be read, so copy the output into GRF
1871         * register if they are used in source registers
1872         */
1873        for (i = 0; i < 3; i++) {
1874            struct prog_src_register *src = &inst->SrcReg[i];
1875            GLuint index = src->Index;
1876            GLuint file = src->File;
1877            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1878                c->output_regs[index].used_in_src = true;
1879        }
1880
1881        switch (inst->Opcode) {
1882        case OPCODE_CAL:
1883        case OPCODE_RET:
1884           c->needs_stack = true;
1885           break;
1886        default:
1887           break;
1888        }
1889    }
1890
1891    /* Static register allocation
1892     */
1893    brw_vs_alloc_regs(c);
1894
1895    brw_vs_rescale_gl_fixed(c);
1896
1897    if (c->needs_stack)
1898       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1899
1900    for (insn = 0; insn < nr_insns; insn++) {
1901
1902       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1903       struct brw_reg args[3], dst;
1904       GLuint i;
1905
1906 #if 0
1907       printf("%d: ", insn);
1908       _mesa_print_instruction(inst);
1909 #endif
1910
1911       /* Get argument regs.  SWZ is special and does this itself.
1912        */
1913       if (inst->Opcode != OPCODE_SWZ)
1914           for (i = 0; i < 3; i++) {
1915               const struct prog_src_register *src = &inst->SrcReg[i];
1916               index = src->Index;
1917               file = src->File;
1918               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
1919                  /* Can't just make get_arg "do the right thing" here because
1920                   * other callers of get_arg and get_src_reg don't expect any
1921                   * special behavior for the c->output_regs[index].used_in_src
1922                   * case.
1923                   */
1924                  args[i] = c->output_regs[index].reg;
1925                  args[i].dw1.bits.swizzle =
1926                     BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1927                                  GET_SWZ(src->Swizzle, 1),
1928                                  GET_SWZ(src->Swizzle, 2),
1929                                  GET_SWZ(src->Swizzle, 3));
1930
1931                  /* Note this is ok for non-swizzle ARB_vp instructions */
1932                  args[i].negate = src->Negate ? 1 : 0;
1933               } else
1934                   args[i] = get_arg(c, inst, i);
1935           }
1936
1937       /* Get dest regs.  Note that it is possible for a reg to be both
1938        * dst and arg, given the static allocation of registers.  So
1939        * care needs to be taken emitting multi-operation instructions.
1940        */
1941       index = inst->DstReg.Index;
1942       file = inst->DstReg.File;
1943       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1944          /* Can't just make get_dst "do the right thing" here because other
1945           * callers of get_dst don't expect any special behavior for the
1946           * c->output_regs[index].used_in_src case.
1947           */
1948          dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
1949       else
1950           dst = get_dst(c, inst->DstReg);
1951
1952       if (inst->SaturateMode != SATURATE_OFF) {
1953          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1954                        inst->SaturateMode);
1955       }
1956
1957       switch (inst->Opcode) {
1958       case OPCODE_ABS:
1959          args[0].negate = false;
1960          brw_MOV(p, dst, brw_abs(args[0]));
1961          break;
1962       case OPCODE_ADD:
1963          brw_ADD(p, dst, args[0], args[1]);
1964          break;
1965       case OPCODE_COS:
1966          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1967          break;
1968       case OPCODE_DP2:
1969          brw_DP2(p, dst, args[0], args[1]);
1970          break;
1971       case OPCODE_DP3:
1972          brw_DP3(p, dst, args[0], args[1]);
1973          break;
1974       case OPCODE_DP4:
1975          brw_DP4(p, dst, args[0], args[1]);
1976          break;
1977       case OPCODE_DPH:
1978          brw_DPH(p, dst, args[0], args[1]);
1979          break;
1980       case OPCODE_DST:
1981          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1982          break;
1983       case OPCODE_EXP:
1984          unalias1(c, dst, args[0], emit_exp_noalias);
1985          break;
1986       case OPCODE_EX2:
1987          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1988          break;
1989       case OPCODE_ARL:
1990          emit_arl(p, dst, args[0]);
1991          break;
1992       case OPCODE_FLR:
1993          brw_RNDD(p, dst, args[0]);
1994          break;
1995       case OPCODE_FRC:
1996          brw_FRC(p, dst, args[0]);
1997          break;
1998       case OPCODE_LOG:
1999          unalias1(c, dst, args[0], emit_log_noalias);
2000          break;
2001       case OPCODE_LG2:
2002          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2003          break;
2004       case OPCODE_LIT:
2005          unalias1(c, dst, args[0], emit_lit_noalias);
2006          break;
2007       case OPCODE_LRP:
2008          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2009          break;
2010       case OPCODE_MAD:
2011          if (!accumulator_contains(c, args[2]))
2012             brw_MOV(p, brw_acc_reg(), args[2]);
2013          brw_MAC(p, dst, args[0], args[1]);
2014          break;
2015       case OPCODE_CMP:
2016          emit_cmp(p, dst, args[0], args[1], args[2]);
2017          break;
2018       case OPCODE_MAX:
2019          emit_max(p, dst, args[0], args[1]);
2020          break;
2021       case OPCODE_MIN:
2022          emit_min(p, dst, args[0], args[1]);
2023          break;
2024       case OPCODE_MOV:
2025          brw_MOV(p, dst, args[0]);
2026          break;
2027       case OPCODE_MUL:
2028          brw_MUL(p, dst, args[0], args[1]);
2029          break;
2030       case OPCODE_POW:
2031          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2032          break;
2033       case OPCODE_RCP:
2034          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2035          break;
2036       case OPCODE_RSQ:
2037          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2038          break;
2039
2040       case OPCODE_SEQ:
2041          unalias2(c, dst, args[0], args[1], emit_seq);
2042          break;
2043       case OPCODE_SIN:
2044          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2045          break;
2046       case OPCODE_SNE:
2047          unalias2(c, dst, args[0], args[1], emit_sne);
2048          break;
2049       case OPCODE_SGE:
2050          unalias2(c, dst, args[0], args[1], emit_sge);
2051          break;
2052       case OPCODE_SGT:
2053          unalias2(c, dst, args[0], args[1], emit_sgt);
2054          break;
2055       case OPCODE_SLT:
2056          unalias2(c, dst, args[0], args[1], emit_slt);
2057          break;
2058       case OPCODE_SLE:
2059          unalias2(c, dst, args[0], args[1], emit_sle);
2060          break;
2061       case OPCODE_SSG:
2062          unalias1(c, dst, args[0], emit_sign);
2063          break;
2064       case OPCODE_SUB:
2065          brw_ADD(p, dst, args[0], negate(args[1]));
2066          break;
2067       case OPCODE_SWZ:
2068          /* The args[0] value can't be used here as it won't have
2069           * correctly encoded the full swizzle:
2070           */
2071          emit_swz(c, dst, inst);
2072          break;
2073       case OPCODE_TRUNC:
2074          /* round toward zero */
2075          brw_RNDZ(p, dst, args[0]);
2076          break;
2077       case OPCODE_XPD:
2078          emit_xpd(p, dst, args[0], args[1]);
2079          break;
2080       case OPCODE_IF: {
2081          struct brw_instruction *if_inst = brw_IF(p, BRW_EXECUTE_8);
2082          /* Note that brw_IF smashes the predicate_control field. */
2083          if_inst->header.predicate_control = get_predicate(inst);
2084          if_depth_in_loop[loop_depth]++;
2085          break;
2086       }
2087       case OPCODE_ELSE:
2088          clear_current_const(c);
2089          brw_ELSE(p);
2090          break;
2091       case OPCODE_ENDIF:
2092          clear_current_const(c);
2093          brw_ENDIF(p);
2094          if_depth_in_loop[loop_depth]--;
2095          break;
2096       case OPCODE_BGNLOOP:
2097          clear_current_const(c);
2098          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2099          if_depth_in_loop[loop_depth] = 0;
2100          break;
2101       case OPCODE_BRK:
2102          brw_set_predicate_control(p, get_predicate(inst));
2103          brw_BREAK(p, if_depth_in_loop[loop_depth]);
2104          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2105          break;
2106       case OPCODE_CONT:
2107          brw_set_predicate_control(p, get_predicate(inst));
2108          if (intel->gen >= 6) {
2109             gen6_CONT(p, loop_inst[loop_depth - 1]);
2110          } else {
2111             brw_CONT(p, if_depth_in_loop[loop_depth]);
2112          }
2113          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2114          break;
2115
2116       case OPCODE_ENDLOOP: {
2117          clear_current_const(c);
2118          struct brw_instruction *inst0, *inst1;
2119          GLuint br = 1;
2120
2121          loop_depth--;
2122
2123          if (intel->gen == 5)
2124             br = 2;
2125
2126          inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2127
2128          if (intel->gen < 6) {
2129             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2130             while (inst0 > loop_inst[loop_depth]) {
2131                inst0--;
2132                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2133                    inst0->bits3.if_else.jump_count == 0) {
2134                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2135                } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2136                           inst0->bits3.if_else.jump_count == 0) {
2137                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2138                }
2139             }
2140          }
2141       }
2142          break;
2143
2144       case OPCODE_BRA:
2145          brw_set_predicate_control(p, get_predicate(inst));
2146          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2147          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2148          break;
2149       case OPCODE_CAL:
2150          brw_set_access_mode(p, BRW_ALIGN_1);
2151          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2152          brw_set_access_mode(p, BRW_ALIGN_16);
2153          brw_ADD(p, get_addr_reg(stack_index),
2154                          get_addr_reg(stack_index), brw_imm_d(4));
2155          brw_save_call(p, inst->Comment, p->nr_insn);
2156          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2157          break;
2158       case OPCODE_RET:
2159          brw_ADD(p, get_addr_reg(stack_index),
2160                          get_addr_reg(stack_index), brw_imm_d(-4));
2161          brw_set_access_mode(p, BRW_ALIGN_1);
2162          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2163          brw_set_access_mode(p, BRW_ALIGN_16);
2164          break;
2165       case OPCODE_END:
2166          emit_vertex_write(c);
2167          break;
2168       case OPCODE_PRINT:
2169          /* no-op */
2170          break;
2171       case OPCODE_BGNSUB:
2172          brw_save_label(p, inst->Comment, p->nr_insn);
2173          break;
2174       case OPCODE_ENDSUB:
2175          /* no-op */
2176          break;
2177       default:
2178          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2179                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2180                                     _mesa_opcode_string(inst->Opcode) :
2181                                     "unknown");
2182       }
2183
2184       /* Set the predication update on the last instruction of the native
2185        * instruction sequence.
2186        *
2187        * This would be problematic if it was set on a math instruction,
2188        * but that shouldn't be the case with the current GLSL compiler.
2189        */
2190       if (inst->CondUpdate) {
2191          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2192
2193          assert(hw_insn->header.destreg__conditionalmod == 0);
2194          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2195       }
2196
2197       if ((inst->DstReg.File == PROGRAM_OUTPUT)
2198           && (inst->DstReg.Index != VERT_RESULT_HPOS)
2199           && c->output_regs[inst->DstReg.Index].used_in_src) {
2200          brw_MOV(p, get_dst(c, inst->DstReg), dst);
2201       }
2202
2203       /* Result color clamping.
2204        *
2205        * When destination register is an output register and
2206        * it's primary/secondary front/back color, we have to clamp
2207        * the result to [0,1]. This is done by enabling the
2208        * saturation bit for the last instruction.
2209        *
2210        * We don't use brw_set_saturate() as it modifies
2211        * p->current->header.saturate, which affects all the subsequent
2212        * instructions. Instead, we directly modify the header
2213        * of the last (already stored) instruction.
2214        */
2215       if (inst->DstReg.File == PROGRAM_OUTPUT &&
2216           c->key.clamp_vertex_color) {
2217          if ((inst->DstReg.Index == VERT_RESULT_COL0)
2218              || (inst->DstReg.Index == VERT_RESULT_COL1)
2219              || (inst->DstReg.Index == VERT_RESULT_BFC0)
2220              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2221             p->store[p->nr_insn-1].header.saturate = 1;
2222          }
2223       }
2224
2225       if (inst->DstReg.RelAddr) {
2226          assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2227                 inst->DstReg.File == PROGRAM_OUTPUT);
2228          move_to_reladdr_dst(c, inst, dst);
2229       }
2230
2231       release_tmps(c);
2232    }
2233
2234    brw_resolve_cals(p);
2235    brw_set_uip_jip(p);
2236
2237    brw_optimize(p);
2238
2239    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2240       int i;
2241
2242       printf("vs-native:\n");
2243       for (i = 0; i < p->nr_insn; i++)
2244          brw_disasm(stdout, &p->store[i], intel->gen);
2245       printf("\n");
2246    }
2247 }