src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 132                                            src0, src1, src2);           \
 133    }
 134
 135 ALU1(NOT)
 136 ALU1(MOV)
 137 ALU1(FRC)
 138 ALU1(RNDD)
 139 ALU1(RNDE)
 140 ALU1(RNDZ)
 141 ALU1(F32TO16)
 142 ALU1(F16TO32)
 143 ALU2(ADD)
 144 ALU2(MUL)
 145 ALU2(MACH)
 146 ALU2(AND)
 147 ALU2(OR)
 148 ALU2(XOR)
 149 ALU2(DP3)
 150 ALU2(DP4)
 151 ALU2(DPH)
 152 ALU2(SHL)
 153 ALU2(SHR)
 154 ALU2(ASR)
 155 ALU3(LRP)
 156 ALU1(BFREV)
 157 ALU3(BFE)
 158 ALU2(BFI1)
 159 ALU3(BFI2)
 160 ALU1(FBH)
 161 ALU1(FBL)
 162 ALU1(CBIT)
 163 ALU3(MAD)
 164 ALU2(ADDC)
 165 ALU2(SUBB)
 166
 167 /** Gen4 predicated IF. */
 168 vec4_instruction *
 169 vec4_visitor::IF(uint32_t predicate)
 170 {
 171    vec4_instruction *inst;
 172
 173    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 174    inst->predicate = predicate;
 175
 176    return inst;
 177 }
 178
 179 /** Gen6 IF with embedded comparison. */
 180 vec4_instruction *
 181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 182 {
 183    assert(brw->gen == 6);
 184
 185    vec4_instruction *inst;
 186
 187    resolve_ud_negate(&src0);
 188    resolve_ud_negate(&src1);
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 191                                         src0, src1);
 192    inst->conditional_mod = condition;
 193
 194    return inst;
 195 }
 196
 197 /**
 198  * CMP: Sets the low bit of the destination channels with the result
 199  * of the comparison, while the upper bits are undefined, and updates
 200  * the flag register with the packed 16 bits of the result.
 201  */
 202 vec4_instruction *
 203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 204 {
 205    vec4_instruction *inst;
 206
 207    /* original gen4 does type conversion to the destination type
 208     * before before comparison, producing garbage results for floating
 209     * point comparisons.
 210     */
 211    if (brw->gen == 4) {
 212       dst.type = src0.type;
 213       if (dst.file == HW_REG)
 214          dst.fixed_hw_reg.type = dst.type;
 215    }
 216
 217    resolve_ud_negate(&src0);
 218    resolve_ud_negate(&src1);
 219
 220    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 221    inst->conditional_mod = condition;
 222
 223    return inst;
 224 }
 225
 226 vec4_instruction *
 227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 228 {
 229    vec4_instruction *inst;
 230
 231    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 232                                         dst, index);
 233    inst->base_mrf = 14;
 234    inst->mlen = 2;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 245                                         dst, src, index);
 246    inst->base_mrf = 13;
 247    inst->mlen = 3;
 248
 249    return inst;
 250 }
 251
 252 void
 253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 254 {
 255    static enum opcode dot_opcodes[] = {
 256       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 257    };
 258
 259    emit(dot_opcodes[elements - 2], dst, src0, src1);
 260 }
 261
 262 src_reg
 263 vec4_visitor::fix_3src_operand(src_reg src)
 264 {
 265    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 266     * able to use vertical stride of zero to replicate the vec4 uniform, like
 267     *
 268     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 269     *
 270     * But you can't, since vertical stride is always four in three-source
 271     * instructions. Instead, insert a MOV instruction to do the replication so
 272     * that the three-source instruction can consume it.
 273     */
 274
 275    /* The MOV is only needed if the source is a uniform or immediate. */
 276    if (src.file != UNIFORM && src.file != IMM)
 277       return src;
 278
 279    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 280    expanded.type = src.type;
 281    emit(MOV(expanded, src));
 282    return src_reg(expanded);
 283 }
 284
 285 src_reg
 286 vec4_visitor::fix_math_operand(src_reg src)
 287 {
 288    /* The gen6 math instruction ignores the source modifiers --
 289     * swizzle, abs, negate, and at least some parts of the register
 290     * region description.
 291     *
 292     * Rather than trying to enumerate all these cases, *always* expand the
 293     * operand to a temp GRF for gen6.
 294     *
 295     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 296     * can't use.
 297     */
 298
 299    if (brw->gen == 7 && src.file != IMM)
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(MOV(expanded, src));
 305    return src_reg(expanded);
 306 }
 307
 308 void
 309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    src = fix_math_operand(src);
 312
 313    if (dst.writemask != WRITEMASK_XYZW) {
 314       /* The gen6 math instruction must be align1, so we can't do
 315        * writemasks.
 316        */
 317       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 318
 319       emit(opcode, temp_dst, src);
 320
 321       emit(MOV(dst, src_reg(temp_dst)));
 322    } else {
 323       emit(opcode, dst, src);
 324    }
 325 }
 326
 327 void
 328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 329 {
 330    vec4_instruction *inst = emit(opcode, dst, src);
 331    inst->base_mrf = 1;
 332    inst->mlen = 1;
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 337 {
 338    switch (opcode) {
 339    case SHADER_OPCODE_RCP:
 340    case SHADER_OPCODE_RSQ:
 341    case SHADER_OPCODE_SQRT:
 342    case SHADER_OPCODE_EXP2:
 343    case SHADER_OPCODE_LOG2:
 344    case SHADER_OPCODE_SIN:
 345    case SHADER_OPCODE_COS:
 346       break;
 347    default:
 348       assert(!"not reached: bad math opcode");
 349       return;
 350    }
 351
 352    if (brw->gen >= 6) {
 353       return emit_math1_gen6(opcode, dst, src);
 354    } else {
 355       return emit_math1_gen4(opcode, dst, src);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    src0 = fix_math_operand(src0);
 364    src1 = fix_math_operand(src1);
 365
 366    if (dst.writemask != WRITEMASK_XYZW) {
 367       /* The gen6 math instruction must be align1, so we can't do
 368        * writemasks.
 369        */
 370       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 371       temp_dst.type = dst.type;
 372
 373       emit(opcode, temp_dst, src0, src1);
 374
 375       emit(MOV(dst, src_reg(temp_dst)));
 376    } else {
 377       emit(opcode, dst, src0, src1);
 378    }
 379 }
 380
 381 void
 382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 383                               dst_reg dst, src_reg src0, src_reg src1)
 384 {
 385    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 386    inst->base_mrf = 1;
 387    inst->mlen = 2;
 388 }
 389
 390 void
 391 vec4_visitor::emit_math(enum opcode opcode,
 392                         dst_reg dst, src_reg src0, src_reg src1)
 393 {
 394    switch (opcode) {
 395    case SHADER_OPCODE_POW:
 396    case SHADER_OPCODE_INT_QUOTIENT:
 397    case SHADER_OPCODE_INT_REMAINDER:
 398       break;
 399    default:
 400       assert(!"not reached: unsupported binary math opcode");
 401       return;
 402    }
 403
 404    if (brw->gen >= 6) {
 405       return emit_math2_gen6(opcode, dst, src0, src1);
 406    } else {
 407       return emit_math2_gen4(opcode, dst, src0, src1);
 408    }
 409 }
 410
 411 void
 412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 413 {
 414    if (brw->gen < 7)
 415       assert(!"ir_unop_pack_half_2x16 should be lowered");
 416
 417    assert(dst.type == BRW_REGISTER_TYPE_UD);
 418    assert(src0.type == BRW_REGISTER_TYPE_F);
 419
 420    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 421     *
 422     *   Because this instruction does not have a 16-bit floating-point type,
 423     *   the destination data type must be Word (W).
 424     *
 425     *   The destination must be DWord-aligned and specify a horizontal stride
 426     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 427     *   each destination channel and the upper word is not modified.
 428     *
 429     * The above restriction implies that the f32to16 instruction must use
 430     * align1 mode, because only in align1 mode is it possible to specify
 431     * horizontal stride.  We choose here to defy the hardware docs and emit
 432     * align16 instructions.
 433     *
 434     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 435     * instructions. I was partially successful in that the code passed all
 436     * tests.  However, the code was dubiously correct and fragile, and the
 437     * tests were not harsh enough to probe that frailty. Not trusting the
 438     * code, I chose instead to remain in align16 mode in defiance of the hw
 439     * docs).
 440     *
 441     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 442     * simulator, emitting a f32to16 in align16 mode with UD as destination
 443     * data type is safe. The behavior differs from that specified in the PRM
 444     * in that the upper word of each destination channel is cleared to 0.
 445     */
 446
 447    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 448    src_reg tmp_src(tmp_dst);
 449
 450 #if 0
 451    /* Verify the undocumented behavior on which the following instructions
 452     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 453     * then the result of the bit-or instruction below will be incorrect.
 454     *
 455     * You should inspect the disasm output in order to verify that the MOV is
 456     * not optimized away.
 457     */
 458    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 459 #endif
 460
 461    /* Give tmp the form below, where "." means untouched.
 462     *
 463     *     w z          y          x w z          y          x
 464     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 465     *
 466     * That the upper word of each write-channel be 0 is required for the
 467     * following bit-shift and bit-or instructions to work. Note that this
 468     * relies on the undocumented hardware behavior mentioned above.
 469     */
 470    tmp_dst.writemask = WRITEMASK_XY;
 471    emit(F32TO16(tmp_dst, src0));
 472
 473    /* Give the write-channels of dst the form:
 474     *   0xhhhh0000
 475     */
 476    tmp_src.swizzle = SWIZZLE_Y;
 477    emit(SHL(dst, tmp_src, src_reg(16u)));
 478
 479    /* Finally, give the write-channels of dst the form of packHalf2x16's
 480     * output:
 481     *   0xhhhhllll
 482     */
 483    tmp_src.swizzle = SWIZZLE_X;
 484    emit(OR(dst, src_reg(dst), tmp_src));
 485 }
 486
 487 void
 488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 489 {
 490    if (brw->gen < 7)
 491       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 492
 493    assert(dst.type == BRW_REGISTER_TYPE_F);
 494    assert(src0.type == BRW_REGISTER_TYPE_UD);
 495
 496    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 497     *
 498     *   Because this instruction does not have a 16-bit floating-point type,
 499     *   the source data type must be Word (W). The destination type must be
 500     *   F (Float).
 501     *
 502     * To use W as the source data type, we must adjust horizontal strides,
 503     * which is only possible in align1 mode. All my [chadv] attempts at
 504     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 505     * Piglit tests, so I gave up.
 506     *
 507     * I've verified that, on gen7 hardware and the simulator, it is safe to
 508     * emit f16to32 in align16 mode with UD as source data type.
 509     */
 510
 511    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 512    src_reg tmp_src(tmp_dst);
 513
 514    tmp_dst.writemask = WRITEMASK_X;
 515    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 516
 517    tmp_dst.writemask = WRITEMASK_Y;
 518    emit(SHR(tmp_dst, src0, src_reg(16u)));
 519
 520    dst.writemask = WRITEMASK_XY;
 521    emit(F16TO32(dst, tmp_src));
 522 }
 523
 524 void
 525 vec4_visitor::visit_instructions(const exec_list *list)
 526 {
 527    foreach_list(node, list) {
 528       ir_instruction *ir = (ir_instruction *)node;
 529
 530       base_ir = ir;
 531       ir->accept(this);
 532    }
 533 }
 534
 535
 536 static int
 537 type_size(const struct glsl_type *type)
 538 {
 539    unsigned int i;
 540    int size;
 541
 542    switch (type->base_type) {
 543    case GLSL_TYPE_UINT:
 544    case GLSL_TYPE_INT:
 545    case GLSL_TYPE_FLOAT:
 546    case GLSL_TYPE_BOOL:
 547       if (type->is_matrix()) {
 548          return type->matrix_columns;
 549       } else {
 550          /* Regardless of size of vector, it gets a vec4. This is bad
 551           * packing for things like floats, but otherwise arrays become a
 552           * mess.  Hopefully a later pass over the code can pack scalars
 553           * down if appropriate.
 554           */
 555          return 1;
 556       }
 557    case GLSL_TYPE_ARRAY:
 558       assert(type->length > 0);
 559       return type_size(type->fields.array) * type->length;
 560    case GLSL_TYPE_STRUCT:
 561       size = 0;
 562       for (i = 0; i < type->length; i++) {
 563          size += type_size(type->fields.structure[i].type);
 564       }
 565       return size;
 566    case GLSL_TYPE_SAMPLER:
 567       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 568        * at link time.
 569        */
 570       return 1;
 571    case GLSL_TYPE_ATOMIC_UINT:
 572       return 0;
 573    case GLSL_TYPE_VOID:
 574    case GLSL_TYPE_ERROR:
 575    case GLSL_TYPE_INTERFACE:
 576       assert(0);
 577       break;
 578    }
 579
 580    return 0;
 581 }
 582
 583 int
 584 vec4_visitor::virtual_grf_alloc(int size)
 585 {
 586    if (virtual_grf_array_size <= virtual_grf_count) {
 587       if (virtual_grf_array_size == 0)
 588          virtual_grf_array_size = 16;
 589       else
 590          virtual_grf_array_size *= 2;
 591       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 592                                    virtual_grf_array_size);
 593       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 594                                      virtual_grf_array_size);
 595    }
 596    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 597    virtual_grf_reg_count += size;
 598    virtual_grf_sizes[virtual_grf_count] = size;
 599    return virtual_grf_count++;
 600 }
 601
 602 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 603 {
 604    init();
 605
 606    this->file = GRF;
 607    this->reg = v->virtual_grf_alloc(type_size(type));
 608
 609    if (type->is_array() || type->is_record()) {
 610       this->swizzle = BRW_SWIZZLE_NOOP;
 611    } else {
 612       this->swizzle = swizzle_for_size(type->vector_elements);
 613    }
 614
 615    this->type = brw_type_for_base_type(type);
 616 }
 617
 618 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 619 {
 620    init();
 621
 622    this->file = GRF;
 623    this->reg = v->virtual_grf_alloc(type_size(type));
 624
 625    if (type->is_array() || type->is_record()) {
 626       this->writemask = WRITEMASK_XYZW;
 627    } else {
 628       this->writemask = (1 << type->vector_elements) - 1;
 629    }
 630
 631    this->type = brw_type_for_base_type(type);
 632 }
 633
 634 /* Our support for uniforms is piggy-backed on the struct
 635  * gl_fragment_program, because that's where the values actually
 636  * get stored, rather than in some global gl_shader_program uniform
 637  * store.
 638  */
 639 void
 640 vec4_visitor::setup_uniform_values(ir_variable *ir)
 641 {
 642    int namelen = strlen(ir->name);
 643
 644    /* The data for our (non-builtin) uniforms is stored in a series of
 645     * gl_uniform_driver_storage structs for each subcomponent that
 646     * glGetUniformLocation() could name.  We know it's been set up in the same
 647     * order we'd walk the type, so walk the list of storage and find anything
 648     * with our name, or the prefix of a component that starts with our name.
 649     */
 650    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 651       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 652
 653       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 654           (storage->name[namelen] != 0 &&
 655            storage->name[namelen] != '.' &&
 656            storage->name[namelen] != '[')) {
 657          continue;
 658       }
 659
 660       gl_constant_value *components = storage->storage;
 661       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 662                                storage->type->matrix_columns);
 663
 664       for (unsigned s = 0; s < vector_count; s++) {
 665          uniform_vector_size[uniforms] = storage->type->vector_elements;
 666
 667          int i;
 668          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 669             prog_data->param[uniforms * 4 + i] = &components->f;
 670             components++;
 671          }
 672          for (; i < 4; i++) {
 673             static float zero = 0;
 674             prog_data->param[uniforms * 4 + i] = &zero;
 675          }
 676
 677          uniforms++;
 678       }
 679    }
 680 }
 681
 682 void
 683 vec4_visitor::setup_uniform_clipplane_values()
 684 {
 685    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 686
 687    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 688       this->uniform_vector_size[this->uniforms] = 4;
 689       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 690       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 691       for (int j = 0; j < 4; ++j) {
 692          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 693       }
 694       ++this->uniforms;
 695    }
 696 }
 697
 698 /* Our support for builtin uniforms is even scarier than non-builtin.
 699  * It sits on top of the PROG_STATE_VAR parameters that are
 700  * automatically updated from GL context state.
 701  */
 702 void
 703 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 704 {
 705    const ir_state_slot *const slots = ir->state_slots;
 706    assert(ir->state_slots != NULL);
 707
 708    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 709       /* This state reference has already been setup by ir_to_mesa,
 710        * but we'll get the same index back here.  We can reference
 711        * ParameterValues directly, since unlike brw_fs.cpp, we never
 712        * add new state references during compile.
 713        */
 714       int index = _mesa_add_state_reference(this->prog->Parameters,
 715                                             (gl_state_index *)slots[i].tokens);
 716       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 717
 718       this->uniform_vector_size[this->uniforms] = 0;
 719       /* Add each of the unique swizzled channels of the element.
 720        * This will end up matching the size of the glsl_type of this field.
 721        */
 722       int last_swiz = -1;
 723       for (unsigned int j = 0; j < 4; j++) {
 724          int swiz = GET_SWZ(slots[i].swizzle, j);
 725          last_swiz = swiz;
 726
 727          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 728          if (swiz <= last_swiz)
 729             this->uniform_vector_size[this->uniforms]++;
 730       }
 731       this->uniforms++;
 732    }
 733 }
 734
 735 dst_reg *
 736 vec4_visitor::variable_storage(ir_variable *var)
 737 {
 738    return (dst_reg *)hash_table_find(this->variable_ht, var);
 739 }
 740
 741 void
 742 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 743 {
 744    ir_expression *expr = ir->as_expression();
 745
 746    *predicate = BRW_PREDICATE_NORMAL;
 747
 748    if (expr) {
 749       src_reg op[2];
 750       vec4_instruction *inst;
 751
 752       assert(expr->get_num_operands() <= 2);
 753       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 754          expr->operands[i]->accept(this);
 755          op[i] = this->result;
 756
 757          resolve_ud_negate(&op[i]);
 758       }
 759
 760       switch (expr->operation) {
 761       case ir_unop_logic_not:
 762          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 763          inst->conditional_mod = BRW_CONDITIONAL_Z;
 764          break;
 765
 766       case ir_binop_logic_xor:
 767          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 768          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 769          break;
 770
 771       case ir_binop_logic_or:
 772          inst = emit(OR(dst_null_d(), op[0], op[1]));
 773          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 774          break;
 775
 776       case ir_binop_logic_and:
 777          inst = emit(AND(dst_null_d(), op[0], op[1]));
 778          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 779          break;
 780
 781       case ir_unop_f2b:
 782          if (brw->gen >= 6) {
 783             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 784          } else {
 785             inst = emit(MOV(dst_null_f(), op[0]));
 786             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 787          }
 788          break;
 789
 790       case ir_unop_i2b:
 791          if (brw->gen >= 6) {
 792             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 793          } else {
 794             inst = emit(MOV(dst_null_d(), op[0]));
 795             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 796          }
 797          break;
 798
 799       case ir_binop_all_equal:
 800          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 801          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 802          break;
 803
 804       case ir_binop_any_nequal:
 805          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 806          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 807          break;
 808
 809       case ir_unop_any:
 810          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 811          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 812          break;
 813
 814       case ir_binop_greater:
 815       case ir_binop_gequal:
 816       case ir_binop_less:
 817       case ir_binop_lequal:
 818       case ir_binop_equal:
 819       case ir_binop_nequal:
 820          emit(CMP(dst_null_d(), op[0], op[1],
 821                   brw_conditional_for_comparison(expr->operation)));
 822          break;
 823
 824       default:
 825          assert(!"not reached");
 826          break;
 827       }
 828       return;
 829    }
 830
 831    ir->accept(this);
 832
 833    resolve_ud_negate(&this->result);
 834
 835    if (brw->gen >= 6) {
 836       vec4_instruction *inst = emit(AND(dst_null_d(),
 837                                         this->result, src_reg(1)));
 838       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839    } else {
 840       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 841       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 842    }
 843 }
 844
 845 /**
 846  * Emit a gen6 IF statement with the comparison folded into the IF
 847  * instruction.
 848  */
 849 void
 850 vec4_visitor::emit_if_gen6(ir_if *ir)
 851 {
 852    ir_expression *expr = ir->condition->as_expression();
 853
 854    if (expr) {
 855       src_reg op[2];
 856       dst_reg temp;
 857
 858       assert(expr->get_num_operands() <= 2);
 859       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 860          expr->operands[i]->accept(this);
 861          op[i] = this->result;
 862       }
 863
 864       switch (expr->operation) {
 865       case ir_unop_logic_not:
 866          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 867          return;
 868
 869       case ir_binop_logic_xor:
 870          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 871          return;
 872
 873       case ir_binop_logic_or:
 874          temp = dst_reg(this, glsl_type::bool_type);
 875          emit(OR(temp, op[0], op[1]));
 876          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 877          return;
 878
 879       case ir_binop_logic_and:
 880          temp = dst_reg(this, glsl_type::bool_type);
 881          emit(AND(temp, op[0], op[1]));
 882          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 883          return;
 884
 885       case ir_unop_f2b:
 886          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 887          return;
 888
 889       case ir_unop_i2b:
 890          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 891          return;
 892
 893       case ir_binop_greater:
 894       case ir_binop_gequal:
 895       case ir_binop_less:
 896       case ir_binop_lequal:
 897       case ir_binop_equal:
 898       case ir_binop_nequal:
 899          emit(IF(op[0], op[1],
 900                  brw_conditional_for_comparison(expr->operation)));
 901          return;
 902
 903       case ir_binop_all_equal:
 904          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 905          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 906          return;
 907
 908       case ir_binop_any_nequal:
 909          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 910          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 911          return;
 912
 913       case ir_unop_any:
 914          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 915          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 916          return;
 917
 918       default:
 919          assert(!"not reached");
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922       }
 923       return;
 924    }
 925
 926    ir->condition->accept(this);
 927
 928    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 929 }
 930
 931 dst_reg
 932 with_writemask(dst_reg const & r, int mask)
 933 {
 934    dst_reg result = r;
 935    result.writemask = mask;
 936    return result;
 937 }
 938
 939
 940 void
 941 vec4_visitor::visit(ir_variable *ir)
 942 {
 943    dst_reg *reg = NULL;
 944
 945    if (variable_storage(ir))
 946       return;
 947
 948    switch (ir->data.mode) {
 949    case ir_var_shader_in:
 950       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 951       break;
 952
 953    case ir_var_shader_out:
 954       reg = new(mem_ctx) dst_reg(this, ir->type);
 955
 956       for (int i = 0; i < type_size(ir->type); i++) {
 957          output_reg[ir->location + i] = *reg;
 958          output_reg[ir->location + i].reg_offset = i;
 959          output_reg[ir->location + i].type =
 960             brw_type_for_base_type(ir->type->get_scalar_type());
 961          output_reg_annotation[ir->location + i] = ir->name;
 962       }
 963       break;
 964
 965    case ir_var_auto:
 966    case ir_var_temporary:
 967       reg = new(mem_ctx) dst_reg(this, ir->type);
 968       break;
 969
 970    case ir_var_uniform:
 971       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 972
 973       /* Thanks to the lower_ubo_reference pass, we will see only
 974        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 975        * variables, so no need for them to be in variable_ht.
 976        *
 977        * Atomic counters take no uniform storage, no need to do
 978        * anything here.
 979        */
 980       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 981          return;
 982
 983       /* Track how big the whole uniform variable is, in case we need to put a
 984        * copy of its data into pull constants for array access.
 985        */
 986       this->uniform_size[this->uniforms] = type_size(ir->type);
 987
 988       if (!strncmp(ir->name, "gl_", 3)) {
 989          setup_builtin_uniform_values(ir);
 990       } else {
 991          setup_uniform_values(ir);
 992       }
 993       break;
 994
 995    case ir_var_system_value:
 996       reg = make_reg_for_system_value(ir);
 997       break;
 998
 999    default:
1000       assert(!"not reached");
1001    }
1002
1003    reg->type = brw_type_for_base_type(ir->type);
1004    hash_table_insert(this->variable_ht, reg, ir);
1005 }
1006
1007 void
1008 vec4_visitor::visit(ir_loop *ir)
1009 {
1010    /* We don't want debugging output to print the whole body of the
1011     * loop as the annotation.
1012     */
1013    this->base_ir = NULL;
1014
1015    emit(BRW_OPCODE_DO);
1016
1017    visit_instructions(&ir->body_instructions);
1018
1019    emit(BRW_OPCODE_WHILE);
1020 }
1021
1022 void
1023 vec4_visitor::visit(ir_loop_jump *ir)
1024 {
1025    switch (ir->mode) {
1026    case ir_loop_jump::jump_break:
1027       emit(BRW_OPCODE_BREAK);
1028       break;
1029    case ir_loop_jump::jump_continue:
1030       emit(BRW_OPCODE_CONTINUE);
1031       break;
1032    }
1033 }
1034
1035
1036 void
1037 vec4_visitor::visit(ir_function_signature *ir)
1038 {
1039    assert(0);
1040    (void)ir;
1041 }
1042
1043 void
1044 vec4_visitor::visit(ir_function *ir)
1045 {
1046    /* Ignore function bodies other than main() -- we shouldn't see calls to
1047     * them since they should all be inlined.
1048     */
1049    if (strcmp(ir->name, "main") == 0) {
1050       const ir_function_signature *sig;
1051       exec_list empty;
1052
1053       sig = ir->matching_signature(NULL, &empty);
1054
1055       assert(sig);
1056
1057       visit_instructions(&sig->body);
1058    }
1059 }
1060
1061 bool
1062 vec4_visitor::try_emit_sat(ir_expression *ir)
1063 {
1064    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1065    if (!sat_src)
1066       return false;
1067
1068    sat_src->accept(this);
1069    src_reg src = this->result;
1070
1071    this->result = src_reg(this, ir->type);
1072    vec4_instruction *inst;
1073    inst = emit(MOV(dst_reg(this->result), src));
1074    inst->saturate = true;
1075
1076    return true;
1077 }
1078
1079 bool
1080 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1081 {
1082    /* 3-src instructions were introduced in gen6. */
1083    if (brw->gen < 6)
1084       return false;
1085
1086    /* MAD can only handle floating-point data. */
1087    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1088       return false;
1089
1090    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1091    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1092
1093    if (!mul || mul->operation != ir_binop_mul)
1094       return false;
1095
1096    nonmul->accept(this);
1097    src_reg src0 = fix_3src_operand(this->result);
1098
1099    mul->operands[0]->accept(this);
1100    src_reg src1 = fix_3src_operand(this->result);
1101
1102    mul->operands[1]->accept(this);
1103    src_reg src2 = fix_3src_operand(this->result);
1104
1105    this->result = src_reg(this, ir->type);
1106    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1107
1108    return true;
1109 }
1110
1111 void
1112 vec4_visitor::emit_bool_comparison(unsigned int op,
1113                                  dst_reg dst, src_reg src0, src_reg src1)
1114 {
1115    /* original gen4 does destination conversion before comparison. */
1116    if (brw->gen < 5)
1117       dst.type = src0.type;
1118
1119    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1120
1121    dst.type = BRW_REGISTER_TYPE_D;
1122    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1123 }
1124
1125 void
1126 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1127                           src_reg src0, src_reg src1)
1128 {
1129    vec4_instruction *inst;
1130
1131    if (brw->gen >= 6) {
1132       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1133       inst->conditional_mod = conditionalmod;
1134    } else {
1135       emit(CMP(dst, src0, src1, conditionalmod));
1136
1137       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138       inst->predicate = BRW_PREDICATE_NORMAL;
1139    }
1140 }
1141
1142 static bool
1143 is_16bit_constant(ir_rvalue *rvalue)
1144 {
1145    ir_constant *constant = rvalue->as_constant();
1146    if (!constant)
1147       return false;
1148
1149    if (constant->type != glsl_type::int_type &&
1150        constant->type != glsl_type::uint_type)
1151       return false;
1152
1153    return constant->value.u[0] < (1 << 16);
1154 }
1155
1156 void
1157 vec4_visitor::visit(ir_expression *ir)
1158 {
1159    unsigned int operand;
1160    src_reg op[Elements(ir->operands)];
1161    src_reg result_src;
1162    dst_reg result_dst;
1163    vec4_instruction *inst;
1164
1165    if (try_emit_sat(ir))
1166       return;
1167
1168    if (ir->operation == ir_binop_add) {
1169       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1170          return;
1171    }
1172
1173    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1174       this->result.file = BAD_FILE;
1175       ir->operands[operand]->accept(this);
1176       if (this->result.file == BAD_FILE) {
1177          printf("Failed to get tree for expression operand:\n");
1178          ir->operands[operand]->print();
1179          exit(1);
1180       }
1181       op[operand] = this->result;
1182
1183       /* Matrix expression operands should have been broken down to vector
1184        * operations already.
1185        */
1186       assert(!ir->operands[operand]->type->is_matrix());
1187    }
1188
1189    int vector_elements = ir->operands[0]->type->vector_elements;
1190    if (ir->operands[1]) {
1191       vector_elements = MAX2(vector_elements,
1192                              ir->operands[1]->type->vector_elements);
1193    }
1194
1195    this->result.file = BAD_FILE;
1196
1197    /* Storage for our result.  Ideally for an assignment we'd be using
1198     * the actual storage for the result here, instead.
1199     */
1200    result_src = src_reg(this, ir->type);
1201    /* convenience for the emit functions below. */
1202    result_dst = dst_reg(result_src);
1203    /* If nothing special happens, this is the result. */
1204    this->result = result_src;
1205    /* Limit writes to the channels that will be used by result_src later.
1206     * This does limit this temp's use as a temporary for multi-instruction
1207     * sequences.
1208     */
1209    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1210
1211    switch (ir->operation) {
1212    case ir_unop_logic_not:
1213       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1214        * ones complement of the whole register, not just bit 0.
1215        */
1216       emit(XOR(result_dst, op[0], src_reg(1)));
1217       break;
1218    case ir_unop_neg:
1219       op[0].negate = !op[0].negate;
1220       emit(MOV(result_dst, op[0]));
1221       break;
1222    case ir_unop_abs:
1223       op[0].abs = true;
1224       op[0].negate = false;
1225       emit(MOV(result_dst, op[0]));
1226       break;
1227
1228    case ir_unop_sign:
1229       if (ir->type->is_float()) {
1230          /* AND(val, 0x80000000) gives the sign bit.
1231           *
1232           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1233           * zero.
1234           */
1235          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1236
1237          op[0].type = BRW_REGISTER_TYPE_UD;
1238          result_dst.type = BRW_REGISTER_TYPE_UD;
1239          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1240
1241          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1242          inst->predicate = BRW_PREDICATE_NORMAL;
1243
1244          this->result.type = BRW_REGISTER_TYPE_F;
1245       } else {
1246          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1247           *               -> non-negative val generates 0x00000000.
1248           *  Predicated OR sets 1 if val is positive.
1249           */
1250          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1251
1252          emit(ASR(result_dst, op[0], src_reg(31)));
1253
1254          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1255          inst->predicate = BRW_PREDICATE_NORMAL;
1256       }
1257       break;
1258
1259    case ir_unop_rcp:
1260       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1261       break;
1262
1263    case ir_unop_exp2:
1264       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1265       break;
1266    case ir_unop_log2:
1267       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1268       break;
1269    case ir_unop_exp:
1270    case ir_unop_log:
1271       assert(!"not reached: should be handled by ir_explog_to_explog2");
1272       break;
1273    case ir_unop_sin:
1274    case ir_unop_sin_reduced:
1275       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1276       break;
1277    case ir_unop_cos:
1278    case ir_unop_cos_reduced:
1279       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1280       break;
1281
1282    case ir_unop_dFdx:
1283    case ir_unop_dFdy:
1284       assert(!"derivatives not valid in vertex shader");
1285       break;
1286
1287    case ir_unop_bitfield_reverse:
1288       emit(BFREV(result_dst, op[0]));
1289       break;
1290    case ir_unop_bit_count:
1291       emit(CBIT(result_dst, op[0]));
1292       break;
1293    case ir_unop_find_msb: {
1294       src_reg temp = src_reg(this, glsl_type::uint_type);
1295
1296       inst = emit(FBH(dst_reg(temp), op[0]));
1297       inst->dst.writemask = WRITEMASK_XYZW;
1298
1299       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1300        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1301        * subtract the result from 31 to convert the MSB count into an LSB count.
1302        */
1303
1304       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1305       temp.swizzle = BRW_SWIZZLE_NOOP;
1306       emit(MOV(result_dst, temp));
1307
1308       src_reg src_tmp = src_reg(result_dst);
1309       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1310
1311       src_tmp.negate = true;
1312       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1313       inst->predicate = BRW_PREDICATE_NORMAL;
1314       break;
1315    }
1316    case ir_unop_find_lsb:
1317       emit(FBL(result_dst, op[0]));
1318       break;
1319
1320    case ir_unop_noise:
1321       assert(!"not reached: should be handled by lower_noise");
1322       break;
1323
1324    case ir_binop_add:
1325       emit(ADD(result_dst, op[0], op[1]));
1326       break;
1327    case ir_binop_sub:
1328       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1329       break;
1330
1331    case ir_binop_mul:
1332       if (brw->gen < 8 && ir->type->is_integer()) {
1333          /* For integer multiplication, the MUL uses the low 16 bits of one of
1334           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1335           * accumulates in the contribution of the upper 16 bits of that
1336           * operand.  If we can determine that one of the args is in the low
1337           * 16 bits, though, we can just emit a single MUL.
1338           */
1339          if (is_16bit_constant(ir->operands[0])) {
1340             if (brw->gen < 7)
1341                emit(MUL(result_dst, op[0], op[1]));
1342             else
1343                emit(MUL(result_dst, op[1], op[0]));
1344          } else if (is_16bit_constant(ir->operands[1])) {
1345             if (brw->gen < 7)
1346                emit(MUL(result_dst, op[1], op[0]));
1347             else
1348                emit(MUL(result_dst, op[0], op[1]));
1349          } else {
1350             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1351
1352             emit(MUL(acc, op[0], op[1]));
1353             emit(MACH(dst_null_d(), op[0], op[1]));
1354             emit(MOV(result_dst, src_reg(acc)));
1355          }
1356       } else {
1357          emit(MUL(result_dst, op[0], op[1]));
1358       }
1359       break;
1360    case ir_binop_imul_high: {
1361       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1362
1363       emit(MUL(acc, op[0], op[1]));
1364       emit(MACH(result_dst, op[0], op[1]));
1365       break;
1366    }
1367    case ir_binop_div:
1368       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1369       assert(ir->type->is_integer());
1370       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1371       break;
1372    case ir_binop_carry: {
1373       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1374
1375       emit(ADDC(dst_null_ud(), op[0], op[1]));
1376       emit(MOV(result_dst, src_reg(acc)));
1377       break;
1378    }
1379    case ir_binop_borrow: {
1380       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1381
1382       emit(SUBB(dst_null_ud(), op[0], op[1]));
1383       emit(MOV(result_dst, src_reg(acc)));
1384       break;
1385    }
1386    case ir_binop_mod:
1387       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1388       assert(ir->type->is_integer());
1389       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1390       break;
1391
1392    case ir_binop_less:
1393    case ir_binop_greater:
1394    case ir_binop_lequal:
1395    case ir_binop_gequal:
1396    case ir_binop_equal:
1397    case ir_binop_nequal: {
1398       emit(CMP(result_dst, op[0], op[1],
1399                brw_conditional_for_comparison(ir->operation)));
1400       emit(AND(result_dst, result_src, src_reg(0x1)));
1401       break;
1402    }
1403
1404    case ir_binop_all_equal:
1405       /* "==" operator producing a scalar boolean. */
1406       if (ir->operands[0]->type->is_vector() ||
1407           ir->operands[1]->type->is_vector()) {
1408          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1409          emit(MOV(result_dst, src_reg(0)));
1410          inst = emit(MOV(result_dst, src_reg(1)));
1411          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1412       } else {
1413          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1414          emit(AND(result_dst, result_src, src_reg(0x1)));
1415       }
1416       break;
1417    case ir_binop_any_nequal:
1418       /* "!=" operator producing a scalar boolean. */
1419       if (ir->operands[0]->type->is_vector() ||
1420           ir->operands[1]->type->is_vector()) {
1421          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1422
1423          emit(MOV(result_dst, src_reg(0)));
1424          inst = emit(MOV(result_dst, src_reg(1)));
1425          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1426       } else {
1427          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1428          emit(AND(result_dst, result_src, src_reg(0x1)));
1429       }
1430       break;
1431
1432    case ir_unop_any:
1433       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1434       emit(MOV(result_dst, src_reg(0)));
1435
1436       inst = emit(MOV(result_dst, src_reg(1)));
1437       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1438       break;
1439
1440    case ir_binop_logic_xor:
1441       emit(XOR(result_dst, op[0], op[1]));
1442       break;
1443
1444    case ir_binop_logic_or:
1445       emit(OR(result_dst, op[0], op[1]));
1446       break;
1447
1448    case ir_binop_logic_and:
1449       emit(AND(result_dst, op[0], op[1]));
1450       break;
1451
1452    case ir_binop_dot:
1453       assert(ir->operands[0]->type->is_vector());
1454       assert(ir->operands[0]->type == ir->operands[1]->type);
1455       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1456       break;
1457
1458    case ir_unop_sqrt:
1459       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1460       break;
1461    case ir_unop_rsq:
1462       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1463       break;
1464
1465    case ir_unop_bitcast_i2f:
1466    case ir_unop_bitcast_u2f:
1467       this->result = op[0];
1468       this->result.type = BRW_REGISTER_TYPE_F;
1469       break;
1470
1471    case ir_unop_bitcast_f2i:
1472       this->result = op[0];
1473       this->result.type = BRW_REGISTER_TYPE_D;
1474       break;
1475
1476    case ir_unop_bitcast_f2u:
1477       this->result = op[0];
1478       this->result.type = BRW_REGISTER_TYPE_UD;
1479       break;
1480
1481    case ir_unop_i2f:
1482    case ir_unop_i2u:
1483    case ir_unop_u2i:
1484    case ir_unop_u2f:
1485    case ir_unop_b2f:
1486    case ir_unop_b2i:
1487    case ir_unop_f2i:
1488    case ir_unop_f2u:
1489       emit(MOV(result_dst, op[0]));
1490       break;
1491    case ir_unop_f2b:
1492    case ir_unop_i2b: {
1493       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1494       emit(AND(result_dst, result_src, src_reg(1)));
1495       break;
1496    }
1497
1498    case ir_unop_trunc:
1499       emit(RNDZ(result_dst, op[0]));
1500       break;
1501    case ir_unop_ceil:
1502       op[0].negate = !op[0].negate;
1503       inst = emit(RNDD(result_dst, op[0]));
1504       this->result.negate = true;
1505       break;
1506    case ir_unop_floor:
1507       inst = emit(RNDD(result_dst, op[0]));
1508       break;
1509    case ir_unop_fract:
1510       inst = emit(FRC(result_dst, op[0]));
1511       break;
1512    case ir_unop_round_even:
1513       emit(RNDE(result_dst, op[0]));
1514       break;
1515
1516    case ir_binop_min:
1517       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1518       break;
1519    case ir_binop_max:
1520       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1521       break;
1522
1523    case ir_binop_pow:
1524       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1525       break;
1526
1527    case ir_unop_bit_not:
1528       inst = emit(NOT(result_dst, op[0]));
1529       break;
1530    case ir_binop_bit_and:
1531       inst = emit(AND(result_dst, op[0], op[1]));
1532       break;
1533    case ir_binop_bit_xor:
1534       inst = emit(XOR(result_dst, op[0], op[1]));
1535       break;
1536    case ir_binop_bit_or:
1537       inst = emit(OR(result_dst, op[0], op[1]));
1538       break;
1539
1540    case ir_binop_lshift:
1541       inst = emit(SHL(result_dst, op[0], op[1]));
1542       break;
1543
1544    case ir_binop_rshift:
1545       if (ir->type->base_type == GLSL_TYPE_INT)
1546          inst = emit(ASR(result_dst, op[0], op[1]));
1547       else
1548          inst = emit(SHR(result_dst, op[0], op[1]));
1549       break;
1550
1551    case ir_binop_bfm:
1552       emit(BFI1(result_dst, op[0], op[1]));
1553       break;
1554
1555    case ir_binop_ubo_load: {
1556       ir_constant *uniform_block = ir->operands[0]->as_constant();
1557       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1558       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1559       src_reg offset;
1560
1561       /* Now, load the vector from that offset. */
1562       assert(ir->type->is_vector() || ir->type->is_scalar());
1563
1564       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1565       packed_consts.type = result.type;
1566       src_reg surf_index =
1567          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1568       if (const_offset_ir) {
1569          if (brw->gen >= 8) {
1570             /* Store the offset in a GRF so we can send-from-GRF. */
1571             offset = src_reg(this, glsl_type::int_type);
1572             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1573          } else {
1574             /* Immediates are fine on older generations since they'll be moved
1575              * to a (potentially fake) MRF at the generator level.
1576              */
1577             offset = src_reg(const_offset / 16);
1578          }
1579       } else {
1580          offset = src_reg(this, glsl_type::uint_type);
1581          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1582       }
1583
1584       vec4_instruction *pull =
1585          emit(new(mem_ctx) vec4_instruction(this,
1586                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1587                                             dst_reg(packed_consts),
1588                                             surf_index,
1589                                             offset));
1590       pull->base_mrf = 14;
1591       pull->mlen = 1;
1592
1593       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1594       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1595                                             const_offset % 16 / 4,
1596                                             const_offset % 16 / 4,
1597                                             const_offset % 16 / 4);
1598
1599       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1600       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1601          emit(CMP(result_dst, packed_consts, src_reg(0u),
1602                   BRW_CONDITIONAL_NZ));
1603          emit(AND(result_dst, result, src_reg(0x1)));
1604       } else {
1605          emit(MOV(result_dst, packed_consts));
1606       }
1607       break;
1608    }
1609
1610    case ir_binop_vector_extract:
1611       assert(!"should have been lowered by vec_index_to_cond_assign");
1612       break;
1613
1614    case ir_triop_fma:
1615       op[0] = fix_3src_operand(op[0]);
1616       op[1] = fix_3src_operand(op[1]);
1617       op[2] = fix_3src_operand(op[2]);
1618       /* Note that the instruction's argument order is reversed from GLSL
1619        * and the IR.
1620        */
1621       emit(MAD(result_dst, op[2], op[1], op[0]));
1622       break;
1623
1624    case ir_triop_lrp:
1625       op[0] = fix_3src_operand(op[0]);
1626       op[1] = fix_3src_operand(op[1]);
1627       op[2] = fix_3src_operand(op[2]);
1628       /* Note that the instruction's argument order is reversed from GLSL
1629        * and the IR.
1630        */
1631       emit(LRP(result_dst, op[2], op[1], op[0]));
1632       break;
1633
1634    case ir_triop_csel:
1635       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1636       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1637       inst->predicate = BRW_PREDICATE_NORMAL;
1638       break;
1639
1640    case ir_triop_bfi:
1641       op[0] = fix_3src_operand(op[0]);
1642       op[1] = fix_3src_operand(op[1]);
1643       op[2] = fix_3src_operand(op[2]);
1644       emit(BFI2(result_dst, op[0], op[1], op[2]));
1645       break;
1646
1647    case ir_triop_bitfield_extract:
1648       op[0] = fix_3src_operand(op[0]);
1649       op[1] = fix_3src_operand(op[1]);
1650       op[2] = fix_3src_operand(op[2]);
1651       /* Note that the instruction's argument order is reversed from GLSL
1652        * and the IR.
1653        */
1654       emit(BFE(result_dst, op[2], op[1], op[0]));
1655       break;
1656
1657    case ir_triop_vector_insert:
1658       assert(!"should have been lowered by lower_vector_insert");
1659       break;
1660
1661    case ir_quadop_bitfield_insert:
1662       assert(!"not reached: should be handled by "
1663               "bitfield_insert_to_bfm_bfi\n");
1664       break;
1665
1666    case ir_quadop_vector:
1667       assert(!"not reached: should be handled by lower_quadop_vector");
1668       break;
1669
1670    case ir_unop_pack_half_2x16:
1671       emit_pack_half_2x16(result_dst, op[0]);
1672       break;
1673    case ir_unop_unpack_half_2x16:
1674       emit_unpack_half_2x16(result_dst, op[0]);
1675       break;
1676    case ir_unop_pack_snorm_2x16:
1677    case ir_unop_pack_snorm_4x8:
1678    case ir_unop_pack_unorm_2x16:
1679    case ir_unop_pack_unorm_4x8:
1680    case ir_unop_unpack_snorm_2x16:
1681    case ir_unop_unpack_snorm_4x8:
1682    case ir_unop_unpack_unorm_2x16:
1683    case ir_unop_unpack_unorm_4x8:
1684       assert(!"not reached: should be handled by lower_packing_builtins");
1685       break;
1686    case ir_unop_unpack_half_2x16_split_x:
1687    case ir_unop_unpack_half_2x16_split_y:
1688    case ir_binop_pack_half_2x16_split:
1689       assert(!"not reached: should not occur in vertex shader");
1690       break;
1691    case ir_binop_ldexp:
1692       assert(!"not reached: should be handled by ldexp_to_arith()");
1693       break;
1694    }
1695 }
1696
1697
1698 void
1699 vec4_visitor::visit(ir_swizzle *ir)
1700 {
1701    src_reg src;
1702    int i = 0;
1703    int swizzle[4];
1704
1705    /* Note that this is only swizzles in expressions, not those on the left
1706     * hand side of an assignment, which do write masking.  See ir_assignment
1707     * for that.
1708     */
1709
1710    ir->val->accept(this);
1711    src = this->result;
1712    assert(src.file != BAD_FILE);
1713
1714    for (i = 0; i < ir->type->vector_elements; i++) {
1715       switch (i) {
1716       case 0:
1717          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1718          break;
1719       case 1:
1720          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1721          break;
1722       case 2:
1723          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1724          break;
1725       case 3:
1726          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1727             break;
1728       }
1729    }
1730    for (; i < 4; i++) {
1731       /* Replicate the last channel out. */
1732       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1733    }
1734
1735    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1736
1737    this->result = src;
1738 }
1739
1740 void
1741 vec4_visitor::visit(ir_dereference_variable *ir)
1742 {
1743    const struct glsl_type *type = ir->type;
1744    dst_reg *reg = variable_storage(ir->var);
1745
1746    if (!reg) {
1747       fail("Failed to find variable storage for %s\n", ir->var->name);
1748       this->result = src_reg(brw_null_reg());
1749       return;
1750    }
1751
1752    this->result = src_reg(*reg);
1753
1754    /* System values get their swizzle from the dst_reg writemask */
1755    if (ir->var->data.mode == ir_var_system_value)
1756       return;
1757
1758    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1759       this->result.swizzle = swizzle_for_size(type->vector_elements);
1760 }
1761
1762
1763 int
1764 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1765 {
1766    /* Under normal circumstances array elements are stored consecutively, so
1767     * the stride is equal to the size of the array element.
1768     */
1769    return type_size(ir->type);
1770 }
1771
1772
1773 void
1774 vec4_visitor::visit(ir_dereference_array *ir)
1775 {
1776    ir_constant *constant_index;
1777    src_reg src;
1778    int array_stride = compute_array_stride(ir);
1779
1780    constant_index = ir->array_index->constant_expression_value();
1781
1782    ir->array->accept(this);
1783    src = this->result;
1784
1785    if (constant_index) {
1786       src.reg_offset += constant_index->value.i[0] * array_stride;
1787    } else {
1788       /* Variable index array dereference.  It eats the "vec4" of the
1789        * base of the array and an index that offsets the Mesa register
1790        * index.
1791        */
1792       ir->array_index->accept(this);
1793
1794       src_reg index_reg;
1795
1796       if (array_stride == 1) {
1797          index_reg = this->result;
1798       } else {
1799          index_reg = src_reg(this, glsl_type::int_type);
1800
1801          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1802       }
1803
1804       if (src.reladdr) {
1805          src_reg temp = src_reg(this, glsl_type::int_type);
1806
1807          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1808
1809          index_reg = temp;
1810       }
1811
1812       src.reladdr = ralloc(mem_ctx, src_reg);
1813       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1814    }
1815
1816    /* If the type is smaller than a vec4, replicate the last channel out. */
1817    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1818       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1819    else
1820       src.swizzle = BRW_SWIZZLE_NOOP;
1821    src.type = brw_type_for_base_type(ir->type);
1822
1823    this->result = src;
1824 }
1825
1826 void
1827 vec4_visitor::visit(ir_dereference_record *ir)
1828 {
1829    unsigned int i;
1830    const glsl_type *struct_type = ir->record->type;
1831    int offset = 0;
1832
1833    ir->record->accept(this);
1834
1835    for (i = 0; i < struct_type->length; i++) {
1836       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1837          break;
1838       offset += type_size(struct_type->fields.structure[i].type);
1839    }
1840
1841    /* If the type is smaller than a vec4, replicate the last channel out. */
1842    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1843       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1844    else
1845       this->result.swizzle = BRW_SWIZZLE_NOOP;
1846    this->result.type = brw_type_for_base_type(ir->type);
1847
1848    this->result.reg_offset += offset;
1849 }
1850
1851 /**
1852  * We want to be careful in assignment setup to hit the actual storage
1853  * instead of potentially using a temporary like we might with the
1854  * ir_dereference handler.
1855  */
1856 static dst_reg
1857 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1858 {
1859    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1860     * access of a vector, it must be separated into a series conditional moves
1861     * before reaching this point (see ir_vec_index_to_cond_assign).
1862     */
1863    assert(ir->as_dereference());
1864    ir_dereference_array *deref_array = ir->as_dereference_array();
1865    if (deref_array) {
1866       assert(!deref_array->array->type->is_vector());
1867    }
1868
1869    /* Use the rvalue deref handler for the most part.  We'll ignore
1870     * swizzles in it and write swizzles using writemask, though.
1871     */
1872    ir->accept(v);
1873    return dst_reg(v->result);
1874 }
1875
1876 void
1877 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1878                               const struct glsl_type *type, uint32_t predicate)
1879 {
1880    if (type->base_type == GLSL_TYPE_STRUCT) {
1881       for (unsigned int i = 0; i < type->length; i++) {
1882          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1883       }
1884       return;
1885    }
1886
1887    if (type->is_array()) {
1888       for (unsigned int i = 0; i < type->length; i++) {
1889          emit_block_move(dst, src, type->fields.array, predicate);
1890       }
1891       return;
1892    }
1893
1894    if (type->is_matrix()) {
1895       const struct glsl_type *vec_type;
1896
1897       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1898                                          type->vector_elements, 1);
1899
1900       for (int i = 0; i < type->matrix_columns; i++) {
1901          emit_block_move(dst, src, vec_type, predicate);
1902       }
1903       return;
1904    }
1905
1906    assert(type->is_scalar() || type->is_vector());
1907
1908    dst->type = brw_type_for_base_type(type);
1909    src->type = dst->type;
1910
1911    dst->writemask = (1 << type->vector_elements) - 1;
1912
1913    src->swizzle = swizzle_for_size(type->vector_elements);
1914
1915    vec4_instruction *inst = emit(MOV(*dst, *src));
1916    inst->predicate = predicate;
1917
1918    dst->reg_offset++;
1919    src->reg_offset++;
1920 }
1921
1922
1923 /* If the RHS processing resulted in an instruction generating a
1924  * temporary value, and it would be easy to rewrite the instruction to
1925  * generate its result right into the LHS instead, do so.  This ends
1926  * up reliably removing instructions where it can be tricky to do so
1927  * later without real UD chain information.
1928  */
1929 bool
1930 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1931                                      dst_reg dst,
1932                                      src_reg src,
1933                                      vec4_instruction *pre_rhs_inst,
1934                                      vec4_instruction *last_rhs_inst)
1935 {
1936    /* This could be supported, but it would take more smarts. */
1937    if (ir->condition)
1938       return false;
1939
1940    if (pre_rhs_inst == last_rhs_inst)
1941       return false; /* No instructions generated to work with. */
1942
1943    /* Make sure the last instruction generated our source reg. */
1944    if (src.file != GRF ||
1945        src.file != last_rhs_inst->dst.file ||
1946        src.reg != last_rhs_inst->dst.reg ||
1947        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1948        src.reladdr ||
1949        src.abs ||
1950        src.negate ||
1951        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1952       return false;
1953
1954    /* Check that that last instruction fully initialized the channels
1955     * we want to use, in the order we want to use them.  We could
1956     * potentially reswizzle the operands of many instructions so that
1957     * we could handle out of order channels, but don't yet.
1958     */
1959
1960    for (unsigned i = 0; i < 4; i++) {
1961       if (dst.writemask & (1 << i)) {
1962          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1963             return false;
1964
1965          if (BRW_GET_SWZ(src.swizzle, i) != i)
1966             return false;
1967       }
1968    }
1969
1970    /* Success!  Rewrite the instruction. */
1971    last_rhs_inst->dst.file = dst.file;
1972    last_rhs_inst->dst.reg = dst.reg;
1973    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1974    last_rhs_inst->dst.reladdr = dst.reladdr;
1975    last_rhs_inst->dst.writemask &= dst.writemask;
1976
1977    return true;
1978 }
1979
1980 void
1981 vec4_visitor::visit(ir_assignment *ir)
1982 {
1983    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1984    uint32_t predicate = BRW_PREDICATE_NONE;
1985
1986    if (!ir->lhs->type->is_scalar() &&
1987        !ir->lhs->type->is_vector()) {
1988       ir->rhs->accept(this);
1989       src_reg src = this->result;
1990
1991       if (ir->condition) {
1992          emit_bool_to_cond_code(ir->condition, &predicate);
1993       }
1994
1995       /* emit_block_move doesn't account for swizzles in the source register.
1996        * This should be ok, since the source register is a structure or an
1997        * array, and those can't be swizzled.  But double-check to be sure.
1998        */
1999       assert(src.swizzle ==
2000              (ir->rhs->type->is_matrix()
2001               ? swizzle_for_size(ir->rhs->type->vector_elements)
2002               : BRW_SWIZZLE_NOOP));
2003
2004       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2005       return;
2006    }
2007
2008    /* Now we're down to just a scalar/vector with writemasks. */
2009    int i;
2010
2011    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2012    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2013
2014    ir->rhs->accept(this);
2015
2016    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2017
2018    src_reg src = this->result;
2019
2020    int swizzles[4];
2021    int first_enabled_chan = 0;
2022    int src_chan = 0;
2023
2024    assert(ir->lhs->type->is_vector() ||
2025           ir->lhs->type->is_scalar());
2026    dst.writemask = ir->write_mask;
2027
2028    for (int i = 0; i < 4; i++) {
2029       if (dst.writemask & (1 << i)) {
2030          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2031          break;
2032       }
2033    }
2034
2035    /* Swizzle a small RHS vector into the channels being written.
2036     *
2037     * glsl ir treats write_mask as dictating how many channels are
2038     * present on the RHS while in our instructions we need to make
2039     * those channels appear in the slots of the vec4 they're written to.
2040     */
2041    for (int i = 0; i < 4; i++) {
2042       if (dst.writemask & (1 << i))
2043          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2044       else
2045          swizzles[i] = first_enabled_chan;
2046    }
2047    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2048                               swizzles[2], swizzles[3]);
2049
2050    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2051       return;
2052    }
2053
2054    if (ir->condition) {
2055       emit_bool_to_cond_code(ir->condition, &predicate);
2056    }
2057
2058    for (i = 0; i < type_size(ir->lhs->type); i++) {
2059       vec4_instruction *inst = emit(MOV(dst, src));
2060       inst->predicate = predicate;
2061
2062       dst.reg_offset++;
2063       src.reg_offset++;
2064    }
2065 }
2066
2067 void
2068 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2069 {
2070    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2071       foreach_list(node, &ir->components) {
2072          ir_constant *field_value = (ir_constant *)node;
2073
2074          emit_constant_values(dst, field_value);
2075       }
2076       return;
2077    }
2078
2079    if (ir->type->is_array()) {
2080       for (unsigned int i = 0; i < ir->type->length; i++) {
2081          emit_constant_values(dst, ir->array_elements[i]);
2082       }
2083       return;
2084    }
2085
2086    if (ir->type->is_matrix()) {
2087       for (int i = 0; i < ir->type->matrix_columns; i++) {
2088          float *vec = &ir->value.f[i * ir->type->vector_elements];
2089
2090          for (int j = 0; j < ir->type->vector_elements; j++) {
2091             dst->writemask = 1 << j;
2092             dst->type = BRW_REGISTER_TYPE_F;
2093
2094             emit(MOV(*dst, src_reg(vec[j])));
2095          }
2096          dst->reg_offset++;
2097       }
2098       return;
2099    }
2100
2101    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2102
2103    for (int i = 0; i < ir->type->vector_elements; i++) {
2104       if (!(remaining_writemask & (1 << i)))
2105          continue;
2106
2107       dst->writemask = 1 << i;
2108       dst->type = brw_type_for_base_type(ir->type);
2109
2110       /* Find other components that match the one we're about to
2111        * write.  Emits fewer instructions for things like vec4(0.5,
2112        * 1.5, 1.5, 1.5).
2113        */
2114       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2115          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2116             if (ir->value.b[i] == ir->value.b[j])
2117                dst->writemask |= (1 << j);
2118          } else {
2119             /* u, i, and f storage all line up, so no need for a
2120              * switch case for comparing each type.
2121              */
2122             if (ir->value.u[i] == ir->value.u[j])
2123                dst->writemask |= (1 << j);
2124          }
2125       }
2126
2127       switch (ir->type->base_type) {
2128       case GLSL_TYPE_FLOAT:
2129          emit(MOV(*dst, src_reg(ir->value.f[i])));
2130          break;
2131       case GLSL_TYPE_INT:
2132          emit(MOV(*dst, src_reg(ir->value.i[i])));
2133          break;
2134       case GLSL_TYPE_UINT:
2135          emit(MOV(*dst, src_reg(ir->value.u[i])));
2136          break;
2137       case GLSL_TYPE_BOOL:
2138          emit(MOV(*dst, src_reg(ir->value.b[i])));
2139          break;
2140       default:
2141          assert(!"Non-float/uint/int/bool constant");
2142          break;
2143       }
2144
2145       remaining_writemask &= ~dst->writemask;
2146    }
2147    dst->reg_offset++;
2148 }
2149
2150 void
2151 vec4_visitor::visit(ir_constant *ir)
2152 {
2153    dst_reg dst = dst_reg(this, ir->type);
2154    this->result = src_reg(dst);
2155
2156    emit_constant_values(&dst, ir);
2157 }
2158
2159 void
2160 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2161 {
2162    ir_dereference *deref = static_cast<ir_dereference *>(
2163       ir->actual_parameters.get_head());
2164    ir_variable *location = deref->variable_referenced();
2165    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2166                           location->atomic.buffer_index);
2167
2168    /* Calculate the surface offset */
2169    src_reg offset(this, glsl_type::uint_type);
2170    ir_dereference_array *deref_array = deref->as_dereference_array();
2171    if (deref_array) {
2172       deref_array->array_index->accept(this);
2173
2174       src_reg tmp(this, glsl_type::uint_type);
2175       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2176       emit(ADD(dst_reg(offset), tmp, location->atomic.offset));
2177    } else {
2178       offset = location->atomic.offset;
2179    }
2180
2181    /* Emit the appropriate machine instruction */
2182    const char *callee = ir->callee->function_name();
2183    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2184
2185    if (!strcmp("__intrinsic_atomic_read", callee)) {
2186       emit_untyped_surface_read(surf_index, dst, offset);
2187
2188    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2189       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2190                           src_reg(), src_reg());
2191
2192    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2193       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2194                           src_reg(), src_reg());
2195    }
2196 }
2197
2198 void
2199 vec4_visitor::visit(ir_call *ir)
2200 {
2201    const char *callee = ir->callee->function_name();
2202
2203    if (!strcmp("__intrinsic_atomic_read", callee) ||
2204        !strcmp("__intrinsic_atomic_increment", callee) ||
2205        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2206       visit_atomic_counter_intrinsic(ir);
2207    } else {
2208       assert(!"Unsupported intrinsic.");
2209    }
2210 }
2211
2212 src_reg
2213 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2214 {
2215    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2216    inst->base_mrf = 2;
2217    inst->mlen = 1;
2218    inst->sampler = sampler;
2219    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2220    inst->dst.writemask = WRITEMASK_XYZW;
2221
2222    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2223    int param_base = inst->base_mrf;
2224    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2225    int zero_mask = 0xf & ~coord_mask;
2226
2227    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2228             coordinate));
2229
2230    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2231             src_reg(0)));
2232
2233    emit(inst);
2234    return src_reg(inst->dst);
2235 }
2236
2237 void
2238 vec4_visitor::visit(ir_texture *ir)
2239 {
2240    int sampler =
2241       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2242
2243    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2244     * emitting anything other than setting up the constant result.
2245     */
2246    if (ir->op == ir_tg4) {
2247       ir_constant *chan = ir->lod_info.component->as_constant();
2248       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2249       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2250          dst_reg result(this, ir->type);
2251          this->result = src_reg(result);
2252          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2253          return;
2254       }
2255    }
2256
2257    /* Should be lowered by do_lower_texture_projection */
2258    assert(!ir->projector);
2259
2260    /* Should be lowered */
2261    assert(!ir->offset || !ir->offset->type->is_array());
2262
2263    /* Generate code to compute all the subexpression trees.  This has to be
2264     * done before loading any values into MRFs for the sampler message since
2265     * generating these values may involve SEND messages that need the MRFs.
2266     */
2267    src_reg coordinate;
2268    if (ir->coordinate) {
2269       ir->coordinate->accept(this);
2270       coordinate = this->result;
2271    }
2272
2273    src_reg shadow_comparitor;
2274    if (ir->shadow_comparitor) {
2275       ir->shadow_comparitor->accept(this);
2276       shadow_comparitor = this->result;
2277    }
2278
2279    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2280    src_reg offset_value;
2281    if (has_nonconstant_offset) {
2282       ir->offset->accept(this);
2283       offset_value = src_reg(this->result);
2284    }
2285
2286    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2287    src_reg lod, dPdx, dPdy, sample_index, mcs;
2288    switch (ir->op) {
2289    case ir_tex:
2290       lod = src_reg(0.0f);
2291       lod_type = glsl_type::float_type;
2292       break;
2293    case ir_txf:
2294    case ir_txl:
2295    case ir_txs:
2296       ir->lod_info.lod->accept(this);
2297       lod = this->result;
2298       lod_type = ir->lod_info.lod->type;
2299       break;
2300    case ir_query_levels:
2301       lod = src_reg(0);
2302       lod_type = glsl_type::int_type;
2303       break;
2304    case ir_txf_ms:
2305       ir->lod_info.sample_index->accept(this);
2306       sample_index = this->result;
2307       sample_index_type = ir->lod_info.sample_index->type;
2308
2309       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2310          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2311       else
2312          mcs = src_reg(0u);
2313       break;
2314    case ir_txd:
2315       ir->lod_info.grad.dPdx->accept(this);
2316       dPdx = this->result;
2317
2318       ir->lod_info.grad.dPdy->accept(this);
2319       dPdy = this->result;
2320
2321       lod_type = ir->lod_info.grad.dPdx->type;
2322       break;
2323    case ir_txb:
2324    case ir_lod:
2325    case ir_tg4:
2326       break;
2327    }
2328
2329    vec4_instruction *inst = NULL;
2330    switch (ir->op) {
2331    case ir_tex:
2332    case ir_txl:
2333       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2334       break;
2335    case ir_txd:
2336       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2337       break;
2338    case ir_txf:
2339       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2340       break;
2341    case ir_txf_ms:
2342       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2343       break;
2344    case ir_txs:
2345       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2346       break;
2347    case ir_tg4:
2348       if (has_nonconstant_offset)
2349          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2350       else
2351          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2352       break;
2353    case ir_query_levels:
2354       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2355       break;
2356    case ir_txb:
2357       assert(!"TXB is not valid for vertex shaders.");
2358       break;
2359    case ir_lod:
2360       assert(!"LOD is not valid for vertex shaders.");
2361       break;
2362    default:
2363       assert(!"Unrecognized tex op");
2364    }
2365
2366    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2367
2368    /* Texel offsets go in the message header; Gen4 also requires headers. */
2369    inst->header_present = use_texture_offset || brw->gen < 5 || ir->op == ir_tg4;
2370    inst->base_mrf = 2;
2371    inst->mlen = inst->header_present + 1; /* always at least one */
2372    inst->sampler = sampler;
2373    inst->dst = dst_reg(this, ir->type);
2374    inst->dst.writemask = WRITEMASK_XYZW;
2375    inst->shadow_compare = ir->shadow_comparitor != NULL;
2376
2377    if (use_texture_offset)
2378       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2379
2380    /* Stuff the channel select bits in the top of the texture offset */
2381    if (ir->op == ir_tg4)
2382       inst->texture_offset |= gather_channel(ir, sampler)<<16;
2383
2384    /* MRF for the first parameter */
2385    int param_base = inst->base_mrf + inst->header_present;
2386
2387    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2388       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2389       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2390    } else {
2391       /* Load the coordinate */
2392       /* FINISHME: gl_clamp_mask and saturate */
2393       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2394       int zero_mask = 0xf & ~coord_mask;
2395
2396       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2397                coordinate));
2398
2399       if (zero_mask != 0) {
2400          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2401                   src_reg(0)));
2402       }
2403       /* Load the shadow comparitor */
2404       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2405          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2406                           WRITEMASK_X),
2407                   shadow_comparitor));
2408          inst->mlen++;
2409       }
2410
2411       /* Load the LOD info */
2412       if (ir->op == ir_tex || ir->op == ir_txl) {
2413          int mrf, writemask;
2414          if (brw->gen >= 5) {
2415             mrf = param_base + 1;
2416             if (ir->shadow_comparitor) {
2417                writemask = WRITEMASK_Y;
2418                /* mlen already incremented */
2419             } else {
2420                writemask = WRITEMASK_X;
2421                inst->mlen++;
2422             }
2423          } else /* brw->gen == 4 */ {
2424             mrf = param_base;
2425             writemask = WRITEMASK_W;
2426          }
2427          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2428       } else if (ir->op == ir_txf) {
2429          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2430       } else if (ir->op == ir_txf_ms) {
2431          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2432                   sample_index));
2433          if (brw->gen >= 7)
2434             /* MCS data is in the first channel of `mcs`, but we need to get it into
2435              * the .y channel of the second vec4 of params, so replicate .x across
2436              * the whole vec4 and then mask off everything except .y
2437              */
2438             mcs.swizzle = BRW_SWIZZLE_XXXX;
2439             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2440                      mcs));
2441          inst->mlen++;
2442       } else if (ir->op == ir_txd) {
2443          const glsl_type *type = lod_type;
2444
2445          if (brw->gen >= 5) {
2446             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2447             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2448             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2449             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2450             inst->mlen++;
2451
2452             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2453                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2454                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2455                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2456                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2457                inst->mlen++;
2458
2459                if (ir->shadow_comparitor) {
2460                   emit(MOV(dst_reg(MRF, param_base + 2,
2461                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2462                            shadow_comparitor));
2463                }
2464             }
2465          } else /* brw->gen == 4 */ {
2466             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2467             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2468             inst->mlen += 2;
2469          }
2470       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2471          if (ir->shadow_comparitor) {
2472             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2473                      shadow_comparitor));
2474          }
2475
2476          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2477                   offset_value));
2478          inst->mlen++;
2479       }
2480    }
2481
2482    emit(inst);
2483
2484    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2485     * spec requires layers.
2486     */
2487    if (ir->op == ir_txs) {
2488       glsl_type const *type = ir->sampler->type;
2489       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2490           type->sampler_array) {
2491          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2492                    with_writemask(inst->dst, WRITEMASK_Z),
2493                    src_reg(inst->dst), src_reg(6));
2494       }
2495    }
2496
2497    swizzle_result(ir, src_reg(inst->dst), sampler);
2498 }
2499
2500 /**
2501  * Set up the gather channel based on the swizzle, for gather4.
2502  */
2503 uint32_t
2504 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2505 {
2506    ir_constant *chan = ir->lod_info.component->as_constant();
2507    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2508    switch (swiz) {
2509       case SWIZZLE_X: return 0;
2510       case SWIZZLE_Y:
2511          /* gather4 sampler is broken for green channel on RG32F --
2512           * we must ask for blue instead.
2513           */
2514          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2515             return 2;
2516          return 1;
2517       case SWIZZLE_Z: return 2;
2518       case SWIZZLE_W: return 3;
2519       default:
2520          assert(!"Not reached"); /* zero, one swizzles handled already */
2521          return 0;
2522    }
2523 }
2524
2525 void
2526 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2527 {
2528    int s = key->tex.swizzles[sampler];
2529
2530    this->result = src_reg(this, ir->type);
2531    dst_reg swizzled_result(this->result);
2532
2533    if (ir->op == ir_query_levels) {
2534       /* # levels is in .w */
2535       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2536       emit(MOV(swizzled_result, orig_val));
2537       return;
2538    }
2539
2540    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2541                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2542       emit(MOV(swizzled_result, orig_val));
2543       return;
2544    }
2545
2546
2547    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2548    int swizzle[4] = {0};
2549
2550    for (int i = 0; i < 4; i++) {
2551       switch (GET_SWZ(s, i)) {
2552       case SWIZZLE_ZERO:
2553          zero_mask |= (1 << i);
2554          break;
2555       case SWIZZLE_ONE:
2556          one_mask |= (1 << i);
2557          break;
2558       default:
2559          copy_mask |= (1 << i);
2560          swizzle[i] = GET_SWZ(s, i);
2561          break;
2562       }
2563    }
2564
2565    if (copy_mask) {
2566       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2567       swizzled_result.writemask = copy_mask;
2568       emit(MOV(swizzled_result, orig_val));
2569    }
2570
2571    if (zero_mask) {
2572       swizzled_result.writemask = zero_mask;
2573       emit(MOV(swizzled_result, src_reg(0.0f)));
2574    }
2575
2576    if (one_mask) {
2577       swizzled_result.writemask = one_mask;
2578       emit(MOV(swizzled_result, src_reg(1.0f)));
2579    }
2580 }
2581
2582 void
2583 vec4_visitor::visit(ir_return *ir)
2584 {
2585    assert(!"not reached");
2586 }
2587
2588 void
2589 vec4_visitor::visit(ir_discard *ir)
2590 {
2591    assert(!"not reached");
2592 }
2593
2594 void
2595 vec4_visitor::visit(ir_if *ir)
2596 {
2597    /* Don't point the annotation at the if statement, because then it plus
2598     * the then and else blocks get printed.
2599     */
2600    this->base_ir = ir->condition;
2601
2602    if (brw->gen == 6) {
2603       emit_if_gen6(ir);
2604    } else {
2605       uint32_t predicate;
2606       emit_bool_to_cond_code(ir->condition, &predicate);
2607       emit(IF(predicate));
2608    }
2609
2610    visit_instructions(&ir->then_instructions);
2611
2612    if (!ir->else_instructions.is_empty()) {
2613       this->base_ir = ir->condition;
2614       emit(BRW_OPCODE_ELSE);
2615
2616       visit_instructions(&ir->else_instructions);
2617    }
2618
2619    this->base_ir = ir->condition;
2620    emit(BRW_OPCODE_ENDIF);
2621 }
2622
2623 void
2624 vec4_visitor::visit(ir_emit_vertex *)
2625 {
2626    assert(!"not reached");
2627 }
2628
2629 void
2630 vec4_visitor::visit(ir_end_primitive *)
2631 {
2632    assert(!"not reached");
2633 }
2634
2635 void
2636 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2637                                   dst_reg dst, src_reg offset,
2638                                   src_reg src0, src_reg src1)
2639 {
2640    unsigned mlen = 0;
2641
2642    /* Set the atomic operation offset. */
2643    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2644    mlen++;
2645
2646    /* Set the atomic operation arguments. */
2647    if (src0.file != BAD_FILE) {
2648       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2649       mlen++;
2650    }
2651
2652    if (src1.file != BAD_FILE) {
2653       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2654       mlen++;
2655    }
2656
2657    /* Emit the instruction.  Note that this maps to the normal SIMD8
2658     * untyped atomic message on Ivy Bridge, but that's OK because
2659     * unused channels will be masked out.
2660     */
2661    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2662                                  src_reg(atomic_op), src_reg(surf_index));
2663    inst->base_mrf = 0;
2664    inst->mlen = mlen;
2665 }
2666
2667 void
2668 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2669                                         src_reg offset)
2670 {
2671    /* Set the surface read offset. */
2672    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2673
2674    /* Emit the instruction.  Note that this maps to the normal SIMD8
2675     * untyped surface read message, but that's OK because unused
2676     * channels will be masked out.
2677     */
2678    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2679                                  dst, src_reg(surf_index));
2680    inst->base_mrf = 0;
2681    inst->mlen = 1;
2682 }
2683
2684 void
2685 vec4_visitor::emit_ndc_computation()
2686 {
2687    /* Get the position */
2688    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2689
2690    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2691    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2692    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2693
2694    current_annotation = "NDC";
2695    dst_reg ndc_w = ndc;
2696    ndc_w.writemask = WRITEMASK_W;
2697    src_reg pos_w = pos;
2698    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2699    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2700
2701    dst_reg ndc_xyz = ndc;
2702    ndc_xyz.writemask = WRITEMASK_XYZ;
2703
2704    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2705 }
2706
2707 void
2708 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2709 {
2710    if (brw->gen < 6 &&
2711        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2712         key->userclip_active || brw->has_negative_rhw_bug)) {
2713       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2714       dst_reg header1_w = header1;
2715       header1_w.writemask = WRITEMASK_W;
2716
2717       emit(MOV(header1, 0u));
2718
2719       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2720          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2721
2722          current_annotation = "Point size";
2723          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2724          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2725       }
2726
2727       if (key->userclip_active) {
2728          current_annotation = "Clipping flags";
2729          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2730          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2731
2732          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2733          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2734          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2735
2736          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2737          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2738          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2739          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2740       }
2741
2742       /* i965 clipping workaround:
2743        * 1) Test for -ve rhw
2744        * 2) If set,
2745        *      set ndc = (0,0,0,0)
2746        *      set ucp[6] = 1
2747        *
2748        * Later, clipping will detect ucp[6] and ensure the primitive is
2749        * clipped against all fixed planes.
2750        */
2751       if (brw->has_negative_rhw_bug) {
2752          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2753          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2754          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2755          vec4_instruction *inst;
2756          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2757          inst->predicate = BRW_PREDICATE_NORMAL;
2758          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2759          inst->predicate = BRW_PREDICATE_NORMAL;
2760       }
2761
2762       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2763    } else if (brw->gen < 6) {
2764       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2765    } else {
2766       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2767       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2768          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2769                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2770       }
2771       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2772          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2773                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2774       }
2775    }
2776 }
2777
2778 void
2779 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2780 {
2781    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2782     *
2783     *     "If a linked set of shaders forming the vertex stage contains no
2784     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2785     *     application has requested clipping against user clip planes through
2786     *     the API, then the coordinate written to gl_Position is used for
2787     *     comparison against the user clip planes."
2788     *
2789     * This function is only called if the shader didn't write to
2790     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2791     * if the user wrote to it; otherwise we use gl_Position.
2792     */
2793    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2794    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2795       clip_vertex = VARYING_SLOT_POS;
2796    }
2797
2798    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2799         ++i) {
2800       reg.writemask = 1 << i;
2801       emit(DP4(reg,
2802                src_reg(output_reg[clip_vertex]),
2803                src_reg(this->userplane[i + offset])));
2804    }
2805 }
2806
2807 void
2808 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2809 {
2810    assert (varying < VARYING_SLOT_MAX);
2811    reg.type = output_reg[varying].type;
2812    current_annotation = output_reg_annotation[varying];
2813    /* Copy the register, saturating if necessary */
2814    vec4_instruction *inst = emit(MOV(reg,
2815                                      src_reg(output_reg[varying])));
2816    if ((varying == VARYING_SLOT_COL0 ||
2817         varying == VARYING_SLOT_COL1 ||
2818         varying == VARYING_SLOT_BFC0 ||
2819         varying == VARYING_SLOT_BFC1) &&
2820        key->clamp_vertex_color) {
2821       inst->saturate = true;
2822    }
2823 }
2824
2825 void
2826 vec4_visitor::emit_urb_slot(int mrf, int varying)
2827 {
2828    struct brw_reg hw_reg = brw_message_reg(mrf);
2829    dst_reg reg = dst_reg(MRF, mrf);
2830    reg.type = BRW_REGISTER_TYPE_F;
2831
2832    switch (varying) {
2833    case VARYING_SLOT_PSIZ:
2834       /* PSIZ is always in slot 0, and is coupled with other flags. */
2835       current_annotation = "indices, point width, clip flags";
2836       emit_psiz_and_flags(hw_reg);
2837       break;
2838    case BRW_VARYING_SLOT_NDC:
2839       current_annotation = "NDC";
2840       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2841       break;
2842    case VARYING_SLOT_POS:
2843       current_annotation = "gl_Position";
2844       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2845       break;
2846    case VARYING_SLOT_EDGE:
2847       /* This is present when doing unfilled polygons.  We're supposed to copy
2848        * the edge flag from the user-provided vertex array
2849        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2850        * of that attribute (starts as 1.0f).  This is then used in clipping to
2851        * determine which edges should be drawn as wireframe.
2852        */
2853       current_annotation = "edge flag";
2854       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2855                                     glsl_type::float_type, WRITEMASK_XYZW))));
2856       break;
2857    case BRW_VARYING_SLOT_PAD:
2858       /* No need to write to this slot */
2859       break;
2860    default:
2861       emit_generic_urb_slot(reg, varying);
2862       break;
2863    }
2864 }
2865
2866 static int
2867 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2868 {
2869    if (brw->gen >= 6) {
2870       /* URB data written (does not include the message header reg) must
2871        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2872        * section 5.4.3.2.2: URB_INTERLEAVED.
2873        *
2874        * URB entries are allocated on a multiple of 1024 bits, so an
2875        * extra 128 bits written here to make the end align to 256 is
2876        * no problem.
2877        */
2878       if ((mlen % 2) != 1)
2879          mlen++;
2880    }
2881
2882    return mlen;
2883 }
2884
2885
2886 /**
2887  * Generates the VUE payload plus the necessary URB write instructions to
2888  * output it.
2889  *
2890  * The VUE layout is documented in Volume 2a.
2891  */
2892 void
2893 vec4_visitor::emit_vertex()
2894 {
2895    /* MRF 0 is reserved for the debugger, so start with message header
2896     * in MRF 1.
2897     */
2898    int base_mrf = 1;
2899    int mrf = base_mrf;
2900    /* In the process of generating our URB write message contents, we
2901     * may need to unspill a register or load from an array.  Those
2902     * reads would use MRFs 14-15.
2903     */
2904    int max_usable_mrf = 13;
2905
2906    /* The following assertion verifies that max_usable_mrf causes an
2907     * even-numbered amount of URB write data, which will meet gen6's
2908     * requirements for length alignment.
2909     */
2910    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2911
2912    /* First mrf is the g0-based message header containing URB handles and
2913     * such.
2914     */
2915    emit_urb_write_header(mrf++);
2916
2917    if (brw->gen < 6) {
2918       emit_ndc_computation();
2919    }
2920
2921    /* Lower legacy ff and ClipVertex clipping to clip distances */
2922    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2923       current_annotation = "user clip distances";
2924
2925       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2926       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2927
2928       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2929       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2930    }
2931
2932    /* We may need to split this up into several URB writes, so do them in a
2933     * loop.
2934     */
2935    int slot = 0;
2936    bool complete = false;
2937    do {
2938       /* URB offset is in URB row increments, and each of our MRFs is half of
2939        * one of those, since we're doing interleaved writes.
2940        */
2941       int offset = slot / 2;
2942
2943       mrf = base_mrf + 1;
2944       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2945          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2946
2947          /* If this was max_usable_mrf, we can't fit anything more into this
2948           * URB WRITE.
2949           */
2950          if (mrf > max_usable_mrf) {
2951             slot++;
2952             break;
2953          }
2954       }
2955
2956       complete = slot >= prog_data->vue_map.num_slots;
2957       current_annotation = "URB write";
2958       vec4_instruction *inst = emit_urb_write_opcode(complete);
2959       inst->base_mrf = base_mrf;
2960       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2961       inst->offset += offset;
2962    } while(!complete);
2963 }
2964
2965
2966 src_reg
2967 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2968                                  src_reg *reladdr, int reg_offset)
2969 {
2970    /* Because we store the values to scratch interleaved like our
2971     * vertex data, we need to scale the vec4 index by 2.
2972     */
2973    int message_header_scale = 2;
2974
2975    /* Pre-gen6, the message header uses byte offsets instead of vec4
2976     * (16-byte) offset units.
2977     */
2978    if (brw->gen < 6)
2979       message_header_scale *= 16;
2980
2981    if (reladdr) {
2982       src_reg index = src_reg(this, glsl_type::int_type);
2983
2984       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2985       emit_before(inst, MUL(dst_reg(index),
2986                             index, src_reg(message_header_scale)));
2987
2988       return index;
2989    } else {
2990       return src_reg(reg_offset * message_header_scale);
2991    }
2992 }
2993
2994 src_reg
2995 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2996                                        src_reg *reladdr, int reg_offset)
2997 {
2998    if (reladdr) {
2999       src_reg index = src_reg(this, glsl_type::int_type);
3000
3001       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3002
3003       /* Pre-gen6, the message header uses byte offsets instead of vec4
3004        * (16-byte) offset units.
3005        */
3006       if (brw->gen < 6) {
3007          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3008       }
3009
3010       return index;
3011    } else if (brw->gen >= 8) {
3012       /* Store the offset in a GRF so we can send-from-GRF. */
3013       src_reg offset = src_reg(this, glsl_type::int_type);
3014       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3015       return offset;
3016    } else {
3017       int message_header_scale = brw->gen < 6 ? 16 : 1;
3018       return src_reg(reg_offset * message_header_scale);
3019    }
3020 }
3021
3022 /**
3023  * Emits an instruction before @inst to load the value named by @orig_src
3024  * from scratch space at @base_offset to @temp.
3025  *
3026  * @base_offset is measured in 32-byte units (the size of a register).
3027  */
3028 void
3029 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3030                                 dst_reg temp, src_reg orig_src,
3031                                 int base_offset)
3032 {
3033    int reg_offset = base_offset + orig_src.reg_offset;
3034    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3035
3036    emit_before(inst, SCRATCH_READ(temp, index));
3037 }
3038
3039 /**
3040  * Emits an instruction after @inst to store the value to be written
3041  * to @orig_dst to scratch space at @base_offset, from @temp.
3042  *
3043  * @base_offset is measured in 32-byte units (the size of a register).
3044  */
3045 void
3046 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3047 {
3048    int reg_offset = base_offset + inst->dst.reg_offset;
3049    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3050
3051    /* Create a temporary register to store *inst's result in.
3052     *
3053     * We have to be careful in MOVing from our temporary result register in
3054     * the scratch write.  If we swizzle from channels of the temporary that
3055     * weren't initialized, it will confuse live interval analysis, which will
3056     * make spilling fail to make progress.
3057     */
3058    src_reg temp = src_reg(this, glsl_type::vec4_type);
3059    temp.type = inst->dst.type;
3060    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3061    int swizzles[4];
3062    for (int i = 0; i < 4; i++)
3063       if (inst->dst.writemask & (1 << i))
3064          swizzles[i] = i;
3065       else
3066          swizzles[i] = first_writemask_chan;
3067    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3068                                swizzles[2], swizzles[3]);
3069
3070    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3071                                        inst->dst.writemask));
3072    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3073    write->predicate = inst->predicate;
3074    write->ir = inst->ir;
3075    write->annotation = inst->annotation;
3076    inst->insert_after(write);
3077
3078    inst->dst.file = temp.file;
3079    inst->dst.reg = temp.reg;
3080    inst->dst.reg_offset = temp.reg_offset;
3081    inst->dst.reladdr = NULL;
3082 }
3083
3084 /**
3085  * We can't generally support array access in GRF space, because a
3086  * single instruction's destination can only span 2 contiguous
3087  * registers.  So, we send all GRF arrays that get variable index
3088  * access to scratch space.
3089  */
3090 void
3091 vec4_visitor::move_grf_array_access_to_scratch()
3092 {
3093    int scratch_loc[this->virtual_grf_count];
3094
3095    for (int i = 0; i < this->virtual_grf_count; i++) {
3096       scratch_loc[i] = -1;
3097    }
3098
3099    /* First, calculate the set of virtual GRFs that need to be punted
3100     * to scratch due to having any array access on them, and where in
3101     * scratch.
3102     */
3103    foreach_list(node, &this->instructions) {
3104       vec4_instruction *inst = (vec4_instruction *)node;
3105
3106       if (inst->dst.file == GRF && inst->dst.reladdr &&
3107           scratch_loc[inst->dst.reg] == -1) {
3108          scratch_loc[inst->dst.reg] = c->last_scratch;
3109          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3110       }
3111
3112       for (int i = 0 ; i < 3; i++) {
3113          src_reg *src = &inst->src[i];
3114
3115          if (src->file == GRF && src->reladdr &&
3116              scratch_loc[src->reg] == -1) {
3117             scratch_loc[src->reg] = c->last_scratch;
3118             c->last_scratch += this->virtual_grf_sizes[src->reg];
3119          }
3120       }
3121    }
3122
3123    /* Now, for anything that will be accessed through scratch, rewrite
3124     * it to load/store.  Note that this is a _safe list walk, because
3125     * we may generate a new scratch_write instruction after the one
3126     * we're processing.
3127     */
3128    foreach_list_safe(node, &this->instructions) {
3129       vec4_instruction *inst = (vec4_instruction *)node;
3130
3131       /* Set up the annotation tracking for new generated instructions. */
3132       base_ir = inst->ir;
3133       current_annotation = inst->annotation;
3134
3135       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3136          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3137       }
3138
3139       for (int i = 0 ; i < 3; i++) {
3140          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3141             continue;
3142
3143          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3144
3145          emit_scratch_read(inst, temp, inst->src[i],
3146                            scratch_loc[inst->src[i].reg]);
3147
3148          inst->src[i].file = temp.file;
3149          inst->src[i].reg = temp.reg;
3150          inst->src[i].reg_offset = temp.reg_offset;
3151          inst->src[i].reladdr = NULL;
3152       }
3153    }
3154 }
3155
3156 /**
3157  * Emits an instruction before @inst to load the value named by @orig_src
3158  * from the pull constant buffer (surface) at @base_offset to @temp.
3159  */
3160 void
3161 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3162                                       dst_reg temp, src_reg orig_src,
3163                                       int base_offset)
3164 {
3165    int reg_offset = base_offset + orig_src.reg_offset;
3166    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3167    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3168    vec4_instruction *load;
3169
3170    if (brw->gen >= 7) {
3171       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3172       grf_offset.type = offset.type;
3173       emit_before(inst, MOV(grf_offset, offset));
3174
3175       load = new(mem_ctx) vec4_instruction(this,
3176                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3177                                            temp, index, src_reg(grf_offset));
3178    } else {
3179       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3180                                            temp, index, offset);
3181       load->base_mrf = 14;
3182       load->mlen = 1;
3183    }
3184    emit_before(inst, load);
3185 }
3186
3187 /**
3188  * Implements array access of uniforms by inserting a
3189  * PULL_CONSTANT_LOAD instruction.
3190  *
3191  * Unlike temporary GRF array access (where we don't support it due to
3192  * the difficulty of doing relative addressing on instruction
3193  * destinations), we could potentially do array access of uniforms
3194  * that were loaded in GRF space as push constants.  In real-world
3195  * usage we've seen, though, the arrays being used are always larger
3196  * than we could load as push constants, so just always move all
3197  * uniform array access out to a pull constant buffer.
3198  */
3199 void
3200 vec4_visitor::move_uniform_array_access_to_pull_constants()
3201 {
3202    int pull_constant_loc[this->uniforms];
3203
3204    for (int i = 0; i < this->uniforms; i++) {
3205       pull_constant_loc[i] = -1;
3206    }
3207
3208    /* Walk through and find array access of uniforms.  Put a copy of that
3209     * uniform in the pull constant buffer.
3210     *
3211     * Note that we don't move constant-indexed accesses to arrays.  No
3212     * testing has been done of the performance impact of this choice.
3213     */
3214    foreach_list_safe(node, &this->instructions) {
3215       vec4_instruction *inst = (vec4_instruction *)node;
3216
3217       for (int i = 0 ; i < 3; i++) {
3218          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3219             continue;
3220
3221          int uniform = inst->src[i].reg;
3222
3223          /* If this array isn't already present in the pull constant buffer,
3224           * add it.
3225           */
3226          if (pull_constant_loc[uniform] == -1) {
3227             const float **values = &prog_data->param[uniform * 4];
3228
3229             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3230
3231             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3232                prog_data->pull_param[prog_data->nr_pull_params++]
3233                   = values[j];
3234             }
3235          }
3236
3237          /* Set up the annotation tracking for new generated instructions. */
3238          base_ir = inst->ir;
3239          current_annotation = inst->annotation;
3240
3241          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3242
3243          emit_pull_constant_load(inst, temp, inst->src[i],
3244                                  pull_constant_loc[uniform]);
3245
3246          inst->src[i].file = temp.file;
3247          inst->src[i].reg = temp.reg;
3248          inst->src[i].reg_offset = temp.reg_offset;
3249          inst->src[i].reladdr = NULL;
3250       }
3251    }
3252
3253    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3254     * no need to track them as larger-than-vec4 objects.  This will be
3255     * relied on in cutting out unused uniform vectors from push
3256     * constants.
3257     */
3258    split_uniform_registers();
3259 }
3260
3261 void
3262 vec4_visitor::resolve_ud_negate(src_reg *reg)
3263 {
3264    if (reg->type != BRW_REGISTER_TYPE_UD ||
3265        !reg->negate)
3266       return;
3267
3268    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3269    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3270    *reg = temp;
3271 }
3272
3273 vec4_visitor::vec4_visitor(struct brw_context *brw,
3274                            struct brw_vec4_compile *c,
3275                            struct gl_program *prog,
3276                            const struct brw_vec4_prog_key *key,
3277                            struct brw_vec4_prog_data *prog_data,
3278                            struct gl_shader_program *shader_prog,
3279                            struct brw_shader *shader,
3280                            void *mem_ctx,
3281                            bool debug_flag,
3282                            bool no_spills)
3283    : sanity_param_count(0),
3284      fail_msg(NULL),
3285      first_non_payload_grf(0),
3286      need_all_constants_in_pull_buffer(false),
3287      debug_flag(debug_flag),
3288      no_spills(no_spills)
3289 {
3290    this->brw = brw;
3291    this->ctx = &brw->ctx;
3292    this->shader_prog = shader_prog;
3293    this->shader = shader;
3294
3295    this->mem_ctx = mem_ctx;
3296    this->failed = false;
3297
3298    this->base_ir = NULL;
3299    this->current_annotation = NULL;
3300    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3301
3302    this->c = c;
3303    this->prog = prog;
3304    this->key = key;
3305    this->prog_data = prog_data;
3306    this->stage_prog_data = &prog_data->base;
3307
3308    this->variable_ht = hash_table_ctor(0,
3309                                        hash_table_pointer_hash,
3310                                        hash_table_pointer_compare);
3311
3312    this->virtual_grf_start = NULL;
3313    this->virtual_grf_end = NULL;
3314    this->virtual_grf_sizes = NULL;
3315    this->virtual_grf_count = 0;
3316    this->virtual_grf_reg_map = NULL;
3317    this->virtual_grf_reg_count = 0;
3318    this->virtual_grf_array_size = 0;
3319    this->live_intervals_valid = false;
3320
3321    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3322
3323    this->uniforms = 0;
3324 }
3325
3326 vec4_visitor::~vec4_visitor()
3327 {
3328    hash_table_dtor(this->variable_ht);
3329 }
3330
3331
3332 void
3333 vec4_visitor::fail(const char *format, ...)
3334 {
3335    va_list va;
3336    char *msg;
3337
3338    if (failed)
3339       return;
3340
3341    failed = true;
3342
3343    va_start(va, format);
3344    msg = ralloc_vasprintf(mem_ctx, format, va);
3345    va_end(va);
3346    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3347
3348    this->fail_msg = msg;
3349
3350    if (debug_flag) {
3351       fprintf(stderr, "%s",  msg);
3352    }
3353 }
3354
3355 } /* namespace brw */