src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 132                                            src0, src1, src2);           \
 133    }
 134
 135 ALU1(NOT)
 136 ALU1(MOV)
 137 ALU1(FRC)
 138 ALU1(RNDD)
 139 ALU1(RNDE)
 140 ALU1(RNDZ)
 141 ALU1(F32TO16)
 142 ALU1(F16TO32)
 143 ALU2(ADD)
 144 ALU2(MUL)
 145 ALU2(MACH)
 146 ALU2(AND)
 147 ALU2(OR)
 148 ALU2(XOR)
 149 ALU2(DP3)
 150 ALU2(DP4)
 151 ALU2(DPH)
 152 ALU2(SHL)
 153 ALU2(SHR)
 154 ALU2(ASR)
 155 ALU3(LRP)
 156 ALU1(BFREV)
 157 ALU3(BFE)
 158 ALU2(BFI1)
 159 ALU3(BFI2)
 160 ALU1(FBH)
 161 ALU1(FBL)
 162 ALU1(CBIT)
 163 ALU3(MAD)
 164 ALU2(ADDC)
 165 ALU2(SUBB)
 166
 167 /** Gen4 predicated IF. */
 168 vec4_instruction *
 169 vec4_visitor::IF(uint32_t predicate)
 170 {
 171    vec4_instruction *inst;
 172
 173    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 174    inst->predicate = predicate;
 175
 176    return inst;
 177 }
 178
 179 /** Gen6 IF with embedded comparison. */
 180 vec4_instruction *
 181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 182 {
 183    assert(brw->gen == 6);
 184
 185    vec4_instruction *inst;
 186
 187    resolve_ud_negate(&src0);
 188    resolve_ud_negate(&src1);
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 191                                         src0, src1);
 192    inst->conditional_mod = condition;
 193
 194    return inst;
 195 }
 196
 197 /**
 198  * CMP: Sets the low bit of the destination channels with the result
 199  * of the comparison, while the upper bits are undefined, and updates
 200  * the flag register with the packed 16 bits of the result.
 201  */
 202 vec4_instruction *
 203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 204 {
 205    vec4_instruction *inst;
 206
 207    /* original gen4 does type conversion to the destination type
 208     * before before comparison, producing garbage results for floating
 209     * point comparisons.
 210     */
 211    if (brw->gen == 4) {
 212       dst.type = src0.type;
 213       if (dst.file == HW_REG)
 214          dst.fixed_hw_reg.type = dst.type;
 215    }
 216
 217    resolve_ud_negate(&src0);
 218    resolve_ud_negate(&src1);
 219
 220    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 221    inst->conditional_mod = condition;
 222
 223    return inst;
 224 }
 225
 226 vec4_instruction *
 227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 228 {
 229    vec4_instruction *inst;
 230
 231    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 232                                         dst, index);
 233    inst->base_mrf = 14;
 234    inst->mlen = 2;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 245                                         dst, src, index);
 246    inst->base_mrf = 13;
 247    inst->mlen = 3;
 248
 249    return inst;
 250 }
 251
 252 void
 253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 254 {
 255    static enum opcode dot_opcodes[] = {
 256       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 257    };
 258
 259    emit(dot_opcodes[elements - 2], dst, src0, src1);
 260 }
 261
 262 src_reg
 263 vec4_visitor::fix_3src_operand(src_reg src)
 264 {
 265    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 266     * able to use vertical stride of zero to replicate the vec4 uniform, like
 267     *
 268     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 269     *
 270     * But you can't, since vertical stride is always four in three-source
 271     * instructions. Instead, insert a MOV instruction to do the replication so
 272     * that the three-source instruction can consume it.
 273     */
 274
 275    /* The MOV is only needed if the source is a uniform or immediate. */
 276    if (src.file != UNIFORM && src.file != IMM)
 277       return src;
 278
 279    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 280    expanded.type = src.type;
 281    emit(MOV(expanded, src));
 282    return src_reg(expanded);
 283 }
 284
 285 src_reg
 286 vec4_visitor::fix_math_operand(src_reg src)
 287 {
 288    /* The gen6 math instruction ignores the source modifiers --
 289     * swizzle, abs, negate, and at least some parts of the register
 290     * region description.
 291     *
 292     * Rather than trying to enumerate all these cases, *always* expand the
 293     * operand to a temp GRF for gen6.
 294     *
 295     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 296     * can't use.
 297     */
 298
 299    if (brw->gen == 7 && src.file != IMM)
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(MOV(expanded, src));
 305    return src_reg(expanded);
 306 }
 307
 308 void
 309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    src = fix_math_operand(src);
 312
 313    if (dst.writemask != WRITEMASK_XYZW) {
 314       /* The gen6 math instruction must be align1, so we can't do
 315        * writemasks.
 316        */
 317       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 318
 319       emit(opcode, temp_dst, src);
 320
 321       emit(MOV(dst, src_reg(temp_dst)));
 322    } else {
 323       emit(opcode, dst, src);
 324    }
 325 }
 326
 327 void
 328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 329 {
 330    vec4_instruction *inst = emit(opcode, dst, src);
 331    inst->base_mrf = 1;
 332    inst->mlen = 1;
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 337 {
 338    switch (opcode) {
 339    case SHADER_OPCODE_RCP:
 340    case SHADER_OPCODE_RSQ:
 341    case SHADER_OPCODE_SQRT:
 342    case SHADER_OPCODE_EXP2:
 343    case SHADER_OPCODE_LOG2:
 344    case SHADER_OPCODE_SIN:
 345    case SHADER_OPCODE_COS:
 346       break;
 347    default:
 348       assert(!"not reached: bad math opcode");
 349       return;
 350    }
 351
 352    if (brw->gen >= 6) {
 353       return emit_math1_gen6(opcode, dst, src);
 354    } else {
 355       return emit_math1_gen4(opcode, dst, src);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    src0 = fix_math_operand(src0);
 364    src1 = fix_math_operand(src1);
 365
 366    if (dst.writemask != WRITEMASK_XYZW) {
 367       /* The gen6 math instruction must be align1, so we can't do
 368        * writemasks.
 369        */
 370       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 371       temp_dst.type = dst.type;
 372
 373       emit(opcode, temp_dst, src0, src1);
 374
 375       emit(MOV(dst, src_reg(temp_dst)));
 376    } else {
 377       emit(opcode, dst, src0, src1);
 378    }
 379 }
 380
 381 void
 382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 383                               dst_reg dst, src_reg src0, src_reg src1)
 384 {
 385    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 386    inst->base_mrf = 1;
 387    inst->mlen = 2;
 388 }
 389
 390 void
 391 vec4_visitor::emit_math(enum opcode opcode,
 392                         dst_reg dst, src_reg src0, src_reg src1)
 393 {
 394    switch (opcode) {
 395    case SHADER_OPCODE_POW:
 396    case SHADER_OPCODE_INT_QUOTIENT:
 397    case SHADER_OPCODE_INT_REMAINDER:
 398       break;
 399    default:
 400       assert(!"not reached: unsupported binary math opcode");
 401       return;
 402    }
 403
 404    if (brw->gen >= 6) {
 405       return emit_math2_gen6(opcode, dst, src0, src1);
 406    } else {
 407       return emit_math2_gen4(opcode, dst, src0, src1);
 408    }
 409 }
 410
 411 void
 412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 413 {
 414    if (brw->gen < 7)
 415       assert(!"ir_unop_pack_half_2x16 should be lowered");
 416
 417    assert(dst.type == BRW_REGISTER_TYPE_UD);
 418    assert(src0.type == BRW_REGISTER_TYPE_F);
 419
 420    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 421     *
 422     *   Because this instruction does not have a 16-bit floating-point type,
 423     *   the destination data type must be Word (W).
 424     *
 425     *   The destination must be DWord-aligned and specify a horizontal stride
 426     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 427     *   each destination channel and the upper word is not modified.
 428     *
 429     * The above restriction implies that the f32to16 instruction must use
 430     * align1 mode, because only in align1 mode is it possible to specify
 431     * horizontal stride.  We choose here to defy the hardware docs and emit
 432     * align16 instructions.
 433     *
 434     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 435     * instructions. I was partially successful in that the code passed all
 436     * tests.  However, the code was dubiously correct and fragile, and the
 437     * tests were not harsh enough to probe that frailty. Not trusting the
 438     * code, I chose instead to remain in align16 mode in defiance of the hw
 439     * docs).
 440     *
 441     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 442     * simulator, emitting a f32to16 in align16 mode with UD as destination
 443     * data type is safe. The behavior differs from that specified in the PRM
 444     * in that the upper word of each destination channel is cleared to 0.
 445     */
 446
 447    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 448    src_reg tmp_src(tmp_dst);
 449
 450 #if 0
 451    /* Verify the undocumented behavior on which the following instructions
 452     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 453     * then the result of the bit-or instruction below will be incorrect.
 454     *
 455     * You should inspect the disasm output in order to verify that the MOV is
 456     * not optimized away.
 457     */
 458    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 459 #endif
 460
 461    /* Give tmp the form below, where "." means untouched.
 462     *
 463     *     w z          y          x w z          y          x
 464     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 465     *
 466     * That the upper word of each write-channel be 0 is required for the
 467     * following bit-shift and bit-or instructions to work. Note that this
 468     * relies on the undocumented hardware behavior mentioned above.
 469     */
 470    tmp_dst.writemask = WRITEMASK_XY;
 471    emit(F32TO16(tmp_dst, src0));
 472
 473    /* Give the write-channels of dst the form:
 474     *   0xhhhh0000
 475     */
 476    tmp_src.swizzle = SWIZZLE_Y;
 477    emit(SHL(dst, tmp_src, src_reg(16u)));
 478
 479    /* Finally, give the write-channels of dst the form of packHalf2x16's
 480     * output:
 481     *   0xhhhhllll
 482     */
 483    tmp_src.swizzle = SWIZZLE_X;
 484    emit(OR(dst, src_reg(dst), tmp_src));
 485 }
 486
 487 void
 488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 489 {
 490    if (brw->gen < 7)
 491       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 492
 493    assert(dst.type == BRW_REGISTER_TYPE_F);
 494    assert(src0.type == BRW_REGISTER_TYPE_UD);
 495
 496    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 497     *
 498     *   Because this instruction does not have a 16-bit floating-point type,
 499     *   the source data type must be Word (W). The destination type must be
 500     *   F (Float).
 501     *
 502     * To use W as the source data type, we must adjust horizontal strides,
 503     * which is only possible in align1 mode. All my [chadv] attempts at
 504     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 505     * Piglit tests, so I gave up.
 506     *
 507     * I've verified that, on gen7 hardware and the simulator, it is safe to
 508     * emit f16to32 in align16 mode with UD as source data type.
 509     */
 510
 511    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 512    src_reg tmp_src(tmp_dst);
 513
 514    tmp_dst.writemask = WRITEMASK_X;
 515    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 516
 517    tmp_dst.writemask = WRITEMASK_Y;
 518    emit(SHR(tmp_dst, src0, src_reg(16u)));
 519
 520    dst.writemask = WRITEMASK_XY;
 521    emit(F16TO32(dst, tmp_src));
 522 }
 523
 524 void
 525 vec4_visitor::visit_instructions(const exec_list *list)
 526 {
 527    foreach_list(node, list) {
 528       ir_instruction *ir = (ir_instruction *)node;
 529
 530       base_ir = ir;
 531       ir->accept(this);
 532    }
 533 }
 534
 535
 536 static int
 537 type_size(const struct glsl_type *type)
 538 {
 539    unsigned int i;
 540    int size;
 541
 542    switch (type->base_type) {
 543    case GLSL_TYPE_UINT:
 544    case GLSL_TYPE_INT:
 545    case GLSL_TYPE_FLOAT:
 546    case GLSL_TYPE_BOOL:
 547       if (type->is_matrix()) {
 548          return type->matrix_columns;
 549       } else {
 550          /* Regardless of size of vector, it gets a vec4. This is bad
 551           * packing for things like floats, but otherwise arrays become a
 552           * mess.  Hopefully a later pass over the code can pack scalars
 553           * down if appropriate.
 554           */
 555          return 1;
 556       }
 557    case GLSL_TYPE_ARRAY:
 558       assert(type->length > 0);
 559       return type_size(type->fields.array) * type->length;
 560    case GLSL_TYPE_STRUCT:
 561       size = 0;
 562       for (i = 0; i < type->length; i++) {
 563          size += type_size(type->fields.structure[i].type);
 564       }
 565       return size;
 566    case GLSL_TYPE_SAMPLER:
 567       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 568        * at link time.
 569        */
 570       return 1;
 571    case GLSL_TYPE_ATOMIC_UINT:
 572       return 0;
 573    case GLSL_TYPE_VOID:
 574    case GLSL_TYPE_ERROR:
 575    case GLSL_TYPE_INTERFACE:
 576       assert(0);
 577       break;
 578    }
 579
 580    return 0;
 581 }
 582
 583 int
 584 vec4_visitor::virtual_grf_alloc(int size)
 585 {
 586    if (virtual_grf_array_size <= virtual_grf_count) {
 587       if (virtual_grf_array_size == 0)
 588          virtual_grf_array_size = 16;
 589       else
 590          virtual_grf_array_size *= 2;
 591       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 592                                    virtual_grf_array_size);
 593       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 594                                      virtual_grf_array_size);
 595    }
 596    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 597    virtual_grf_reg_count += size;
 598    virtual_grf_sizes[virtual_grf_count] = size;
 599    return virtual_grf_count++;
 600 }
 601
 602 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 603 {
 604    init();
 605
 606    this->file = GRF;
 607    this->reg = v->virtual_grf_alloc(type_size(type));
 608
 609    if (type->is_array() || type->is_record()) {
 610       this->swizzle = BRW_SWIZZLE_NOOP;
 611    } else {
 612       this->swizzle = swizzle_for_size(type->vector_elements);
 613    }
 614
 615    this->type = brw_type_for_base_type(type);
 616 }
 617
 618 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 619 {
 620    init();
 621
 622    this->file = GRF;
 623    this->reg = v->virtual_grf_alloc(type_size(type));
 624
 625    if (type->is_array() || type->is_record()) {
 626       this->writemask = WRITEMASK_XYZW;
 627    } else {
 628       this->writemask = (1 << type->vector_elements) - 1;
 629    }
 630
 631    this->type = brw_type_for_base_type(type);
 632 }
 633
 634 /* Our support for uniforms is piggy-backed on the struct
 635  * gl_fragment_program, because that's where the values actually
 636  * get stored, rather than in some global gl_shader_program uniform
 637  * store.
 638  */
 639 void
 640 vec4_visitor::setup_uniform_values(ir_variable *ir)
 641 {
 642    int namelen = strlen(ir->name);
 643
 644    /* The data for our (non-builtin) uniforms is stored in a series of
 645     * gl_uniform_driver_storage structs for each subcomponent that
 646     * glGetUniformLocation() could name.  We know it's been set up in the same
 647     * order we'd walk the type, so walk the list of storage and find anything
 648     * with our name, or the prefix of a component that starts with our name.
 649     */
 650    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 651       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 652
 653       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 654           (storage->name[namelen] != 0 &&
 655            storage->name[namelen] != '.' &&
 656            storage->name[namelen] != '[')) {
 657          continue;
 658       }
 659
 660       gl_constant_value *components = storage->storage;
 661       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 662                                storage->type->matrix_columns);
 663
 664       for (unsigned s = 0; s < vector_count; s++) {
 665          uniform_vector_size[uniforms] = storage->type->vector_elements;
 666
 667          int i;
 668          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 669             prog_data->param[uniforms * 4 + i] = &components->f;
 670             components++;
 671          }
 672          for (; i < 4; i++) {
 673             static float zero = 0;
 674             prog_data->param[uniforms * 4 + i] = &zero;
 675          }
 676
 677          uniforms++;
 678       }
 679    }
 680 }
 681
 682 void
 683 vec4_visitor::setup_uniform_clipplane_values()
 684 {
 685    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 686
 687    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 688       this->uniform_vector_size[this->uniforms] = 4;
 689       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 690       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 691       for (int j = 0; j < 4; ++j) {
 692          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 693       }
 694       ++this->uniforms;
 695    }
 696 }
 697
 698 /* Our support for builtin uniforms is even scarier than non-builtin.
 699  * It sits on top of the PROG_STATE_VAR parameters that are
 700  * automatically updated from GL context state.
 701  */
 702 void
 703 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 704 {
 705    const ir_state_slot *const slots = ir->state_slots;
 706    assert(ir->state_slots != NULL);
 707
 708    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 709       /* This state reference has already been setup by ir_to_mesa,
 710        * but we'll get the same index back here.  We can reference
 711        * ParameterValues directly, since unlike brw_fs.cpp, we never
 712        * add new state references during compile.
 713        */
 714       int index = _mesa_add_state_reference(this->prog->Parameters,
 715                                             (gl_state_index *)slots[i].tokens);
 716       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 717
 718       this->uniform_vector_size[this->uniforms] = 0;
 719       /* Add each of the unique swizzled channels of the element.
 720        * This will end up matching the size of the glsl_type of this field.
 721        */
 722       int last_swiz = -1;
 723       for (unsigned int j = 0; j < 4; j++) {
 724          int swiz = GET_SWZ(slots[i].swizzle, j);
 725          last_swiz = swiz;
 726
 727          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 728          if (swiz <= last_swiz)
 729             this->uniform_vector_size[this->uniforms]++;
 730       }
 731       this->uniforms++;
 732    }
 733 }
 734
 735 dst_reg *
 736 vec4_visitor::variable_storage(ir_variable *var)
 737 {
 738    return (dst_reg *)hash_table_find(this->variable_ht, var);
 739 }
 740
 741 void
 742 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 743 {
 744    ir_expression *expr = ir->as_expression();
 745
 746    *predicate = BRW_PREDICATE_NORMAL;
 747
 748    if (expr) {
 749       src_reg op[2];
 750       vec4_instruction *inst;
 751
 752       assert(expr->get_num_operands() <= 2);
 753       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 754          expr->operands[i]->accept(this);
 755          op[i] = this->result;
 756
 757          resolve_ud_negate(&op[i]);
 758       }
 759
 760       switch (expr->operation) {
 761       case ir_unop_logic_not:
 762          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 763          inst->conditional_mod = BRW_CONDITIONAL_Z;
 764          break;
 765
 766       case ir_binop_logic_xor:
 767          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 768          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 769          break;
 770
 771       case ir_binop_logic_or:
 772          inst = emit(OR(dst_null_d(), op[0], op[1]));
 773          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 774          break;
 775
 776       case ir_binop_logic_and:
 777          inst = emit(AND(dst_null_d(), op[0], op[1]));
 778          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 779          break;
 780
 781       case ir_unop_f2b:
 782          if (brw->gen >= 6) {
 783             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 784          } else {
 785             inst = emit(MOV(dst_null_f(), op[0]));
 786             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 787          }
 788          break;
 789
 790       case ir_unop_i2b:
 791          if (brw->gen >= 6) {
 792             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 793          } else {
 794             inst = emit(MOV(dst_null_d(), op[0]));
 795             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 796          }
 797          break;
 798
 799       case ir_binop_all_equal:
 800          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 801          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 802          break;
 803
 804       case ir_binop_any_nequal:
 805          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 806          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 807          break;
 808
 809       case ir_unop_any:
 810          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 811          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 812          break;
 813
 814       case ir_binop_greater:
 815       case ir_binop_gequal:
 816       case ir_binop_less:
 817       case ir_binop_lequal:
 818       case ir_binop_equal:
 819       case ir_binop_nequal:
 820          emit(CMP(dst_null_d(), op[0], op[1],
 821                   brw_conditional_for_comparison(expr->operation)));
 822          break;
 823
 824       default:
 825          assert(!"not reached");
 826          break;
 827       }
 828       return;
 829    }
 830
 831    ir->accept(this);
 832
 833    resolve_ud_negate(&this->result);
 834
 835    if (brw->gen >= 6) {
 836       vec4_instruction *inst = emit(AND(dst_null_d(),
 837                                         this->result, src_reg(1)));
 838       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839    } else {
 840       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 841       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 842    }
 843 }
 844
 845 /**
 846  * Emit a gen6 IF statement with the comparison folded into the IF
 847  * instruction.
 848  */
 849 void
 850 vec4_visitor::emit_if_gen6(ir_if *ir)
 851 {
 852    ir_expression *expr = ir->condition->as_expression();
 853
 854    if (expr) {
 855       src_reg op[2];
 856       dst_reg temp;
 857
 858       assert(expr->get_num_operands() <= 2);
 859       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 860          expr->operands[i]->accept(this);
 861          op[i] = this->result;
 862       }
 863
 864       switch (expr->operation) {
 865       case ir_unop_logic_not:
 866          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 867          return;
 868
 869       case ir_binop_logic_xor:
 870          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 871          return;
 872
 873       case ir_binop_logic_or:
 874          temp = dst_reg(this, glsl_type::bool_type);
 875          emit(OR(temp, op[0], op[1]));
 876          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 877          return;
 878
 879       case ir_binop_logic_and:
 880          temp = dst_reg(this, glsl_type::bool_type);
 881          emit(AND(temp, op[0], op[1]));
 882          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 883          return;
 884
 885       case ir_unop_f2b:
 886          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 887          return;
 888
 889       case ir_unop_i2b:
 890          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 891          return;
 892
 893       case ir_binop_greater:
 894       case ir_binop_gequal:
 895       case ir_binop_less:
 896       case ir_binop_lequal:
 897       case ir_binop_equal:
 898       case ir_binop_nequal:
 899          emit(IF(op[0], op[1],
 900                  brw_conditional_for_comparison(expr->operation)));
 901          return;
 902
 903       case ir_binop_all_equal:
 904          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 905          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 906          return;
 907
 908       case ir_binop_any_nequal:
 909          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 910          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 911          return;
 912
 913       case ir_unop_any:
 914          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 915          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 916          return;
 917
 918       default:
 919          assert(!"not reached");
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922       }
 923       return;
 924    }
 925
 926    ir->condition->accept(this);
 927
 928    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 929 }
 930
 931 dst_reg
 932 with_writemask(dst_reg const & r, int mask)
 933 {
 934    dst_reg result = r;
 935    result.writemask = mask;
 936    return result;
 937 }
 938
 939
 940 void
 941 vec4_visitor::visit(ir_variable *ir)
 942 {
 943    dst_reg *reg = NULL;
 944
 945    if (variable_storage(ir))
 946       return;
 947
 948    switch (ir->data.mode) {
 949    case ir_var_shader_in:
 950       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 951       break;
 952
 953    case ir_var_shader_out:
 954       reg = new(mem_ctx) dst_reg(this, ir->type);
 955
 956       for (int i = 0; i < type_size(ir->type); i++) {
 957          output_reg[ir->data.location + i] = *reg;
 958          output_reg[ir->data.location + i].reg_offset = i;
 959          output_reg[ir->data.location + i].type =
 960             brw_type_for_base_type(ir->type->get_scalar_type());
 961          output_reg_annotation[ir->data.location + i] = ir->name;
 962       }
 963       break;
 964
 965    case ir_var_auto:
 966    case ir_var_temporary:
 967       reg = new(mem_ctx) dst_reg(this, ir->type);
 968       break;
 969
 970    case ir_var_uniform:
 971       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 972
 973       /* Thanks to the lower_ubo_reference pass, we will see only
 974        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 975        * variables, so no need for them to be in variable_ht.
 976        *
 977        * Atomic counters take no uniform storage, no need to do
 978        * anything here.
 979        */
 980       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 981          return;
 982
 983       /* Track how big the whole uniform variable is, in case we need to put a
 984        * copy of its data into pull constants for array access.
 985        */
 986       this->uniform_size[this->uniforms] = type_size(ir->type);
 987
 988       if (!strncmp(ir->name, "gl_", 3)) {
 989          setup_builtin_uniform_values(ir);
 990       } else {
 991          setup_uniform_values(ir);
 992       }
 993       break;
 994
 995    case ir_var_system_value:
 996       reg = make_reg_for_system_value(ir);
 997       break;
 998
 999    default:
1000       assert(!"not reached");
1001    }
1002
1003    reg->type = brw_type_for_base_type(ir->type);
1004    hash_table_insert(this->variable_ht, reg, ir);
1005 }
1006
1007 void
1008 vec4_visitor::visit(ir_loop *ir)
1009 {
1010    /* We don't want debugging output to print the whole body of the
1011     * loop as the annotation.
1012     */
1013    this->base_ir = NULL;
1014
1015    emit(BRW_OPCODE_DO);
1016
1017    visit_instructions(&ir->body_instructions);
1018
1019    emit(BRW_OPCODE_WHILE);
1020 }
1021
1022 void
1023 vec4_visitor::visit(ir_loop_jump *ir)
1024 {
1025    switch (ir->mode) {
1026    case ir_loop_jump::jump_break:
1027       emit(BRW_OPCODE_BREAK);
1028       break;
1029    case ir_loop_jump::jump_continue:
1030       emit(BRW_OPCODE_CONTINUE);
1031       break;
1032    }
1033 }
1034
1035
1036 void
1037 vec4_visitor::visit(ir_function_signature *ir)
1038 {
1039    assert(0);
1040    (void)ir;
1041 }
1042
1043 void
1044 vec4_visitor::visit(ir_function *ir)
1045 {
1046    /* Ignore function bodies other than main() -- we shouldn't see calls to
1047     * them since they should all be inlined.
1048     */
1049    if (strcmp(ir->name, "main") == 0) {
1050       const ir_function_signature *sig;
1051       exec_list empty;
1052
1053       sig = ir->matching_signature(NULL, &empty);
1054
1055       assert(sig);
1056
1057       visit_instructions(&sig->body);
1058    }
1059 }
1060
1061 bool
1062 vec4_visitor::try_emit_sat(ir_expression *ir)
1063 {
1064    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1065    if (!sat_src)
1066       return false;
1067
1068    sat_src->accept(this);
1069    src_reg src = this->result;
1070
1071    this->result = src_reg(this, ir->type);
1072    vec4_instruction *inst;
1073    inst = emit(MOV(dst_reg(this->result), src));
1074    inst->saturate = true;
1075
1076    return true;
1077 }
1078
1079 bool
1080 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1081 {
1082    /* 3-src instructions were introduced in gen6. */
1083    if (brw->gen < 6)
1084       return false;
1085
1086    /* MAD can only handle floating-point data. */
1087    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1088       return false;
1089
1090    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1091    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1092
1093    if (!mul || mul->operation != ir_binop_mul)
1094       return false;
1095
1096    nonmul->accept(this);
1097    src_reg src0 = fix_3src_operand(this->result);
1098
1099    mul->operands[0]->accept(this);
1100    src_reg src1 = fix_3src_operand(this->result);
1101
1102    mul->operands[1]->accept(this);
1103    src_reg src2 = fix_3src_operand(this->result);
1104
1105    this->result = src_reg(this, ir->type);
1106    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1107
1108    return true;
1109 }
1110
1111 void
1112 vec4_visitor::emit_bool_comparison(unsigned int op,
1113                                  dst_reg dst, src_reg src0, src_reg src1)
1114 {
1115    /* original gen4 does destination conversion before comparison. */
1116    if (brw->gen < 5)
1117       dst.type = src0.type;
1118
1119    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1120
1121    dst.type = BRW_REGISTER_TYPE_D;
1122    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1123 }
1124
1125 void
1126 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1127                           src_reg src0, src_reg src1)
1128 {
1129    vec4_instruction *inst;
1130
1131    if (brw->gen >= 6) {
1132       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1133       inst->conditional_mod = conditionalmod;
1134    } else {
1135       emit(CMP(dst, src0, src1, conditionalmod));
1136
1137       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138       inst->predicate = BRW_PREDICATE_NORMAL;
1139    }
1140 }
1141
1142 static bool
1143 is_16bit_constant(ir_rvalue *rvalue)
1144 {
1145    ir_constant *constant = rvalue->as_constant();
1146    if (!constant)
1147       return false;
1148
1149    if (constant->type != glsl_type::int_type &&
1150        constant->type != glsl_type::uint_type)
1151       return false;
1152
1153    return constant->value.u[0] < (1 << 16);
1154 }
1155
1156 void
1157 vec4_visitor::visit(ir_expression *ir)
1158 {
1159    unsigned int operand;
1160    src_reg op[Elements(ir->operands)];
1161    src_reg result_src;
1162    dst_reg result_dst;
1163    vec4_instruction *inst;
1164
1165    if (try_emit_sat(ir))
1166       return;
1167
1168    if (ir->operation == ir_binop_add) {
1169       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1170          return;
1171    }
1172
1173    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1174       this->result.file = BAD_FILE;
1175       ir->operands[operand]->accept(this);
1176       if (this->result.file == BAD_FILE) {
1177          printf("Failed to get tree for expression operand:\n");
1178          ir->operands[operand]->print();
1179          exit(1);
1180       }
1181       op[operand] = this->result;
1182
1183       /* Matrix expression operands should have been broken down to vector
1184        * operations already.
1185        */
1186       assert(!ir->operands[operand]->type->is_matrix());
1187    }
1188
1189    int vector_elements = ir->operands[0]->type->vector_elements;
1190    if (ir->operands[1]) {
1191       vector_elements = MAX2(vector_elements,
1192                              ir->operands[1]->type->vector_elements);
1193    }
1194
1195    this->result.file = BAD_FILE;
1196
1197    /* Storage for our result.  Ideally for an assignment we'd be using
1198     * the actual storage for the result here, instead.
1199     */
1200    result_src = src_reg(this, ir->type);
1201    /* convenience for the emit functions below. */
1202    result_dst = dst_reg(result_src);
1203    /* If nothing special happens, this is the result. */
1204    this->result = result_src;
1205    /* Limit writes to the channels that will be used by result_src later.
1206     * This does limit this temp's use as a temporary for multi-instruction
1207     * sequences.
1208     */
1209    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1210
1211    switch (ir->operation) {
1212    case ir_unop_logic_not:
1213       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1214        * ones complement of the whole register, not just bit 0.
1215        */
1216       emit(XOR(result_dst, op[0], src_reg(1)));
1217       break;
1218    case ir_unop_neg:
1219       op[0].negate = !op[0].negate;
1220       emit(MOV(result_dst, op[0]));
1221       break;
1222    case ir_unop_abs:
1223       op[0].abs = true;
1224       op[0].negate = false;
1225       emit(MOV(result_dst, op[0]));
1226       break;
1227
1228    case ir_unop_sign:
1229       if (ir->type->is_float()) {
1230          /* AND(val, 0x80000000) gives the sign bit.
1231           *
1232           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1233           * zero.
1234           */
1235          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1236
1237          op[0].type = BRW_REGISTER_TYPE_UD;
1238          result_dst.type = BRW_REGISTER_TYPE_UD;
1239          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1240
1241          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1242          inst->predicate = BRW_PREDICATE_NORMAL;
1243
1244          this->result.type = BRW_REGISTER_TYPE_F;
1245       } else {
1246          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1247           *               -> non-negative val generates 0x00000000.
1248           *  Predicated OR sets 1 if val is positive.
1249           */
1250          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1251
1252          emit(ASR(result_dst, op[0], src_reg(31)));
1253
1254          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1255          inst->predicate = BRW_PREDICATE_NORMAL;
1256       }
1257       break;
1258
1259    case ir_unop_rcp:
1260       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1261       break;
1262
1263    case ir_unop_exp2:
1264       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1265       break;
1266    case ir_unop_log2:
1267       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1268       break;
1269    case ir_unop_exp:
1270    case ir_unop_log:
1271       assert(!"not reached: should be handled by ir_explog_to_explog2");
1272       break;
1273    case ir_unop_sin:
1274    case ir_unop_sin_reduced:
1275       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1276       break;
1277    case ir_unop_cos:
1278    case ir_unop_cos_reduced:
1279       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1280       break;
1281
1282    case ir_unop_dFdx:
1283    case ir_unop_dFdy:
1284       assert(!"derivatives not valid in vertex shader");
1285       break;
1286
1287    case ir_unop_bitfield_reverse:
1288       emit(BFREV(result_dst, op[0]));
1289       break;
1290    case ir_unop_bit_count:
1291       emit(CBIT(result_dst, op[0]));
1292       break;
1293    case ir_unop_find_msb: {
1294       src_reg temp = src_reg(this, glsl_type::uint_type);
1295
1296       inst = emit(FBH(dst_reg(temp), op[0]));
1297       inst->dst.writemask = WRITEMASK_XYZW;
1298
1299       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1300        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1301        * subtract the result from 31 to convert the MSB count into an LSB count.
1302        */
1303
1304       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1305       temp.swizzle = BRW_SWIZZLE_NOOP;
1306       emit(MOV(result_dst, temp));
1307
1308       src_reg src_tmp = src_reg(result_dst);
1309       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1310
1311       src_tmp.negate = true;
1312       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1313       inst->predicate = BRW_PREDICATE_NORMAL;
1314       break;
1315    }
1316    case ir_unop_find_lsb:
1317       emit(FBL(result_dst, op[0]));
1318       break;
1319
1320    case ir_unop_noise:
1321       assert(!"not reached: should be handled by lower_noise");
1322       break;
1323
1324    case ir_binop_add:
1325       emit(ADD(result_dst, op[0], op[1]));
1326       break;
1327    case ir_binop_sub:
1328       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1329       break;
1330
1331    case ir_binop_mul:
1332       if (brw->gen < 8 && ir->type->is_integer()) {
1333          /* For integer multiplication, the MUL uses the low 16 bits of one of
1334           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1335           * accumulates in the contribution of the upper 16 bits of that
1336           * operand.  If we can determine that one of the args is in the low
1337           * 16 bits, though, we can just emit a single MUL.
1338           */
1339          if (is_16bit_constant(ir->operands[0])) {
1340             if (brw->gen < 7)
1341                emit(MUL(result_dst, op[0], op[1]));
1342             else
1343                emit(MUL(result_dst, op[1], op[0]));
1344          } else if (is_16bit_constant(ir->operands[1])) {
1345             if (brw->gen < 7)
1346                emit(MUL(result_dst, op[1], op[0]));
1347             else
1348                emit(MUL(result_dst, op[0], op[1]));
1349          } else {
1350             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1351
1352             emit(MUL(acc, op[0], op[1]));
1353             emit(MACH(dst_null_d(), op[0], op[1]));
1354             emit(MOV(result_dst, src_reg(acc)));
1355          }
1356       } else {
1357          emit(MUL(result_dst, op[0], op[1]));
1358       }
1359       break;
1360    case ir_binop_imul_high: {
1361       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1362
1363       emit(MUL(acc, op[0], op[1]));
1364       emit(MACH(result_dst, op[0], op[1]));
1365       break;
1366    }
1367    case ir_binop_div:
1368       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1369       assert(ir->type->is_integer());
1370       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1371       break;
1372    case ir_binop_carry: {
1373       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1374
1375       emit(ADDC(dst_null_ud(), op[0], op[1]));
1376       emit(MOV(result_dst, src_reg(acc)));
1377       break;
1378    }
1379    case ir_binop_borrow: {
1380       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1381
1382       emit(SUBB(dst_null_ud(), op[0], op[1]));
1383       emit(MOV(result_dst, src_reg(acc)));
1384       break;
1385    }
1386    case ir_binop_mod:
1387       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1388       assert(ir->type->is_integer());
1389       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1390       break;
1391
1392    case ir_binop_less:
1393    case ir_binop_greater:
1394    case ir_binop_lequal:
1395    case ir_binop_gequal:
1396    case ir_binop_equal:
1397    case ir_binop_nequal: {
1398       emit(CMP(result_dst, op[0], op[1],
1399                brw_conditional_for_comparison(ir->operation)));
1400       emit(AND(result_dst, result_src, src_reg(0x1)));
1401       break;
1402    }
1403
1404    case ir_binop_all_equal:
1405       /* "==" operator producing a scalar boolean. */
1406       if (ir->operands[0]->type->is_vector() ||
1407           ir->operands[1]->type->is_vector()) {
1408          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1409          emit(MOV(result_dst, src_reg(0)));
1410          inst = emit(MOV(result_dst, src_reg(1)));
1411          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1412       } else {
1413          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1414          emit(AND(result_dst, result_src, src_reg(0x1)));
1415       }
1416       break;
1417    case ir_binop_any_nequal:
1418       /* "!=" operator producing a scalar boolean. */
1419       if (ir->operands[0]->type->is_vector() ||
1420           ir->operands[1]->type->is_vector()) {
1421          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1422
1423          emit(MOV(result_dst, src_reg(0)));
1424          inst = emit(MOV(result_dst, src_reg(1)));
1425          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1426       } else {
1427          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1428          emit(AND(result_dst, result_src, src_reg(0x1)));
1429       }
1430       break;
1431
1432    case ir_unop_any:
1433       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1434       emit(MOV(result_dst, src_reg(0)));
1435
1436       inst = emit(MOV(result_dst, src_reg(1)));
1437       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1438       break;
1439
1440    case ir_binop_logic_xor:
1441       emit(XOR(result_dst, op[0], op[1]));
1442       break;
1443
1444    case ir_binop_logic_or:
1445       emit(OR(result_dst, op[0], op[1]));
1446       break;
1447
1448    case ir_binop_logic_and:
1449       emit(AND(result_dst, op[0], op[1]));
1450       break;
1451
1452    case ir_binop_dot:
1453       assert(ir->operands[0]->type->is_vector());
1454       assert(ir->operands[0]->type == ir->operands[1]->type);
1455       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1456       break;
1457
1458    case ir_unop_sqrt:
1459       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1460       break;
1461    case ir_unop_rsq:
1462       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1463       break;
1464
1465    case ir_unop_bitcast_i2f:
1466    case ir_unop_bitcast_u2f:
1467       this->result = op[0];
1468       this->result.type = BRW_REGISTER_TYPE_F;
1469       break;
1470
1471    case ir_unop_bitcast_f2i:
1472       this->result = op[0];
1473       this->result.type = BRW_REGISTER_TYPE_D;
1474       break;
1475
1476    case ir_unop_bitcast_f2u:
1477       this->result = op[0];
1478       this->result.type = BRW_REGISTER_TYPE_UD;
1479       break;
1480
1481    case ir_unop_i2f:
1482    case ir_unop_i2u:
1483    case ir_unop_u2i:
1484    case ir_unop_u2f:
1485    case ir_unop_b2f:
1486    case ir_unop_b2i:
1487    case ir_unop_f2i:
1488    case ir_unop_f2u:
1489       emit(MOV(result_dst, op[0]));
1490       break;
1491    case ir_unop_f2b:
1492    case ir_unop_i2b: {
1493       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1494       emit(AND(result_dst, result_src, src_reg(1)));
1495       break;
1496    }
1497
1498    case ir_unop_trunc:
1499       emit(RNDZ(result_dst, op[0]));
1500       break;
1501    case ir_unop_ceil:
1502       op[0].negate = !op[0].negate;
1503       inst = emit(RNDD(result_dst, op[0]));
1504       this->result.negate = true;
1505       break;
1506    case ir_unop_floor:
1507       inst = emit(RNDD(result_dst, op[0]));
1508       break;
1509    case ir_unop_fract:
1510       inst = emit(FRC(result_dst, op[0]));
1511       break;
1512    case ir_unop_round_even:
1513       emit(RNDE(result_dst, op[0]));
1514       break;
1515
1516    case ir_binop_min:
1517       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1518       break;
1519    case ir_binop_max:
1520       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1521       break;
1522
1523    case ir_binop_pow:
1524       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1525       break;
1526
1527    case ir_unop_bit_not:
1528       inst = emit(NOT(result_dst, op[0]));
1529       break;
1530    case ir_binop_bit_and:
1531       inst = emit(AND(result_dst, op[0], op[1]));
1532       break;
1533    case ir_binop_bit_xor:
1534       inst = emit(XOR(result_dst, op[0], op[1]));
1535       break;
1536    case ir_binop_bit_or:
1537       inst = emit(OR(result_dst, op[0], op[1]));
1538       break;
1539
1540    case ir_binop_lshift:
1541       inst = emit(SHL(result_dst, op[0], op[1]));
1542       break;
1543
1544    case ir_binop_rshift:
1545       if (ir->type->base_type == GLSL_TYPE_INT)
1546          inst = emit(ASR(result_dst, op[0], op[1]));
1547       else
1548          inst = emit(SHR(result_dst, op[0], op[1]));
1549       break;
1550
1551    case ir_binop_bfm:
1552       emit(BFI1(result_dst, op[0], op[1]));
1553       break;
1554
1555    case ir_binop_ubo_load: {
1556       ir_constant *uniform_block = ir->operands[0]->as_constant();
1557       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1558       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1559       src_reg offset;
1560
1561       /* Now, load the vector from that offset. */
1562       assert(ir->type->is_vector() || ir->type->is_scalar());
1563
1564       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1565       packed_consts.type = result.type;
1566       src_reg surf_index =
1567          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1568       if (const_offset_ir) {
1569          if (brw->gen >= 8) {
1570             /* Store the offset in a GRF so we can send-from-GRF. */
1571             offset = src_reg(this, glsl_type::int_type);
1572             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1573          } else {
1574             /* Immediates are fine on older generations since they'll be moved
1575              * to a (potentially fake) MRF at the generator level.
1576              */
1577             offset = src_reg(const_offset / 16);
1578          }
1579       } else {
1580          offset = src_reg(this, glsl_type::uint_type);
1581          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1582       }
1583
1584       vec4_instruction *pull =
1585          emit(new(mem_ctx) vec4_instruction(this,
1586                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1587                                             dst_reg(packed_consts),
1588                                             surf_index,
1589                                             offset));
1590       pull->base_mrf = 14;
1591       pull->mlen = 1;
1592
1593       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1594       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1595                                             const_offset % 16 / 4,
1596                                             const_offset % 16 / 4,
1597                                             const_offset % 16 / 4);
1598
1599       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1600       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1601          emit(CMP(result_dst, packed_consts, src_reg(0u),
1602                   BRW_CONDITIONAL_NZ));
1603          emit(AND(result_dst, result, src_reg(0x1)));
1604       } else {
1605          emit(MOV(result_dst, packed_consts));
1606       }
1607       break;
1608    }
1609
1610    case ir_binop_vector_extract:
1611       assert(!"should have been lowered by vec_index_to_cond_assign");
1612       break;
1613
1614    case ir_triop_fma:
1615       op[0] = fix_3src_operand(op[0]);
1616       op[1] = fix_3src_operand(op[1]);
1617       op[2] = fix_3src_operand(op[2]);
1618       /* Note that the instruction's argument order is reversed from GLSL
1619        * and the IR.
1620        */
1621       emit(MAD(result_dst, op[2], op[1], op[0]));
1622       break;
1623
1624    case ir_triop_lrp:
1625       op[0] = fix_3src_operand(op[0]);
1626       op[1] = fix_3src_operand(op[1]);
1627       op[2] = fix_3src_operand(op[2]);
1628       /* Note that the instruction's argument order is reversed from GLSL
1629        * and the IR.
1630        */
1631       emit(LRP(result_dst, op[2], op[1], op[0]));
1632       break;
1633
1634    case ir_triop_csel:
1635       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1636       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1637       inst->predicate = BRW_PREDICATE_NORMAL;
1638       break;
1639
1640    case ir_triop_bfi:
1641       op[0] = fix_3src_operand(op[0]);
1642       op[1] = fix_3src_operand(op[1]);
1643       op[2] = fix_3src_operand(op[2]);
1644       emit(BFI2(result_dst, op[0], op[1], op[2]));
1645       break;
1646
1647    case ir_triop_bitfield_extract:
1648       op[0] = fix_3src_operand(op[0]);
1649       op[1] = fix_3src_operand(op[1]);
1650       op[2] = fix_3src_operand(op[2]);
1651       /* Note that the instruction's argument order is reversed from GLSL
1652        * and the IR.
1653        */
1654       emit(BFE(result_dst, op[2], op[1], op[0]));
1655       break;
1656
1657    case ir_triop_vector_insert:
1658       assert(!"should have been lowered by lower_vector_insert");
1659       break;
1660
1661    case ir_quadop_bitfield_insert:
1662       assert(!"not reached: should be handled by "
1663               "bitfield_insert_to_bfm_bfi\n");
1664       break;
1665
1666    case ir_quadop_vector:
1667       assert(!"not reached: should be handled by lower_quadop_vector");
1668       break;
1669
1670    case ir_unop_pack_half_2x16:
1671       emit_pack_half_2x16(result_dst, op[0]);
1672       break;
1673    case ir_unop_unpack_half_2x16:
1674       emit_unpack_half_2x16(result_dst, op[0]);
1675       break;
1676    case ir_unop_pack_snorm_2x16:
1677    case ir_unop_pack_snorm_4x8:
1678    case ir_unop_pack_unorm_2x16:
1679    case ir_unop_pack_unorm_4x8:
1680    case ir_unop_unpack_snorm_2x16:
1681    case ir_unop_unpack_snorm_4x8:
1682    case ir_unop_unpack_unorm_2x16:
1683    case ir_unop_unpack_unorm_4x8:
1684       assert(!"not reached: should be handled by lower_packing_builtins");
1685       break;
1686    case ir_unop_unpack_half_2x16_split_x:
1687    case ir_unop_unpack_half_2x16_split_y:
1688    case ir_binop_pack_half_2x16_split:
1689       assert(!"not reached: should not occur in vertex shader");
1690       break;
1691    case ir_binop_ldexp:
1692       assert(!"not reached: should be handled by ldexp_to_arith()");
1693       break;
1694    }
1695 }
1696
1697
1698 void
1699 vec4_visitor::visit(ir_swizzle *ir)
1700 {
1701    src_reg src;
1702    int i = 0;
1703    int swizzle[4];
1704
1705    /* Note that this is only swizzles in expressions, not those on the left
1706     * hand side of an assignment, which do write masking.  See ir_assignment
1707     * for that.
1708     */
1709
1710    ir->val->accept(this);
1711    src = this->result;
1712    assert(src.file != BAD_FILE);
1713
1714    for (i = 0; i < ir->type->vector_elements; i++) {
1715       switch (i) {
1716       case 0:
1717          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1718          break;
1719       case 1:
1720          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1721          break;
1722       case 2:
1723          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1724          break;
1725       case 3:
1726          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1727             break;
1728       }
1729    }
1730    for (; i < 4; i++) {
1731       /* Replicate the last channel out. */
1732       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1733    }
1734
1735    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1736
1737    this->result = src;
1738 }
1739
1740 void
1741 vec4_visitor::visit(ir_dereference_variable *ir)
1742 {
1743    const struct glsl_type *type = ir->type;
1744    dst_reg *reg = variable_storage(ir->var);
1745
1746    if (!reg) {
1747       fail("Failed to find variable storage for %s\n", ir->var->name);
1748       this->result = src_reg(brw_null_reg());
1749       return;
1750    }
1751
1752    this->result = src_reg(*reg);
1753
1754    /* System values get their swizzle from the dst_reg writemask */
1755    if (ir->var->data.mode == ir_var_system_value)
1756       return;
1757
1758    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1759       this->result.swizzle = swizzle_for_size(type->vector_elements);
1760 }
1761
1762
1763 int
1764 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1765 {
1766    /* Under normal circumstances array elements are stored consecutively, so
1767     * the stride is equal to the size of the array element.
1768     */
1769    return type_size(ir->type);
1770 }
1771
1772
1773 void
1774 vec4_visitor::visit(ir_dereference_array *ir)
1775 {
1776    ir_constant *constant_index;
1777    src_reg src;
1778    int array_stride = compute_array_stride(ir);
1779
1780    constant_index = ir->array_index->constant_expression_value();
1781
1782    ir->array->accept(this);
1783    src = this->result;
1784
1785    if (constant_index) {
1786       src.reg_offset += constant_index->value.i[0] * array_stride;
1787    } else {
1788       /* Variable index array dereference.  It eats the "vec4" of the
1789        * base of the array and an index that offsets the Mesa register
1790        * index.
1791        */
1792       ir->array_index->accept(this);
1793
1794       src_reg index_reg;
1795
1796       if (array_stride == 1) {
1797          index_reg = this->result;
1798       } else {
1799          index_reg = src_reg(this, glsl_type::int_type);
1800
1801          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1802       }
1803
1804       if (src.reladdr) {
1805          src_reg temp = src_reg(this, glsl_type::int_type);
1806
1807          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1808
1809          index_reg = temp;
1810       }
1811
1812       src.reladdr = ralloc(mem_ctx, src_reg);
1813       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1814    }
1815
1816    /* If the type is smaller than a vec4, replicate the last channel out. */
1817    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1818       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1819    else
1820       src.swizzle = BRW_SWIZZLE_NOOP;
1821    src.type = brw_type_for_base_type(ir->type);
1822
1823    this->result = src;
1824 }
1825
1826 void
1827 vec4_visitor::visit(ir_dereference_record *ir)
1828 {
1829    unsigned int i;
1830    const glsl_type *struct_type = ir->record->type;
1831    int offset = 0;
1832
1833    ir->record->accept(this);
1834
1835    for (i = 0; i < struct_type->length; i++) {
1836       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1837          break;
1838       offset += type_size(struct_type->fields.structure[i].type);
1839    }
1840
1841    /* If the type is smaller than a vec4, replicate the last channel out. */
1842    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1843       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1844    else
1845       this->result.swizzle = BRW_SWIZZLE_NOOP;
1846    this->result.type = brw_type_for_base_type(ir->type);
1847
1848    this->result.reg_offset += offset;
1849 }
1850
1851 /**
1852  * We want to be careful in assignment setup to hit the actual storage
1853  * instead of potentially using a temporary like we might with the
1854  * ir_dereference handler.
1855  */
1856 static dst_reg
1857 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1858 {
1859    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1860     * access of a vector, it must be separated into a series conditional moves
1861     * before reaching this point (see ir_vec_index_to_cond_assign).
1862     */
1863    assert(ir->as_dereference());
1864    ir_dereference_array *deref_array = ir->as_dereference_array();
1865    if (deref_array) {
1866       assert(!deref_array->array->type->is_vector());
1867    }
1868
1869    /* Use the rvalue deref handler for the most part.  We'll ignore
1870     * swizzles in it and write swizzles using writemask, though.
1871     */
1872    ir->accept(v);
1873    return dst_reg(v->result);
1874 }
1875
1876 void
1877 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1878                               const struct glsl_type *type, uint32_t predicate)
1879 {
1880    if (type->base_type == GLSL_TYPE_STRUCT) {
1881       for (unsigned int i = 0; i < type->length; i++) {
1882          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1883       }
1884       return;
1885    }
1886
1887    if (type->is_array()) {
1888       for (unsigned int i = 0; i < type->length; i++) {
1889          emit_block_move(dst, src, type->fields.array, predicate);
1890       }
1891       return;
1892    }
1893
1894    if (type->is_matrix()) {
1895       const struct glsl_type *vec_type;
1896
1897       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1898                                          type->vector_elements, 1);
1899
1900       for (int i = 0; i < type->matrix_columns; i++) {
1901          emit_block_move(dst, src, vec_type, predicate);
1902       }
1903       return;
1904    }
1905
1906    assert(type->is_scalar() || type->is_vector());
1907
1908    dst->type = brw_type_for_base_type(type);
1909    src->type = dst->type;
1910
1911    dst->writemask = (1 << type->vector_elements) - 1;
1912
1913    src->swizzle = swizzle_for_size(type->vector_elements);
1914
1915    vec4_instruction *inst = emit(MOV(*dst, *src));
1916    inst->predicate = predicate;
1917
1918    dst->reg_offset++;
1919    src->reg_offset++;
1920 }
1921
1922
1923 /* If the RHS processing resulted in an instruction generating a
1924  * temporary value, and it would be easy to rewrite the instruction to
1925  * generate its result right into the LHS instead, do so.  This ends
1926  * up reliably removing instructions where it can be tricky to do so
1927  * later without real UD chain information.
1928  */
1929 bool
1930 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1931                                      dst_reg dst,
1932                                      src_reg src,
1933                                      vec4_instruction *pre_rhs_inst,
1934                                      vec4_instruction *last_rhs_inst)
1935 {
1936    /* This could be supported, but it would take more smarts. */
1937    if (ir->condition)
1938       return false;
1939
1940    if (pre_rhs_inst == last_rhs_inst)
1941       return false; /* No instructions generated to work with. */
1942
1943    /* Make sure the last instruction generated our source reg. */
1944    if (src.file != GRF ||
1945        src.file != last_rhs_inst->dst.file ||
1946        src.reg != last_rhs_inst->dst.reg ||
1947        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1948        src.reladdr ||
1949        src.abs ||
1950        src.negate ||
1951        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1952       return false;
1953
1954    /* Check that that last instruction fully initialized the channels
1955     * we want to use, in the order we want to use them.  We could
1956     * potentially reswizzle the operands of many instructions so that
1957     * we could handle out of order channels, but don't yet.
1958     */
1959
1960    for (unsigned i = 0; i < 4; i++) {
1961       if (dst.writemask & (1 << i)) {
1962          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1963             return false;
1964
1965          if (BRW_GET_SWZ(src.swizzle, i) != i)
1966             return false;
1967       }
1968    }
1969
1970    /* Success!  Rewrite the instruction. */
1971    last_rhs_inst->dst.file = dst.file;
1972    last_rhs_inst->dst.reg = dst.reg;
1973    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1974    last_rhs_inst->dst.reladdr = dst.reladdr;
1975    last_rhs_inst->dst.writemask &= dst.writemask;
1976
1977    return true;
1978 }
1979
1980 void
1981 vec4_visitor::visit(ir_assignment *ir)
1982 {
1983    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1984    uint32_t predicate = BRW_PREDICATE_NONE;
1985
1986    if (!ir->lhs->type->is_scalar() &&
1987        !ir->lhs->type->is_vector()) {
1988       ir->rhs->accept(this);
1989       src_reg src = this->result;
1990
1991       if (ir->condition) {
1992          emit_bool_to_cond_code(ir->condition, &predicate);
1993       }
1994
1995       /* emit_block_move doesn't account for swizzles in the source register.
1996        * This should be ok, since the source register is a structure or an
1997        * array, and those can't be swizzled.  But double-check to be sure.
1998        */
1999       assert(src.swizzle ==
2000              (ir->rhs->type->is_matrix()
2001               ? swizzle_for_size(ir->rhs->type->vector_elements)
2002               : BRW_SWIZZLE_NOOP));
2003
2004       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2005       return;
2006    }
2007
2008    /* Now we're down to just a scalar/vector with writemasks. */
2009    int i;
2010
2011    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2012    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2013
2014    ir->rhs->accept(this);
2015
2016    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2017
2018    src_reg src = this->result;
2019
2020    int swizzles[4];
2021    int first_enabled_chan = 0;
2022    int src_chan = 0;
2023
2024    assert(ir->lhs->type->is_vector() ||
2025           ir->lhs->type->is_scalar());
2026    dst.writemask = ir->write_mask;
2027
2028    for (int i = 0; i < 4; i++) {
2029       if (dst.writemask & (1 << i)) {
2030          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2031          break;
2032       }
2033    }
2034
2035    /* Swizzle a small RHS vector into the channels being written.
2036     *
2037     * glsl ir treats write_mask as dictating how many channels are
2038     * present on the RHS while in our instructions we need to make
2039     * those channels appear in the slots of the vec4 they're written to.
2040     */
2041    for (int i = 0; i < 4; i++) {
2042       if (dst.writemask & (1 << i))
2043          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2044       else
2045          swizzles[i] = first_enabled_chan;
2046    }
2047    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2048                               swizzles[2], swizzles[3]);
2049
2050    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2051       return;
2052    }
2053
2054    if (ir->condition) {
2055       emit_bool_to_cond_code(ir->condition, &predicate);
2056    }
2057
2058    for (i = 0; i < type_size(ir->lhs->type); i++) {
2059       vec4_instruction *inst = emit(MOV(dst, src));
2060       inst->predicate = predicate;
2061
2062       dst.reg_offset++;
2063       src.reg_offset++;
2064    }
2065 }
2066
2067 void
2068 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2069 {
2070    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2071       foreach_list(node, &ir->components) {
2072          ir_constant *field_value = (ir_constant *)node;
2073
2074          emit_constant_values(dst, field_value);
2075       }
2076       return;
2077    }
2078
2079    if (ir->type->is_array()) {
2080       for (unsigned int i = 0; i < ir->type->length; i++) {
2081          emit_constant_values(dst, ir->array_elements[i]);
2082       }
2083       return;
2084    }
2085
2086    if (ir->type->is_matrix()) {
2087       for (int i = 0; i < ir->type->matrix_columns; i++) {
2088          float *vec = &ir->value.f[i * ir->type->vector_elements];
2089
2090          for (int j = 0; j < ir->type->vector_elements; j++) {
2091             dst->writemask = 1 << j;
2092             dst->type = BRW_REGISTER_TYPE_F;
2093
2094             emit(MOV(*dst, src_reg(vec[j])));
2095          }
2096          dst->reg_offset++;
2097       }
2098       return;
2099    }
2100
2101    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2102
2103    for (int i = 0; i < ir->type->vector_elements; i++) {
2104       if (!(remaining_writemask & (1 << i)))
2105          continue;
2106
2107       dst->writemask = 1 << i;
2108       dst->type = brw_type_for_base_type(ir->type);
2109
2110       /* Find other components that match the one we're about to
2111        * write.  Emits fewer instructions for things like vec4(0.5,
2112        * 1.5, 1.5, 1.5).
2113        */
2114       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2115          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2116             if (ir->value.b[i] == ir->value.b[j])
2117                dst->writemask |= (1 << j);
2118          } else {
2119             /* u, i, and f storage all line up, so no need for a
2120              * switch case for comparing each type.
2121              */
2122             if (ir->value.u[i] == ir->value.u[j])
2123                dst->writemask |= (1 << j);
2124          }
2125       }
2126
2127       switch (ir->type->base_type) {
2128       case GLSL_TYPE_FLOAT:
2129          emit(MOV(*dst, src_reg(ir->value.f[i])));
2130          break;
2131       case GLSL_TYPE_INT:
2132          emit(MOV(*dst, src_reg(ir->value.i[i])));
2133          break;
2134       case GLSL_TYPE_UINT:
2135          emit(MOV(*dst, src_reg(ir->value.u[i])));
2136          break;
2137       case GLSL_TYPE_BOOL:
2138          emit(MOV(*dst, src_reg(ir->value.b[i])));
2139          break;
2140       default:
2141          assert(!"Non-float/uint/int/bool constant");
2142          break;
2143       }
2144
2145       remaining_writemask &= ~dst->writemask;
2146    }
2147    dst->reg_offset++;
2148 }
2149
2150 void
2151 vec4_visitor::visit(ir_constant *ir)
2152 {
2153    dst_reg dst = dst_reg(this, ir->type);
2154    this->result = src_reg(dst);
2155
2156    emit_constant_values(&dst, ir);
2157 }
2158
2159 void
2160 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2161 {
2162    ir_dereference *deref = static_cast<ir_dereference *>(
2163       ir->actual_parameters.get_head());
2164    ir_variable *location = deref->variable_referenced();
2165    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2166                           location->data.atomic.buffer_index);
2167
2168    /* Calculate the surface offset */
2169    src_reg offset(this, glsl_type::uint_type);
2170    ir_dereference_array *deref_array = deref->as_dereference_array();
2171    if (deref_array) {
2172       deref_array->array_index->accept(this);
2173
2174       src_reg tmp(this, glsl_type::uint_type);
2175       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2176       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2177    } else {
2178       offset = location->data.atomic.offset;
2179    }
2180
2181    /* Emit the appropriate machine instruction */
2182    const char *callee = ir->callee->function_name();
2183    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2184
2185    if (!strcmp("__intrinsic_atomic_read", callee)) {
2186       emit_untyped_surface_read(surf_index, dst, offset);
2187
2188    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2189       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2190                           src_reg(), src_reg());
2191
2192    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2193       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2194                           src_reg(), src_reg());
2195    }
2196 }
2197
2198 void
2199 vec4_visitor::visit(ir_call *ir)
2200 {
2201    const char *callee = ir->callee->function_name();
2202
2203    if (!strcmp("__intrinsic_atomic_read", callee) ||
2204        !strcmp("__intrinsic_atomic_increment", callee) ||
2205        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2206       visit_atomic_counter_intrinsic(ir);
2207    } else {
2208       assert(!"Unsupported intrinsic.");
2209    }
2210 }
2211
2212 src_reg
2213 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2214 {
2215    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2216    inst->base_mrf = 2;
2217    inst->mlen = 1;
2218    inst->sampler = sampler;
2219    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2220    inst->dst.writemask = WRITEMASK_XYZW;
2221
2222    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2223    int param_base = inst->base_mrf;
2224    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2225    int zero_mask = 0xf & ~coord_mask;
2226
2227    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2228             coordinate));
2229
2230    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2231             src_reg(0)));
2232
2233    emit(inst);
2234    return src_reg(inst->dst);
2235 }
2236
2237 void
2238 vec4_visitor::visit(ir_texture *ir)
2239 {
2240    int sampler =
2241       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2242
2243    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2244     * emitting anything other than setting up the constant result.
2245     */
2246    if (ir->op == ir_tg4) {
2247       ir_constant *chan = ir->lod_info.component->as_constant();
2248       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2249       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2250          dst_reg result(this, ir->type);
2251          this->result = src_reg(result);
2252          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2253          return;
2254       }
2255    }
2256
2257    /* Should be lowered by do_lower_texture_projection */
2258    assert(!ir->projector);
2259
2260    /* Should be lowered */
2261    assert(!ir->offset || !ir->offset->type->is_array());
2262
2263    /* Generate code to compute all the subexpression trees.  This has to be
2264     * done before loading any values into MRFs for the sampler message since
2265     * generating these values may involve SEND messages that need the MRFs.
2266     */
2267    src_reg coordinate;
2268    if (ir->coordinate) {
2269       ir->coordinate->accept(this);
2270       coordinate = this->result;
2271    }
2272
2273    src_reg shadow_comparitor;
2274    if (ir->shadow_comparitor) {
2275       ir->shadow_comparitor->accept(this);
2276       shadow_comparitor = this->result;
2277    }
2278
2279    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2280    src_reg offset_value;
2281    if (has_nonconstant_offset) {
2282       ir->offset->accept(this);
2283       offset_value = src_reg(this->result);
2284    }
2285
2286    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2287    src_reg lod, dPdx, dPdy, sample_index, mcs;
2288    switch (ir->op) {
2289    case ir_tex:
2290       lod = src_reg(0.0f);
2291       lod_type = glsl_type::float_type;
2292       break;
2293    case ir_txf:
2294    case ir_txl:
2295    case ir_txs:
2296       ir->lod_info.lod->accept(this);
2297       lod = this->result;
2298       lod_type = ir->lod_info.lod->type;
2299       break;
2300    case ir_query_levels:
2301       lod = src_reg(0);
2302       lod_type = glsl_type::int_type;
2303       break;
2304    case ir_txf_ms:
2305       ir->lod_info.sample_index->accept(this);
2306       sample_index = this->result;
2307       sample_index_type = ir->lod_info.sample_index->type;
2308
2309       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2310          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2311       else
2312          mcs = src_reg(0u);
2313       break;
2314    case ir_txd:
2315       ir->lod_info.grad.dPdx->accept(this);
2316       dPdx = this->result;
2317
2318       ir->lod_info.grad.dPdy->accept(this);
2319       dPdy = this->result;
2320
2321       lod_type = ir->lod_info.grad.dPdx->type;
2322       break;
2323    case ir_txb:
2324    case ir_lod:
2325    case ir_tg4:
2326       break;
2327    }
2328
2329    vec4_instruction *inst = NULL;
2330    switch (ir->op) {
2331    case ir_tex:
2332    case ir_txl:
2333       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2334       break;
2335    case ir_txd:
2336       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2337       break;
2338    case ir_txf:
2339       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2340       break;
2341    case ir_txf_ms:
2342       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2343       break;
2344    case ir_txs:
2345       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2346       break;
2347    case ir_tg4:
2348       if (has_nonconstant_offset)
2349          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2350       else
2351          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2352       break;
2353    case ir_query_levels:
2354       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2355       break;
2356    case ir_txb:
2357       assert(!"TXB is not valid for vertex shaders.");
2358       break;
2359    case ir_lod:
2360       assert(!"LOD is not valid for vertex shaders.");
2361       break;
2362    default:
2363       assert(!"Unrecognized tex op");
2364    }
2365
2366    if (ir->offset != NULL && ir->op != ir_txf)
2367       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2368
2369    /* Stuff the channel select bits in the top of the texture offset */
2370    if (ir->op == ir_tg4)
2371       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2372
2373    /* The message header is necessary for:
2374     * - Gen4 (always)
2375     * - Texel offsets
2376     * - Gather channel selection
2377     * - Sampler indices too large to fit in a 4-bit value.
2378     */
2379    inst->header_present =
2380       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2381       sampler >= 16;
2382    inst->base_mrf = 2;
2383    inst->mlen = inst->header_present + 1; /* always at least one */
2384    inst->sampler = sampler;
2385    inst->dst = dst_reg(this, ir->type);
2386    inst->dst.writemask = WRITEMASK_XYZW;
2387    inst->shadow_compare = ir->shadow_comparitor != NULL;
2388
2389    /* MRF for the first parameter */
2390    int param_base = inst->base_mrf + inst->header_present;
2391
2392    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2393       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2394       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2395    } else {
2396       /* Load the coordinate */
2397       /* FINISHME: gl_clamp_mask and saturate */
2398       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2399       int zero_mask = 0xf & ~coord_mask;
2400
2401       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2402                coordinate));
2403
2404       if (zero_mask != 0) {
2405          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2406                   src_reg(0)));
2407       }
2408       /* Load the shadow comparitor */
2409       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2410          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2411                           WRITEMASK_X),
2412                   shadow_comparitor));
2413          inst->mlen++;
2414       }
2415
2416       /* Load the LOD info */
2417       if (ir->op == ir_tex || ir->op == ir_txl) {
2418          int mrf, writemask;
2419          if (brw->gen >= 5) {
2420             mrf = param_base + 1;
2421             if (ir->shadow_comparitor) {
2422                writemask = WRITEMASK_Y;
2423                /* mlen already incremented */
2424             } else {
2425                writemask = WRITEMASK_X;
2426                inst->mlen++;
2427             }
2428          } else /* brw->gen == 4 */ {
2429             mrf = param_base;
2430             writemask = WRITEMASK_W;
2431          }
2432          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2433       } else if (ir->op == ir_txf) {
2434          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2435       } else if (ir->op == ir_txf_ms) {
2436          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2437                   sample_index));
2438          if (brw->gen >= 7)
2439             /* MCS data is in the first channel of `mcs`, but we need to get it into
2440              * the .y channel of the second vec4 of params, so replicate .x across
2441              * the whole vec4 and then mask off everything except .y
2442              */
2443             mcs.swizzle = BRW_SWIZZLE_XXXX;
2444             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2445                      mcs));
2446          inst->mlen++;
2447       } else if (ir->op == ir_txd) {
2448          const glsl_type *type = lod_type;
2449
2450          if (brw->gen >= 5) {
2451             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2452             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2453             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2454             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2455             inst->mlen++;
2456
2457             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2458                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2459                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2460                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2461                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2462                inst->mlen++;
2463
2464                if (ir->shadow_comparitor) {
2465                   emit(MOV(dst_reg(MRF, param_base + 2,
2466                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2467                            shadow_comparitor));
2468                }
2469             }
2470          } else /* brw->gen == 4 */ {
2471             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2472             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2473             inst->mlen += 2;
2474          }
2475       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2476          if (ir->shadow_comparitor) {
2477             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2478                      shadow_comparitor));
2479          }
2480
2481          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2482                   offset_value));
2483          inst->mlen++;
2484       }
2485    }
2486
2487    emit(inst);
2488
2489    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2490     * spec requires layers.
2491     */
2492    if (ir->op == ir_txs) {
2493       glsl_type const *type = ir->sampler->type;
2494       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2495           type->sampler_array) {
2496          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2497                    with_writemask(inst->dst, WRITEMASK_Z),
2498                    src_reg(inst->dst), src_reg(6));
2499       }
2500    }
2501
2502    if (brw->gen == 6 && ir->op == ir_tg4) {
2503       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2504    }
2505
2506    swizzle_result(ir, src_reg(inst->dst), sampler);
2507 }
2508
2509 /**
2510  * Apply workarounds for Gen6 gather with UINT/SINT
2511  */
2512 void
2513 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2514 {
2515    if (!wa)
2516       return;
2517
2518    int width = (wa & WA_8BIT) ? 8 : 16;
2519    dst_reg dst_f = dst;
2520    dst_f.type = BRW_REGISTER_TYPE_F;
2521
2522    /* Convert from UNORM to UINT */
2523    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2524    emit(MOV(dst, src_reg(dst_f)));
2525
2526    if (wa & WA_SIGN) {
2527       /* Reinterpret the UINT value as a signed INT value by
2528        * shifting the sign bit into place, then shifting back
2529        * preserving sign.
2530        */
2531       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2532       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2533    }
2534 }
2535
2536 /**
2537  * Set up the gather channel based on the swizzle, for gather4.
2538  */
2539 uint32_t
2540 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2541 {
2542    ir_constant *chan = ir->lod_info.component->as_constant();
2543    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2544    switch (swiz) {
2545       case SWIZZLE_X: return 0;
2546       case SWIZZLE_Y:
2547          /* gather4 sampler is broken for green channel on RG32F --
2548           * we must ask for blue instead.
2549           */
2550          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2551             return 2;
2552          return 1;
2553       case SWIZZLE_Z: return 2;
2554       case SWIZZLE_W: return 3;
2555       default:
2556          assert(!"Not reached"); /* zero, one swizzles handled already */
2557          return 0;
2558    }
2559 }
2560
2561 void
2562 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2563 {
2564    int s = key->tex.swizzles[sampler];
2565
2566    this->result = src_reg(this, ir->type);
2567    dst_reg swizzled_result(this->result);
2568
2569    if (ir->op == ir_query_levels) {
2570       /* # levels is in .w */
2571       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2572       emit(MOV(swizzled_result, orig_val));
2573       return;
2574    }
2575
2576    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2577                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2578       emit(MOV(swizzled_result, orig_val));
2579       return;
2580    }
2581
2582
2583    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2584    int swizzle[4] = {0};
2585
2586    for (int i = 0; i < 4; i++) {
2587       switch (GET_SWZ(s, i)) {
2588       case SWIZZLE_ZERO:
2589          zero_mask |= (1 << i);
2590          break;
2591       case SWIZZLE_ONE:
2592          one_mask |= (1 << i);
2593          break;
2594       default:
2595          copy_mask |= (1 << i);
2596          swizzle[i] = GET_SWZ(s, i);
2597          break;
2598       }
2599    }
2600
2601    if (copy_mask) {
2602       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2603       swizzled_result.writemask = copy_mask;
2604       emit(MOV(swizzled_result, orig_val));
2605    }
2606
2607    if (zero_mask) {
2608       swizzled_result.writemask = zero_mask;
2609       emit(MOV(swizzled_result, src_reg(0.0f)));
2610    }
2611
2612    if (one_mask) {
2613       swizzled_result.writemask = one_mask;
2614       emit(MOV(swizzled_result, src_reg(1.0f)));
2615    }
2616 }
2617
2618 void
2619 vec4_visitor::visit(ir_return *ir)
2620 {
2621    assert(!"not reached");
2622 }
2623
2624 void
2625 vec4_visitor::visit(ir_discard *ir)
2626 {
2627    assert(!"not reached");
2628 }
2629
2630 void
2631 vec4_visitor::visit(ir_if *ir)
2632 {
2633    /* Don't point the annotation at the if statement, because then it plus
2634     * the then and else blocks get printed.
2635     */
2636    this->base_ir = ir->condition;
2637
2638    if (brw->gen == 6) {
2639       emit_if_gen6(ir);
2640    } else {
2641       uint32_t predicate;
2642       emit_bool_to_cond_code(ir->condition, &predicate);
2643       emit(IF(predicate));
2644    }
2645
2646    visit_instructions(&ir->then_instructions);
2647
2648    if (!ir->else_instructions.is_empty()) {
2649       this->base_ir = ir->condition;
2650       emit(BRW_OPCODE_ELSE);
2651
2652       visit_instructions(&ir->else_instructions);
2653    }
2654
2655    this->base_ir = ir->condition;
2656    emit(BRW_OPCODE_ENDIF);
2657 }
2658
2659 void
2660 vec4_visitor::visit(ir_emit_vertex *)
2661 {
2662    assert(!"not reached");
2663 }
2664
2665 void
2666 vec4_visitor::visit(ir_end_primitive *)
2667 {
2668    assert(!"not reached");
2669 }
2670
2671 void
2672 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2673                                   dst_reg dst, src_reg offset,
2674                                   src_reg src0, src_reg src1)
2675 {
2676    unsigned mlen = 0;
2677
2678    /* Set the atomic operation offset. */
2679    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2680    mlen++;
2681
2682    /* Set the atomic operation arguments. */
2683    if (src0.file != BAD_FILE) {
2684       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2685       mlen++;
2686    }
2687
2688    if (src1.file != BAD_FILE) {
2689       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2690       mlen++;
2691    }
2692
2693    /* Emit the instruction.  Note that this maps to the normal SIMD8
2694     * untyped atomic message on Ivy Bridge, but that's OK because
2695     * unused channels will be masked out.
2696     */
2697    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2698                                  src_reg(atomic_op), src_reg(surf_index));
2699    inst->base_mrf = 0;
2700    inst->mlen = mlen;
2701 }
2702
2703 void
2704 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2705                                         src_reg offset)
2706 {
2707    /* Set the surface read offset. */
2708    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2709
2710    /* Emit the instruction.  Note that this maps to the normal SIMD8
2711     * untyped surface read message, but that's OK because unused
2712     * channels will be masked out.
2713     */
2714    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2715                                  dst, src_reg(surf_index));
2716    inst->base_mrf = 0;
2717    inst->mlen = 1;
2718 }
2719
2720 void
2721 vec4_visitor::emit_ndc_computation()
2722 {
2723    /* Get the position */
2724    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2725
2726    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2727    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2728    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2729
2730    current_annotation = "NDC";
2731    dst_reg ndc_w = ndc;
2732    ndc_w.writemask = WRITEMASK_W;
2733    src_reg pos_w = pos;
2734    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2735    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2736
2737    dst_reg ndc_xyz = ndc;
2738    ndc_xyz.writemask = WRITEMASK_XYZ;
2739
2740    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2741 }
2742
2743 void
2744 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2745 {
2746    if (brw->gen < 6 &&
2747        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2748         key->userclip_active || brw->has_negative_rhw_bug)) {
2749       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2750       dst_reg header1_w = header1;
2751       header1_w.writemask = WRITEMASK_W;
2752
2753       emit(MOV(header1, 0u));
2754
2755       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2756          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2757
2758          current_annotation = "Point size";
2759          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2760          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2761       }
2762
2763       if (key->userclip_active) {
2764          current_annotation = "Clipping flags";
2765          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2766          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2767
2768          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2769          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2770          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2771
2772          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2773          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2774          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2775          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2776       }
2777
2778       /* i965 clipping workaround:
2779        * 1) Test for -ve rhw
2780        * 2) If set,
2781        *      set ndc = (0,0,0,0)
2782        *      set ucp[6] = 1
2783        *
2784        * Later, clipping will detect ucp[6] and ensure the primitive is
2785        * clipped against all fixed planes.
2786        */
2787       if (brw->has_negative_rhw_bug) {
2788          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2789          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2790          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2791          vec4_instruction *inst;
2792          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2793          inst->predicate = BRW_PREDICATE_NORMAL;
2794          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2795          inst->predicate = BRW_PREDICATE_NORMAL;
2796       }
2797
2798       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2799    } else if (brw->gen < 6) {
2800       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2801    } else {
2802       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2803       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2804          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2805                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2806       }
2807       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2808          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2809                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2810       }
2811       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2812          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2813                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2814       }
2815    }
2816 }
2817
2818 void
2819 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2820 {
2821    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2822     *
2823     *     "If a linked set of shaders forming the vertex stage contains no
2824     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2825     *     application has requested clipping against user clip planes through
2826     *     the API, then the coordinate written to gl_Position is used for
2827     *     comparison against the user clip planes."
2828     *
2829     * This function is only called if the shader didn't write to
2830     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2831     * if the user wrote to it; otherwise we use gl_Position.
2832     */
2833    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2834    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2835       clip_vertex = VARYING_SLOT_POS;
2836    }
2837
2838    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2839         ++i) {
2840       reg.writemask = 1 << i;
2841       emit(DP4(reg,
2842                src_reg(output_reg[clip_vertex]),
2843                src_reg(this->userplane[i + offset])));
2844    }
2845 }
2846
2847 void
2848 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2849 {
2850    assert (varying < VARYING_SLOT_MAX);
2851    reg.type = output_reg[varying].type;
2852    current_annotation = output_reg_annotation[varying];
2853    /* Copy the register, saturating if necessary */
2854    vec4_instruction *inst = emit(MOV(reg,
2855                                      src_reg(output_reg[varying])));
2856    if ((varying == VARYING_SLOT_COL0 ||
2857         varying == VARYING_SLOT_COL1 ||
2858         varying == VARYING_SLOT_BFC0 ||
2859         varying == VARYING_SLOT_BFC1) &&
2860        key->clamp_vertex_color) {
2861       inst->saturate = true;
2862    }
2863 }
2864
2865 void
2866 vec4_visitor::emit_urb_slot(int mrf, int varying)
2867 {
2868    struct brw_reg hw_reg = brw_message_reg(mrf);
2869    dst_reg reg = dst_reg(MRF, mrf);
2870    reg.type = BRW_REGISTER_TYPE_F;
2871
2872    switch (varying) {
2873    case VARYING_SLOT_PSIZ:
2874       /* PSIZ is always in slot 0, and is coupled with other flags. */
2875       current_annotation = "indices, point width, clip flags";
2876       emit_psiz_and_flags(hw_reg);
2877       break;
2878    case BRW_VARYING_SLOT_NDC:
2879       current_annotation = "NDC";
2880       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2881       break;
2882    case VARYING_SLOT_POS:
2883       current_annotation = "gl_Position";
2884       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2885       break;
2886    case VARYING_SLOT_EDGE:
2887       /* This is present when doing unfilled polygons.  We're supposed to copy
2888        * the edge flag from the user-provided vertex array
2889        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2890        * of that attribute (starts as 1.0f).  This is then used in clipping to
2891        * determine which edges should be drawn as wireframe.
2892        */
2893       current_annotation = "edge flag";
2894       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2895                                     glsl_type::float_type, WRITEMASK_XYZW))));
2896       break;
2897    case BRW_VARYING_SLOT_PAD:
2898       /* No need to write to this slot */
2899       break;
2900    default:
2901       emit_generic_urb_slot(reg, varying);
2902       break;
2903    }
2904 }
2905
2906 static int
2907 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2908 {
2909    if (brw->gen >= 6) {
2910       /* URB data written (does not include the message header reg) must
2911        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2912        * section 5.4.3.2.2: URB_INTERLEAVED.
2913        *
2914        * URB entries are allocated on a multiple of 1024 bits, so an
2915        * extra 128 bits written here to make the end align to 256 is
2916        * no problem.
2917        */
2918       if ((mlen % 2) != 1)
2919          mlen++;
2920    }
2921
2922    return mlen;
2923 }
2924
2925
2926 /**
2927  * Generates the VUE payload plus the necessary URB write instructions to
2928  * output it.
2929  *
2930  * The VUE layout is documented in Volume 2a.
2931  */
2932 void
2933 vec4_visitor::emit_vertex()
2934 {
2935    /* MRF 0 is reserved for the debugger, so start with message header
2936     * in MRF 1.
2937     */
2938    int base_mrf = 1;
2939    int mrf = base_mrf;
2940    /* In the process of generating our URB write message contents, we
2941     * may need to unspill a register or load from an array.  Those
2942     * reads would use MRFs 14-15.
2943     */
2944    int max_usable_mrf = 13;
2945
2946    /* The following assertion verifies that max_usable_mrf causes an
2947     * even-numbered amount of URB write data, which will meet gen6's
2948     * requirements for length alignment.
2949     */
2950    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2951
2952    /* First mrf is the g0-based message header containing URB handles and
2953     * such.
2954     */
2955    emit_urb_write_header(mrf++);
2956
2957    if (brw->gen < 6) {
2958       emit_ndc_computation();
2959    }
2960
2961    /* Lower legacy ff and ClipVertex clipping to clip distances */
2962    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2963       current_annotation = "user clip distances";
2964
2965       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2966       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2967
2968       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2969       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2970    }
2971
2972    /* We may need to split this up into several URB writes, so do them in a
2973     * loop.
2974     */
2975    int slot = 0;
2976    bool complete = false;
2977    do {
2978       /* URB offset is in URB row increments, and each of our MRFs is half of
2979        * one of those, since we're doing interleaved writes.
2980        */
2981       int offset = slot / 2;
2982
2983       mrf = base_mrf + 1;
2984       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2985          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2986
2987          /* If this was max_usable_mrf, we can't fit anything more into this
2988           * URB WRITE.
2989           */
2990          if (mrf > max_usable_mrf) {
2991             slot++;
2992             break;
2993          }
2994       }
2995
2996       complete = slot >= prog_data->vue_map.num_slots;
2997       current_annotation = "URB write";
2998       vec4_instruction *inst = emit_urb_write_opcode(complete);
2999       inst->base_mrf = base_mrf;
3000       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3001       inst->offset += offset;
3002    } while(!complete);
3003 }
3004
3005
3006 src_reg
3007 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3008                                  src_reg *reladdr, int reg_offset)
3009 {
3010    /* Because we store the values to scratch interleaved like our
3011     * vertex data, we need to scale the vec4 index by 2.
3012     */
3013    int message_header_scale = 2;
3014
3015    /* Pre-gen6, the message header uses byte offsets instead of vec4
3016     * (16-byte) offset units.
3017     */
3018    if (brw->gen < 6)
3019       message_header_scale *= 16;
3020
3021    if (reladdr) {
3022       src_reg index = src_reg(this, glsl_type::int_type);
3023
3024       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3025       emit_before(inst, MUL(dst_reg(index),
3026                             index, src_reg(message_header_scale)));
3027
3028       return index;
3029    } else {
3030       return src_reg(reg_offset * message_header_scale);
3031    }
3032 }
3033
3034 src_reg
3035 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3036                                        src_reg *reladdr, int reg_offset)
3037 {
3038    if (reladdr) {
3039       src_reg index = src_reg(this, glsl_type::int_type);
3040
3041       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3042
3043       /* Pre-gen6, the message header uses byte offsets instead of vec4
3044        * (16-byte) offset units.
3045        */
3046       if (brw->gen < 6) {
3047          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3048       }
3049
3050       return index;
3051    } else if (brw->gen >= 8) {
3052       /* Store the offset in a GRF so we can send-from-GRF. */
3053       src_reg offset = src_reg(this, glsl_type::int_type);
3054       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3055       return offset;
3056    } else {
3057       int message_header_scale = brw->gen < 6 ? 16 : 1;
3058       return src_reg(reg_offset * message_header_scale);
3059    }
3060 }
3061
3062 /**
3063  * Emits an instruction before @inst to load the value named by @orig_src
3064  * from scratch space at @base_offset to @temp.
3065  *
3066  * @base_offset is measured in 32-byte units (the size of a register).
3067  */
3068 void
3069 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3070                                 dst_reg temp, src_reg orig_src,
3071                                 int base_offset)
3072 {
3073    int reg_offset = base_offset + orig_src.reg_offset;
3074    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3075
3076    emit_before(inst, SCRATCH_READ(temp, index));
3077 }
3078
3079 /**
3080  * Emits an instruction after @inst to store the value to be written
3081  * to @orig_dst to scratch space at @base_offset, from @temp.
3082  *
3083  * @base_offset is measured in 32-byte units (the size of a register).
3084  */
3085 void
3086 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3087 {
3088    int reg_offset = base_offset + inst->dst.reg_offset;
3089    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3090
3091    /* Create a temporary register to store *inst's result in.
3092     *
3093     * We have to be careful in MOVing from our temporary result register in
3094     * the scratch write.  If we swizzle from channels of the temporary that
3095     * weren't initialized, it will confuse live interval analysis, which will
3096     * make spilling fail to make progress.
3097     */
3098    src_reg temp = src_reg(this, glsl_type::vec4_type);
3099    temp.type = inst->dst.type;
3100    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3101    int swizzles[4];
3102    for (int i = 0; i < 4; i++)
3103       if (inst->dst.writemask & (1 << i))
3104          swizzles[i] = i;
3105       else
3106          swizzles[i] = first_writemask_chan;
3107    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3108                                swizzles[2], swizzles[3]);
3109
3110    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3111                                        inst->dst.writemask));
3112    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3113    write->predicate = inst->predicate;
3114    write->ir = inst->ir;
3115    write->annotation = inst->annotation;
3116    inst->insert_after(write);
3117
3118    inst->dst.file = temp.file;
3119    inst->dst.reg = temp.reg;
3120    inst->dst.reg_offset = temp.reg_offset;
3121    inst->dst.reladdr = NULL;
3122 }
3123
3124 /**
3125  * We can't generally support array access in GRF space, because a
3126  * single instruction's destination can only span 2 contiguous
3127  * registers.  So, we send all GRF arrays that get variable index
3128  * access to scratch space.
3129  */
3130 void
3131 vec4_visitor::move_grf_array_access_to_scratch()
3132 {
3133    int scratch_loc[this->virtual_grf_count];
3134
3135    for (int i = 0; i < this->virtual_grf_count; i++) {
3136       scratch_loc[i] = -1;
3137    }
3138
3139    /* First, calculate the set of virtual GRFs that need to be punted
3140     * to scratch due to having any array access on them, and where in
3141     * scratch.
3142     */
3143    foreach_list(node, &this->instructions) {
3144       vec4_instruction *inst = (vec4_instruction *)node;
3145
3146       if (inst->dst.file == GRF && inst->dst.reladdr &&
3147           scratch_loc[inst->dst.reg] == -1) {
3148          scratch_loc[inst->dst.reg] = c->last_scratch;
3149          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3150       }
3151
3152       for (int i = 0 ; i < 3; i++) {
3153          src_reg *src = &inst->src[i];
3154
3155          if (src->file == GRF && src->reladdr &&
3156              scratch_loc[src->reg] == -1) {
3157             scratch_loc[src->reg] = c->last_scratch;
3158             c->last_scratch += this->virtual_grf_sizes[src->reg];
3159          }
3160       }
3161    }
3162
3163    /* Now, for anything that will be accessed through scratch, rewrite
3164     * it to load/store.  Note that this is a _safe list walk, because
3165     * we may generate a new scratch_write instruction after the one
3166     * we're processing.
3167     */
3168    foreach_list_safe(node, &this->instructions) {
3169       vec4_instruction *inst = (vec4_instruction *)node;
3170
3171       /* Set up the annotation tracking for new generated instructions. */
3172       base_ir = inst->ir;
3173       current_annotation = inst->annotation;
3174
3175       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3176          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3177       }
3178
3179       for (int i = 0 ; i < 3; i++) {
3180          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3181             continue;
3182
3183          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3184
3185          emit_scratch_read(inst, temp, inst->src[i],
3186                            scratch_loc[inst->src[i].reg]);
3187
3188          inst->src[i].file = temp.file;
3189          inst->src[i].reg = temp.reg;
3190          inst->src[i].reg_offset = temp.reg_offset;
3191          inst->src[i].reladdr = NULL;
3192       }
3193    }
3194 }
3195
3196 /**
3197  * Emits an instruction before @inst to load the value named by @orig_src
3198  * from the pull constant buffer (surface) at @base_offset to @temp.
3199  */
3200 void
3201 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3202                                       dst_reg temp, src_reg orig_src,
3203                                       int base_offset)
3204 {
3205    int reg_offset = base_offset + orig_src.reg_offset;
3206    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3207    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3208    vec4_instruction *load;
3209
3210    if (brw->gen >= 7) {
3211       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3212       grf_offset.type = offset.type;
3213       emit_before(inst, MOV(grf_offset, offset));
3214
3215       load = new(mem_ctx) vec4_instruction(this,
3216                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3217                                            temp, index, src_reg(grf_offset));
3218    } else {
3219       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3220                                            temp, index, offset);
3221       load->base_mrf = 14;
3222       load->mlen = 1;
3223    }
3224    emit_before(inst, load);
3225 }
3226
3227 /**
3228  * Implements array access of uniforms by inserting a
3229  * PULL_CONSTANT_LOAD instruction.
3230  *
3231  * Unlike temporary GRF array access (where we don't support it due to
3232  * the difficulty of doing relative addressing on instruction
3233  * destinations), we could potentially do array access of uniforms
3234  * that were loaded in GRF space as push constants.  In real-world
3235  * usage we've seen, though, the arrays being used are always larger
3236  * than we could load as push constants, so just always move all
3237  * uniform array access out to a pull constant buffer.
3238  */
3239 void
3240 vec4_visitor::move_uniform_array_access_to_pull_constants()
3241 {
3242    int pull_constant_loc[this->uniforms];
3243
3244    for (int i = 0; i < this->uniforms; i++) {
3245       pull_constant_loc[i] = -1;
3246    }
3247
3248    /* Walk through and find array access of uniforms.  Put a copy of that
3249     * uniform in the pull constant buffer.
3250     *
3251     * Note that we don't move constant-indexed accesses to arrays.  No
3252     * testing has been done of the performance impact of this choice.
3253     */
3254    foreach_list_safe(node, &this->instructions) {
3255       vec4_instruction *inst = (vec4_instruction *)node;
3256
3257       for (int i = 0 ; i < 3; i++) {
3258          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3259             continue;
3260
3261          int uniform = inst->src[i].reg;
3262
3263          /* If this array isn't already present in the pull constant buffer,
3264           * add it.
3265           */
3266          if (pull_constant_loc[uniform] == -1) {
3267             const float **values = &prog_data->param[uniform * 4];
3268
3269             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3270
3271             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3272                prog_data->pull_param[prog_data->nr_pull_params++]
3273                   = values[j];
3274             }
3275          }
3276
3277          /* Set up the annotation tracking for new generated instructions. */
3278          base_ir = inst->ir;
3279          current_annotation = inst->annotation;
3280
3281          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3282
3283          emit_pull_constant_load(inst, temp, inst->src[i],
3284                                  pull_constant_loc[uniform]);
3285
3286          inst->src[i].file = temp.file;
3287          inst->src[i].reg = temp.reg;
3288          inst->src[i].reg_offset = temp.reg_offset;
3289          inst->src[i].reladdr = NULL;
3290       }
3291    }
3292
3293    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3294     * no need to track them as larger-than-vec4 objects.  This will be
3295     * relied on in cutting out unused uniform vectors from push
3296     * constants.
3297     */
3298    split_uniform_registers();
3299 }
3300
3301 void
3302 vec4_visitor::resolve_ud_negate(src_reg *reg)
3303 {
3304    if (reg->type != BRW_REGISTER_TYPE_UD ||
3305        !reg->negate)
3306       return;
3307
3308    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3309    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3310    *reg = temp;
3311 }
3312
3313 vec4_visitor::vec4_visitor(struct brw_context *brw,
3314                            struct brw_vec4_compile *c,
3315                            struct gl_program *prog,
3316                            const struct brw_vec4_prog_key *key,
3317                            struct brw_vec4_prog_data *prog_data,
3318                            struct gl_shader_program *shader_prog,
3319                            struct brw_shader *shader,
3320                            void *mem_ctx,
3321                            bool debug_flag,
3322                            bool no_spills,
3323                            shader_time_shader_type st_base,
3324                            shader_time_shader_type st_written,
3325                            shader_time_shader_type st_reset)
3326    : sanity_param_count(0),
3327      fail_msg(NULL),
3328      first_non_payload_grf(0),
3329      need_all_constants_in_pull_buffer(false),
3330      debug_flag(debug_flag),
3331      no_spills(no_spills),
3332      st_base(st_base),
3333      st_written(st_written),
3334      st_reset(st_reset)
3335 {
3336    this->brw = brw;
3337    this->ctx = &brw->ctx;
3338    this->shader_prog = shader_prog;
3339    this->shader = shader;
3340
3341    this->mem_ctx = mem_ctx;
3342    this->failed = false;
3343
3344    this->base_ir = NULL;
3345    this->current_annotation = NULL;
3346    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3347
3348    this->c = c;
3349    this->prog = prog;
3350    this->key = key;
3351    this->prog_data = prog_data;
3352    this->stage_prog_data = &prog_data->base;
3353
3354    this->variable_ht = hash_table_ctor(0,
3355                                        hash_table_pointer_hash,
3356                                        hash_table_pointer_compare);
3357
3358    this->virtual_grf_start = NULL;
3359    this->virtual_grf_end = NULL;
3360    this->virtual_grf_sizes = NULL;
3361    this->virtual_grf_count = 0;
3362    this->virtual_grf_reg_map = NULL;
3363    this->virtual_grf_reg_count = 0;
3364    this->virtual_grf_array_size = 0;
3365    this->live_intervals_valid = false;
3366
3367    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3368
3369    this->uniforms = 0;
3370 }
3371
3372 vec4_visitor::~vec4_visitor()
3373 {
3374    hash_table_dtor(this->variable_ht);
3375 }
3376
3377
3378 void
3379 vec4_visitor::fail(const char *format, ...)
3380 {
3381    va_list va;
3382    char *msg;
3383
3384    if (failed)
3385       return;
3386
3387    failed = true;
3388
3389    va_start(va, format);
3390    msg = ralloc_vasprintf(mem_ctx, format, va);
3391    va_end(va);
3392    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3393
3394    this->fail_msg = msg;
3395
3396    if (debug_flag) {
3397       fprintf(stderr, "%s",  msg);
3398    }
3399 }
3400
3401 } /* namespace brw */