src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->sampler = 0;
  47    this->texture_offset = 0;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = v->base_ir;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_present = false;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->annotation = v->current_annotation;
  57 }
  58
  59 vec4_instruction *
  60 vec4_visitor::emit(vec4_instruction *inst)
  61 {
  62    this->instructions.push_tail(inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  80                    src_reg src0, src_reg src1, src_reg src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  91 }
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 109 }
 110
 111 #define ALU1(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0);                       \
 117    }
 118
 119 #define ALU2(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 124                                            src0, src1);                 \
 125    }
 126
 127 #define ALU3(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 132                                            src0, src1, src2);           \
 133    }
 134
 135 ALU1(NOT)
 136 ALU1(MOV)
 137 ALU1(FRC)
 138 ALU1(RNDD)
 139 ALU1(RNDE)
 140 ALU1(RNDZ)
 141 ALU1(F32TO16)
 142 ALU1(F16TO32)
 143 ALU2(ADD)
 144 ALU2(MUL)
 145 ALU2(MACH)
 146 ALU2(AND)
 147 ALU2(OR)
 148 ALU2(XOR)
 149 ALU2(DP3)
 150 ALU2(DP4)
 151 ALU2(DPH)
 152 ALU2(SHL)
 153 ALU2(SHR)
 154 ALU2(ASR)
 155 ALU3(LRP)
 156 ALU1(BFREV)
 157 ALU3(BFE)
 158 ALU2(BFI1)
 159 ALU3(BFI2)
 160 ALU1(FBH)
 161 ALU1(FBL)
 162 ALU1(CBIT)
 163 ALU3(MAD)
 164 ALU2(ADDC)
 165 ALU2(SUBB)
 166
 167 /** Gen4 predicated IF. */
 168 vec4_instruction *
 169 vec4_visitor::IF(uint32_t predicate)
 170 {
 171    vec4_instruction *inst;
 172
 173    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 174    inst->predicate = predicate;
 175
 176    return inst;
 177 }
 178
 179 /** Gen6 IF with embedded comparison. */
 180 vec4_instruction *
 181 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 182 {
 183    assert(brw->gen == 6);
 184
 185    vec4_instruction *inst;
 186
 187    resolve_ud_negate(&src0);
 188    resolve_ud_negate(&src1);
 189
 190    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 191                                         src0, src1);
 192    inst->conditional_mod = condition;
 193
 194    return inst;
 195 }
 196
 197 /**
 198  * CMP: Sets the low bit of the destination channels with the result
 199  * of the comparison, while the upper bits are undefined, and updates
 200  * the flag register with the packed 16 bits of the result.
 201  */
 202 vec4_instruction *
 203 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 204 {
 205    vec4_instruction *inst;
 206
 207    /* original gen4 does type conversion to the destination type
 208     * before before comparison, producing garbage results for floating
 209     * point comparisons.
 210     */
 211    if (brw->gen == 4) {
 212       dst.type = src0.type;
 213       if (dst.file == HW_REG)
 214          dst.fixed_hw_reg.type = dst.type;
 215    }
 216
 217    resolve_ud_negate(&src0);
 218    resolve_ud_negate(&src1);
 219
 220    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 221    inst->conditional_mod = condition;
 222
 223    return inst;
 224 }
 225
 226 vec4_instruction *
 227 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 228 {
 229    vec4_instruction *inst;
 230
 231    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 232                                         dst, index);
 233    inst->base_mrf = 14;
 234    inst->mlen = 2;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 245                                         dst, src, index);
 246    inst->base_mrf = 13;
 247    inst->mlen = 3;
 248
 249    return inst;
 250 }
 251
 252 void
 253 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 254 {
 255    static enum opcode dot_opcodes[] = {
 256       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 257    };
 258
 259    emit(dot_opcodes[elements - 2], dst, src0, src1);
 260 }
 261
 262 src_reg
 263 vec4_visitor::fix_3src_operand(src_reg src)
 264 {
 265    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 266     * able to use vertical stride of zero to replicate the vec4 uniform, like
 267     *
 268     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 269     *
 270     * But you can't, since vertical stride is always four in three-source
 271     * instructions. Instead, insert a MOV instruction to do the replication so
 272     * that the three-source instruction can consume it.
 273     */
 274
 275    /* The MOV is only needed if the source is a uniform or immediate. */
 276    if (src.file != UNIFORM && src.file != IMM)
 277       return src;
 278
 279    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 280    expanded.type = src.type;
 281    emit(MOV(expanded, src));
 282    return src_reg(expanded);
 283 }
 284
 285 src_reg
 286 vec4_visitor::fix_math_operand(src_reg src)
 287 {
 288    /* The gen6 math instruction ignores the source modifiers --
 289     * swizzle, abs, negate, and at least some parts of the register
 290     * region description.
 291     *
 292     * Rather than trying to enumerate all these cases, *always* expand the
 293     * operand to a temp GRF for gen6.
 294     *
 295     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 296     * can't use.
 297     */
 298
 299    if (brw->gen == 7 && src.file != IMM)
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(MOV(expanded, src));
 305    return src_reg(expanded);
 306 }
 307
 308 void
 309 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    src = fix_math_operand(src);
 312
 313    if (dst.writemask != WRITEMASK_XYZW) {
 314       /* The gen6 math instruction must be align1, so we can't do
 315        * writemasks.
 316        */
 317       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 318
 319       emit(opcode, temp_dst, src);
 320
 321       emit(MOV(dst, src_reg(temp_dst)));
 322    } else {
 323       emit(opcode, dst, src);
 324    }
 325 }
 326
 327 void
 328 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 329 {
 330    vec4_instruction *inst = emit(opcode, dst, src);
 331    inst->base_mrf = 1;
 332    inst->mlen = 1;
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 337 {
 338    switch (opcode) {
 339    case SHADER_OPCODE_RCP:
 340    case SHADER_OPCODE_RSQ:
 341    case SHADER_OPCODE_SQRT:
 342    case SHADER_OPCODE_EXP2:
 343    case SHADER_OPCODE_LOG2:
 344    case SHADER_OPCODE_SIN:
 345    case SHADER_OPCODE_COS:
 346       break;
 347    default:
 348       assert(!"not reached: bad math opcode");
 349       return;
 350    }
 351
 352    if (brw->gen >= 6) {
 353       return emit_math1_gen6(opcode, dst, src);
 354    } else {
 355       return emit_math1_gen4(opcode, dst, src);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    src0 = fix_math_operand(src0);
 364    src1 = fix_math_operand(src1);
 365
 366    if (dst.writemask != WRITEMASK_XYZW) {
 367       /* The gen6 math instruction must be align1, so we can't do
 368        * writemasks.
 369        */
 370       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 371       temp_dst.type = dst.type;
 372
 373       emit(opcode, temp_dst, src0, src1);
 374
 375       emit(MOV(dst, src_reg(temp_dst)));
 376    } else {
 377       emit(opcode, dst, src0, src1);
 378    }
 379 }
 380
 381 void
 382 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 383                               dst_reg dst, src_reg src0, src_reg src1)
 384 {
 385    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 386    inst->base_mrf = 1;
 387    inst->mlen = 2;
 388 }
 389
 390 void
 391 vec4_visitor::emit_math(enum opcode opcode,
 392                         dst_reg dst, src_reg src0, src_reg src1)
 393 {
 394    switch (opcode) {
 395    case SHADER_OPCODE_POW:
 396    case SHADER_OPCODE_INT_QUOTIENT:
 397    case SHADER_OPCODE_INT_REMAINDER:
 398       break;
 399    default:
 400       assert(!"not reached: unsupported binary math opcode");
 401       return;
 402    }
 403
 404    if (brw->gen >= 6) {
 405       return emit_math2_gen6(opcode, dst, src0, src1);
 406    } else {
 407       return emit_math2_gen4(opcode, dst, src0, src1);
 408    }
 409 }
 410
 411 void
 412 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 413 {
 414    if (brw->gen < 7)
 415       assert(!"ir_unop_pack_half_2x16 should be lowered");
 416
 417    assert(dst.type == BRW_REGISTER_TYPE_UD);
 418    assert(src0.type == BRW_REGISTER_TYPE_F);
 419
 420    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 421     *
 422     *   Because this instruction does not have a 16-bit floating-point type,
 423     *   the destination data type must be Word (W).
 424     *
 425     *   The destination must be DWord-aligned and specify a horizontal stride
 426     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 427     *   each destination channel and the upper word is not modified.
 428     *
 429     * The above restriction implies that the f32to16 instruction must use
 430     * align1 mode, because only in align1 mode is it possible to specify
 431     * horizontal stride.  We choose here to defy the hardware docs and emit
 432     * align16 instructions.
 433     *
 434     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 435     * instructions. I was partially successful in that the code passed all
 436     * tests.  However, the code was dubiously correct and fragile, and the
 437     * tests were not harsh enough to probe that frailty. Not trusting the
 438     * code, I chose instead to remain in align16 mode in defiance of the hw
 439     * docs).
 440     *
 441     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 442     * simulator, emitting a f32to16 in align16 mode with UD as destination
 443     * data type is safe. The behavior differs from that specified in the PRM
 444     * in that the upper word of each destination channel is cleared to 0.
 445     */
 446
 447    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 448    src_reg tmp_src(tmp_dst);
 449
 450 #if 0
 451    /* Verify the undocumented behavior on which the following instructions
 452     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 453     * then the result of the bit-or instruction below will be incorrect.
 454     *
 455     * You should inspect the disasm output in order to verify that the MOV is
 456     * not optimized away.
 457     */
 458    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 459 #endif
 460
 461    /* Give tmp the form below, where "." means untouched.
 462     *
 463     *     w z          y          x w z          y          x
 464     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 465     *
 466     * That the upper word of each write-channel be 0 is required for the
 467     * following bit-shift and bit-or instructions to work. Note that this
 468     * relies on the undocumented hardware behavior mentioned above.
 469     */
 470    tmp_dst.writemask = WRITEMASK_XY;
 471    emit(F32TO16(tmp_dst, src0));
 472
 473    /* Give the write-channels of dst the form:
 474     *   0xhhhh0000
 475     */
 476    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 477    emit(SHL(dst, tmp_src, src_reg(16u)));
 478
 479    /* Finally, give the write-channels of dst the form of packHalf2x16's
 480     * output:
 481     *   0xhhhhllll
 482     */
 483    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 484    emit(OR(dst, src_reg(dst), tmp_src));
 485 }
 486
 487 void
 488 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 489 {
 490    if (brw->gen < 7)
 491       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 492
 493    assert(dst.type == BRW_REGISTER_TYPE_F);
 494    assert(src0.type == BRW_REGISTER_TYPE_UD);
 495
 496    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 497     *
 498     *   Because this instruction does not have a 16-bit floating-point type,
 499     *   the source data type must be Word (W). The destination type must be
 500     *   F (Float).
 501     *
 502     * To use W as the source data type, we must adjust horizontal strides,
 503     * which is only possible in align1 mode. All my [chadv] attempts at
 504     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 505     * Piglit tests, so I gave up.
 506     *
 507     * I've verified that, on gen7 hardware and the simulator, it is safe to
 508     * emit f16to32 in align16 mode with UD as source data type.
 509     */
 510
 511    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 512    src_reg tmp_src(tmp_dst);
 513
 514    tmp_dst.writemask = WRITEMASK_X;
 515    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 516
 517    tmp_dst.writemask = WRITEMASK_Y;
 518    emit(SHR(tmp_dst, src0, src_reg(16u)));
 519
 520    dst.writemask = WRITEMASK_XY;
 521    emit(F16TO32(dst, tmp_src));
 522 }
 523
 524 void
 525 vec4_visitor::visit_instructions(const exec_list *list)
 526 {
 527    foreach_list(node, list) {
 528       ir_instruction *ir = (ir_instruction *)node;
 529
 530       base_ir = ir;
 531       ir->accept(this);
 532    }
 533 }
 534
 535
 536 static int
 537 type_size(const struct glsl_type *type)
 538 {
 539    unsigned int i;
 540    int size;
 541
 542    switch (type->base_type) {
 543    case GLSL_TYPE_UINT:
 544    case GLSL_TYPE_INT:
 545    case GLSL_TYPE_FLOAT:
 546    case GLSL_TYPE_BOOL:
 547       if (type->is_matrix()) {
 548          return type->matrix_columns;
 549       } else {
 550          /* Regardless of size of vector, it gets a vec4. This is bad
 551           * packing for things like floats, but otherwise arrays become a
 552           * mess.  Hopefully a later pass over the code can pack scalars
 553           * down if appropriate.
 554           */
 555          return 1;
 556       }
 557    case GLSL_TYPE_ARRAY:
 558       assert(type->length > 0);
 559       return type_size(type->fields.array) * type->length;
 560    case GLSL_TYPE_STRUCT:
 561       size = 0;
 562       for (i = 0; i < type->length; i++) {
 563          size += type_size(type->fields.structure[i].type);
 564       }
 565       return size;
 566    case GLSL_TYPE_SAMPLER:
 567       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 568        * at link time.
 569        */
 570       return 1;
 571    case GLSL_TYPE_ATOMIC_UINT:
 572       return 0;
 573    case GLSL_TYPE_IMAGE:
 574    case GLSL_TYPE_VOID:
 575    case GLSL_TYPE_ERROR:
 576    case GLSL_TYPE_INTERFACE:
 577       assert(0);
 578       break;
 579    }
 580
 581    return 0;
 582 }
 583
 584 int
 585 vec4_visitor::virtual_grf_alloc(int size)
 586 {
 587    if (virtual_grf_array_size <= virtual_grf_count) {
 588       if (virtual_grf_array_size == 0)
 589          virtual_grf_array_size = 16;
 590       else
 591          virtual_grf_array_size *= 2;
 592       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 593                                    virtual_grf_array_size);
 594       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 595                                      virtual_grf_array_size);
 596    }
 597    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 598    virtual_grf_reg_count += size;
 599    virtual_grf_sizes[virtual_grf_count] = size;
 600    return virtual_grf_count++;
 601 }
 602
 603 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 604 {
 605    init();
 606
 607    this->file = GRF;
 608    this->reg = v->virtual_grf_alloc(type_size(type));
 609
 610    if (type->is_array() || type->is_record()) {
 611       this->swizzle = BRW_SWIZZLE_NOOP;
 612    } else {
 613       this->swizzle = swizzle_for_size(type->vector_elements);
 614    }
 615
 616    this->type = brw_type_for_base_type(type);
 617 }
 618
 619 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 620 {
 621    init();
 622
 623    this->file = GRF;
 624    this->reg = v->virtual_grf_alloc(type_size(type));
 625
 626    if (type->is_array() || type->is_record()) {
 627       this->writemask = WRITEMASK_XYZW;
 628    } else {
 629       this->writemask = (1 << type->vector_elements) - 1;
 630    }
 631
 632    this->type = brw_type_for_base_type(type);
 633 }
 634
 635 /* Our support for uniforms is piggy-backed on the struct
 636  * gl_fragment_program, because that's where the values actually
 637  * get stored, rather than in some global gl_shader_program uniform
 638  * store.
 639  */
 640 void
 641 vec4_visitor::setup_uniform_values(ir_variable *ir)
 642 {
 643    int namelen = strlen(ir->name);
 644
 645    /* The data for our (non-builtin) uniforms is stored in a series of
 646     * gl_uniform_driver_storage structs for each subcomponent that
 647     * glGetUniformLocation() could name.  We know it's been set up in the same
 648     * order we'd walk the type, so walk the list of storage and find anything
 649     * with our name, or the prefix of a component that starts with our name.
 650     */
 651    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 652       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 653
 654       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 655           (storage->name[namelen] != 0 &&
 656            storage->name[namelen] != '.' &&
 657            storage->name[namelen] != '[')) {
 658          continue;
 659       }
 660
 661       gl_constant_value *components = storage->storage;
 662       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 663                                storage->type->matrix_columns);
 664
 665       for (unsigned s = 0; s < vector_count; s++) {
 666          uniform_vector_size[uniforms] = storage->type->vector_elements;
 667
 668          int i;
 669          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 670             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 671             components++;
 672          }
 673          for (; i < 4; i++) {
 674             static float zero = 0;
 675             stage_prog_data->param[uniforms * 4 + i] = &zero;
 676          }
 677
 678          uniforms++;
 679       }
 680    }
 681 }
 682
 683 void
 684 vec4_visitor::setup_uniform_clipplane_values()
 685 {
 686    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 687
 688    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 689       this->uniform_vector_size[this->uniforms] = 4;
 690       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 691       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 692       for (int j = 0; j < 4; ++j) {
 693          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 694       }
 695       ++this->uniforms;
 696    }
 697 }
 698
 699 /* Our support for builtin uniforms is even scarier than non-builtin.
 700  * It sits on top of the PROG_STATE_VAR parameters that are
 701  * automatically updated from GL context state.
 702  */
 703 void
 704 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 705 {
 706    const ir_state_slot *const slots = ir->state_slots;
 707    assert(ir->state_slots != NULL);
 708
 709    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 710       /* This state reference has already been setup by ir_to_mesa,
 711        * but we'll get the same index back here.  We can reference
 712        * ParameterValues directly, since unlike brw_fs.cpp, we never
 713        * add new state references during compile.
 714        */
 715       int index = _mesa_add_state_reference(this->prog->Parameters,
 716                                             (gl_state_index *)slots[i].tokens);
 717       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 718
 719       this->uniform_vector_size[this->uniforms] = 0;
 720       /* Add each of the unique swizzled channels of the element.
 721        * This will end up matching the size of the glsl_type of this field.
 722        */
 723       int last_swiz = -1;
 724       for (unsigned int j = 0; j < 4; j++) {
 725          int swiz = GET_SWZ(slots[i].swizzle, j);
 726          last_swiz = swiz;
 727
 728          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 729          if (swiz <= last_swiz)
 730             this->uniform_vector_size[this->uniforms]++;
 731       }
 732       this->uniforms++;
 733    }
 734 }
 735
 736 dst_reg *
 737 vec4_visitor::variable_storage(ir_variable *var)
 738 {
 739    return (dst_reg *)hash_table_find(this->variable_ht, var);
 740 }
 741
 742 void
 743 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 744 {
 745    ir_expression *expr = ir->as_expression();
 746
 747    *predicate = BRW_PREDICATE_NORMAL;
 748
 749    if (expr) {
 750       src_reg op[2];
 751       vec4_instruction *inst;
 752
 753       assert(expr->get_num_operands() <= 2);
 754       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 755          expr->operands[i]->accept(this);
 756          op[i] = this->result;
 757
 758          resolve_ud_negate(&op[i]);
 759       }
 760
 761       switch (expr->operation) {
 762       case ir_unop_logic_not:
 763          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 764          inst->conditional_mod = BRW_CONDITIONAL_Z;
 765          break;
 766
 767       case ir_binop_logic_xor:
 768          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 769          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 770          break;
 771
 772       case ir_binop_logic_or:
 773          inst = emit(OR(dst_null_d(), op[0], op[1]));
 774          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 775          break;
 776
 777       case ir_binop_logic_and:
 778          inst = emit(AND(dst_null_d(), op[0], op[1]));
 779          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 780          break;
 781
 782       case ir_unop_f2b:
 783          if (brw->gen >= 6) {
 784             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 785          } else {
 786             inst = emit(MOV(dst_null_f(), op[0]));
 787             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 788          }
 789          break;
 790
 791       case ir_unop_i2b:
 792          if (brw->gen >= 6) {
 793             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 794          } else {
 795             inst = emit(MOV(dst_null_d(), op[0]));
 796             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 797          }
 798          break;
 799
 800       case ir_binop_all_equal:
 801          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 802          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 803          break;
 804
 805       case ir_binop_any_nequal:
 806          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 807          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 808          break;
 809
 810       case ir_unop_any:
 811          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 812          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 813          break;
 814
 815       case ir_binop_greater:
 816       case ir_binop_gequal:
 817       case ir_binop_less:
 818       case ir_binop_lequal:
 819       case ir_binop_equal:
 820       case ir_binop_nequal:
 821          emit(CMP(dst_null_d(), op[0], op[1],
 822                   brw_conditional_for_comparison(expr->operation)));
 823          break;
 824
 825       default:
 826          assert(!"not reached");
 827          break;
 828       }
 829       return;
 830    }
 831
 832    ir->accept(this);
 833
 834    resolve_ud_negate(&this->result);
 835
 836    if (brw->gen >= 6) {
 837       vec4_instruction *inst = emit(AND(dst_null_d(),
 838                                         this->result, src_reg(1)));
 839       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 840    } else {
 841       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 842       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843    }
 844 }
 845
 846 /**
 847  * Emit a gen6 IF statement with the comparison folded into the IF
 848  * instruction.
 849  */
 850 void
 851 vec4_visitor::emit_if_gen6(ir_if *ir)
 852 {
 853    ir_expression *expr = ir->condition->as_expression();
 854
 855    if (expr) {
 856       src_reg op[2];
 857       dst_reg temp;
 858
 859       assert(expr->get_num_operands() <= 2);
 860       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 861          expr->operands[i]->accept(this);
 862          op[i] = this->result;
 863       }
 864
 865       switch (expr->operation) {
 866       case ir_unop_logic_not:
 867          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 868          return;
 869
 870       case ir_binop_logic_xor:
 871          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 872          return;
 873
 874       case ir_binop_logic_or:
 875          temp = dst_reg(this, glsl_type::bool_type);
 876          emit(OR(temp, op[0], op[1]));
 877          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 878          return;
 879
 880       case ir_binop_logic_and:
 881          temp = dst_reg(this, glsl_type::bool_type);
 882          emit(AND(temp, op[0], op[1]));
 883          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 884          return;
 885
 886       case ir_unop_f2b:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 888          return;
 889
 890       case ir_unop_i2b:
 891          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_binop_greater:
 895       case ir_binop_gequal:
 896       case ir_binop_less:
 897       case ir_binop_lequal:
 898       case ir_binop_equal:
 899       case ir_binop_nequal:
 900          emit(IF(op[0], op[1],
 901                  brw_conditional_for_comparison(expr->operation)));
 902          return;
 903
 904       case ir_binop_all_equal:
 905          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 906          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 907          return;
 908
 909       case ir_binop_any_nequal:
 910          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 911          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 912          return;
 913
 914       case ir_unop_any:
 915          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 916          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 917          return;
 918
 919       default:
 920          assert(!"not reached");
 921          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 922          return;
 923       }
 924       return;
 925    }
 926
 927    ir->condition->accept(this);
 928
 929    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 930 }
 931
 932 dst_reg
 933 with_writemask(dst_reg const & r, int mask)
 934 {
 935    dst_reg result = r;
 936    result.writemask = mask;
 937    return result;
 938 }
 939
 940
 941 void
 942 vec4_visitor::visit(ir_variable *ir)
 943 {
 944    dst_reg *reg = NULL;
 945
 946    if (variable_storage(ir))
 947       return;
 948
 949    switch (ir->data.mode) {
 950    case ir_var_shader_in:
 951       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 952       break;
 953
 954    case ir_var_shader_out:
 955       reg = new(mem_ctx) dst_reg(this, ir->type);
 956
 957       for (int i = 0; i < type_size(ir->type); i++) {
 958          output_reg[ir->data.location + i] = *reg;
 959          output_reg[ir->data.location + i].reg_offset = i;
 960          output_reg[ir->data.location + i].type =
 961             brw_type_for_base_type(ir->type->get_scalar_type());
 962          output_reg_annotation[ir->data.location + i] = ir->name;
 963       }
 964       break;
 965
 966    case ir_var_auto:
 967    case ir_var_temporary:
 968       reg = new(mem_ctx) dst_reg(this, ir->type);
 969       break;
 970
 971    case ir_var_uniform:
 972       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 973
 974       /* Thanks to the lower_ubo_reference pass, we will see only
 975        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 976        * variables, so no need for them to be in variable_ht.
 977        *
 978        * Atomic counters take no uniform storage, no need to do
 979        * anything here.
 980        */
 981       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 982          return;
 983
 984       /* Track how big the whole uniform variable is, in case we need to put a
 985        * copy of its data into pull constants for array access.
 986        */
 987       this->uniform_size[this->uniforms] = type_size(ir->type);
 988
 989       if (!strncmp(ir->name, "gl_", 3)) {
 990          setup_builtin_uniform_values(ir);
 991       } else {
 992          setup_uniform_values(ir);
 993       }
 994       break;
 995
 996    case ir_var_system_value:
 997       reg = make_reg_for_system_value(ir);
 998       break;
 999
1000    default:
1001       assert(!"not reached");
1002    }
1003
1004    reg->type = brw_type_for_base_type(ir->type);
1005    hash_table_insert(this->variable_ht, reg, ir);
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_loop *ir)
1010 {
1011    /* We don't want debugging output to print the whole body of the
1012     * loop as the annotation.
1013     */
1014    this->base_ir = NULL;
1015
1016    emit(BRW_OPCODE_DO);
1017
1018    visit_instructions(&ir->body_instructions);
1019
1020    emit(BRW_OPCODE_WHILE);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop_jump *ir)
1025 {
1026    switch (ir->mode) {
1027    case ir_loop_jump::jump_break:
1028       emit(BRW_OPCODE_BREAK);
1029       break;
1030    case ir_loop_jump::jump_continue:
1031       emit(BRW_OPCODE_CONTINUE);
1032       break;
1033    }
1034 }
1035
1036
1037 void
1038 vec4_visitor::visit(ir_function_signature *ir)
1039 {
1040    assert(0);
1041    (void)ir;
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_function *ir)
1046 {
1047    /* Ignore function bodies other than main() -- we shouldn't see calls to
1048     * them since they should all be inlined.
1049     */
1050    if (strcmp(ir->name, "main") == 0) {
1051       const ir_function_signature *sig;
1052       exec_list empty;
1053
1054       sig = ir->matching_signature(NULL, &empty);
1055
1056       assert(sig);
1057
1058       visit_instructions(&sig->body);
1059    }
1060 }
1061
1062 bool
1063 vec4_visitor::try_emit_sat(ir_expression *ir)
1064 {
1065    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1066    if (!sat_src)
1067       return false;
1068
1069    sat_src->accept(this);
1070    src_reg src = this->result;
1071
1072    this->result = src_reg(this, ir->type);
1073    vec4_instruction *inst;
1074    inst = emit(MOV(dst_reg(this->result), src));
1075    inst->saturate = true;
1076
1077    return true;
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1082 {
1083    /* 3-src instructions were introduced in gen6. */
1084    if (brw->gen < 6)
1085       return false;
1086
1087    /* MAD can only handle floating-point data. */
1088    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089       return false;
1090
1091    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1092    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1093
1094    if (!mul || mul->operation != ir_binop_mul)
1095       return false;
1096
1097    nonmul->accept(this);
1098    src_reg src0 = fix_3src_operand(this->result);
1099
1100    mul->operands[0]->accept(this);
1101    src_reg src1 = fix_3src_operand(this->result);
1102
1103    mul->operands[1]->accept(this);
1104    src_reg src2 = fix_3src_operand(this->result);
1105
1106    this->result = src_reg(this, ir->type);
1107    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1108
1109    return true;
1110 }
1111
1112 void
1113 vec4_visitor::emit_bool_comparison(unsigned int op,
1114                                  dst_reg dst, src_reg src0, src_reg src1)
1115 {
1116    /* original gen4 does destination conversion before comparison. */
1117    if (brw->gen < 5)
1118       dst.type = src0.type;
1119
1120    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1121
1122    dst.type = BRW_REGISTER_TYPE_D;
1123    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1124 }
1125
1126 void
1127 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1128                           src_reg src0, src_reg src1)
1129 {
1130    vec4_instruction *inst;
1131
1132    if (brw->gen >= 6) {
1133       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1134       inst->conditional_mod = conditionalmod;
1135    } else {
1136       emit(CMP(dst, src0, src1, conditionalmod));
1137
1138       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139       inst->predicate = BRW_PREDICATE_NORMAL;
1140    }
1141 }
1142
1143 static bool
1144 is_16bit_constant(ir_rvalue *rvalue)
1145 {
1146    ir_constant *constant = rvalue->as_constant();
1147    if (!constant)
1148       return false;
1149
1150    if (constant->type != glsl_type::int_type &&
1151        constant->type != glsl_type::uint_type)
1152       return false;
1153
1154    return constant->value.u[0] < (1 << 16);
1155 }
1156
1157 void
1158 vec4_visitor::visit(ir_expression *ir)
1159 {
1160    unsigned int operand;
1161    src_reg op[Elements(ir->operands)];
1162    src_reg result_src;
1163    dst_reg result_dst;
1164    vec4_instruction *inst;
1165
1166    if (try_emit_sat(ir))
1167       return;
1168
1169    if (ir->operation == ir_binop_add) {
1170       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1171          return;
1172    }
1173
1174    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1175       this->result.file = BAD_FILE;
1176       ir->operands[operand]->accept(this);
1177       if (this->result.file == BAD_FILE) {
1178          printf("Failed to get tree for expression operand:\n");
1179          ir->operands[operand]->print();
1180          exit(1);
1181       }
1182       op[operand] = this->result;
1183
1184       /* Matrix expression operands should have been broken down to vector
1185        * operations already.
1186        */
1187       assert(!ir->operands[operand]->type->is_matrix());
1188    }
1189
1190    int vector_elements = ir->operands[0]->type->vector_elements;
1191    if (ir->operands[1]) {
1192       vector_elements = MAX2(vector_elements,
1193                              ir->operands[1]->type->vector_elements);
1194    }
1195
1196    this->result.file = BAD_FILE;
1197
1198    /* Storage for our result.  Ideally for an assignment we'd be using
1199     * the actual storage for the result here, instead.
1200     */
1201    result_src = src_reg(this, ir->type);
1202    /* convenience for the emit functions below. */
1203    result_dst = dst_reg(result_src);
1204    /* If nothing special happens, this is the result. */
1205    this->result = result_src;
1206    /* Limit writes to the channels that will be used by result_src later.
1207     * This does limit this temp's use as a temporary for multi-instruction
1208     * sequences.
1209     */
1210    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1211
1212    switch (ir->operation) {
1213    case ir_unop_logic_not:
1214       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1215        * ones complement of the whole register, not just bit 0.
1216        */
1217       emit(XOR(result_dst, op[0], src_reg(1)));
1218       break;
1219    case ir_unop_neg:
1220       op[0].negate = !op[0].negate;
1221       emit(MOV(result_dst, op[0]));
1222       break;
1223    case ir_unop_abs:
1224       op[0].abs = true;
1225       op[0].negate = false;
1226       emit(MOV(result_dst, op[0]));
1227       break;
1228
1229    case ir_unop_sign:
1230       if (ir->type->is_float()) {
1231          /* AND(val, 0x80000000) gives the sign bit.
1232           *
1233           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1234           * zero.
1235           */
1236          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1237
1238          op[0].type = BRW_REGISTER_TYPE_UD;
1239          result_dst.type = BRW_REGISTER_TYPE_UD;
1240          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1241
1242          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1243          inst->predicate = BRW_PREDICATE_NORMAL;
1244
1245          this->result.type = BRW_REGISTER_TYPE_F;
1246       } else {
1247          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1248           *               -> non-negative val generates 0x00000000.
1249           *  Predicated OR sets 1 if val is positive.
1250           */
1251          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1252
1253          emit(ASR(result_dst, op[0], src_reg(31)));
1254
1255          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1256          inst->predicate = BRW_PREDICATE_NORMAL;
1257       }
1258       break;
1259
1260    case ir_unop_rcp:
1261       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1262       break;
1263
1264    case ir_unop_exp2:
1265       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1266       break;
1267    case ir_unop_log2:
1268       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1269       break;
1270    case ir_unop_exp:
1271    case ir_unop_log:
1272       assert(!"not reached: should be handled by ir_explog_to_explog2");
1273       break;
1274    case ir_unop_sin:
1275    case ir_unop_sin_reduced:
1276       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1277       break;
1278    case ir_unop_cos:
1279    case ir_unop_cos_reduced:
1280       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1281       break;
1282
1283    case ir_unop_dFdx:
1284    case ir_unop_dFdy:
1285       assert(!"derivatives not valid in vertex shader");
1286       break;
1287
1288    case ir_unop_bitfield_reverse:
1289       emit(BFREV(result_dst, op[0]));
1290       break;
1291    case ir_unop_bit_count:
1292       emit(CBIT(result_dst, op[0]));
1293       break;
1294    case ir_unop_find_msb: {
1295       src_reg temp = src_reg(this, glsl_type::uint_type);
1296
1297       inst = emit(FBH(dst_reg(temp), op[0]));
1298       inst->dst.writemask = WRITEMASK_XYZW;
1299
1300       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1301        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1302        * subtract the result from 31 to convert the MSB count into an LSB count.
1303        */
1304
1305       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1306       temp.swizzle = BRW_SWIZZLE_NOOP;
1307       emit(MOV(result_dst, temp));
1308
1309       src_reg src_tmp = src_reg(result_dst);
1310       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1311
1312       src_tmp.negate = true;
1313       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1314       inst->predicate = BRW_PREDICATE_NORMAL;
1315       break;
1316    }
1317    case ir_unop_find_lsb:
1318       emit(FBL(result_dst, op[0]));
1319       break;
1320
1321    case ir_unop_noise:
1322       assert(!"not reached: should be handled by lower_noise");
1323       break;
1324
1325    case ir_binop_add:
1326       emit(ADD(result_dst, op[0], op[1]));
1327       break;
1328    case ir_binop_sub:
1329       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1330       break;
1331
1332    case ir_binop_mul:
1333       if (brw->gen < 8 && ir->type->is_integer()) {
1334          /* For integer multiplication, the MUL uses the low 16 bits of one of
1335           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1336           * accumulates in the contribution of the upper 16 bits of that
1337           * operand.  If we can determine that one of the args is in the low
1338           * 16 bits, though, we can just emit a single MUL.
1339           */
1340          if (is_16bit_constant(ir->operands[0])) {
1341             if (brw->gen < 7)
1342                emit(MUL(result_dst, op[0], op[1]));
1343             else
1344                emit(MUL(result_dst, op[1], op[0]));
1345          } else if (is_16bit_constant(ir->operands[1])) {
1346             if (brw->gen < 7)
1347                emit(MUL(result_dst, op[1], op[0]));
1348             else
1349                emit(MUL(result_dst, op[0], op[1]));
1350          } else {
1351             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1352
1353             emit(MUL(acc, op[0], op[1]));
1354             emit(MACH(dst_null_d(), op[0], op[1]));
1355             emit(MOV(result_dst, src_reg(acc)));
1356          }
1357       } else {
1358          emit(MUL(result_dst, op[0], op[1]));
1359       }
1360       break;
1361    case ir_binop_imul_high: {
1362       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1363
1364       emit(MUL(acc, op[0], op[1]));
1365       emit(MACH(result_dst, op[0], op[1]));
1366       break;
1367    }
1368    case ir_binop_div:
1369       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1370       assert(ir->type->is_integer());
1371       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1372       break;
1373    case ir_binop_carry: {
1374       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1375
1376       emit(ADDC(dst_null_ud(), op[0], op[1]));
1377       emit(MOV(result_dst, src_reg(acc)));
1378       break;
1379    }
1380    case ir_binop_borrow: {
1381       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1382
1383       emit(SUBB(dst_null_ud(), op[0], op[1]));
1384       emit(MOV(result_dst, src_reg(acc)));
1385       break;
1386    }
1387    case ir_binop_mod:
1388       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1389       assert(ir->type->is_integer());
1390       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1391       break;
1392
1393    case ir_binop_less:
1394    case ir_binop_greater:
1395    case ir_binop_lequal:
1396    case ir_binop_gequal:
1397    case ir_binop_equal:
1398    case ir_binop_nequal: {
1399       emit(CMP(result_dst, op[0], op[1],
1400                brw_conditional_for_comparison(ir->operation)));
1401       emit(AND(result_dst, result_src, src_reg(0x1)));
1402       break;
1403    }
1404
1405    case ir_binop_all_equal:
1406       /* "==" operator producing a scalar boolean. */
1407       if (ir->operands[0]->type->is_vector() ||
1408           ir->operands[1]->type->is_vector()) {
1409          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1410          emit(MOV(result_dst, src_reg(0)));
1411          inst = emit(MOV(result_dst, src_reg(1)));
1412          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1413       } else {
1414          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1415          emit(AND(result_dst, result_src, src_reg(0x1)));
1416       }
1417       break;
1418    case ir_binop_any_nequal:
1419       /* "!=" operator producing a scalar boolean. */
1420       if (ir->operands[0]->type->is_vector() ||
1421           ir->operands[1]->type->is_vector()) {
1422          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1423
1424          emit(MOV(result_dst, src_reg(0)));
1425          inst = emit(MOV(result_dst, src_reg(1)));
1426          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1427       } else {
1428          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1429          emit(AND(result_dst, result_src, src_reg(0x1)));
1430       }
1431       break;
1432
1433    case ir_unop_any:
1434       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1435       emit(MOV(result_dst, src_reg(0)));
1436
1437       inst = emit(MOV(result_dst, src_reg(1)));
1438       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1439       break;
1440
1441    case ir_binop_logic_xor:
1442       emit(XOR(result_dst, op[0], op[1]));
1443       break;
1444
1445    case ir_binop_logic_or:
1446       emit(OR(result_dst, op[0], op[1]));
1447       break;
1448
1449    case ir_binop_logic_and:
1450       emit(AND(result_dst, op[0], op[1]));
1451       break;
1452
1453    case ir_binop_dot:
1454       assert(ir->operands[0]->type->is_vector());
1455       assert(ir->operands[0]->type == ir->operands[1]->type);
1456       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1457       break;
1458
1459    case ir_unop_sqrt:
1460       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1461       break;
1462    case ir_unop_rsq:
1463       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1464       break;
1465
1466    case ir_unop_bitcast_i2f:
1467    case ir_unop_bitcast_u2f:
1468       this->result = op[0];
1469       this->result.type = BRW_REGISTER_TYPE_F;
1470       break;
1471
1472    case ir_unop_bitcast_f2i:
1473       this->result = op[0];
1474       this->result.type = BRW_REGISTER_TYPE_D;
1475       break;
1476
1477    case ir_unop_bitcast_f2u:
1478       this->result = op[0];
1479       this->result.type = BRW_REGISTER_TYPE_UD;
1480       break;
1481
1482    case ir_unop_i2f:
1483    case ir_unop_i2u:
1484    case ir_unop_u2i:
1485    case ir_unop_u2f:
1486    case ir_unop_b2f:
1487    case ir_unop_b2i:
1488    case ir_unop_f2i:
1489    case ir_unop_f2u:
1490       emit(MOV(result_dst, op[0]));
1491       break;
1492    case ir_unop_f2b:
1493    case ir_unop_i2b: {
1494       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1495       emit(AND(result_dst, result_src, src_reg(1)));
1496       break;
1497    }
1498
1499    case ir_unop_trunc:
1500       emit(RNDZ(result_dst, op[0]));
1501       break;
1502    case ir_unop_ceil:
1503       op[0].negate = !op[0].negate;
1504       inst = emit(RNDD(result_dst, op[0]));
1505       this->result.negate = true;
1506       break;
1507    case ir_unop_floor:
1508       inst = emit(RNDD(result_dst, op[0]));
1509       break;
1510    case ir_unop_fract:
1511       inst = emit(FRC(result_dst, op[0]));
1512       break;
1513    case ir_unop_round_even:
1514       emit(RNDE(result_dst, op[0]));
1515       break;
1516
1517    case ir_binop_min:
1518       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1519       break;
1520    case ir_binop_max:
1521       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1522       break;
1523
1524    case ir_binop_pow:
1525       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1526       break;
1527
1528    case ir_unop_bit_not:
1529       inst = emit(NOT(result_dst, op[0]));
1530       break;
1531    case ir_binop_bit_and:
1532       inst = emit(AND(result_dst, op[0], op[1]));
1533       break;
1534    case ir_binop_bit_xor:
1535       inst = emit(XOR(result_dst, op[0], op[1]));
1536       break;
1537    case ir_binop_bit_or:
1538       inst = emit(OR(result_dst, op[0], op[1]));
1539       break;
1540
1541    case ir_binop_lshift:
1542       inst = emit(SHL(result_dst, op[0], op[1]));
1543       break;
1544
1545    case ir_binop_rshift:
1546       if (ir->type->base_type == GLSL_TYPE_INT)
1547          inst = emit(ASR(result_dst, op[0], op[1]));
1548       else
1549          inst = emit(SHR(result_dst, op[0], op[1]));
1550       break;
1551
1552    case ir_binop_bfm:
1553       emit(BFI1(result_dst, op[0], op[1]));
1554       break;
1555
1556    case ir_binop_ubo_load: {
1557       ir_constant *uniform_block = ir->operands[0]->as_constant();
1558       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1559       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1560       src_reg offset;
1561
1562       /* Now, load the vector from that offset. */
1563       assert(ir->type->is_vector() || ir->type->is_scalar());
1564
1565       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1566       packed_consts.type = result.type;
1567       src_reg surf_index =
1568          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1569       if (const_offset_ir) {
1570          if (brw->gen >= 8) {
1571             /* Store the offset in a GRF so we can send-from-GRF. */
1572             offset = src_reg(this, glsl_type::int_type);
1573             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1574          } else {
1575             /* Immediates are fine on older generations since they'll be moved
1576              * to a (potentially fake) MRF at the generator level.
1577              */
1578             offset = src_reg(const_offset / 16);
1579          }
1580       } else {
1581          offset = src_reg(this, glsl_type::uint_type);
1582          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1583       }
1584
1585       vec4_instruction *pull =
1586          emit(new(mem_ctx) vec4_instruction(this,
1587                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1588                                             dst_reg(packed_consts),
1589                                             surf_index,
1590                                             offset));
1591       pull->base_mrf = 14;
1592       pull->mlen = 1;
1593
1594       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1595       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1596                                             const_offset % 16 / 4,
1597                                             const_offset % 16 / 4,
1598                                             const_offset % 16 / 4);
1599
1600       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1601       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1602          emit(CMP(result_dst, packed_consts, src_reg(0u),
1603                   BRW_CONDITIONAL_NZ));
1604          emit(AND(result_dst, result, src_reg(0x1)));
1605       } else {
1606          emit(MOV(result_dst, packed_consts));
1607       }
1608       break;
1609    }
1610
1611    case ir_binop_vector_extract:
1612       assert(!"should have been lowered by vec_index_to_cond_assign");
1613       break;
1614
1615    case ir_triop_fma:
1616       op[0] = fix_3src_operand(op[0]);
1617       op[1] = fix_3src_operand(op[1]);
1618       op[2] = fix_3src_operand(op[2]);
1619       /* Note that the instruction's argument order is reversed from GLSL
1620        * and the IR.
1621        */
1622       emit(MAD(result_dst, op[2], op[1], op[0]));
1623       break;
1624
1625    case ir_triop_lrp:
1626       op[0] = fix_3src_operand(op[0]);
1627       op[1] = fix_3src_operand(op[1]);
1628       op[2] = fix_3src_operand(op[2]);
1629       /* Note that the instruction's argument order is reversed from GLSL
1630        * and the IR.
1631        */
1632       emit(LRP(result_dst, op[2], op[1], op[0]));
1633       break;
1634
1635    case ir_triop_csel:
1636       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1637       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1638       inst->predicate = BRW_PREDICATE_NORMAL;
1639       break;
1640
1641    case ir_triop_bfi:
1642       op[0] = fix_3src_operand(op[0]);
1643       op[1] = fix_3src_operand(op[1]);
1644       op[2] = fix_3src_operand(op[2]);
1645       emit(BFI2(result_dst, op[0], op[1], op[2]));
1646       break;
1647
1648    case ir_triop_bitfield_extract:
1649       op[0] = fix_3src_operand(op[0]);
1650       op[1] = fix_3src_operand(op[1]);
1651       op[2] = fix_3src_operand(op[2]);
1652       /* Note that the instruction's argument order is reversed from GLSL
1653        * and the IR.
1654        */
1655       emit(BFE(result_dst, op[2], op[1], op[0]));
1656       break;
1657
1658    case ir_triop_vector_insert:
1659       assert(!"should have been lowered by lower_vector_insert");
1660       break;
1661
1662    case ir_quadop_bitfield_insert:
1663       assert(!"not reached: should be handled by "
1664               "bitfield_insert_to_bfm_bfi\n");
1665       break;
1666
1667    case ir_quadop_vector:
1668       assert(!"not reached: should be handled by lower_quadop_vector");
1669       break;
1670
1671    case ir_unop_pack_half_2x16:
1672       emit_pack_half_2x16(result_dst, op[0]);
1673       break;
1674    case ir_unop_unpack_half_2x16:
1675       emit_unpack_half_2x16(result_dst, op[0]);
1676       break;
1677    case ir_unop_pack_snorm_2x16:
1678    case ir_unop_pack_snorm_4x8:
1679    case ir_unop_pack_unorm_2x16:
1680    case ir_unop_pack_unorm_4x8:
1681    case ir_unop_unpack_snorm_2x16:
1682    case ir_unop_unpack_snorm_4x8:
1683    case ir_unop_unpack_unorm_2x16:
1684    case ir_unop_unpack_unorm_4x8:
1685       assert(!"not reached: should be handled by lower_packing_builtins");
1686       break;
1687    case ir_unop_unpack_half_2x16_split_x:
1688    case ir_unop_unpack_half_2x16_split_y:
1689    case ir_binop_pack_half_2x16_split:
1690       assert(!"not reached: should not occur in vertex shader");
1691       break;
1692    case ir_binop_ldexp:
1693       assert(!"not reached: should be handled by ldexp_to_arith()");
1694       break;
1695    }
1696 }
1697
1698
1699 void
1700 vec4_visitor::visit(ir_swizzle *ir)
1701 {
1702    src_reg src;
1703    int i = 0;
1704    int swizzle[4];
1705
1706    /* Note that this is only swizzles in expressions, not those on the left
1707     * hand side of an assignment, which do write masking.  See ir_assignment
1708     * for that.
1709     */
1710
1711    ir->val->accept(this);
1712    src = this->result;
1713    assert(src.file != BAD_FILE);
1714
1715    for (i = 0; i < ir->type->vector_elements; i++) {
1716       switch (i) {
1717       case 0:
1718          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1719          break;
1720       case 1:
1721          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1722          break;
1723       case 2:
1724          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1725          break;
1726       case 3:
1727          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1728             break;
1729       }
1730    }
1731    for (; i < 4; i++) {
1732       /* Replicate the last channel out. */
1733       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1734    }
1735
1736    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1737
1738    this->result = src;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_dereference_variable *ir)
1743 {
1744    const struct glsl_type *type = ir->type;
1745    dst_reg *reg = variable_storage(ir->var);
1746
1747    if (!reg) {
1748       fail("Failed to find variable storage for %s\n", ir->var->name);
1749       this->result = src_reg(brw_null_reg());
1750       return;
1751    }
1752
1753    this->result = src_reg(*reg);
1754
1755    /* System values get their swizzle from the dst_reg writemask */
1756    if (ir->var->data.mode == ir_var_system_value)
1757       return;
1758
1759    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1760       this->result.swizzle = swizzle_for_size(type->vector_elements);
1761 }
1762
1763
1764 int
1765 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1766 {
1767    /* Under normal circumstances array elements are stored consecutively, so
1768     * the stride is equal to the size of the array element.
1769     */
1770    return type_size(ir->type);
1771 }
1772
1773
1774 void
1775 vec4_visitor::visit(ir_dereference_array *ir)
1776 {
1777    ir_constant *constant_index;
1778    src_reg src;
1779    int array_stride = compute_array_stride(ir);
1780
1781    constant_index = ir->array_index->constant_expression_value();
1782
1783    ir->array->accept(this);
1784    src = this->result;
1785
1786    if (constant_index) {
1787       src.reg_offset += constant_index->value.i[0] * array_stride;
1788    } else {
1789       /* Variable index array dereference.  It eats the "vec4" of the
1790        * base of the array and an index that offsets the Mesa register
1791        * index.
1792        */
1793       ir->array_index->accept(this);
1794
1795       src_reg index_reg;
1796
1797       if (array_stride == 1) {
1798          index_reg = this->result;
1799       } else {
1800          index_reg = src_reg(this, glsl_type::int_type);
1801
1802          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1803       }
1804
1805       if (src.reladdr) {
1806          src_reg temp = src_reg(this, glsl_type::int_type);
1807
1808          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1809
1810          index_reg = temp;
1811       }
1812
1813       src.reladdr = ralloc(mem_ctx, src_reg);
1814       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1815    }
1816
1817    /* If the type is smaller than a vec4, replicate the last channel out. */
1818    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1819       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1820    else
1821       src.swizzle = BRW_SWIZZLE_NOOP;
1822    src.type = brw_type_for_base_type(ir->type);
1823
1824    this->result = src;
1825 }
1826
1827 void
1828 vec4_visitor::visit(ir_dereference_record *ir)
1829 {
1830    unsigned int i;
1831    const glsl_type *struct_type = ir->record->type;
1832    int offset = 0;
1833
1834    ir->record->accept(this);
1835
1836    for (i = 0; i < struct_type->length; i++) {
1837       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1838          break;
1839       offset += type_size(struct_type->fields.structure[i].type);
1840    }
1841
1842    /* If the type is smaller than a vec4, replicate the last channel out. */
1843    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1844       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1845    else
1846       this->result.swizzle = BRW_SWIZZLE_NOOP;
1847    this->result.type = brw_type_for_base_type(ir->type);
1848
1849    this->result.reg_offset += offset;
1850 }
1851
1852 /**
1853  * We want to be careful in assignment setup to hit the actual storage
1854  * instead of potentially using a temporary like we might with the
1855  * ir_dereference handler.
1856  */
1857 static dst_reg
1858 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1859 {
1860    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1861     * access of a vector, it must be separated into a series conditional moves
1862     * before reaching this point (see ir_vec_index_to_cond_assign).
1863     */
1864    assert(ir->as_dereference());
1865    ir_dereference_array *deref_array = ir->as_dereference_array();
1866    if (deref_array) {
1867       assert(!deref_array->array->type->is_vector());
1868    }
1869
1870    /* Use the rvalue deref handler for the most part.  We'll ignore
1871     * swizzles in it and write swizzles using writemask, though.
1872     */
1873    ir->accept(v);
1874    return dst_reg(v->result);
1875 }
1876
1877 void
1878 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1879                               const struct glsl_type *type, uint32_t predicate)
1880 {
1881    if (type->base_type == GLSL_TYPE_STRUCT) {
1882       for (unsigned int i = 0; i < type->length; i++) {
1883          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1884       }
1885       return;
1886    }
1887
1888    if (type->is_array()) {
1889       for (unsigned int i = 0; i < type->length; i++) {
1890          emit_block_move(dst, src, type->fields.array, predicate);
1891       }
1892       return;
1893    }
1894
1895    if (type->is_matrix()) {
1896       const struct glsl_type *vec_type;
1897
1898       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1899                                          type->vector_elements, 1);
1900
1901       for (int i = 0; i < type->matrix_columns; i++) {
1902          emit_block_move(dst, src, vec_type, predicate);
1903       }
1904       return;
1905    }
1906
1907    assert(type->is_scalar() || type->is_vector());
1908
1909    dst->type = brw_type_for_base_type(type);
1910    src->type = dst->type;
1911
1912    dst->writemask = (1 << type->vector_elements) - 1;
1913
1914    src->swizzle = swizzle_for_size(type->vector_elements);
1915
1916    vec4_instruction *inst = emit(MOV(*dst, *src));
1917    inst->predicate = predicate;
1918
1919    dst->reg_offset++;
1920    src->reg_offset++;
1921 }
1922
1923
1924 /* If the RHS processing resulted in an instruction generating a
1925  * temporary value, and it would be easy to rewrite the instruction to
1926  * generate its result right into the LHS instead, do so.  This ends
1927  * up reliably removing instructions where it can be tricky to do so
1928  * later without real UD chain information.
1929  */
1930 bool
1931 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1932                                      dst_reg dst,
1933                                      src_reg src,
1934                                      vec4_instruction *pre_rhs_inst,
1935                                      vec4_instruction *last_rhs_inst)
1936 {
1937    /* This could be supported, but it would take more smarts. */
1938    if (ir->condition)
1939       return false;
1940
1941    if (pre_rhs_inst == last_rhs_inst)
1942       return false; /* No instructions generated to work with. */
1943
1944    /* Make sure the last instruction generated our source reg. */
1945    if (src.file != GRF ||
1946        src.file != last_rhs_inst->dst.file ||
1947        src.reg != last_rhs_inst->dst.reg ||
1948        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1949        src.reladdr ||
1950        src.abs ||
1951        src.negate ||
1952        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1953       return false;
1954
1955    /* Check that that last instruction fully initialized the channels
1956     * we want to use, in the order we want to use them.  We could
1957     * potentially reswizzle the operands of many instructions so that
1958     * we could handle out of order channels, but don't yet.
1959     */
1960
1961    for (unsigned i = 0; i < 4; i++) {
1962       if (dst.writemask & (1 << i)) {
1963          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1964             return false;
1965
1966          if (BRW_GET_SWZ(src.swizzle, i) != i)
1967             return false;
1968       }
1969    }
1970
1971    /* Success!  Rewrite the instruction. */
1972    last_rhs_inst->dst.file = dst.file;
1973    last_rhs_inst->dst.reg = dst.reg;
1974    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1975    last_rhs_inst->dst.reladdr = dst.reladdr;
1976    last_rhs_inst->dst.writemask &= dst.writemask;
1977
1978    return true;
1979 }
1980
1981 void
1982 vec4_visitor::visit(ir_assignment *ir)
1983 {
1984    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1985    uint32_t predicate = BRW_PREDICATE_NONE;
1986
1987    if (!ir->lhs->type->is_scalar() &&
1988        !ir->lhs->type->is_vector()) {
1989       ir->rhs->accept(this);
1990       src_reg src = this->result;
1991
1992       if (ir->condition) {
1993          emit_bool_to_cond_code(ir->condition, &predicate);
1994       }
1995
1996       /* emit_block_move doesn't account for swizzles in the source register.
1997        * This should be ok, since the source register is a structure or an
1998        * array, and those can't be swizzled.  But double-check to be sure.
1999        */
2000       assert(src.swizzle ==
2001              (ir->rhs->type->is_matrix()
2002               ? swizzle_for_size(ir->rhs->type->vector_elements)
2003               : BRW_SWIZZLE_NOOP));
2004
2005       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2006       return;
2007    }
2008
2009    /* Now we're down to just a scalar/vector with writemasks. */
2010    int i;
2011
2012    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2013    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2014
2015    ir->rhs->accept(this);
2016
2017    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2018
2019    src_reg src = this->result;
2020
2021    int swizzles[4];
2022    int first_enabled_chan = 0;
2023    int src_chan = 0;
2024
2025    assert(ir->lhs->type->is_vector() ||
2026           ir->lhs->type->is_scalar());
2027    dst.writemask = ir->write_mask;
2028
2029    for (int i = 0; i < 4; i++) {
2030       if (dst.writemask & (1 << i)) {
2031          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2032          break;
2033       }
2034    }
2035
2036    /* Swizzle a small RHS vector into the channels being written.
2037     *
2038     * glsl ir treats write_mask as dictating how many channels are
2039     * present on the RHS while in our instructions we need to make
2040     * those channels appear in the slots of the vec4 they're written to.
2041     */
2042    for (int i = 0; i < 4; i++) {
2043       if (dst.writemask & (1 << i))
2044          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2045       else
2046          swizzles[i] = first_enabled_chan;
2047    }
2048    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2049                               swizzles[2], swizzles[3]);
2050
2051    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2052       return;
2053    }
2054
2055    if (ir->condition) {
2056       emit_bool_to_cond_code(ir->condition, &predicate);
2057    }
2058
2059    for (i = 0; i < type_size(ir->lhs->type); i++) {
2060       vec4_instruction *inst = emit(MOV(dst, src));
2061       inst->predicate = predicate;
2062
2063       dst.reg_offset++;
2064       src.reg_offset++;
2065    }
2066 }
2067
2068 void
2069 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2070 {
2071    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2072       foreach_list(node, &ir->components) {
2073          ir_constant *field_value = (ir_constant *)node;
2074
2075          emit_constant_values(dst, field_value);
2076       }
2077       return;
2078    }
2079
2080    if (ir->type->is_array()) {
2081       for (unsigned int i = 0; i < ir->type->length; i++) {
2082          emit_constant_values(dst, ir->array_elements[i]);
2083       }
2084       return;
2085    }
2086
2087    if (ir->type->is_matrix()) {
2088       for (int i = 0; i < ir->type->matrix_columns; i++) {
2089          float *vec = &ir->value.f[i * ir->type->vector_elements];
2090
2091          for (int j = 0; j < ir->type->vector_elements; j++) {
2092             dst->writemask = 1 << j;
2093             dst->type = BRW_REGISTER_TYPE_F;
2094
2095             emit(MOV(*dst, src_reg(vec[j])));
2096          }
2097          dst->reg_offset++;
2098       }
2099       return;
2100    }
2101
2102    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2103
2104    for (int i = 0; i < ir->type->vector_elements; i++) {
2105       if (!(remaining_writemask & (1 << i)))
2106          continue;
2107
2108       dst->writemask = 1 << i;
2109       dst->type = brw_type_for_base_type(ir->type);
2110
2111       /* Find other components that match the one we're about to
2112        * write.  Emits fewer instructions for things like vec4(0.5,
2113        * 1.5, 1.5, 1.5).
2114        */
2115       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2116          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2117             if (ir->value.b[i] == ir->value.b[j])
2118                dst->writemask |= (1 << j);
2119          } else {
2120             /* u, i, and f storage all line up, so no need for a
2121              * switch case for comparing each type.
2122              */
2123             if (ir->value.u[i] == ir->value.u[j])
2124                dst->writemask |= (1 << j);
2125          }
2126       }
2127
2128       switch (ir->type->base_type) {
2129       case GLSL_TYPE_FLOAT:
2130          emit(MOV(*dst, src_reg(ir->value.f[i])));
2131          break;
2132       case GLSL_TYPE_INT:
2133          emit(MOV(*dst, src_reg(ir->value.i[i])));
2134          break;
2135       case GLSL_TYPE_UINT:
2136          emit(MOV(*dst, src_reg(ir->value.u[i])));
2137          break;
2138       case GLSL_TYPE_BOOL:
2139          emit(MOV(*dst, src_reg(ir->value.b[i])));
2140          break;
2141       default:
2142          assert(!"Non-float/uint/int/bool constant");
2143          break;
2144       }
2145
2146       remaining_writemask &= ~dst->writemask;
2147    }
2148    dst->reg_offset++;
2149 }
2150
2151 void
2152 vec4_visitor::visit(ir_constant *ir)
2153 {
2154    dst_reg dst = dst_reg(this, ir->type);
2155    this->result = src_reg(dst);
2156
2157    emit_constant_values(&dst, ir);
2158 }
2159
2160 void
2161 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2162 {
2163    ir_dereference *deref = static_cast<ir_dereference *>(
2164       ir->actual_parameters.get_head());
2165    ir_variable *location = deref->variable_referenced();
2166    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2167                           location->data.atomic.buffer_index);
2168
2169    /* Calculate the surface offset */
2170    src_reg offset(this, glsl_type::uint_type);
2171    ir_dereference_array *deref_array = deref->as_dereference_array();
2172    if (deref_array) {
2173       deref_array->array_index->accept(this);
2174
2175       src_reg tmp(this, glsl_type::uint_type);
2176       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2177       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2178    } else {
2179       offset = location->data.atomic.offset;
2180    }
2181
2182    /* Emit the appropriate machine instruction */
2183    const char *callee = ir->callee->function_name();
2184    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2185
2186    if (!strcmp("__intrinsic_atomic_read", callee)) {
2187       emit_untyped_surface_read(surf_index, dst, offset);
2188
2189    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2190       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2191                           src_reg(), src_reg());
2192
2193    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2194       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2195                           src_reg(), src_reg());
2196    }
2197 }
2198
2199 void
2200 vec4_visitor::visit(ir_call *ir)
2201 {
2202    const char *callee = ir->callee->function_name();
2203
2204    if (!strcmp("__intrinsic_atomic_read", callee) ||
2205        !strcmp("__intrinsic_atomic_increment", callee) ||
2206        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2207       visit_atomic_counter_intrinsic(ir);
2208    } else {
2209       assert(!"Unsupported intrinsic.");
2210    }
2211 }
2212
2213 src_reg
2214 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2215 {
2216    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2217    inst->base_mrf = 2;
2218    inst->mlen = 1;
2219    inst->sampler = sampler;
2220    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2221    inst->dst.writemask = WRITEMASK_XYZW;
2222
2223    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2224    int param_base = inst->base_mrf;
2225    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2226    int zero_mask = 0xf & ~coord_mask;
2227
2228    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2229             coordinate));
2230
2231    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2232             src_reg(0)));
2233
2234    emit(inst);
2235    return src_reg(inst->dst);
2236 }
2237
2238 void
2239 vec4_visitor::visit(ir_texture *ir)
2240 {
2241    int sampler =
2242       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2243
2244    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2245     * emitting anything other than setting up the constant result.
2246     */
2247    if (ir->op == ir_tg4) {
2248       ir_constant *chan = ir->lod_info.component->as_constant();
2249       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2250       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2251          dst_reg result(this, ir->type);
2252          this->result = src_reg(result);
2253          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2254          return;
2255       }
2256    }
2257
2258    /* Should be lowered by do_lower_texture_projection */
2259    assert(!ir->projector);
2260
2261    /* Should be lowered */
2262    assert(!ir->offset || !ir->offset->type->is_array());
2263
2264    /* Generate code to compute all the subexpression trees.  This has to be
2265     * done before loading any values into MRFs for the sampler message since
2266     * generating these values may involve SEND messages that need the MRFs.
2267     */
2268    src_reg coordinate;
2269    if (ir->coordinate) {
2270       ir->coordinate->accept(this);
2271       coordinate = this->result;
2272    }
2273
2274    src_reg shadow_comparitor;
2275    if (ir->shadow_comparitor) {
2276       ir->shadow_comparitor->accept(this);
2277       shadow_comparitor = this->result;
2278    }
2279
2280    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2281    src_reg offset_value;
2282    if (has_nonconstant_offset) {
2283       ir->offset->accept(this);
2284       offset_value = src_reg(this->result);
2285    }
2286
2287    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2288    src_reg lod, dPdx, dPdy, sample_index, mcs;
2289    switch (ir->op) {
2290    case ir_tex:
2291       lod = src_reg(0.0f);
2292       lod_type = glsl_type::float_type;
2293       break;
2294    case ir_txf:
2295    case ir_txl:
2296    case ir_txs:
2297       ir->lod_info.lod->accept(this);
2298       lod = this->result;
2299       lod_type = ir->lod_info.lod->type;
2300       break;
2301    case ir_query_levels:
2302       lod = src_reg(0);
2303       lod_type = glsl_type::int_type;
2304       break;
2305    case ir_txf_ms:
2306       ir->lod_info.sample_index->accept(this);
2307       sample_index = this->result;
2308       sample_index_type = ir->lod_info.sample_index->type;
2309
2310       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2311          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2312       else
2313          mcs = src_reg(0u);
2314       break;
2315    case ir_txd:
2316       ir->lod_info.grad.dPdx->accept(this);
2317       dPdx = this->result;
2318
2319       ir->lod_info.grad.dPdy->accept(this);
2320       dPdy = this->result;
2321
2322       lod_type = ir->lod_info.grad.dPdx->type;
2323       break;
2324    case ir_txb:
2325    case ir_lod:
2326    case ir_tg4:
2327       break;
2328    }
2329
2330    vec4_instruction *inst = NULL;
2331    switch (ir->op) {
2332    case ir_tex:
2333    case ir_txl:
2334       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2335       break;
2336    case ir_txd:
2337       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2338       break;
2339    case ir_txf:
2340       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2341       break;
2342    case ir_txf_ms:
2343       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2344       break;
2345    case ir_txs:
2346       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2347       break;
2348    case ir_tg4:
2349       if (has_nonconstant_offset)
2350          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2351       else
2352          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2353       break;
2354    case ir_query_levels:
2355       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2356       break;
2357    case ir_txb:
2358       assert(!"TXB is not valid for vertex shaders.");
2359       break;
2360    case ir_lod:
2361       assert(!"LOD is not valid for vertex shaders.");
2362       break;
2363    default:
2364       assert(!"Unrecognized tex op");
2365    }
2366
2367    if (ir->offset != NULL && ir->op != ir_txf)
2368       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2369
2370    /* Stuff the channel select bits in the top of the texture offset */
2371    if (ir->op == ir_tg4)
2372       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2373
2374    /* The message header is necessary for:
2375     * - Gen4 (always)
2376     * - Texel offsets
2377     * - Gather channel selection
2378     * - Sampler indices too large to fit in a 4-bit value.
2379     */
2380    inst->header_present =
2381       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2382       sampler >= 16;
2383    inst->base_mrf = 2;
2384    inst->mlen = inst->header_present + 1; /* always at least one */
2385    inst->sampler = sampler;
2386    inst->dst = dst_reg(this, ir->type);
2387    inst->dst.writemask = WRITEMASK_XYZW;
2388    inst->shadow_compare = ir->shadow_comparitor != NULL;
2389
2390    /* MRF for the first parameter */
2391    int param_base = inst->base_mrf + inst->header_present;
2392
2393    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2394       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2395       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2396    } else {
2397       /* Load the coordinate */
2398       /* FINISHME: gl_clamp_mask and saturate */
2399       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2400       int zero_mask = 0xf & ~coord_mask;
2401
2402       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2403                coordinate));
2404
2405       if (zero_mask != 0) {
2406          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2407                   src_reg(0)));
2408       }
2409       /* Load the shadow comparitor */
2410       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2411          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2412                           WRITEMASK_X),
2413                   shadow_comparitor));
2414          inst->mlen++;
2415       }
2416
2417       /* Load the LOD info */
2418       if (ir->op == ir_tex || ir->op == ir_txl) {
2419          int mrf, writemask;
2420          if (brw->gen >= 5) {
2421             mrf = param_base + 1;
2422             if (ir->shadow_comparitor) {
2423                writemask = WRITEMASK_Y;
2424                /* mlen already incremented */
2425             } else {
2426                writemask = WRITEMASK_X;
2427                inst->mlen++;
2428             }
2429          } else /* brw->gen == 4 */ {
2430             mrf = param_base;
2431             writemask = WRITEMASK_W;
2432          }
2433          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2434       } else if (ir->op == ir_txf) {
2435          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2436       } else if (ir->op == ir_txf_ms) {
2437          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2438                   sample_index));
2439          if (brw->gen >= 7)
2440             /* MCS data is in the first channel of `mcs`, but we need to get it into
2441              * the .y channel of the second vec4 of params, so replicate .x across
2442              * the whole vec4 and then mask off everything except .y
2443              */
2444             mcs.swizzle = BRW_SWIZZLE_XXXX;
2445             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2446                      mcs));
2447          inst->mlen++;
2448       } else if (ir->op == ir_txd) {
2449          const glsl_type *type = lod_type;
2450
2451          if (brw->gen >= 5) {
2452             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2453             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2454             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2455             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2456             inst->mlen++;
2457
2458             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2459                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2460                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2461                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2462                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2463                inst->mlen++;
2464
2465                if (ir->shadow_comparitor) {
2466                   emit(MOV(dst_reg(MRF, param_base + 2,
2467                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2468                            shadow_comparitor));
2469                }
2470             }
2471          } else /* brw->gen == 4 */ {
2472             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2473             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2474             inst->mlen += 2;
2475          }
2476       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2477          if (ir->shadow_comparitor) {
2478             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2479                      shadow_comparitor));
2480          }
2481
2482          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2483                   offset_value));
2484          inst->mlen++;
2485       }
2486    }
2487
2488    emit(inst);
2489
2490    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2491     * spec requires layers.
2492     */
2493    if (ir->op == ir_txs) {
2494       glsl_type const *type = ir->sampler->type;
2495       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2496           type->sampler_array) {
2497          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2498                    with_writemask(inst->dst, WRITEMASK_Z),
2499                    src_reg(inst->dst), src_reg(6));
2500       }
2501    }
2502
2503    if (brw->gen == 6 && ir->op == ir_tg4) {
2504       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2505    }
2506
2507    swizzle_result(ir, src_reg(inst->dst), sampler);
2508 }
2509
2510 /**
2511  * Apply workarounds for Gen6 gather with UINT/SINT
2512  */
2513 void
2514 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2515 {
2516    if (!wa)
2517       return;
2518
2519    int width = (wa & WA_8BIT) ? 8 : 16;
2520    dst_reg dst_f = dst;
2521    dst_f.type = BRW_REGISTER_TYPE_F;
2522
2523    /* Convert from UNORM to UINT */
2524    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2525    emit(MOV(dst, src_reg(dst_f)));
2526
2527    if (wa & WA_SIGN) {
2528       /* Reinterpret the UINT value as a signed INT value by
2529        * shifting the sign bit into place, then shifting back
2530        * preserving sign.
2531        */
2532       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2533       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2534    }
2535 }
2536
2537 /**
2538  * Set up the gather channel based on the swizzle, for gather4.
2539  */
2540 uint32_t
2541 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2542 {
2543    ir_constant *chan = ir->lod_info.component->as_constant();
2544    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2545    switch (swiz) {
2546       case SWIZZLE_X: return 0;
2547       case SWIZZLE_Y:
2548          /* gather4 sampler is broken for green channel on RG32F --
2549           * we must ask for blue instead.
2550           */
2551          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2552             return 2;
2553          return 1;
2554       case SWIZZLE_Z: return 2;
2555       case SWIZZLE_W: return 3;
2556       default:
2557          assert(!"Not reached"); /* zero, one swizzles handled already */
2558          return 0;
2559    }
2560 }
2561
2562 void
2563 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2564 {
2565    int s = key->tex.swizzles[sampler];
2566
2567    this->result = src_reg(this, ir->type);
2568    dst_reg swizzled_result(this->result);
2569
2570    if (ir->op == ir_query_levels) {
2571       /* # levels is in .w */
2572       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2573       emit(MOV(swizzled_result, orig_val));
2574       return;
2575    }
2576
2577    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2578                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2579       emit(MOV(swizzled_result, orig_val));
2580       return;
2581    }
2582
2583
2584    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2585    int swizzle[4] = {0};
2586
2587    for (int i = 0; i < 4; i++) {
2588       switch (GET_SWZ(s, i)) {
2589       case SWIZZLE_ZERO:
2590          zero_mask |= (1 << i);
2591          break;
2592       case SWIZZLE_ONE:
2593          one_mask |= (1 << i);
2594          break;
2595       default:
2596          copy_mask |= (1 << i);
2597          swizzle[i] = GET_SWZ(s, i);
2598          break;
2599       }
2600    }
2601
2602    if (copy_mask) {
2603       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2604       swizzled_result.writemask = copy_mask;
2605       emit(MOV(swizzled_result, orig_val));
2606    }
2607
2608    if (zero_mask) {
2609       swizzled_result.writemask = zero_mask;
2610       emit(MOV(swizzled_result, src_reg(0.0f)));
2611    }
2612
2613    if (one_mask) {
2614       swizzled_result.writemask = one_mask;
2615       emit(MOV(swizzled_result, src_reg(1.0f)));
2616    }
2617 }
2618
2619 void
2620 vec4_visitor::visit(ir_return *ir)
2621 {
2622    assert(!"not reached");
2623 }
2624
2625 void
2626 vec4_visitor::visit(ir_discard *ir)
2627 {
2628    assert(!"not reached");
2629 }
2630
2631 void
2632 vec4_visitor::visit(ir_if *ir)
2633 {
2634    /* Don't point the annotation at the if statement, because then it plus
2635     * the then and else blocks get printed.
2636     */
2637    this->base_ir = ir->condition;
2638
2639    if (brw->gen == 6) {
2640       emit_if_gen6(ir);
2641    } else {
2642       uint32_t predicate;
2643       emit_bool_to_cond_code(ir->condition, &predicate);
2644       emit(IF(predicate));
2645    }
2646
2647    visit_instructions(&ir->then_instructions);
2648
2649    if (!ir->else_instructions.is_empty()) {
2650       this->base_ir = ir->condition;
2651       emit(BRW_OPCODE_ELSE);
2652
2653       visit_instructions(&ir->else_instructions);
2654    }
2655
2656    this->base_ir = ir->condition;
2657    emit(BRW_OPCODE_ENDIF);
2658 }
2659
2660 void
2661 vec4_visitor::visit(ir_emit_vertex *)
2662 {
2663    assert(!"not reached");
2664 }
2665
2666 void
2667 vec4_visitor::visit(ir_end_primitive *)
2668 {
2669    assert(!"not reached");
2670 }
2671
2672 void
2673 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2674                                   dst_reg dst, src_reg offset,
2675                                   src_reg src0, src_reg src1)
2676 {
2677    unsigned mlen = 0;
2678
2679    /* Set the atomic operation offset. */
2680    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2681    mlen++;
2682
2683    /* Set the atomic operation arguments. */
2684    if (src0.file != BAD_FILE) {
2685       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2686       mlen++;
2687    }
2688
2689    if (src1.file != BAD_FILE) {
2690       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2691       mlen++;
2692    }
2693
2694    /* Emit the instruction.  Note that this maps to the normal SIMD8
2695     * untyped atomic message on Ivy Bridge, but that's OK because
2696     * unused channels will be masked out.
2697     */
2698    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2699                                  src_reg(atomic_op), src_reg(surf_index));
2700    inst->base_mrf = 0;
2701    inst->mlen = mlen;
2702 }
2703
2704 void
2705 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2706                                         src_reg offset)
2707 {
2708    /* Set the surface read offset. */
2709    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2710
2711    /* Emit the instruction.  Note that this maps to the normal SIMD8
2712     * untyped surface read message, but that's OK because unused
2713     * channels will be masked out.
2714     */
2715    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2716                                  dst, src_reg(surf_index));
2717    inst->base_mrf = 0;
2718    inst->mlen = 1;
2719 }
2720
2721 void
2722 vec4_visitor::emit_ndc_computation()
2723 {
2724    /* Get the position */
2725    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2726
2727    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2728    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2729    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2730
2731    current_annotation = "NDC";
2732    dst_reg ndc_w = ndc;
2733    ndc_w.writemask = WRITEMASK_W;
2734    src_reg pos_w = pos;
2735    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2736    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2737
2738    dst_reg ndc_xyz = ndc;
2739    ndc_xyz.writemask = WRITEMASK_XYZ;
2740
2741    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2742 }
2743
2744 void
2745 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2746 {
2747    if (brw->gen < 6 &&
2748        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2749         key->userclip_active || brw->has_negative_rhw_bug)) {
2750       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2751       dst_reg header1_w = header1;
2752       header1_w.writemask = WRITEMASK_W;
2753
2754       emit(MOV(header1, 0u));
2755
2756       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2757          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2758
2759          current_annotation = "Point size";
2760          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2761          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2762       }
2763
2764       if (key->userclip_active) {
2765          current_annotation = "Clipping flags";
2766          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2767          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2768
2769          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2770          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2771          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2772
2773          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2774          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2775          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2776          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2777       }
2778
2779       /* i965 clipping workaround:
2780        * 1) Test for -ve rhw
2781        * 2) If set,
2782        *      set ndc = (0,0,0,0)
2783        *      set ucp[6] = 1
2784        *
2785        * Later, clipping will detect ucp[6] and ensure the primitive is
2786        * clipped against all fixed planes.
2787        */
2788       if (brw->has_negative_rhw_bug) {
2789          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2790          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2791          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2792          vec4_instruction *inst;
2793          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2794          inst->predicate = BRW_PREDICATE_NORMAL;
2795          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2796          inst->predicate = BRW_PREDICATE_NORMAL;
2797       }
2798
2799       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2800    } else if (brw->gen < 6) {
2801       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2802    } else {
2803       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2804       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2805          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2806                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2807       }
2808       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2809          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2810                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2811       }
2812       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2813          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2814                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2815       }
2816    }
2817 }
2818
2819 void
2820 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2821 {
2822    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2823     *
2824     *     "If a linked set of shaders forming the vertex stage contains no
2825     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2826     *     application has requested clipping against user clip planes through
2827     *     the API, then the coordinate written to gl_Position is used for
2828     *     comparison against the user clip planes."
2829     *
2830     * This function is only called if the shader didn't write to
2831     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2832     * if the user wrote to it; otherwise we use gl_Position.
2833     */
2834    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2835    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2836       clip_vertex = VARYING_SLOT_POS;
2837    }
2838
2839    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2840         ++i) {
2841       reg.writemask = 1 << i;
2842       emit(DP4(reg,
2843                src_reg(output_reg[clip_vertex]),
2844                src_reg(this->userplane[i + offset])));
2845    }
2846 }
2847
2848 void
2849 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2850 {
2851    assert (varying < VARYING_SLOT_MAX);
2852    reg.type = output_reg[varying].type;
2853    current_annotation = output_reg_annotation[varying];
2854    /* Copy the register, saturating if necessary */
2855    vec4_instruction *inst = emit(MOV(reg,
2856                                      src_reg(output_reg[varying])));
2857    if ((varying == VARYING_SLOT_COL0 ||
2858         varying == VARYING_SLOT_COL1 ||
2859         varying == VARYING_SLOT_BFC0 ||
2860         varying == VARYING_SLOT_BFC1) &&
2861        key->clamp_vertex_color) {
2862       inst->saturate = true;
2863    }
2864 }
2865
2866 void
2867 vec4_visitor::emit_urb_slot(int mrf, int varying)
2868 {
2869    struct brw_reg hw_reg = brw_message_reg(mrf);
2870    dst_reg reg = dst_reg(MRF, mrf);
2871    reg.type = BRW_REGISTER_TYPE_F;
2872
2873    switch (varying) {
2874    case VARYING_SLOT_PSIZ:
2875       /* PSIZ is always in slot 0, and is coupled with other flags. */
2876       current_annotation = "indices, point width, clip flags";
2877       emit_psiz_and_flags(hw_reg);
2878       break;
2879    case BRW_VARYING_SLOT_NDC:
2880       current_annotation = "NDC";
2881       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2882       break;
2883    case VARYING_SLOT_POS:
2884       current_annotation = "gl_Position";
2885       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2886       break;
2887    case VARYING_SLOT_EDGE:
2888       /* This is present when doing unfilled polygons.  We're supposed to copy
2889        * the edge flag from the user-provided vertex array
2890        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2891        * of that attribute (starts as 1.0f).  This is then used in clipping to
2892        * determine which edges should be drawn as wireframe.
2893        */
2894       current_annotation = "edge flag";
2895       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2896                                     glsl_type::float_type, WRITEMASK_XYZW))));
2897       break;
2898    case BRW_VARYING_SLOT_PAD:
2899       /* No need to write to this slot */
2900       break;
2901    default:
2902       emit_generic_urb_slot(reg, varying);
2903       break;
2904    }
2905 }
2906
2907 static int
2908 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2909 {
2910    if (brw->gen >= 6) {
2911       /* URB data written (does not include the message header reg) must
2912        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2913        * section 5.4.3.2.2: URB_INTERLEAVED.
2914        *
2915        * URB entries are allocated on a multiple of 1024 bits, so an
2916        * extra 128 bits written here to make the end align to 256 is
2917        * no problem.
2918        */
2919       if ((mlen % 2) != 1)
2920          mlen++;
2921    }
2922
2923    return mlen;
2924 }
2925
2926
2927 /**
2928  * Generates the VUE payload plus the necessary URB write instructions to
2929  * output it.
2930  *
2931  * The VUE layout is documented in Volume 2a.
2932  */
2933 void
2934 vec4_visitor::emit_vertex()
2935 {
2936    /* MRF 0 is reserved for the debugger, so start with message header
2937     * in MRF 1.
2938     */
2939    int base_mrf = 1;
2940    int mrf = base_mrf;
2941    /* In the process of generating our URB write message contents, we
2942     * may need to unspill a register or load from an array.  Those
2943     * reads would use MRFs 14-15.
2944     */
2945    int max_usable_mrf = 13;
2946
2947    /* The following assertion verifies that max_usable_mrf causes an
2948     * even-numbered amount of URB write data, which will meet gen6's
2949     * requirements for length alignment.
2950     */
2951    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2952
2953    /* First mrf is the g0-based message header containing URB handles and
2954     * such.
2955     */
2956    emit_urb_write_header(mrf++);
2957
2958    if (brw->gen < 6) {
2959       emit_ndc_computation();
2960    }
2961
2962    /* Lower legacy ff and ClipVertex clipping to clip distances */
2963    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2964       current_annotation = "user clip distances";
2965
2966       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2967       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2968
2969       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2970       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2971    }
2972
2973    /* We may need to split this up into several URB writes, so do them in a
2974     * loop.
2975     */
2976    int slot = 0;
2977    bool complete = false;
2978    do {
2979       /* URB offset is in URB row increments, and each of our MRFs is half of
2980        * one of those, since we're doing interleaved writes.
2981        */
2982       int offset = slot / 2;
2983
2984       mrf = base_mrf + 1;
2985       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2986          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2987
2988          /* If this was max_usable_mrf, we can't fit anything more into this
2989           * URB WRITE.
2990           */
2991          if (mrf > max_usable_mrf) {
2992             slot++;
2993             break;
2994          }
2995       }
2996
2997       complete = slot >= prog_data->vue_map.num_slots;
2998       current_annotation = "URB write";
2999       vec4_instruction *inst = emit_urb_write_opcode(complete);
3000       inst->base_mrf = base_mrf;
3001       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3002       inst->offset += offset;
3003    } while(!complete);
3004 }
3005
3006
3007 src_reg
3008 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3009                                  src_reg *reladdr, int reg_offset)
3010 {
3011    /* Because we store the values to scratch interleaved like our
3012     * vertex data, we need to scale the vec4 index by 2.
3013     */
3014    int message_header_scale = 2;
3015
3016    /* Pre-gen6, the message header uses byte offsets instead of vec4
3017     * (16-byte) offset units.
3018     */
3019    if (brw->gen < 6)
3020       message_header_scale *= 16;
3021
3022    if (reladdr) {
3023       src_reg index = src_reg(this, glsl_type::int_type);
3024
3025       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3026       emit_before(inst, MUL(dst_reg(index),
3027                             index, src_reg(message_header_scale)));
3028
3029       return index;
3030    } else {
3031       return src_reg(reg_offset * message_header_scale);
3032    }
3033 }
3034
3035 src_reg
3036 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3037                                        src_reg *reladdr, int reg_offset)
3038 {
3039    if (reladdr) {
3040       src_reg index = src_reg(this, glsl_type::int_type);
3041
3042       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3043
3044       /* Pre-gen6, the message header uses byte offsets instead of vec4
3045        * (16-byte) offset units.
3046        */
3047       if (brw->gen < 6) {
3048          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3049       }
3050
3051       return index;
3052    } else if (brw->gen >= 8) {
3053       /* Store the offset in a GRF so we can send-from-GRF. */
3054       src_reg offset = src_reg(this, glsl_type::int_type);
3055       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3056       return offset;
3057    } else {
3058       int message_header_scale = brw->gen < 6 ? 16 : 1;
3059       return src_reg(reg_offset * message_header_scale);
3060    }
3061 }
3062
3063 /**
3064  * Emits an instruction before @inst to load the value named by @orig_src
3065  * from scratch space at @base_offset to @temp.
3066  *
3067  * @base_offset is measured in 32-byte units (the size of a register).
3068  */
3069 void
3070 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3071                                 dst_reg temp, src_reg orig_src,
3072                                 int base_offset)
3073 {
3074    int reg_offset = base_offset + orig_src.reg_offset;
3075    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3076
3077    emit_before(inst, SCRATCH_READ(temp, index));
3078 }
3079
3080 /**
3081  * Emits an instruction after @inst to store the value to be written
3082  * to @orig_dst to scratch space at @base_offset, from @temp.
3083  *
3084  * @base_offset is measured in 32-byte units (the size of a register).
3085  */
3086 void
3087 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3088 {
3089    int reg_offset = base_offset + inst->dst.reg_offset;
3090    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3091
3092    /* Create a temporary register to store *inst's result in.
3093     *
3094     * We have to be careful in MOVing from our temporary result register in
3095     * the scratch write.  If we swizzle from channels of the temporary that
3096     * weren't initialized, it will confuse live interval analysis, which will
3097     * make spilling fail to make progress.
3098     */
3099    src_reg temp = src_reg(this, glsl_type::vec4_type);
3100    temp.type = inst->dst.type;
3101    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3102    int swizzles[4];
3103    for (int i = 0; i < 4; i++)
3104       if (inst->dst.writemask & (1 << i))
3105          swizzles[i] = i;
3106       else
3107          swizzles[i] = first_writemask_chan;
3108    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3109                                swizzles[2], swizzles[3]);
3110
3111    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3112                                        inst->dst.writemask));
3113    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3114    write->predicate = inst->predicate;
3115    write->ir = inst->ir;
3116    write->annotation = inst->annotation;
3117    inst->insert_after(write);
3118
3119    inst->dst.file = temp.file;
3120    inst->dst.reg = temp.reg;
3121    inst->dst.reg_offset = temp.reg_offset;
3122    inst->dst.reladdr = NULL;
3123 }
3124
3125 /**
3126  * We can't generally support array access in GRF space, because a
3127  * single instruction's destination can only span 2 contiguous
3128  * registers.  So, we send all GRF arrays that get variable index
3129  * access to scratch space.
3130  */
3131 void
3132 vec4_visitor::move_grf_array_access_to_scratch()
3133 {
3134    int scratch_loc[this->virtual_grf_count];
3135
3136    for (int i = 0; i < this->virtual_grf_count; i++) {
3137       scratch_loc[i] = -1;
3138    }
3139
3140    /* First, calculate the set of virtual GRFs that need to be punted
3141     * to scratch due to having any array access on them, and where in
3142     * scratch.
3143     */
3144    foreach_list(node, &this->instructions) {
3145       vec4_instruction *inst = (vec4_instruction *)node;
3146
3147       if (inst->dst.file == GRF && inst->dst.reladdr &&
3148           scratch_loc[inst->dst.reg] == -1) {
3149          scratch_loc[inst->dst.reg] = c->last_scratch;
3150          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3151       }
3152
3153       for (int i = 0 ; i < 3; i++) {
3154          src_reg *src = &inst->src[i];
3155
3156          if (src->file == GRF && src->reladdr &&
3157              scratch_loc[src->reg] == -1) {
3158             scratch_loc[src->reg] = c->last_scratch;
3159             c->last_scratch += this->virtual_grf_sizes[src->reg];
3160          }
3161       }
3162    }
3163
3164    /* Now, for anything that will be accessed through scratch, rewrite
3165     * it to load/store.  Note that this is a _safe list walk, because
3166     * we may generate a new scratch_write instruction after the one
3167     * we're processing.
3168     */
3169    foreach_list_safe(node, &this->instructions) {
3170       vec4_instruction *inst = (vec4_instruction *)node;
3171
3172       /* Set up the annotation tracking for new generated instructions. */
3173       base_ir = inst->ir;
3174       current_annotation = inst->annotation;
3175
3176       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3177          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3178       }
3179
3180       for (int i = 0 ; i < 3; i++) {
3181          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3182             continue;
3183
3184          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3185
3186          emit_scratch_read(inst, temp, inst->src[i],
3187                            scratch_loc[inst->src[i].reg]);
3188
3189          inst->src[i].file = temp.file;
3190          inst->src[i].reg = temp.reg;
3191          inst->src[i].reg_offset = temp.reg_offset;
3192          inst->src[i].reladdr = NULL;
3193       }
3194    }
3195 }
3196
3197 /**
3198  * Emits an instruction before @inst to load the value named by @orig_src
3199  * from the pull constant buffer (surface) at @base_offset to @temp.
3200  */
3201 void
3202 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3203                                       dst_reg temp, src_reg orig_src,
3204                                       int base_offset)
3205 {
3206    int reg_offset = base_offset + orig_src.reg_offset;
3207    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3208    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3209    vec4_instruction *load;
3210
3211    if (brw->gen >= 7) {
3212       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3213       grf_offset.type = offset.type;
3214       emit_before(inst, MOV(grf_offset, offset));
3215
3216       load = new(mem_ctx) vec4_instruction(this,
3217                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3218                                            temp, index, src_reg(grf_offset));
3219    } else {
3220       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3221                                            temp, index, offset);
3222       load->base_mrf = 14;
3223       load->mlen = 1;
3224    }
3225    emit_before(inst, load);
3226 }
3227
3228 /**
3229  * Implements array access of uniforms by inserting a
3230  * PULL_CONSTANT_LOAD instruction.
3231  *
3232  * Unlike temporary GRF array access (where we don't support it due to
3233  * the difficulty of doing relative addressing on instruction
3234  * destinations), we could potentially do array access of uniforms
3235  * that were loaded in GRF space as push constants.  In real-world
3236  * usage we've seen, though, the arrays being used are always larger
3237  * than we could load as push constants, so just always move all
3238  * uniform array access out to a pull constant buffer.
3239  */
3240 void
3241 vec4_visitor::move_uniform_array_access_to_pull_constants()
3242 {
3243    int pull_constant_loc[this->uniforms];
3244
3245    for (int i = 0; i < this->uniforms; i++) {
3246       pull_constant_loc[i] = -1;
3247    }
3248
3249    /* Walk through and find array access of uniforms.  Put a copy of that
3250     * uniform in the pull constant buffer.
3251     *
3252     * Note that we don't move constant-indexed accesses to arrays.  No
3253     * testing has been done of the performance impact of this choice.
3254     */
3255    foreach_list_safe(node, &this->instructions) {
3256       vec4_instruction *inst = (vec4_instruction *)node;
3257
3258       for (int i = 0 ; i < 3; i++) {
3259          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3260             continue;
3261
3262          int uniform = inst->src[i].reg;
3263
3264          /* If this array isn't already present in the pull constant buffer,
3265           * add it.
3266           */
3267          if (pull_constant_loc[uniform] == -1) {
3268             const float **values = &stage_prog_data->param[uniform * 4];
3269
3270             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3271
3272             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3273                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3274                   = values[j];
3275             }
3276          }
3277
3278          /* Set up the annotation tracking for new generated instructions. */
3279          base_ir = inst->ir;
3280          current_annotation = inst->annotation;
3281
3282          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3283
3284          emit_pull_constant_load(inst, temp, inst->src[i],
3285                                  pull_constant_loc[uniform]);
3286
3287          inst->src[i].file = temp.file;
3288          inst->src[i].reg = temp.reg;
3289          inst->src[i].reg_offset = temp.reg_offset;
3290          inst->src[i].reladdr = NULL;
3291       }
3292    }
3293
3294    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3295     * no need to track them as larger-than-vec4 objects.  This will be
3296     * relied on in cutting out unused uniform vectors from push
3297     * constants.
3298     */
3299    split_uniform_registers();
3300 }
3301
3302 void
3303 vec4_visitor::resolve_ud_negate(src_reg *reg)
3304 {
3305    if (reg->type != BRW_REGISTER_TYPE_UD ||
3306        !reg->negate)
3307       return;
3308
3309    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3310    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3311    *reg = temp;
3312 }
3313
3314 vec4_visitor::vec4_visitor(struct brw_context *brw,
3315                            struct brw_vec4_compile *c,
3316                            struct gl_program *prog,
3317                            const struct brw_vec4_prog_key *key,
3318                            struct brw_vec4_prog_data *prog_data,
3319                            struct gl_shader_program *shader_prog,
3320                            struct brw_shader *shader,
3321                            void *mem_ctx,
3322                            bool debug_flag,
3323                            bool no_spills,
3324                            shader_time_shader_type st_base,
3325                            shader_time_shader_type st_written,
3326                            shader_time_shader_type st_reset)
3327    : sanity_param_count(0),
3328      fail_msg(NULL),
3329      first_non_payload_grf(0),
3330      need_all_constants_in_pull_buffer(false),
3331      debug_flag(debug_flag),
3332      no_spills(no_spills),
3333      st_base(st_base),
3334      st_written(st_written),
3335      st_reset(st_reset)
3336 {
3337    this->brw = brw;
3338    this->ctx = &brw->ctx;
3339    this->shader_prog = shader_prog;
3340    this->shader = shader;
3341
3342    this->mem_ctx = mem_ctx;
3343    this->failed = false;
3344
3345    this->base_ir = NULL;
3346    this->current_annotation = NULL;
3347    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3348
3349    this->c = c;
3350    this->prog = prog;
3351    this->key = key;
3352    this->prog_data = prog_data;
3353    this->stage_prog_data = &prog_data->base;
3354
3355    this->variable_ht = hash_table_ctor(0,
3356                                        hash_table_pointer_hash,
3357                                        hash_table_pointer_compare);
3358
3359    this->virtual_grf_start = NULL;
3360    this->virtual_grf_end = NULL;
3361    this->virtual_grf_sizes = NULL;
3362    this->virtual_grf_count = 0;
3363    this->virtual_grf_reg_map = NULL;
3364    this->virtual_grf_reg_count = 0;
3365    this->virtual_grf_array_size = 0;
3366    this->live_intervals_valid = false;
3367
3368    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3369
3370    this->uniforms = 0;
3371 }
3372
3373 vec4_visitor::~vec4_visitor()
3374 {
3375    hash_table_dtor(this->variable_ht);
3376 }
3377
3378
3379 void
3380 vec4_visitor::fail(const char *format, ...)
3381 {
3382    va_list va;
3383    char *msg;
3384
3385    if (failed)
3386       return;
3387
3388    failed = true;
3389
3390    va_start(va, format);
3391    msg = ralloc_vasprintf(mem_ctx, format, va);
3392    va_end(va);
3393    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3394
3395    this->fail_msg = msg;
3396
3397    if (debug_flag) {
3398       fprintf(stderr, "%s",  msg);
3399    }
3400 }
3401
3402 } /* namespace brw */