src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->predicate = BRW_PREDICATE_NONE;
  49    this->predicate_inverse = false;
  50    this->target = 0;
  51    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  52    this->shadow_compare = false;
  53    this->ir = NULL;
  54    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  55    this->header_size = 0;
  56    this->flag_subreg = 0;
  57    this->mlen = 0;
  58    this->base_mrf = 0;
  59    this->offset = 0;
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188
 189 /** Gen4 predicated IF. */
 190 vec4_instruction *
 191 vec4_visitor::IF(enum brw_predicate predicate)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197
 198    return inst;
 199 }
 200
 201 /** Gen6 IF with embedded comparison. */
 202 vec4_instruction *
 203 vec4_visitor::IF(src_reg src0, src_reg src1,
 204                  enum brw_conditional_mod condition)
 205 {
 206    assert(devinfo->gen == 6);
 207
 208    vec4_instruction *inst;
 209
 210    resolve_ud_negate(&src0);
 211    resolve_ud_negate(&src1);
 212
 213    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 214                                         src0, src1);
 215    inst->conditional_mod = condition;
 216
 217    return inst;
 218 }
 219
 220 /**
 221  * CMP: Sets the low bit of the destination channels with the result
 222  * of the comparison, while the upper bits are undefined, and updates
 223  * the flag register with the packed 16 bits of the result.
 224  */
 225 vec4_instruction *
 226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 227                   enum brw_conditional_mod condition)
 228 {
 229    vec4_instruction *inst;
 230
 231    /* Take the instruction:
 232     *
 233     * CMP null<d> src0<f> src1<f>
 234     *
 235     * Original gen4 does type conversion to the destination type before
 236     * comparison, producing garbage results for floating point comparisons.
 237     *
 238     * The destination type doesn't matter on newer generations, so we set the
 239     * type to match src0 so we can compact the instruction.
 240     */
 241    dst.type = src0.type;
 242    if (dst.file == HW_REG)
 243       dst.fixed_hw_reg.type = dst.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 void
 282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 283 {
 284    static enum opcode dot_opcodes[] = {
 285       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 286    };
 287
 288    emit(dot_opcodes[elements - 2], dst, src0, src1);
 289 }
 290
 291 src_reg
 292 vec4_visitor::fix_3src_operand(const src_reg &src)
 293 {
 294    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 295     * able to use vertical stride of zero to replicate the vec4 uniform, like
 296     *
 297     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 298     *
 299     * But you can't, since vertical stride is always four in three-source
 300     * instructions. Instead, insert a MOV instruction to do the replication so
 301     * that the three-source instruction can consume it.
 302     */
 303
 304    /* The MOV is only needed if the source is a uniform or immediate. */
 305    if (src.file != UNIFORM && src.file != IMM)
 306       return src;
 307
 308    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 309       return src;
 310
 311    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 312    expanded.type = src.type;
 313    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 314    return src_reg(expanded);
 315 }
 316
 317 src_reg
 318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 319 {
 320    if (!src.abs && !src.negate)
 321       return src;
 322
 323    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 324    resolved.type = src.type;
 325    emit(MOV(resolved, src));
 326
 327    return src_reg(resolved);
 328 }
 329
 330 src_reg
 331 vec4_visitor::fix_math_operand(const src_reg &src)
 332 {
 333    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 334       return src;
 335
 336    /* The gen6 math instruction ignores the source modifiers --
 337     * swizzle, abs, negate, and at least some parts of the register
 338     * region description.
 339     *
 340     * Rather than trying to enumerate all these cases, *always* expand the
 341     * operand to a temp GRF for gen6.
 342     *
 343     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 344     * can't use.
 345     */
 346
 347    if (devinfo->gen == 7 && src.file != IMM)
 348       return src;
 349
 350    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 351    expanded.type = src.type;
 352    emit(MOV(expanded, src));
 353    return src_reg(expanded);
 354 }
 355
 356 vec4_instruction *
 357 vec4_visitor::emit_math(enum opcode opcode,
 358                         const dst_reg &dst,
 359                         const src_reg &src0, const src_reg &src1)
 360 {
 361    vec4_instruction *math =
 362       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 363
 364    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 365       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 366       math->dst = dst_reg(this, glsl_type::vec4_type);
 367       math->dst.type = dst.type;
 368       math = emit(MOV(dst, src_reg(math->dst)));
 369    } else if (devinfo->gen < 6) {
 370       math->base_mrf = 1;
 371       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 372    }
 373
 374    return math;
 375 }
 376
 377 void
 378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 379 {
 380    if (devinfo->gen < 7) {
 381       unreachable("ir_unop_pack_half_2x16 should be lowered");
 382    }
 383
 384    assert(dst.type == BRW_REGISTER_TYPE_UD);
 385    assert(src0.type == BRW_REGISTER_TYPE_F);
 386
 387    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 388     *
 389     *   Because this instruction does not have a 16-bit floating-point type,
 390     *   the destination data type must be Word (W).
 391     *
 392     *   The destination must be DWord-aligned and specify a horizontal stride
 393     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 394     *   each destination channel and the upper word is not modified.
 395     *
 396     * The above restriction implies that the f32to16 instruction must use
 397     * align1 mode, because only in align1 mode is it possible to specify
 398     * horizontal stride.  We choose here to defy the hardware docs and emit
 399     * align16 instructions.
 400     *
 401     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 402     * instructions. I was partially successful in that the code passed all
 403     * tests.  However, the code was dubiously correct and fragile, and the
 404     * tests were not harsh enough to probe that frailty. Not trusting the
 405     * code, I chose instead to remain in align16 mode in defiance of the hw
 406     * docs).
 407     *
 408     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 409     * simulator, emitting a f32to16 in align16 mode with UD as destination
 410     * data type is safe. The behavior differs from that specified in the PRM
 411     * in that the upper word of each destination channel is cleared to 0.
 412     */
 413
 414    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 415    src_reg tmp_src(tmp_dst);
 416
 417 #if 0
 418    /* Verify the undocumented behavior on which the following instructions
 419     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 420     * then the result of the bit-or instruction below will be incorrect.
 421     *
 422     * You should inspect the disasm output in order to verify that the MOV is
 423     * not optimized away.
 424     */
 425    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 426 #endif
 427
 428    /* Give tmp the form below, where "." means untouched.
 429     *
 430     *     w z          y          x w z          y          x
 431     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 432     *
 433     * That the upper word of each write-channel be 0 is required for the
 434     * following bit-shift and bit-or instructions to work. Note that this
 435     * relies on the undocumented hardware behavior mentioned above.
 436     */
 437    tmp_dst.writemask = WRITEMASK_XY;
 438    emit(F32TO16(tmp_dst, src0));
 439
 440    /* Give the write-channels of dst the form:
 441     *   0xhhhh0000
 442     */
 443    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 444    emit(SHL(dst, tmp_src, src_reg(16u)));
 445
 446    /* Finally, give the write-channels of dst the form of packHalf2x16's
 447     * output:
 448     *   0xhhhhllll
 449     */
 450    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 451    emit(OR(dst, src_reg(dst), tmp_src));
 452 }
 453
 454 void
 455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 456 {
 457    if (devinfo->gen < 7) {
 458       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 459    }
 460
 461    assert(dst.type == BRW_REGISTER_TYPE_F);
 462    assert(src0.type == BRW_REGISTER_TYPE_UD);
 463
 464    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 465     *
 466     *   Because this instruction does not have a 16-bit floating-point type,
 467     *   the source data type must be Word (W). The destination type must be
 468     *   F (Float).
 469     *
 470     * To use W as the source data type, we must adjust horizontal strides,
 471     * which is only possible in align1 mode. All my [chadv] attempts at
 472     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 473     * Piglit tests, so I gave up.
 474     *
 475     * I've verified that, on gen7 hardware and the simulator, it is safe to
 476     * emit f16to32 in align16 mode with UD as source data type.
 477     */
 478
 479    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 480    src_reg tmp_src(tmp_dst);
 481
 482    tmp_dst.writemask = WRITEMASK_X;
 483    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 484
 485    tmp_dst.writemask = WRITEMASK_Y;
 486    emit(SHR(tmp_dst, src0, src_reg(16u)));
 487
 488    dst.writemask = WRITEMASK_XY;
 489    emit(F16TO32(dst, tmp_src));
 490 }
 491
 492 void
 493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 494 {
 495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 497     * is not suitable to generate the shift values, but we can use the packed
 498     * vector float and a type-converting MOV.
 499     */
 500    dst_reg shift(this, glsl_type::uvec4_type);
 501    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 502
 503    dst_reg shifted(this, glsl_type::uvec4_type);
 504    src0.swizzle = BRW_SWIZZLE_XXXX;
 505    emit(SHR(shifted, src0, src_reg(shift)));
 506
 507    shifted.type = BRW_REGISTER_TYPE_UB;
 508    dst_reg f(this, glsl_type::vec4_type);
 509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 510
 511    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 516 {
 517    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 518     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 519     * is not suitable to generate the shift values, but we can use the packed
 520     * vector float and a type-converting MOV.
 521     */
 522    dst_reg shift(this, glsl_type::uvec4_type);
 523    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 524
 525    dst_reg shifted(this, glsl_type::uvec4_type);
 526    src0.swizzle = BRW_SWIZZLE_XXXX;
 527    emit(SHR(shifted, src0, src_reg(shift)));
 528
 529    shifted.type = BRW_REGISTER_TYPE_B;
 530    dst_reg f(this, glsl_type::vec4_type);
 531    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 532
 533    dst_reg scaled(this, glsl_type::vec4_type);
 534    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 535
 536    dst_reg max(this, glsl_type::vec4_type);
 537    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 538    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 539 }
 540
 541 void
 542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 543 {
 544    dst_reg saturated(this, glsl_type::vec4_type);
 545    vec4_instruction *inst = emit(MOV(saturated, src0));
 546    inst->saturate = true;
 547
 548    dst_reg scaled(this, glsl_type::vec4_type);
 549    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 550
 551    dst_reg rounded(this, glsl_type::vec4_type);
 552    emit(RNDE(rounded, src_reg(scaled)));
 553
 554    dst_reg u(this, glsl_type::uvec4_type);
 555    emit(MOV(u, src_reg(rounded)));
 556
 557    src_reg bytes(u);
 558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 559 }
 560
 561 void
 562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 563 {
 564    dst_reg max(this, glsl_type::vec4_type);
 565    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 566
 567    dst_reg min(this, glsl_type::vec4_type);
 568    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 569
 570    dst_reg scaled(this, glsl_type::vec4_type);
 571    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 572
 573    dst_reg rounded(this, glsl_type::vec4_type);
 574    emit(RNDE(rounded, src_reg(scaled)));
 575
 576    dst_reg i(this, glsl_type::ivec4_type);
 577    emit(MOV(i, src_reg(rounded)));
 578
 579    src_reg bytes(i);
 580    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 581 }
 582
 583 /**
 584  * Returns the minimum number of vec4 elements needed to pack a type.
 585  *
 586  * For simple types, it will return 1 (a single vec4); for matrices, the
 587  * number of columns; for array and struct, the sum of the vec4_size of
 588  * each of its elements; and for sampler and atomic, zero.
 589  *
 590  * This method is useful to calculate how much register space is needed to
 591  * store a particular type.
 592  */
 593 extern "C" int
 594 type_size_vec4(const struct glsl_type *type)
 595 {
 596    unsigned int i;
 597    int size;
 598
 599    switch (type->base_type) {
 600    case GLSL_TYPE_UINT:
 601    case GLSL_TYPE_INT:
 602    case GLSL_TYPE_FLOAT:
 603    case GLSL_TYPE_BOOL:
 604       if (type->is_matrix()) {
 605          return type->matrix_columns;
 606       } else {
 607          /* Regardless of size of vector, it gets a vec4. This is bad
 608           * packing for things like floats, but otherwise arrays become a
 609           * mess.  Hopefully a later pass over the code can pack scalars
 610           * down if appropriate.
 611           */
 612          return 1;
 613       }
 614    case GLSL_TYPE_ARRAY:
 615       assert(type->length > 0);
 616       return type_size_vec4(type->fields.array) * type->length;
 617    case GLSL_TYPE_STRUCT:
 618       size = 0;
 619       for (i = 0; i < type->length; i++) {
 620          size += type_size_vec4(type->fields.structure[i].type);
 621       }
 622       return size;
 623    case GLSL_TYPE_SUBROUTINE:
 624       return 1;
 625
 626    case GLSL_TYPE_SAMPLER:
 627       /* Samplers take up no register space, since they're baked in at
 628        * link time.
 629        */
 630       return 0;
 631    case GLSL_TYPE_ATOMIC_UINT:
 632       return 0;
 633    case GLSL_TYPE_IMAGE:
 634       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 635    case GLSL_TYPE_VOID:
 636    case GLSL_TYPE_DOUBLE:
 637    case GLSL_TYPE_ERROR:
 638    case GLSL_TYPE_INTERFACE:
 639       unreachable("not reached");
 640    }
 641
 642    return 0;
 643 }
 644
 645 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 646 {
 647    init();
 648
 649    this->file = GRF;
 650    this->reg = v->alloc.allocate(type_size_vec4(type));
 651
 652    if (type->is_array() || type->is_record()) {
 653       this->swizzle = BRW_SWIZZLE_NOOP;
 654    } else {
 655       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 656    }
 657
 658    this->type = brw_type_for_base_type(type);
 659 }
 660
 661 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 662 {
 663    assert(size > 0);
 664
 665    init();
 666
 667    this->file = GRF;
 668    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 669
 670    this->swizzle = BRW_SWIZZLE_NOOP;
 671
 672    this->type = brw_type_for_base_type(type);
 673 }
 674
 675 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 676 {
 677    init();
 678
 679    this->file = GRF;
 680    this->reg = v->alloc.allocate(type_size_vec4(type));
 681
 682    if (type->is_array() || type->is_record()) {
 683       this->writemask = WRITEMASK_XYZW;
 684    } else {
 685       this->writemask = (1 << type->vector_elements) - 1;
 686    }
 687
 688    this->type = brw_type_for_base_type(type);
 689 }
 690
 691 void
 692 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 693                                        const gl_constant_value *values,
 694                                        unsigned n)
 695 {
 696    static const gl_constant_value zero = { 0 };
 697
 698    assert(param_offset % 4 == 0);
 699
 700    for (unsigned i = 0; i < n; ++i)
 701       stage_prog_data->param[param_offset + i] = &values[i];
 702
 703    for (unsigned i = n; i < 4; ++i)
 704       stage_prog_data->param[param_offset + i] = &zero;
 705 }
 706
 707 vec4_instruction *
 708 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 709                           src_reg src0, src_reg src1)
 710 {
 711    vec4_instruction *inst;
 712
 713    if (devinfo->gen >= 6) {
 714       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 715       inst->conditional_mod = conditionalmod;
 716    } else {
 717       emit(CMP(dst, src0, src1, conditionalmod));
 718
 719       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 720       inst->predicate = BRW_PREDICATE_NORMAL;
 721    }
 722
 723    return inst;
 724 }
 725
 726 vec4_instruction *
 727 vec4_visitor::emit_lrp(const dst_reg &dst,
 728                        const src_reg &x, const src_reg &y, const src_reg &a)
 729 {
 730    if (devinfo->gen >= 6) {
 731       /* Note that the instruction's argument order is reversed from GLSL
 732        * and the IR.
 733        */
 734      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 735                      fix_3src_operand(x)));
 736    } else {
 737       /* Earlier generations don't support three source operations, so we
 738        * need to emit x*(1-a) + y*a.
 739        */
 740       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 741       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 742       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 743       y_times_a.writemask           = dst.writemask;
 744       one_minus_a.writemask         = dst.writemask;
 745       x_times_one_minus_a.writemask = dst.writemask;
 746
 747       emit(MUL(y_times_a, y, a));
 748       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
 749       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 750       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 751    }
 752 }
 753
 754 /**
 755  * Emits the instructions needed to perform a pull constant load. before_block
 756  * and before_inst can be NULL in which case the instruction will be appended
 757  * to the end of the instruction list.
 758  */
 759 void
 760 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 761                                           src_reg surf_index,
 762                                           src_reg offset_reg,
 763                                           bblock_t *before_block,
 764                                           vec4_instruction *before_inst)
 765 {
 766    assert((before_inst == NULL && before_block == NULL) ||
 767           (before_inst && before_block));
 768
 769    vec4_instruction *pull;
 770
 771    if (devinfo->gen >= 9) {
 772       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 773       src_reg header(this, glsl_type::uvec4_type, 2);
 774
 775       pull = new(mem_ctx)
 776          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 777                           dst_reg(header));
 778
 779       if (before_inst)
 780          emit_before(before_block, before_inst, pull);
 781       else
 782          emit(pull);
 783
 784       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 785                                  offset_reg.type);
 786       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 787
 788       if (before_inst)
 789          emit_before(before_block, before_inst, pull);
 790       else
 791          emit(pull);
 792
 793       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 794                                            dst,
 795                                            surf_index,
 796                                            header);
 797       pull->mlen = 2;
 798       pull->header_size = 1;
 799    } else if (devinfo->gen >= 7) {
 800       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 801
 802       grf_offset.type = offset_reg.type;
 803
 804       pull = MOV(grf_offset, offset_reg);
 805
 806       if (before_inst)
 807          emit_before(before_block, before_inst, pull);
 808       else
 809          emit(pull);
 810
 811       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 812                                            dst,
 813                                            surf_index,
 814                                            src_reg(grf_offset));
 815       pull->mlen = 1;
 816    } else {
 817       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 818                                            dst,
 819                                            surf_index,
 820                                            offset_reg);
 821       pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 822       pull->mlen = 1;
 823    }
 824
 825    if (before_inst)
 826       emit_before(before_block, before_inst, pull);
 827    else
 828       emit(pull);
 829 }
 830
 831 src_reg
 832 vec4_visitor::emit_uniformize(const src_reg &src)
 833 {
 834    const src_reg chan_index(this, glsl_type::uint_type);
 835    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 836                               src.type);
 837
 838    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 839       ->force_writemask_all = true;
 840    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 841       ->force_writemask_all = true;
 842
 843    return src_reg(dst);
 844 }
 845
 846 src_reg
 847 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 848                              src_reg coordinate, src_reg sampler)
 849 {
 850    vec4_instruction *inst =
 851       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 852                                     dst_reg(this, glsl_type::uvec4_type));
 853    inst->base_mrf = 2;
 854    inst->src[1] = sampler;
 855
 856    int param_base;
 857
 858    if (devinfo->gen >= 9) {
 859       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 860       vec4_instruction *header_inst = new(mem_ctx)
 861          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 862                           dst_reg(MRF, inst->base_mrf));
 863
 864       emit(header_inst);
 865
 866       inst->mlen = 2;
 867       inst->header_size = 1;
 868       param_base = inst->base_mrf + 1;
 869    } else {
 870       inst->mlen = 1;
 871       param_base = inst->base_mrf;
 872    }
 873
 874    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 875    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 876    int zero_mask = 0xf & ~coord_mask;
 877
 878    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 879             coordinate));
 880
 881    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 882             src_reg(0)));
 883
 884    emit(inst);
 885    return src_reg(inst->dst);
 886 }
 887
 888 bool
 889 vec4_visitor::is_high_sampler(src_reg sampler)
 890 {
 891    if (devinfo->gen < 8 && !devinfo->is_haswell)
 892       return false;
 893
 894    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 895 }
 896
 897 void
 898 vec4_visitor::emit_texture(ir_texture_opcode op,
 899                            dst_reg dest,
 900                            const glsl_type *dest_type,
 901                            src_reg coordinate,
 902                            int coord_components,
 903                            src_reg shadow_comparitor,
 904                            src_reg lod, src_reg lod2,
 905                            src_reg sample_index,
 906                            uint32_t constant_offset,
 907                            src_reg offset_value,
 908                            src_reg mcs,
 909                            bool is_cube_array,
 910                            uint32_t sampler,
 911                            src_reg sampler_reg)
 912 {
 913    enum opcode opcode;
 914    switch (op) {
 915    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 916    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 917    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 918    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 919    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 920    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 921    case ir_tg4: opcode = offset_value.file != BAD_FILE
 922                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 923    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 924    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 925    case ir_txb:
 926       unreachable("TXB is not valid for vertex shaders.");
 927    case ir_lod:
 928       unreachable("LOD is not valid for vertex shaders.");
 929    default:
 930       unreachable("Unrecognized tex op");
 931    }
 932
 933    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 934       opcode, dst_reg(this, dest_type));
 935
 936    inst->offset = constant_offset;
 937
 938    /* The message header is necessary for:
 939     * - Gen4 (always)
 940     * - Gen9+ for selecting SIMD4x2
 941     * - Texel offsets
 942     * - Gather channel selection
 943     * - Sampler indices too large to fit in a 4-bit value.
 944     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 945     */
 946    inst->header_size =
 947       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 948        inst->offset != 0 || op == ir_tg4 ||
 949        op == ir_texture_samples ||
 950        is_high_sampler(sampler_reg)) ? 1 : 0;
 951    inst->base_mrf = 2;
 952    inst->mlen = inst->header_size;
 953    inst->dst.writemask = WRITEMASK_XYZW;
 954    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 955
 956    inst->src[1] = sampler_reg;
 957
 958    /* MRF for the first parameter */
 959    int param_base = inst->base_mrf + inst->header_size;
 960
 961    if (op == ir_txs || op == ir_query_levels) {
 962       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 963       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 964       inst->mlen++;
 965    } else if (op == ir_texture_samples) {
 966       inst->dst.writemask = WRITEMASK_X;
 967    } else {
 968       /* Load the coordinate */
 969       /* FINISHME: gl_clamp_mask and saturate */
 970       int coord_mask = (1 << coord_components) - 1;
 971       int zero_mask = 0xf & ~coord_mask;
 972
 973       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 974                coordinate));
 975       inst->mlen++;
 976
 977       if (zero_mask != 0) {
 978          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 979                   src_reg(0)));
 980       }
 981       /* Load the shadow comparitor */
 982       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 983          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 984                           WRITEMASK_X),
 985                   shadow_comparitor));
 986          inst->mlen++;
 987       }
 988
 989       /* Load the LOD info */
 990       if (op == ir_tex || op == ir_txl) {
 991          int mrf, writemask;
 992          if (devinfo->gen >= 5) {
 993             mrf = param_base + 1;
 994             if (shadow_comparitor.file != BAD_FILE) {
 995                writemask = WRITEMASK_Y;
 996                /* mlen already incremented */
 997             } else {
 998                writemask = WRITEMASK_X;
 999                inst->mlen++;
1000             }
1001          } else /* devinfo->gen == 4 */ {
1002             mrf = param_base;
1003             writemask = WRITEMASK_W;
1004          }
1005          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1006       } else if (op == ir_txf) {
1007          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1008       } else if (op == ir_txf_ms) {
1009          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1010                   sample_index));
1011          if (devinfo->gen >= 7) {
1012             /* MCS data is in the first channel of `mcs`, but we need to get it into
1013              * the .y channel of the second vec4 of params, so replicate .x across
1014              * the whole vec4 and then mask off everything except .y
1015              */
1016             mcs.swizzle = BRW_SWIZZLE_XXXX;
1017             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1018                      mcs));
1019          }
1020          inst->mlen++;
1021       } else if (op == ir_txd) {
1022          const brw_reg_type type = lod.type;
1023
1024          if (devinfo->gen >= 5) {
1025             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1026             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1027             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1028             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1029             inst->mlen++;
1030
1031             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1032                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1033                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1034                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1035                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1036                inst->mlen++;
1037
1038                if (shadow_comparitor.file != BAD_FILE) {
1039                   emit(MOV(dst_reg(MRF, param_base + 2,
1040                                    shadow_comparitor.type, WRITEMASK_Z),
1041                            shadow_comparitor));
1042                }
1043             }
1044          } else /* devinfo->gen == 4 */ {
1045             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1046             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1047             inst->mlen += 2;
1048          }
1049       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1050          if (shadow_comparitor.file != BAD_FILE) {
1051             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1052                      shadow_comparitor));
1053          }
1054
1055          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1056                   offset_value));
1057          inst->mlen++;
1058       }
1059    }
1060
1061    emit(inst);
1062
1063    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1064     * spec requires layers.
1065     */
1066    if (op == ir_txs && is_cube_array) {
1067       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1068                 writemask(inst->dst, WRITEMASK_Z),
1069                 src_reg(inst->dst), src_reg(6));
1070    }
1071
1072    if (devinfo->gen == 6 && op == ir_tg4) {
1073       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1074    }
1075
1076    swizzle_result(op, dest,
1077                   src_reg(inst->dst), sampler, dest_type);
1078 }
1079
1080 /**
1081  * Apply workarounds for Gen6 gather with UINT/SINT
1082  */
1083 void
1084 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1085 {
1086    if (!wa)
1087       return;
1088
1089    int width = (wa & WA_8BIT) ? 8 : 16;
1090    dst_reg dst_f = dst;
1091    dst_f.type = BRW_REGISTER_TYPE_F;
1092
1093    /* Convert from UNORM to UINT */
1094    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1095    emit(MOV(dst, src_reg(dst_f)));
1096
1097    if (wa & WA_SIGN) {
1098       /* Reinterpret the UINT value as a signed INT value by
1099        * shifting the sign bit into place, then shifting back
1100        * preserving sign.
1101        */
1102       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1103       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1104    }
1105 }
1106
1107 /**
1108  * Set up the gather channel based on the swizzle, for gather4.
1109  */
1110 uint32_t
1111 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1112 {
1113    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1114    switch (swiz) {
1115       case SWIZZLE_X: return 0;
1116       case SWIZZLE_Y:
1117          /* gather4 sampler is broken for green channel on RG32F --
1118           * we must ask for blue instead.
1119           */
1120          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1121             return 2;
1122          return 1;
1123       case SWIZZLE_Z: return 2;
1124       case SWIZZLE_W: return 3;
1125       default:
1126          unreachable("Not reached"); /* zero, one swizzles handled already */
1127    }
1128 }
1129
1130 void
1131 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1132                              src_reg orig_val, uint32_t sampler,
1133                              const glsl_type *dest_type)
1134 {
1135    int s = key_tex->swizzles[sampler];
1136
1137    dst_reg swizzled_result = dest;
1138
1139    if (op == ir_query_levels) {
1140       /* # levels is in .w */
1141       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1142       emit(MOV(swizzled_result, orig_val));
1143       return;
1144    }
1145
1146    if (op == ir_txs || dest_type == glsl_type::float_type
1147                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1148       emit(MOV(swizzled_result, orig_val));
1149       return;
1150    }
1151
1152
1153    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1154    int swizzle[4] = {0};
1155
1156    for (int i = 0; i < 4; i++) {
1157       switch (GET_SWZ(s, i)) {
1158       case SWIZZLE_ZERO:
1159          zero_mask |= (1 << i);
1160          break;
1161       case SWIZZLE_ONE:
1162          one_mask |= (1 << i);
1163          break;
1164       default:
1165          copy_mask |= (1 << i);
1166          swizzle[i] = GET_SWZ(s, i);
1167          break;
1168       }
1169    }
1170
1171    if (copy_mask) {
1172       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1173       swizzled_result.writemask = copy_mask;
1174       emit(MOV(swizzled_result, orig_val));
1175    }
1176
1177    if (zero_mask) {
1178       swizzled_result.writemask = zero_mask;
1179       emit(MOV(swizzled_result, src_reg(0.0f)));
1180    }
1181
1182    if (one_mask) {
1183       swizzled_result.writemask = one_mask;
1184       emit(MOV(swizzled_result, src_reg(1.0f)));
1185    }
1186 }
1187
1188 void
1189 vec4_visitor::gs_emit_vertex(int stream_id)
1190 {
1191    unreachable("not reached");
1192 }
1193
1194 void
1195 vec4_visitor::gs_end_primitive()
1196 {
1197    unreachable("not reached");
1198 }
1199
1200 void
1201 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1202                                   dst_reg dst, src_reg offset,
1203                                   src_reg src0, src_reg src1)
1204 {
1205    unsigned mlen = 0;
1206
1207    /* Set the atomic operation offset. */
1208    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1209    mlen++;
1210
1211    /* Set the atomic operation arguments. */
1212    if (src0.file != BAD_FILE) {
1213       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1214       mlen++;
1215    }
1216
1217    if (src1.file != BAD_FILE) {
1218       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1219       mlen++;
1220    }
1221
1222    /* Emit the instruction.  Note that this maps to the normal SIMD8
1223     * untyped atomic message on Ivy Bridge, but that's OK because
1224     * unused channels will be masked out.
1225     */
1226    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1227                                  brw_message_reg(0),
1228                                  src_reg(surf_index), src_reg(atomic_op));
1229    inst->mlen = mlen;
1230 }
1231
1232 void
1233 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1234                                         src_reg offset)
1235 {
1236    /* Set the surface read offset. */
1237    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1238
1239    /* Emit the instruction.  Note that this maps to the normal SIMD8
1240     * untyped surface read message, but that's OK because unused
1241     * channels will be masked out.
1242     */
1243    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1244                                  brw_message_reg(0),
1245                                  src_reg(surf_index), src_reg(1));
1246    inst->mlen = 1;
1247 }
1248
1249 void
1250 vec4_visitor::emit_ndc_computation()
1251 {
1252    /* Get the position */
1253    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1254
1255    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1256    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1257    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1258
1259    current_annotation = "NDC";
1260    dst_reg ndc_w = ndc;
1261    ndc_w.writemask = WRITEMASK_W;
1262    src_reg pos_w = pos;
1263    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1264    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1265
1266    dst_reg ndc_xyz = ndc;
1267    ndc_xyz.writemask = WRITEMASK_XYZ;
1268
1269    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1270 }
1271
1272 void
1273 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1274 {
1275    if (devinfo->gen < 6 &&
1276        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1277         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1278         devinfo->has_negative_rhw_bug)) {
1279       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1280       dst_reg header1_w = header1;
1281       header1_w.writemask = WRITEMASK_W;
1282
1283       emit(MOV(header1, 0u));
1284
1285       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1286          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1287
1288          current_annotation = "Point size";
1289          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1290          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1291       }
1292
1293       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1294          current_annotation = "Clipping flags";
1295          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1296          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1297
1298          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1299          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1300          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1301
1302          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1303          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1304          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1305          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1306       }
1307
1308       /* i965 clipping workaround:
1309        * 1) Test for -ve rhw
1310        * 2) If set,
1311        *      set ndc = (0,0,0,0)
1312        *      set ucp[6] = 1
1313        *
1314        * Later, clipping will detect ucp[6] and ensure the primitive is
1315        * clipped against all fixed planes.
1316        */
1317       if (devinfo->has_negative_rhw_bug) {
1318          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1319          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1320          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1321          vec4_instruction *inst;
1322          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1323          inst->predicate = BRW_PREDICATE_NORMAL;
1324          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1325          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1326          inst->predicate = BRW_PREDICATE_NORMAL;
1327       }
1328
1329       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1330    } else if (devinfo->gen < 6) {
1331       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1332    } else {
1333       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1334       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1335          dst_reg reg_w = reg;
1336          reg_w.writemask = WRITEMASK_W;
1337          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1338          reg_as_src.type = reg_w.type;
1339          reg_as_src.swizzle = brw_swizzle_for_size(1);
1340          emit(MOV(reg_w, reg_as_src));
1341       }
1342       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1343          dst_reg reg_y = reg;
1344          reg_y.writemask = WRITEMASK_Y;
1345          reg_y.type = BRW_REGISTER_TYPE_D;
1346          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1347          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1348       }
1349       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1350          dst_reg reg_z = reg;
1351          reg_z.writemask = WRITEMASK_Z;
1352          reg_z.type = BRW_REGISTER_TYPE_D;
1353          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1354          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1355       }
1356    }
1357 }
1358
1359 vec4_instruction *
1360 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1361 {
1362    assert(varying < VARYING_SLOT_MAX);
1363    assert(output_reg[varying].type == reg.type);
1364    current_annotation = output_reg_annotation[varying];
1365    /* Copy the register, saturating if necessary */
1366    return emit(MOV(reg, src_reg(output_reg[varying])));
1367 }
1368
1369 void
1370 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1371 {
1372    reg.type = BRW_REGISTER_TYPE_F;
1373    output_reg[varying].type = reg.type;
1374
1375    switch (varying) {
1376    case VARYING_SLOT_PSIZ:
1377    {
1378       /* PSIZ is always in slot 0, and is coupled with other flags. */
1379       current_annotation = "indices, point width, clip flags";
1380       emit_psiz_and_flags(reg);
1381       break;
1382    }
1383    case BRW_VARYING_SLOT_NDC:
1384       current_annotation = "NDC";
1385       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1386       break;
1387    case VARYING_SLOT_POS:
1388       current_annotation = "gl_Position";
1389       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1390       break;
1391    case VARYING_SLOT_EDGE:
1392       /* This is present when doing unfilled polygons.  We're supposed to copy
1393        * the edge flag from the user-provided vertex array
1394        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1395        * of that attribute (starts as 1.0f).  This is then used in clipping to
1396        * determine which edges should be drawn as wireframe.
1397        */
1398       current_annotation = "edge flag";
1399       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1400                                     glsl_type::float_type, WRITEMASK_XYZW))));
1401       break;
1402    case BRW_VARYING_SLOT_PAD:
1403       /* No need to write to this slot */
1404       break;
1405    default:
1406       emit_generic_urb_slot(reg, varying);
1407       break;
1408    }
1409 }
1410
1411 static int
1412 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1413 {
1414    if (devinfo->gen >= 6) {
1415       /* URB data written (does not include the message header reg) must
1416        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1417        * section 5.4.3.2.2: URB_INTERLEAVED.
1418        *
1419        * URB entries are allocated on a multiple of 1024 bits, so an
1420        * extra 128 bits written here to make the end align to 256 is
1421        * no problem.
1422        */
1423       if ((mlen % 2) != 1)
1424          mlen++;
1425    }
1426
1427    return mlen;
1428 }
1429
1430
1431 /**
1432  * Generates the VUE payload plus the necessary URB write instructions to
1433  * output it.
1434  *
1435  * The VUE layout is documented in Volume 2a.
1436  */
1437 void
1438 vec4_visitor::emit_vertex()
1439 {
1440    /* MRF 0 is reserved for the debugger, so start with message header
1441     * in MRF 1.
1442     */
1443    int base_mrf = 1;
1444    int mrf = base_mrf;
1445    /* In the process of generating our URB write message contents, we
1446     * may need to unspill a register or load from an array.  Those
1447     * reads would use MRFs 14-15.
1448     */
1449    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1450
1451    /* The following assertion verifies that max_usable_mrf causes an
1452     * even-numbered amount of URB write data, which will meet gen6's
1453     * requirements for length alignment.
1454     */
1455    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1456
1457    /* First mrf is the g0-based message header containing URB handles and
1458     * such.
1459     */
1460    emit_urb_write_header(mrf++);
1461
1462    if (devinfo->gen < 6) {
1463       emit_ndc_computation();
1464    }
1465
1466    /* We may need to split this up into several URB writes, so do them in a
1467     * loop.
1468     */
1469    int slot = 0;
1470    bool complete = false;
1471    do {
1472       /* URB offset is in URB row increments, and each of our MRFs is half of
1473        * one of those, since we're doing interleaved writes.
1474        */
1475       int offset = slot / 2;
1476
1477       mrf = base_mrf + 1;
1478       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1479          emit_urb_slot(dst_reg(MRF, mrf++),
1480                        prog_data->vue_map.slot_to_varying[slot]);
1481
1482          /* If this was max_usable_mrf, we can't fit anything more into this
1483           * URB WRITE. Same thing if we reached the maximum length available.
1484           */
1485          if (mrf > max_usable_mrf ||
1486              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1487             slot++;
1488             break;
1489          }
1490       }
1491
1492       complete = slot >= prog_data->vue_map.num_slots;
1493       current_annotation = "URB write";
1494       vec4_instruction *inst = emit_urb_write_opcode(complete);
1495       inst->base_mrf = base_mrf;
1496       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1497       inst->offset += offset;
1498    } while(!complete);
1499 }
1500
1501
1502 src_reg
1503 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1504                                  src_reg *reladdr, int reg_offset)
1505 {
1506    /* Because we store the values to scratch interleaved like our
1507     * vertex data, we need to scale the vec4 index by 2.
1508     */
1509    int message_header_scale = 2;
1510
1511    /* Pre-gen6, the message header uses byte offsets instead of vec4
1512     * (16-byte) offset units.
1513     */
1514    if (devinfo->gen < 6)
1515       message_header_scale *= 16;
1516
1517    if (reladdr) {
1518       src_reg index = src_reg(this, glsl_type::int_type);
1519
1520       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1521                                    src_reg(reg_offset)));
1522       emit_before(block, inst, MUL(dst_reg(index), index,
1523                                    src_reg(message_header_scale)));
1524
1525       return index;
1526    } else {
1527       return src_reg(reg_offset * message_header_scale);
1528    }
1529 }
1530
1531 src_reg
1532 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1533                                        src_reg *reladdr, int reg_offset)
1534 {
1535    if (reladdr) {
1536       src_reg index = src_reg(this, glsl_type::int_type);
1537
1538       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1539                                    src_reg(reg_offset)));
1540
1541       /* Pre-gen6, the message header uses byte offsets instead of vec4
1542        * (16-byte) offset units.
1543        */
1544       if (devinfo->gen < 6) {
1545          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1546       }
1547
1548       return index;
1549    } else if (devinfo->gen >= 8) {
1550       /* Store the offset in a GRF so we can send-from-GRF. */
1551       src_reg offset = src_reg(this, glsl_type::int_type);
1552       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1553       return offset;
1554    } else {
1555       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1556       return src_reg(reg_offset * message_header_scale);
1557    }
1558 }
1559
1560 /**
1561  * Emits an instruction before @inst to load the value named by @orig_src
1562  * from scratch space at @base_offset to @temp.
1563  *
1564  * @base_offset is measured in 32-byte units (the size of a register).
1565  */
1566 void
1567 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1568                                 dst_reg temp, src_reg orig_src,
1569                                 int base_offset)
1570 {
1571    int reg_offset = base_offset + orig_src.reg_offset;
1572    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1573                                       reg_offset);
1574
1575    emit_before(block, inst, SCRATCH_READ(temp, index));
1576 }
1577
1578 /**
1579  * Emits an instruction after @inst to store the value to be written
1580  * to @orig_dst to scratch space at @base_offset, from @temp.
1581  *
1582  * @base_offset is measured in 32-byte units (the size of a register).
1583  */
1584 void
1585 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1586                                  int base_offset)
1587 {
1588    int reg_offset = base_offset + inst->dst.reg_offset;
1589    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1590                                       reg_offset);
1591
1592    /* Create a temporary register to store *inst's result in.
1593     *
1594     * We have to be careful in MOVing from our temporary result register in
1595     * the scratch write.  If we swizzle from channels of the temporary that
1596     * weren't initialized, it will confuse live interval analysis, which will
1597     * make spilling fail to make progress.
1598     */
1599    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1600                                        inst->dst.type),
1601                                 brw_swizzle_for_mask(inst->dst.writemask));
1602    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1603                                        inst->dst.writemask));
1604    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1605    if (inst->opcode != BRW_OPCODE_SEL)
1606       write->predicate = inst->predicate;
1607    write->ir = inst->ir;
1608    write->annotation = inst->annotation;
1609    inst->insert_after(block, write);
1610
1611    inst->dst.file = temp.file;
1612    inst->dst.reg = temp.reg;
1613    inst->dst.reg_offset = temp.reg_offset;
1614    inst->dst.reladdr = NULL;
1615 }
1616
1617 /**
1618  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1619  * adds the scratch read(s) before \p inst. The function also checks for
1620  * recursive reladdr scratch accesses, issuing the corresponding scratch
1621  * loads and rewriting reladdr references accordingly.
1622  *
1623  * \return \p src if it did not require a scratch load, otherwise, the
1624  * register holding the result of the scratch load that the caller should
1625  * use to rewrite src.
1626  */
1627 src_reg
1628 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1629                                    vec4_instruction *inst, src_reg src)
1630 {
1631    /* Resolve recursive reladdr scratch access by calling ourselves
1632     * with src.reladdr
1633     */
1634    if (src.reladdr)
1635       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1636                                           *src.reladdr);
1637
1638    /* Now handle scratch access on src */
1639    if (src.file == GRF && scratch_loc[src.reg] != -1) {
1640       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1641       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1642       src.reg = temp.reg;
1643       src.reg_offset = temp.reg_offset;
1644       src.reladdr = NULL;
1645    }
1646
1647    return src;
1648 }
1649
1650 /**
1651  * We can't generally support array access in GRF space, because a
1652  * single instruction's destination can only span 2 contiguous
1653  * registers.  So, we send all GRF arrays that get variable index
1654  * access to scratch space.
1655  */
1656 void
1657 vec4_visitor::move_grf_array_access_to_scratch()
1658 {
1659    int scratch_loc[this->alloc.count];
1660    memset(scratch_loc, -1, sizeof(scratch_loc));
1661
1662    /* First, calculate the set of virtual GRFs that need to be punted
1663     * to scratch due to having any array access on them, and where in
1664     * scratch.
1665     */
1666    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1667       if (inst->dst.file == GRF && inst->dst.reladdr) {
1668          if (scratch_loc[inst->dst.reg] == -1) {
1669             scratch_loc[inst->dst.reg] = last_scratch;
1670             last_scratch += this->alloc.sizes[inst->dst.reg];
1671          }
1672
1673          for (src_reg *iter = inst->dst.reladdr;
1674               iter->reladdr;
1675               iter = iter->reladdr) {
1676             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1677                scratch_loc[iter->reg] = last_scratch;
1678                last_scratch += this->alloc.sizes[iter->reg];
1679             }
1680          }
1681       }
1682
1683       for (int i = 0 ; i < 3; i++) {
1684          for (src_reg *iter = &inst->src[i];
1685               iter->reladdr;
1686               iter = iter->reladdr) {
1687             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1688                scratch_loc[iter->reg] = last_scratch;
1689                last_scratch += this->alloc.sizes[iter->reg];
1690             }
1691          }
1692       }
1693    }
1694
1695    /* Now, for anything that will be accessed through scratch, rewrite
1696     * it to load/store.  Note that this is a _safe list walk, because
1697     * we may generate a new scratch_write instruction after the one
1698     * we're processing.
1699     */
1700    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1701       /* Set up the annotation tracking for new generated instructions. */
1702       base_ir = inst->ir;
1703       current_annotation = inst->annotation;
1704
1705       /* First handle scratch access on the dst. Notice we have to handle
1706        * the case where the dst's reladdr also points to scratch space.
1707        */
1708       if (inst->dst.reladdr)
1709          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1710                                                    *inst->dst.reladdr);
1711
1712       /* Now that we have handled any (possibly recursive) reladdr scratch
1713        * accesses for dst we can safely do the scratch write for dst itself
1714        */
1715       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1716          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1717
1718       /* Now handle scratch access on any src. In this case, since inst->src[i]
1719        * already is a src_reg, we can just call emit_resolve_reladdr with
1720        * inst->src[i] and it will take care of handling scratch loads for
1721        * both src and src.reladdr (recursively).
1722        */
1723       for (int i = 0 ; i < 3; i++) {
1724          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1725                                              inst->src[i]);
1726       }
1727    }
1728 }
1729
1730 /**
1731  * Emits an instruction before @inst to load the value named by @orig_src
1732  * from the pull constant buffer (surface) at @base_offset to @temp.
1733  */
1734 void
1735 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1736                                       dst_reg temp, src_reg orig_src,
1737                                       int base_offset)
1738 {
1739    int reg_offset = base_offset + orig_src.reg_offset;
1740    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1741    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1742                                              reg_offset);
1743
1744    emit_pull_constant_load_reg(temp,
1745                                index,
1746                                offset,
1747                                block, inst);
1748 }
1749
1750 /**
1751  * Implements array access of uniforms by inserting a
1752  * PULL_CONSTANT_LOAD instruction.
1753  *
1754  * Unlike temporary GRF array access (where we don't support it due to
1755  * the difficulty of doing relative addressing on instruction
1756  * destinations), we could potentially do array access of uniforms
1757  * that were loaded in GRF space as push constants.  In real-world
1758  * usage we've seen, though, the arrays being used are always larger
1759  * than we could load as push constants, so just always move all
1760  * uniform array access out to a pull constant buffer.
1761  */
1762 void
1763 vec4_visitor::move_uniform_array_access_to_pull_constants()
1764 {
1765    int pull_constant_loc[this->uniforms];
1766    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1767    bool nested_reladdr;
1768
1769    /* Walk through and find array access of uniforms.  Put a copy of that
1770     * uniform in the pull constant buffer.
1771     *
1772     * Note that we don't move constant-indexed accesses to arrays.  No
1773     * testing has been done of the performance impact of this choice.
1774     */
1775    do {
1776       nested_reladdr = false;
1777
1778       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1779          for (int i = 0 ; i < 3; i++) {
1780             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1781                continue;
1782
1783             int uniform = inst->src[i].reg;
1784
1785             if (inst->src[i].reladdr->reladdr)
1786                nested_reladdr = true;  /* will need another pass */
1787
1788             /* If this array isn't already present in the pull constant buffer,
1789              * add it.
1790              */
1791             if (pull_constant_loc[uniform] == -1) {
1792                const gl_constant_value **values =
1793                   &stage_prog_data->param[uniform * 4];
1794
1795                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1796
1797                assert(uniform < uniform_array_size);
1798                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1799                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1800                      = values[j];
1801                }
1802             }
1803
1804             /* Set up the annotation tracking for new generated instructions. */
1805             base_ir = inst->ir;
1806             current_annotation = inst->annotation;
1807
1808             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1809
1810             emit_pull_constant_load(block, inst, temp, inst->src[i],
1811                                     pull_constant_loc[uniform]);
1812
1813             inst->src[i].file = temp.file;
1814             inst->src[i].reg = temp.reg;
1815             inst->src[i].reg_offset = temp.reg_offset;
1816             inst->src[i].reladdr = NULL;
1817          }
1818       }
1819    } while (nested_reladdr);
1820
1821    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1822     * no need to track them as larger-than-vec4 objects.  This will be
1823     * relied on in cutting out unused uniform vectors from push
1824     * constants.
1825     */
1826    split_uniform_registers();
1827 }
1828
1829 void
1830 vec4_visitor::resolve_ud_negate(src_reg *reg)
1831 {
1832    if (reg->type != BRW_REGISTER_TYPE_UD ||
1833        !reg->negate)
1834       return;
1835
1836    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1837    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1838    *reg = temp;
1839 }
1840
1841 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1842                            void *log_data,
1843                            struct gl_program *prog,
1844                            const struct brw_sampler_prog_key_data *key_tex,
1845                            struct brw_vue_prog_data *prog_data,
1846                            struct gl_shader_program *shader_prog,
1847                            gl_shader_stage stage,
1848                            void *mem_ctx,
1849                            bool no_spills,
1850                            int shader_time_index)
1851    : backend_shader(compiler, log_data, mem_ctx,
1852                     shader_prog, prog, &prog_data->base, stage),
1853      key_tex(key_tex),
1854      prog_data(prog_data),
1855      sanity_param_count(0),
1856      fail_msg(NULL),
1857      first_non_payload_grf(0),
1858      need_all_constants_in_pull_buffer(false),
1859      no_spills(no_spills),
1860      shader_time_index(shader_time_index),
1861      last_scratch(0)
1862 {
1863    this->failed = false;
1864
1865    this->base_ir = NULL;
1866    this->current_annotation = NULL;
1867    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1868
1869    this->virtual_grf_start = NULL;
1870    this->virtual_grf_end = NULL;
1871    this->live_intervals = NULL;
1872
1873    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1874
1875    this->uniforms = 0;
1876
1877    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1878     * at least one. See setup_uniforms() in brw_vec4.cpp.
1879     */
1880    this->uniform_array_size = 1;
1881    if (prog_data) {
1882       this->uniform_array_size =
1883          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1884    }
1885
1886    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1887 }
1888
1889 vec4_visitor::~vec4_visitor()
1890 {
1891 }
1892
1893
1894 void
1895 vec4_visitor::fail(const char *format, ...)
1896 {
1897    va_list va;
1898    char *msg;
1899
1900    if (failed)
1901       return;
1902
1903    failed = true;
1904
1905    va_start(va, format);
1906    msg = ralloc_vasprintf(mem_ctx, format, va);
1907    va_end(va);
1908    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1909
1910    this->fail_msg = msg;
1911
1912    if (debug_enabled) {
1913       fprintf(stderr, "%s",  msg);
1914    }
1915 }
1916
1917 } /* namespace brw */