src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 src_reg
 280 vec4_visitor::fix_3src_operand(const src_reg &src)
 281 {
 282    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 283     * able to use vertical stride of zero to replicate the vec4 uniform, like
 284     *
 285     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 286     *
 287     * But you can't, since vertical stride is always four in three-source
 288     * instructions. Instead, insert a MOV instruction to do the replication so
 289     * that the three-source instruction can consume it.
 290     */
 291
 292    /* The MOV is only needed if the source is a uniform or immediate. */
 293    if (src.file != UNIFORM && src.file != IMM)
 294       return src;
 295
 296    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 297       return src;
 298
 299    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 300    expanded.type = src.type;
 301    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 302    return src_reg(expanded);
 303 }
 304
 305 src_reg
 306 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 307 {
 308    if (!src.abs && !src.negate)
 309       return src;
 310
 311    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 312    resolved.type = src.type;
 313    emit(MOV(resolved, src));
 314
 315    return src_reg(resolved);
 316 }
 317
 318 src_reg
 319 vec4_visitor::fix_math_operand(const src_reg &src)
 320 {
 321    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 322       return src;
 323
 324    /* The gen6 math instruction ignores the source modifiers --
 325     * swizzle, abs, negate, and at least some parts of the register
 326     * region description.
 327     *
 328     * Rather than trying to enumerate all these cases, *always* expand the
 329     * operand to a temp GRF for gen6.
 330     *
 331     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 332     * can't use.
 333     */
 334
 335    if (devinfo->gen == 7 && src.file != IMM)
 336       return src;
 337
 338    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 339    expanded.type = src.type;
 340    emit(MOV(expanded, src));
 341    return src_reg(expanded);
 342 }
 343
 344 vec4_instruction *
 345 vec4_visitor::emit_math(enum opcode opcode,
 346                         const dst_reg &dst,
 347                         const src_reg &src0, const src_reg &src1)
 348 {
 349    vec4_instruction *math =
 350       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 351
 352    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 353       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 354       math->dst = dst_reg(this, glsl_type::vec4_type);
 355       math->dst.type = dst.type;
 356       math = emit(MOV(dst, src_reg(math->dst)));
 357    } else if (devinfo->gen < 6) {
 358       math->base_mrf = 1;
 359       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 360    }
 361
 362    return math;
 363 }
 364
 365 void
 366 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 367 {
 368    if (devinfo->gen < 7) {
 369       unreachable("ir_unop_pack_half_2x16 should be lowered");
 370    }
 371
 372    assert(dst.type == BRW_REGISTER_TYPE_UD);
 373    assert(src0.type == BRW_REGISTER_TYPE_F);
 374
 375    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 376     *
 377     *   Because this instruction does not have a 16-bit floating-point type,
 378     *   the destination data type must be Word (W).
 379     *
 380     *   The destination must be DWord-aligned and specify a horizontal stride
 381     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 382     *   each destination channel and the upper word is not modified.
 383     *
 384     * The above restriction implies that the f32to16 instruction must use
 385     * align1 mode, because only in align1 mode is it possible to specify
 386     * horizontal stride.  We choose here to defy the hardware docs and emit
 387     * align16 instructions.
 388     *
 389     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 390     * instructions. I was partially successful in that the code passed all
 391     * tests.  However, the code was dubiously correct and fragile, and the
 392     * tests were not harsh enough to probe that frailty. Not trusting the
 393     * code, I chose instead to remain in align16 mode in defiance of the hw
 394     * docs).
 395     *
 396     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 397     * simulator, emitting a f32to16 in align16 mode with UD as destination
 398     * data type is safe. The behavior differs from that specified in the PRM
 399     * in that the upper word of each destination channel is cleared to 0.
 400     */
 401
 402    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 403    src_reg tmp_src(tmp_dst);
 404
 405 #if 0
 406    /* Verify the undocumented behavior on which the following instructions
 407     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 408     * then the result of the bit-or instruction below will be incorrect.
 409     *
 410     * You should inspect the disasm output in order to verify that the MOV is
 411     * not optimized away.
 412     */
 413    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 414 #endif
 415
 416    /* Give tmp the form below, where "." means untouched.
 417     *
 418     *     w z          y          x w z          y          x
 419     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 420     *
 421     * That the upper word of each write-channel be 0 is required for the
 422     * following bit-shift and bit-or instructions to work. Note that this
 423     * relies on the undocumented hardware behavior mentioned above.
 424     */
 425    tmp_dst.writemask = WRITEMASK_XY;
 426    emit(F32TO16(tmp_dst, src0));
 427
 428    /* Give the write-channels of dst the form:
 429     *   0xhhhh0000
 430     */
 431    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 432    emit(SHL(dst, tmp_src, src_reg(16u)));
 433
 434    /* Finally, give the write-channels of dst the form of packHalf2x16's
 435     * output:
 436     *   0xhhhhllll
 437     */
 438    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 439    emit(OR(dst, src_reg(dst), tmp_src));
 440 }
 441
 442 void
 443 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 444 {
 445    if (devinfo->gen < 7) {
 446       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 447    }
 448
 449    assert(dst.type == BRW_REGISTER_TYPE_F);
 450    assert(src0.type == BRW_REGISTER_TYPE_UD);
 451
 452    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 453     *
 454     *   Because this instruction does not have a 16-bit floating-point type,
 455     *   the source data type must be Word (W). The destination type must be
 456     *   F (Float).
 457     *
 458     * To use W as the source data type, we must adjust horizontal strides,
 459     * which is only possible in align1 mode. All my [chadv] attempts at
 460     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 461     * Piglit tests, so I gave up.
 462     *
 463     * I've verified that, on gen7 hardware and the simulator, it is safe to
 464     * emit f16to32 in align16 mode with UD as source data type.
 465     */
 466
 467    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 468    src_reg tmp_src(tmp_dst);
 469
 470    tmp_dst.writemask = WRITEMASK_X;
 471    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 472
 473    tmp_dst.writemask = WRITEMASK_Y;
 474    emit(SHR(tmp_dst, src0, src_reg(16u)));
 475
 476    dst.writemask = WRITEMASK_XY;
 477    emit(F16TO32(dst, tmp_src));
 478 }
 479
 480 void
 481 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 482 {
 483    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 484     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 485     * is not suitable to generate the shift values, but we can use the packed
 486     * vector float and a type-converting MOV.
 487     */
 488    dst_reg shift(this, glsl_type::uvec4_type);
 489    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 490
 491    dst_reg shifted(this, glsl_type::uvec4_type);
 492    src0.swizzle = BRW_SWIZZLE_XXXX;
 493    emit(SHR(shifted, src0, src_reg(shift)));
 494
 495    shifted.type = BRW_REGISTER_TYPE_UB;
 496    dst_reg f(this, glsl_type::vec4_type);
 497    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 498
 499    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 500 }
 501
 502 void
 503 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 504 {
 505    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 506     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 507     * is not suitable to generate the shift values, but we can use the packed
 508     * vector float and a type-converting MOV.
 509     */
 510    dst_reg shift(this, glsl_type::uvec4_type);
 511    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 512
 513    dst_reg shifted(this, glsl_type::uvec4_type);
 514    src0.swizzle = BRW_SWIZZLE_XXXX;
 515    emit(SHR(shifted, src0, src_reg(shift)));
 516
 517    shifted.type = BRW_REGISTER_TYPE_B;
 518    dst_reg f(this, glsl_type::vec4_type);
 519    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 520
 521    dst_reg scaled(this, glsl_type::vec4_type);
 522    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 523
 524    dst_reg max(this, glsl_type::vec4_type);
 525    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 526    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 527 }
 528
 529 void
 530 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 531 {
 532    dst_reg saturated(this, glsl_type::vec4_type);
 533    vec4_instruction *inst = emit(MOV(saturated, src0));
 534    inst->saturate = true;
 535
 536    dst_reg scaled(this, glsl_type::vec4_type);
 537    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 538
 539    dst_reg rounded(this, glsl_type::vec4_type);
 540    emit(RNDE(rounded, src_reg(scaled)));
 541
 542    dst_reg u(this, glsl_type::uvec4_type);
 543    emit(MOV(u, src_reg(rounded)));
 544
 545    src_reg bytes(u);
 546    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 547 }
 548
 549 void
 550 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 551 {
 552    dst_reg max(this, glsl_type::vec4_type);
 553    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 554
 555    dst_reg min(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 557
 558    dst_reg scaled(this, glsl_type::vec4_type);
 559    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 560
 561    dst_reg rounded(this, glsl_type::vec4_type);
 562    emit(RNDE(rounded, src_reg(scaled)));
 563
 564    dst_reg i(this, glsl_type::ivec4_type);
 565    emit(MOV(i, src_reg(rounded)));
 566
 567    src_reg bytes(i);
 568    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 569 }
 570
 571 /**
 572  * Returns the minimum number of vec4 elements needed to pack a type.
 573  *
 574  * For simple types, it will return 1 (a single vec4); for matrices, the
 575  * number of columns; for array and struct, the sum of the vec4_size of
 576  * each of its elements; and for sampler and atomic, zero.
 577  *
 578  * This method is useful to calculate how much register space is needed to
 579  * store a particular type.
 580  */
 581 extern "C" int
 582 type_size_vec4(const struct glsl_type *type)
 583 {
 584    unsigned int i;
 585    int size;
 586
 587    switch (type->base_type) {
 588    case GLSL_TYPE_UINT:
 589    case GLSL_TYPE_INT:
 590    case GLSL_TYPE_FLOAT:
 591    case GLSL_TYPE_BOOL:
 592       if (type->is_matrix()) {
 593          return type->matrix_columns;
 594       } else {
 595          /* Regardless of size of vector, it gets a vec4. This is bad
 596           * packing for things like floats, but otherwise arrays become a
 597           * mess.  Hopefully a later pass over the code can pack scalars
 598           * down if appropriate.
 599           */
 600          return 1;
 601       }
 602    case GLSL_TYPE_ARRAY:
 603       assert(type->length > 0);
 604       return type_size_vec4(type->fields.array) * type->length;
 605    case GLSL_TYPE_STRUCT:
 606       size = 0;
 607       for (i = 0; i < type->length; i++) {
 608          size += type_size_vec4(type->fields.structure[i].type);
 609       }
 610       return size;
 611    case GLSL_TYPE_SUBROUTINE:
 612       return 1;
 613
 614    case GLSL_TYPE_SAMPLER:
 615       /* Samplers take up no register space, since they're baked in at
 616        * link time.
 617        */
 618       return 0;
 619    case GLSL_TYPE_ATOMIC_UINT:
 620       return 0;
 621    case GLSL_TYPE_IMAGE:
 622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 623    case GLSL_TYPE_VOID:
 624    case GLSL_TYPE_DOUBLE:
 625    case GLSL_TYPE_ERROR:
 626    case GLSL_TYPE_INTERFACE:
 627    case GLSL_TYPE_FUNCTION:
 628       unreachable("not reached");
 629    }
 630
 631    return 0;
 632 }
 633
 634 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 635 {
 636    init();
 637
 638    this->file = GRF;
 639    this->reg = v->alloc.allocate(type_size_vec4(type));
 640
 641    if (type->is_array() || type->is_record()) {
 642       this->swizzle = BRW_SWIZZLE_NOOP;
 643    } else {
 644       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 645    }
 646
 647    this->type = brw_type_for_base_type(type);
 648 }
 649
 650 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 651 {
 652    assert(size > 0);
 653
 654    init();
 655
 656    this->file = GRF;
 657    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 658
 659    this->swizzle = BRW_SWIZZLE_NOOP;
 660
 661    this->type = brw_type_for_base_type(type);
 662 }
 663
 664 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 665 {
 666    init();
 667
 668    this->file = GRF;
 669    this->reg = v->alloc.allocate(type_size_vec4(type));
 670
 671    if (type->is_array() || type->is_record()) {
 672       this->writemask = WRITEMASK_XYZW;
 673    } else {
 674       this->writemask = (1 << type->vector_elements) - 1;
 675    }
 676
 677    this->type = brw_type_for_base_type(type);
 678 }
 679
 680 vec4_instruction *
 681 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 682                           src_reg src0, src_reg src1)
 683 {
 684    vec4_instruction *inst;
 685
 686    if (devinfo->gen >= 6) {
 687       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 688       inst->conditional_mod = conditionalmod;
 689    } else {
 690       emit(CMP(dst, src0, src1, conditionalmod));
 691
 692       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 693       inst->predicate = BRW_PREDICATE_NORMAL;
 694    }
 695
 696    return inst;
 697 }
 698
 699 vec4_instruction *
 700 vec4_visitor::emit_lrp(const dst_reg &dst,
 701                        const src_reg &x, const src_reg &y, const src_reg &a)
 702 {
 703    if (devinfo->gen >= 6) {
 704       /* Note that the instruction's argument order is reversed from GLSL
 705        * and the IR.
 706        */
 707      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 708                      fix_3src_operand(x)));
 709    } else {
 710       /* Earlier generations don't support three source operations, so we
 711        * need to emit x*(1-a) + y*a.
 712        */
 713       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 714       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 715       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 716       y_times_a.writemask           = dst.writemask;
 717       one_minus_a.writemask         = dst.writemask;
 718       x_times_one_minus_a.writemask = dst.writemask;
 719
 720       emit(MUL(y_times_a, y, a));
 721       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
 722       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 723       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 724    }
 725 }
 726
 727 /**
 728  * Emits the instructions needed to perform a pull constant load. before_block
 729  * and before_inst can be NULL in which case the instruction will be appended
 730  * to the end of the instruction list.
 731  */
 732 void
 733 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 734                                           src_reg surf_index,
 735                                           src_reg offset_reg,
 736                                           bblock_t *before_block,
 737                                           vec4_instruction *before_inst)
 738 {
 739    assert((before_inst == NULL && before_block == NULL) ||
 740           (before_inst && before_block));
 741
 742    vec4_instruction *pull;
 743
 744    if (devinfo->gen >= 9) {
 745       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 746       src_reg header(this, glsl_type::uvec4_type, 2);
 747
 748       pull = new(mem_ctx)
 749          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 750                           dst_reg(header));
 751
 752       if (before_inst)
 753          emit_before(before_block, before_inst, pull);
 754       else
 755          emit(pull);
 756
 757       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 758                                  offset_reg.type);
 759       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 760
 761       if (before_inst)
 762          emit_before(before_block, before_inst, pull);
 763       else
 764          emit(pull);
 765
 766       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 767                                            dst,
 768                                            surf_index,
 769                                            header);
 770       pull->mlen = 2;
 771       pull->header_size = 1;
 772    } else if (devinfo->gen >= 7) {
 773       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 774
 775       grf_offset.type = offset_reg.type;
 776
 777       pull = MOV(grf_offset, offset_reg);
 778
 779       if (before_inst)
 780          emit_before(before_block, before_inst, pull);
 781       else
 782          emit(pull);
 783
 784       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 785                                            dst,
 786                                            surf_index,
 787                                            src_reg(grf_offset));
 788       pull->mlen = 1;
 789    } else {
 790       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 791                                            dst,
 792                                            surf_index,
 793                                            offset_reg);
 794       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 795       pull->mlen = 1;
 796    }
 797
 798    if (before_inst)
 799       emit_before(before_block, before_inst, pull);
 800    else
 801       emit(pull);
 802 }
 803
 804 src_reg
 805 vec4_visitor::emit_uniformize(const src_reg &src)
 806 {
 807    const src_reg chan_index(this, glsl_type::uint_type);
 808    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 809                               src.type);
 810
 811    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 812       ->force_writemask_all = true;
 813    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 814       ->force_writemask_all = true;
 815
 816    return src_reg(dst);
 817 }
 818
 819 src_reg
 820 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 821                              src_reg coordinate, src_reg sampler)
 822 {
 823    vec4_instruction *inst =
 824       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 825                                     dst_reg(this, glsl_type::uvec4_type));
 826    inst->base_mrf = 2;
 827    inst->src[1] = sampler;
 828
 829    int param_base;
 830
 831    if (devinfo->gen >= 9) {
 832       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 833       vec4_instruction *header_inst = new(mem_ctx)
 834          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 835                           dst_reg(MRF, inst->base_mrf));
 836
 837       emit(header_inst);
 838
 839       inst->mlen = 2;
 840       inst->header_size = 1;
 841       param_base = inst->base_mrf + 1;
 842    } else {
 843       inst->mlen = 1;
 844       param_base = inst->base_mrf;
 845    }
 846
 847    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 848    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 849    int zero_mask = 0xf & ~coord_mask;
 850
 851    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 852             coordinate));
 853
 854    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 855             src_reg(0)));
 856
 857    emit(inst);
 858    return src_reg(inst->dst);
 859 }
 860
 861 bool
 862 vec4_visitor::is_high_sampler(src_reg sampler)
 863 {
 864    if (devinfo->gen < 8 && !devinfo->is_haswell)
 865       return false;
 866
 867    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 868 }
 869
 870 void
 871 vec4_visitor::emit_texture(ir_texture_opcode op,
 872                            dst_reg dest,
 873                            const glsl_type *dest_type,
 874                            src_reg coordinate,
 875                            int coord_components,
 876                            src_reg shadow_comparitor,
 877                            src_reg lod, src_reg lod2,
 878                            src_reg sample_index,
 879                            uint32_t constant_offset,
 880                            src_reg offset_value,
 881                            src_reg mcs,
 882                            bool is_cube_array,
 883                            uint32_t sampler,
 884                            src_reg sampler_reg)
 885 {
 886    enum opcode opcode;
 887    switch (op) {
 888    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 889    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 890    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 891    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 892    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 893    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 894    case ir_tg4: opcode = offset_value.file != BAD_FILE
 895                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 896    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 897    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 898    case ir_txb:
 899       unreachable("TXB is not valid for vertex shaders.");
 900    case ir_lod:
 901       unreachable("LOD is not valid for vertex shaders.");
 902    default:
 903       unreachable("Unrecognized tex op");
 904    }
 905
 906    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 907       opcode, dst_reg(this, dest_type));
 908
 909    inst->offset = constant_offset;
 910
 911    /* The message header is necessary for:
 912     * - Gen4 (always)
 913     * - Gen9+ for selecting SIMD4x2
 914     * - Texel offsets
 915     * - Gather channel selection
 916     * - Sampler indices too large to fit in a 4-bit value.
 917     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 918     */
 919    inst->header_size =
 920       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 921        inst->offset != 0 || op == ir_tg4 ||
 922        op == ir_texture_samples ||
 923        is_high_sampler(sampler_reg)) ? 1 : 0;
 924    inst->base_mrf = 2;
 925    inst->mlen = inst->header_size;
 926    inst->dst.writemask = WRITEMASK_XYZW;
 927    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 928
 929    inst->src[1] = sampler_reg;
 930
 931    /* MRF for the first parameter */
 932    int param_base = inst->base_mrf + inst->header_size;
 933
 934    if (op == ir_txs || op == ir_query_levels) {
 935       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 936       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 937       inst->mlen++;
 938    } else if (op == ir_texture_samples) {
 939       inst->dst.writemask = WRITEMASK_X;
 940    } else {
 941       /* Load the coordinate */
 942       /* FINISHME: gl_clamp_mask and saturate */
 943       int coord_mask = (1 << coord_components) - 1;
 944       int zero_mask = 0xf & ~coord_mask;
 945
 946       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 947                coordinate));
 948       inst->mlen++;
 949
 950       if (zero_mask != 0) {
 951          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 952                   src_reg(0)));
 953       }
 954       /* Load the shadow comparitor */
 955       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 956          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 957                           WRITEMASK_X),
 958                   shadow_comparitor));
 959          inst->mlen++;
 960       }
 961
 962       /* Load the LOD info */
 963       if (op == ir_tex || op == ir_txl) {
 964          int mrf, writemask;
 965          if (devinfo->gen >= 5) {
 966             mrf = param_base + 1;
 967             if (shadow_comparitor.file != BAD_FILE) {
 968                writemask = WRITEMASK_Y;
 969                /* mlen already incremented */
 970             } else {
 971                writemask = WRITEMASK_X;
 972                inst->mlen++;
 973             }
 974          } else /* devinfo->gen == 4 */ {
 975             mrf = param_base;
 976             writemask = WRITEMASK_W;
 977          }
 978          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 979       } else if (op == ir_txf) {
 980          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 981       } else if (op == ir_txf_ms) {
 982          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
 983                   sample_index));
 984          if (devinfo->gen >= 7) {
 985             /* MCS data is in the first channel of `mcs`, but we need to get it into
 986              * the .y channel of the second vec4 of params, so replicate .x across
 987              * the whole vec4 and then mask off everything except .y
 988              */
 989             mcs.swizzle = BRW_SWIZZLE_XXXX;
 990             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
 991                      mcs));
 992          }
 993          inst->mlen++;
 994       } else if (op == ir_txd) {
 995          const brw_reg_type type = lod.type;
 996
 997          if (devinfo->gen >= 5) {
 998             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
 999             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1000             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1001             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1002             inst->mlen++;
1003
1004             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1005                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1006                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1007                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1008                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1009                inst->mlen++;
1010
1011                if (shadow_comparitor.file != BAD_FILE) {
1012                   emit(MOV(dst_reg(MRF, param_base + 2,
1013                                    shadow_comparitor.type, WRITEMASK_Z),
1014                            shadow_comparitor));
1015                }
1016             }
1017          } else /* devinfo->gen == 4 */ {
1018             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1019             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1020             inst->mlen += 2;
1021          }
1022       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1023          if (shadow_comparitor.file != BAD_FILE) {
1024             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1025                      shadow_comparitor));
1026          }
1027
1028          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1029                   offset_value));
1030          inst->mlen++;
1031       }
1032    }
1033
1034    emit(inst);
1035
1036    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1037     * spec requires layers.
1038     */
1039    if (op == ir_txs && is_cube_array) {
1040       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1041                 writemask(inst->dst, WRITEMASK_Z),
1042                 src_reg(inst->dst), src_reg(6));
1043    }
1044
1045    if (devinfo->gen == 6 && op == ir_tg4) {
1046       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1047    }
1048
1049    swizzle_result(op, dest,
1050                   src_reg(inst->dst), sampler, dest_type);
1051 }
1052
1053 /**
1054  * Apply workarounds for Gen6 gather with UINT/SINT
1055  */
1056 void
1057 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1058 {
1059    if (!wa)
1060       return;
1061
1062    int width = (wa & WA_8BIT) ? 8 : 16;
1063    dst_reg dst_f = dst;
1064    dst_f.type = BRW_REGISTER_TYPE_F;
1065
1066    /* Convert from UNORM to UINT */
1067    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1068    emit(MOV(dst, src_reg(dst_f)));
1069
1070    if (wa & WA_SIGN) {
1071       /* Reinterpret the UINT value as a signed INT value by
1072        * shifting the sign bit into place, then shifting back
1073        * preserving sign.
1074        */
1075       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1076       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1077    }
1078 }
1079
1080 /**
1081  * Set up the gather channel based on the swizzle, for gather4.
1082  */
1083 uint32_t
1084 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1085 {
1086    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1087    switch (swiz) {
1088       case SWIZZLE_X: return 0;
1089       case SWIZZLE_Y:
1090          /* gather4 sampler is broken for green channel on RG32F --
1091           * we must ask for blue instead.
1092           */
1093          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1094             return 2;
1095          return 1;
1096       case SWIZZLE_Z: return 2;
1097       case SWIZZLE_W: return 3;
1098       default:
1099          unreachable("Not reached"); /* zero, one swizzles handled already */
1100    }
1101 }
1102
1103 void
1104 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1105                              src_reg orig_val, uint32_t sampler,
1106                              const glsl_type *dest_type)
1107 {
1108    int s = key_tex->swizzles[sampler];
1109
1110    dst_reg swizzled_result = dest;
1111
1112    if (op == ir_query_levels) {
1113       /* # levels is in .w */
1114       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1115       emit(MOV(swizzled_result, orig_val));
1116       return;
1117    }
1118
1119    if (op == ir_txs || dest_type == glsl_type::float_type
1120                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1121       emit(MOV(swizzled_result, orig_val));
1122       return;
1123    }
1124
1125
1126    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1127    int swizzle[4] = {0};
1128
1129    for (int i = 0; i < 4; i++) {
1130       switch (GET_SWZ(s, i)) {
1131       case SWIZZLE_ZERO:
1132          zero_mask |= (1 << i);
1133          break;
1134       case SWIZZLE_ONE:
1135          one_mask |= (1 << i);
1136          break;
1137       default:
1138          copy_mask |= (1 << i);
1139          swizzle[i] = GET_SWZ(s, i);
1140          break;
1141       }
1142    }
1143
1144    if (copy_mask) {
1145       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1146       swizzled_result.writemask = copy_mask;
1147       emit(MOV(swizzled_result, orig_val));
1148    }
1149
1150    if (zero_mask) {
1151       swizzled_result.writemask = zero_mask;
1152       emit(MOV(swizzled_result, src_reg(0.0f)));
1153    }
1154
1155    if (one_mask) {
1156       swizzled_result.writemask = one_mask;
1157       emit(MOV(swizzled_result, src_reg(1.0f)));
1158    }
1159 }
1160
1161 void
1162 vec4_visitor::gs_emit_vertex(int stream_id)
1163 {
1164    unreachable("not reached");
1165 }
1166
1167 void
1168 vec4_visitor::gs_end_primitive()
1169 {
1170    unreachable("not reached");
1171 }
1172
1173 void
1174 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1175                                   dst_reg dst, src_reg offset,
1176                                   src_reg src0, src_reg src1)
1177 {
1178    unsigned mlen = 0;
1179
1180    /* Set the atomic operation offset. */
1181    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1182    mlen++;
1183
1184    /* Set the atomic operation arguments. */
1185    if (src0.file != BAD_FILE) {
1186       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1187       mlen++;
1188    }
1189
1190    if (src1.file != BAD_FILE) {
1191       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1192       mlen++;
1193    }
1194
1195    /* Emit the instruction.  Note that this maps to the normal SIMD8
1196     * untyped atomic message on Ivy Bridge, but that's OK because
1197     * unused channels will be masked out.
1198     */
1199    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1200                                  brw_message_reg(0),
1201                                  src_reg(surf_index), src_reg(atomic_op));
1202    inst->mlen = mlen;
1203 }
1204
1205 void
1206 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1207                                         src_reg offset)
1208 {
1209    /* Set the surface read offset. */
1210    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1211
1212    /* Emit the instruction.  Note that this maps to the normal SIMD8
1213     * untyped surface read message, but that's OK because unused
1214     * channels will be masked out.
1215     */
1216    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1217                                  brw_message_reg(0),
1218                                  src_reg(surf_index), src_reg(1));
1219    inst->mlen = 1;
1220 }
1221
1222 void
1223 vec4_visitor::emit_ndc_computation()
1224 {
1225    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1226       return;
1227
1228    /* Get the position */
1229    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1230
1231    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1232    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1233    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1234
1235    current_annotation = "NDC";
1236    dst_reg ndc_w = ndc;
1237    ndc_w.writemask = WRITEMASK_W;
1238    src_reg pos_w = pos;
1239    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1240    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1241
1242    dst_reg ndc_xyz = ndc;
1243    ndc_xyz.writemask = WRITEMASK_XYZ;
1244
1245    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1246 }
1247
1248 void
1249 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1250 {
1251    if (devinfo->gen < 6 &&
1252        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1253         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1254         devinfo->has_negative_rhw_bug)) {
1255       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1256       dst_reg header1_w = header1;
1257       header1_w.writemask = WRITEMASK_W;
1258
1259       emit(MOV(header1, 0u));
1260
1261       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1262          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1263
1264          current_annotation = "Point size";
1265          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1266          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1267       }
1268
1269       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1270          current_annotation = "Clipping flags";
1271          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1272          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1273
1274          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1275          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1276          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1277
1278          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1279          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1280          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1281          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1282       }
1283
1284       /* i965 clipping workaround:
1285        * 1) Test for -ve rhw
1286        * 2) If set,
1287        *      set ndc = (0,0,0,0)
1288        *      set ucp[6] = 1
1289        *
1290        * Later, clipping will detect ucp[6] and ensure the primitive is
1291        * clipped against all fixed planes.
1292        */
1293       if (devinfo->has_negative_rhw_bug &&
1294           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1295          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1296          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1297          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1298          vec4_instruction *inst;
1299          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1300          inst->predicate = BRW_PREDICATE_NORMAL;
1301          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1302          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1303          inst->predicate = BRW_PREDICATE_NORMAL;
1304       }
1305
1306       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1307    } else if (devinfo->gen < 6) {
1308       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1309    } else {
1310       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1311       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1312          dst_reg reg_w = reg;
1313          reg_w.writemask = WRITEMASK_W;
1314          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1315          reg_as_src.type = reg_w.type;
1316          reg_as_src.swizzle = brw_swizzle_for_size(1);
1317          emit(MOV(reg_w, reg_as_src));
1318       }
1319       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1320          dst_reg reg_y = reg;
1321          reg_y.writemask = WRITEMASK_Y;
1322          reg_y.type = BRW_REGISTER_TYPE_D;
1323          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1324          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1325       }
1326       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1327          dst_reg reg_z = reg;
1328          reg_z.writemask = WRITEMASK_Z;
1329          reg_z.type = BRW_REGISTER_TYPE_D;
1330          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1331          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1332       }
1333    }
1334 }
1335
1336 vec4_instruction *
1337 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1338 {
1339    assert(varying < VARYING_SLOT_MAX);
1340    assert(output_reg[varying].type == reg.type);
1341    current_annotation = output_reg_annotation[varying];
1342    if (output_reg[varying].file != BAD_FILE)
1343       return emit(MOV(reg, src_reg(output_reg[varying])));
1344    else
1345       return NULL;
1346 }
1347
1348 void
1349 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1350 {
1351    reg.type = BRW_REGISTER_TYPE_F;
1352    output_reg[varying].type = reg.type;
1353
1354    switch (varying) {
1355    case VARYING_SLOT_PSIZ:
1356    {
1357       /* PSIZ is always in slot 0, and is coupled with other flags. */
1358       current_annotation = "indices, point width, clip flags";
1359       emit_psiz_and_flags(reg);
1360       break;
1361    }
1362    case BRW_VARYING_SLOT_NDC:
1363       current_annotation = "NDC";
1364       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1365          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1366       break;
1367    case VARYING_SLOT_POS:
1368       current_annotation = "gl_Position";
1369       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1370          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1371       break;
1372    case VARYING_SLOT_EDGE:
1373       /* This is present when doing unfilled polygons.  We're supposed to copy
1374        * the edge flag from the user-provided vertex array
1375        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1376        * of that attribute (starts as 1.0f).  This is then used in clipping to
1377        * determine which edges should be drawn as wireframe.
1378        */
1379       current_annotation = "edge flag";
1380       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1381                                     glsl_type::float_type, WRITEMASK_XYZW))));
1382       break;
1383    case BRW_VARYING_SLOT_PAD:
1384       /* No need to write to this slot */
1385       break;
1386    default:
1387       emit_generic_urb_slot(reg, varying);
1388       break;
1389    }
1390 }
1391
1392 static int
1393 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1394 {
1395    if (devinfo->gen >= 6) {
1396       /* URB data written (does not include the message header reg) must
1397        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1398        * section 5.4.3.2.2: URB_INTERLEAVED.
1399        *
1400        * URB entries are allocated on a multiple of 1024 bits, so an
1401        * extra 128 bits written here to make the end align to 256 is
1402        * no problem.
1403        */
1404       if ((mlen % 2) != 1)
1405          mlen++;
1406    }
1407
1408    return mlen;
1409 }
1410
1411
1412 /**
1413  * Generates the VUE payload plus the necessary URB write instructions to
1414  * output it.
1415  *
1416  * The VUE layout is documented in Volume 2a.
1417  */
1418 void
1419 vec4_visitor::emit_vertex()
1420 {
1421    /* MRF 0 is reserved for the debugger, so start with message header
1422     * in MRF 1.
1423     */
1424    int base_mrf = 1;
1425    int mrf = base_mrf;
1426    /* In the process of generating our URB write message contents, we
1427     * may need to unspill a register or load from an array.  Those
1428     * reads would use MRFs 14-15.
1429     */
1430    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1431
1432    /* The following assertion verifies that max_usable_mrf causes an
1433     * even-numbered amount of URB write data, which will meet gen6's
1434     * requirements for length alignment.
1435     */
1436    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1437
1438    /* First mrf is the g0-based message header containing URB handles and
1439     * such.
1440     */
1441    emit_urb_write_header(mrf++);
1442
1443    if (devinfo->gen < 6) {
1444       emit_ndc_computation();
1445    }
1446
1447    /* We may need to split this up into several URB writes, so do them in a
1448     * loop.
1449     */
1450    int slot = 0;
1451    bool complete = false;
1452    do {
1453       /* URB offset is in URB row increments, and each of our MRFs is half of
1454        * one of those, since we're doing interleaved writes.
1455        */
1456       int offset = slot / 2;
1457
1458       mrf = base_mrf + 1;
1459       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1460          emit_urb_slot(dst_reg(MRF, mrf++),
1461                        prog_data->vue_map.slot_to_varying[slot]);
1462
1463          /* If this was max_usable_mrf, we can't fit anything more into this
1464           * URB WRITE. Same thing if we reached the maximum length available.
1465           */
1466          if (mrf > max_usable_mrf ||
1467              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1468             slot++;
1469             break;
1470          }
1471       }
1472
1473       complete = slot >= prog_data->vue_map.num_slots;
1474       current_annotation = "URB write";
1475       vec4_instruction *inst = emit_urb_write_opcode(complete);
1476       inst->base_mrf = base_mrf;
1477       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1478       inst->offset += offset;
1479    } while(!complete);
1480 }
1481
1482
1483 src_reg
1484 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1485                                  src_reg *reladdr, int reg_offset)
1486 {
1487    /* Because we store the values to scratch interleaved like our
1488     * vertex data, we need to scale the vec4 index by 2.
1489     */
1490    int message_header_scale = 2;
1491
1492    /* Pre-gen6, the message header uses byte offsets instead of vec4
1493     * (16-byte) offset units.
1494     */
1495    if (devinfo->gen < 6)
1496       message_header_scale *= 16;
1497
1498    if (reladdr) {
1499       src_reg index = src_reg(this, glsl_type::int_type);
1500
1501       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1502                                    src_reg(reg_offset)));
1503       emit_before(block, inst, MUL(dst_reg(index), index,
1504                                    src_reg(message_header_scale)));
1505
1506       return index;
1507    } else {
1508       return src_reg(reg_offset * message_header_scale);
1509    }
1510 }
1511
1512 src_reg
1513 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1514                                        src_reg *reladdr, int reg_offset)
1515 {
1516    if (reladdr) {
1517       src_reg index = src_reg(this, glsl_type::int_type);
1518
1519       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1520                                    src_reg(reg_offset)));
1521
1522       /* Pre-gen6, the message header uses byte offsets instead of vec4
1523        * (16-byte) offset units.
1524        */
1525       if (devinfo->gen < 6) {
1526          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1527       }
1528
1529       return index;
1530    } else if (devinfo->gen >= 8) {
1531       /* Store the offset in a GRF so we can send-from-GRF. */
1532       src_reg offset = src_reg(this, glsl_type::int_type);
1533       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1534       return offset;
1535    } else {
1536       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1537       return src_reg(reg_offset * message_header_scale);
1538    }
1539 }
1540
1541 /**
1542  * Emits an instruction before @inst to load the value named by @orig_src
1543  * from scratch space at @base_offset to @temp.
1544  *
1545  * @base_offset is measured in 32-byte units (the size of a register).
1546  */
1547 void
1548 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1549                                 dst_reg temp, src_reg orig_src,
1550                                 int base_offset)
1551 {
1552    int reg_offset = base_offset + orig_src.reg_offset;
1553    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1554                                       reg_offset);
1555
1556    emit_before(block, inst, SCRATCH_READ(temp, index));
1557 }
1558
1559 /**
1560  * Emits an instruction after @inst to store the value to be written
1561  * to @orig_dst to scratch space at @base_offset, from @temp.
1562  *
1563  * @base_offset is measured in 32-byte units (the size of a register).
1564  */
1565 void
1566 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1567                                  int base_offset)
1568 {
1569    int reg_offset = base_offset + inst->dst.reg_offset;
1570    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1571                                       reg_offset);
1572
1573    /* Create a temporary register to store *inst's result in.
1574     *
1575     * We have to be careful in MOVing from our temporary result register in
1576     * the scratch write.  If we swizzle from channels of the temporary that
1577     * weren't initialized, it will confuse live interval analysis, which will
1578     * make spilling fail to make progress.
1579     */
1580    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1581                                        inst->dst.type),
1582                                 brw_swizzle_for_mask(inst->dst.writemask));
1583    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1584                                        inst->dst.writemask));
1585    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1586    if (inst->opcode != BRW_OPCODE_SEL)
1587       write->predicate = inst->predicate;
1588    write->ir = inst->ir;
1589    write->annotation = inst->annotation;
1590    inst->insert_after(block, write);
1591
1592    inst->dst.file = temp.file;
1593    inst->dst.reg = temp.reg;
1594    inst->dst.reg_offset = temp.reg_offset;
1595    inst->dst.reladdr = NULL;
1596 }
1597
1598 /**
1599  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1600  * adds the scratch read(s) before \p inst. The function also checks for
1601  * recursive reladdr scratch accesses, issuing the corresponding scratch
1602  * loads and rewriting reladdr references accordingly.
1603  *
1604  * \return \p src if it did not require a scratch load, otherwise, the
1605  * register holding the result of the scratch load that the caller should
1606  * use to rewrite src.
1607  */
1608 src_reg
1609 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1610                                    vec4_instruction *inst, src_reg src)
1611 {
1612    /* Resolve recursive reladdr scratch access by calling ourselves
1613     * with src.reladdr
1614     */
1615    if (src.reladdr)
1616       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1617                                           *src.reladdr);
1618
1619    /* Now handle scratch access on src */
1620    if (src.file == GRF && scratch_loc[src.reg] != -1) {
1621       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1622       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1623       src.reg = temp.reg;
1624       src.reg_offset = temp.reg_offset;
1625       src.reladdr = NULL;
1626    }
1627
1628    return src;
1629 }
1630
1631 /**
1632  * We can't generally support array access in GRF space, because a
1633  * single instruction's destination can only span 2 contiguous
1634  * registers.  So, we send all GRF arrays that get variable index
1635  * access to scratch space.
1636  */
1637 void
1638 vec4_visitor::move_grf_array_access_to_scratch()
1639 {
1640    int scratch_loc[this->alloc.count];
1641    memset(scratch_loc, -1, sizeof(scratch_loc));
1642
1643    /* First, calculate the set of virtual GRFs that need to be punted
1644     * to scratch due to having any array access on them, and where in
1645     * scratch.
1646     */
1647    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1648       if (inst->dst.file == GRF && inst->dst.reladdr) {
1649          if (scratch_loc[inst->dst.reg] == -1) {
1650             scratch_loc[inst->dst.reg] = last_scratch;
1651             last_scratch += this->alloc.sizes[inst->dst.reg];
1652          }
1653
1654          for (src_reg *iter = inst->dst.reladdr;
1655               iter->reladdr;
1656               iter = iter->reladdr) {
1657             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1658                scratch_loc[iter->reg] = last_scratch;
1659                last_scratch += this->alloc.sizes[iter->reg];
1660             }
1661          }
1662       }
1663
1664       for (int i = 0 ; i < 3; i++) {
1665          for (src_reg *iter = &inst->src[i];
1666               iter->reladdr;
1667               iter = iter->reladdr) {
1668             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1669                scratch_loc[iter->reg] = last_scratch;
1670                last_scratch += this->alloc.sizes[iter->reg];
1671             }
1672          }
1673       }
1674    }
1675
1676    /* Now, for anything that will be accessed through scratch, rewrite
1677     * it to load/store.  Note that this is a _safe list walk, because
1678     * we may generate a new scratch_write instruction after the one
1679     * we're processing.
1680     */
1681    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1682       /* Set up the annotation tracking for new generated instructions. */
1683       base_ir = inst->ir;
1684       current_annotation = inst->annotation;
1685
1686       /* First handle scratch access on the dst. Notice we have to handle
1687        * the case where the dst's reladdr also points to scratch space.
1688        */
1689       if (inst->dst.reladdr)
1690          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1691                                                    *inst->dst.reladdr);
1692
1693       /* Now that we have handled any (possibly recursive) reladdr scratch
1694        * accesses for dst we can safely do the scratch write for dst itself
1695        */
1696       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1697          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1698
1699       /* Now handle scratch access on any src. In this case, since inst->src[i]
1700        * already is a src_reg, we can just call emit_resolve_reladdr with
1701        * inst->src[i] and it will take care of handling scratch loads for
1702        * both src and src.reladdr (recursively).
1703        */
1704       for (int i = 0 ; i < 3; i++) {
1705          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1706                                              inst->src[i]);
1707       }
1708    }
1709 }
1710
1711 /**
1712  * Emits an instruction before @inst to load the value named by @orig_src
1713  * from the pull constant buffer (surface) at @base_offset to @temp.
1714  */
1715 void
1716 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1717                                       dst_reg temp, src_reg orig_src,
1718                                       int base_offset)
1719 {
1720    int reg_offset = base_offset + orig_src.reg_offset;
1721    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1722    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1723                                              reg_offset);
1724
1725    emit_pull_constant_load_reg(temp,
1726                                index,
1727                                offset,
1728                                block, inst);
1729 }
1730
1731 /**
1732  * Implements array access of uniforms by inserting a
1733  * PULL_CONSTANT_LOAD instruction.
1734  *
1735  * Unlike temporary GRF array access (where we don't support it due to
1736  * the difficulty of doing relative addressing on instruction
1737  * destinations), we could potentially do array access of uniforms
1738  * that were loaded in GRF space as push constants.  In real-world
1739  * usage we've seen, though, the arrays being used are always larger
1740  * than we could load as push constants, so just always move all
1741  * uniform array access out to a pull constant buffer.
1742  */
1743 void
1744 vec4_visitor::move_uniform_array_access_to_pull_constants()
1745 {
1746    int pull_constant_loc[this->uniforms];
1747    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1748    bool nested_reladdr;
1749
1750    /* Walk through and find array access of uniforms.  Put a copy of that
1751     * uniform in the pull constant buffer.
1752     *
1753     * Note that we don't move constant-indexed accesses to arrays.  No
1754     * testing has been done of the performance impact of this choice.
1755     */
1756    do {
1757       nested_reladdr = false;
1758
1759       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1760          for (int i = 0 ; i < 3; i++) {
1761             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1762                continue;
1763
1764             int uniform = inst->src[i].reg;
1765
1766             if (inst->src[i].reladdr->reladdr)
1767                nested_reladdr = true;  /* will need another pass */
1768
1769             /* If this array isn't already present in the pull constant buffer,
1770              * add it.
1771              */
1772             if (pull_constant_loc[uniform] == -1) {
1773                const gl_constant_value **values =
1774                   &stage_prog_data->param[uniform * 4];
1775
1776                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1777
1778                assert(uniform < uniform_array_size);
1779                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1780                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1781                      = values[j];
1782                }
1783             }
1784
1785             /* Set up the annotation tracking for new generated instructions. */
1786             base_ir = inst->ir;
1787             current_annotation = inst->annotation;
1788
1789             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1790
1791             emit_pull_constant_load(block, inst, temp, inst->src[i],
1792                                     pull_constant_loc[uniform]);
1793
1794             inst->src[i].file = temp.file;
1795             inst->src[i].reg = temp.reg;
1796             inst->src[i].reg_offset = temp.reg_offset;
1797             inst->src[i].reladdr = NULL;
1798          }
1799       }
1800    } while (nested_reladdr);
1801
1802    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1803     * no need to track them as larger-than-vec4 objects.  This will be
1804     * relied on in cutting out unused uniform vectors from push
1805     * constants.
1806     */
1807    split_uniform_registers();
1808 }
1809
1810 void
1811 vec4_visitor::resolve_ud_negate(src_reg *reg)
1812 {
1813    if (reg->type != BRW_REGISTER_TYPE_UD ||
1814        !reg->negate)
1815       return;
1816
1817    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1818    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1819    *reg = temp;
1820 }
1821
1822 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1823                            void *log_data,
1824                            const struct brw_sampler_prog_key_data *key_tex,
1825                            struct brw_vue_prog_data *prog_data,
1826                            const nir_shader *shader,
1827                            void *mem_ctx,
1828                            bool no_spills,
1829                            int shader_time_index)
1830    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1831      key_tex(key_tex),
1832      prog_data(prog_data),
1833      fail_msg(NULL),
1834      first_non_payload_grf(0),
1835      need_all_constants_in_pull_buffer(false),
1836      no_spills(no_spills),
1837      shader_time_index(shader_time_index),
1838      last_scratch(0)
1839 {
1840    this->failed = false;
1841
1842    this->base_ir = NULL;
1843    this->current_annotation = NULL;
1844    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1845
1846    this->virtual_grf_start = NULL;
1847    this->virtual_grf_end = NULL;
1848    this->live_intervals = NULL;
1849
1850    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1851
1852    this->uniforms = 0;
1853
1854    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1855     * at least one. See setup_uniforms() in brw_vec4.cpp.
1856     */
1857    this->uniform_array_size = 1;
1858    if (prog_data) {
1859       this->uniform_array_size =
1860          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1861    }
1862
1863    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1864 }
1865
1866 vec4_visitor::~vec4_visitor()
1867 {
1868 }
1869
1870
1871 void
1872 vec4_visitor::fail(const char *format, ...)
1873 {
1874    va_list va;
1875    char *msg;
1876
1877    if (failed)
1878       return;
1879
1880    failed = true;
1881
1882    va_start(va, format);
1883    msg = ralloc_vasprintf(mem_ctx, format, va);
1884    va_end(va);
1885    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1886
1887    this->fail_msg = msg;
1888
1889    if (debug_enabled) {
1890       fprintf(stderr, "%s",  msg);
1891    }
1892 }
1893
1894 } /* namespace brw */