src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 src_reg
 280 vec4_visitor::fix_3src_operand(const src_reg &src)
 281 {
 282    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 283     * able to use vertical stride of zero to replicate the vec4 uniform, like
 284     *
 285     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 286     *
 287     * But you can't, since vertical stride is always four in three-source
 288     * instructions. Instead, insert a MOV instruction to do the replication so
 289     * that the three-source instruction can consume it.
 290     */
 291
 292    /* The MOV is only needed if the source is a uniform or immediate. */
 293    if (src.file != UNIFORM && src.file != IMM)
 294       return src;
 295
 296    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 297       return src;
 298
 299    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 300    expanded.type = src.type;
 301    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 302    return src_reg(expanded);
 303 }
 304
 305 src_reg
 306 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 307 {
 308    if (!src.abs && !src.negate)
 309       return src;
 310
 311    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 312    resolved.type = src.type;
 313    emit(MOV(resolved, src));
 314
 315    return src_reg(resolved);
 316 }
 317
 318 src_reg
 319 vec4_visitor::fix_math_operand(const src_reg &src)
 320 {
 321    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 322       return src;
 323
 324    /* The gen6 math instruction ignores the source modifiers --
 325     * swizzle, abs, negate, and at least some parts of the register
 326     * region description.
 327     *
 328     * Rather than trying to enumerate all these cases, *always* expand the
 329     * operand to a temp GRF for gen6.
 330     *
 331     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 332     * can't use.
 333     */
 334
 335    if (devinfo->gen == 7 && src.file != IMM)
 336       return src;
 337
 338    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 339    expanded.type = src.type;
 340    emit(MOV(expanded, src));
 341    return src_reg(expanded);
 342 }
 343
 344 vec4_instruction *
 345 vec4_visitor::emit_math(enum opcode opcode,
 346                         const dst_reg &dst,
 347                         const src_reg &src0, const src_reg &src1)
 348 {
 349    vec4_instruction *math =
 350       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 351
 352    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 353       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 354       math->dst = dst_reg(this, glsl_type::vec4_type);
 355       math->dst.type = dst.type;
 356       math = emit(MOV(dst, src_reg(math->dst)));
 357    } else if (devinfo->gen < 6) {
 358       math->base_mrf = 1;
 359       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 360    }
 361
 362    return math;
 363 }
 364
 365 void
 366 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 367 {
 368    if (devinfo->gen < 7) {
 369       unreachable("ir_unop_pack_half_2x16 should be lowered");
 370    }
 371
 372    assert(dst.type == BRW_REGISTER_TYPE_UD);
 373    assert(src0.type == BRW_REGISTER_TYPE_F);
 374
 375    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 376     *
 377     *   Because this instruction does not have a 16-bit floating-point type,
 378     *   the destination data type must be Word (W).
 379     *
 380     *   The destination must be DWord-aligned and specify a horizontal stride
 381     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 382     *   each destination channel and the upper word is not modified.
 383     *
 384     * The above restriction implies that the f32to16 instruction must use
 385     * align1 mode, because only in align1 mode is it possible to specify
 386     * horizontal stride.  We choose here to defy the hardware docs and emit
 387     * align16 instructions.
 388     *
 389     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 390     * instructions. I was partially successful in that the code passed all
 391     * tests.  However, the code was dubiously correct and fragile, and the
 392     * tests were not harsh enough to probe that frailty. Not trusting the
 393     * code, I chose instead to remain in align16 mode in defiance of the hw
 394     * docs).
 395     *
 396     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 397     * simulator, emitting a f32to16 in align16 mode with UD as destination
 398     * data type is safe. The behavior differs from that specified in the PRM
 399     * in that the upper word of each destination channel is cleared to 0.
 400     */
 401
 402    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 403    src_reg tmp_src(tmp_dst);
 404
 405 #if 0
 406    /* Verify the undocumented behavior on which the following instructions
 407     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 408     * then the result of the bit-or instruction below will be incorrect.
 409     *
 410     * You should inspect the disasm output in order to verify that the MOV is
 411     * not optimized away.
 412     */
 413    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 414 #endif
 415
 416    /* Give tmp the form below, where "." means untouched.
 417     *
 418     *     w z          y          x w z          y          x
 419     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 420     *
 421     * That the upper word of each write-channel be 0 is required for the
 422     * following bit-shift and bit-or instructions to work. Note that this
 423     * relies on the undocumented hardware behavior mentioned above.
 424     */
 425    tmp_dst.writemask = WRITEMASK_XY;
 426    emit(F32TO16(tmp_dst, src0));
 427
 428    /* Give the write-channels of dst the form:
 429     *   0xhhhh0000
 430     */
 431    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 432    emit(SHL(dst, tmp_src, src_reg(16u)));
 433
 434    /* Finally, give the write-channels of dst the form of packHalf2x16's
 435     * output:
 436     *   0xhhhhllll
 437     */
 438    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 439    emit(OR(dst, src_reg(dst), tmp_src));
 440 }
 441
 442 void
 443 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 444 {
 445    if (devinfo->gen < 7) {
 446       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 447    }
 448
 449    assert(dst.type == BRW_REGISTER_TYPE_F);
 450    assert(src0.type == BRW_REGISTER_TYPE_UD);
 451
 452    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 453     *
 454     *   Because this instruction does not have a 16-bit floating-point type,
 455     *   the source data type must be Word (W). The destination type must be
 456     *   F (Float).
 457     *
 458     * To use W as the source data type, we must adjust horizontal strides,
 459     * which is only possible in align1 mode. All my [chadv] attempts at
 460     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 461     * Piglit tests, so I gave up.
 462     *
 463     * I've verified that, on gen7 hardware and the simulator, it is safe to
 464     * emit f16to32 in align16 mode with UD as source data type.
 465     */
 466
 467    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 468    src_reg tmp_src(tmp_dst);
 469
 470    tmp_dst.writemask = WRITEMASK_X;
 471    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 472
 473    tmp_dst.writemask = WRITEMASK_Y;
 474    emit(SHR(tmp_dst, src0, src_reg(16u)));
 475
 476    dst.writemask = WRITEMASK_XY;
 477    emit(F16TO32(dst, tmp_src));
 478 }
 479
 480 void
 481 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 482 {
 483    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 484     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 485     * is not suitable to generate the shift values, but we can use the packed
 486     * vector float and a type-converting MOV.
 487     */
 488    dst_reg shift(this, glsl_type::uvec4_type);
 489    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 490
 491    dst_reg shifted(this, glsl_type::uvec4_type);
 492    src0.swizzle = BRW_SWIZZLE_XXXX;
 493    emit(SHR(shifted, src0, src_reg(shift)));
 494
 495    shifted.type = BRW_REGISTER_TYPE_UB;
 496    dst_reg f(this, glsl_type::vec4_type);
 497    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 498
 499    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 500 }
 501
 502 void
 503 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 504 {
 505    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 506     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 507     * is not suitable to generate the shift values, but we can use the packed
 508     * vector float and a type-converting MOV.
 509     */
 510    dst_reg shift(this, glsl_type::uvec4_type);
 511    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 512
 513    dst_reg shifted(this, glsl_type::uvec4_type);
 514    src0.swizzle = BRW_SWIZZLE_XXXX;
 515    emit(SHR(shifted, src0, src_reg(shift)));
 516
 517    shifted.type = BRW_REGISTER_TYPE_B;
 518    dst_reg f(this, glsl_type::vec4_type);
 519    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 520
 521    dst_reg scaled(this, glsl_type::vec4_type);
 522    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 523
 524    dst_reg max(this, glsl_type::vec4_type);
 525    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 526    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 527 }
 528
 529 void
 530 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 531 {
 532    dst_reg saturated(this, glsl_type::vec4_type);
 533    vec4_instruction *inst = emit(MOV(saturated, src0));
 534    inst->saturate = true;
 535
 536    dst_reg scaled(this, glsl_type::vec4_type);
 537    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 538
 539    dst_reg rounded(this, glsl_type::vec4_type);
 540    emit(RNDE(rounded, src_reg(scaled)));
 541
 542    dst_reg u(this, glsl_type::uvec4_type);
 543    emit(MOV(u, src_reg(rounded)));
 544
 545    src_reg bytes(u);
 546    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 547 }
 548
 549 void
 550 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 551 {
 552    dst_reg max(this, glsl_type::vec4_type);
 553    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 554
 555    dst_reg min(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 557
 558    dst_reg scaled(this, glsl_type::vec4_type);
 559    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 560
 561    dst_reg rounded(this, glsl_type::vec4_type);
 562    emit(RNDE(rounded, src_reg(scaled)));
 563
 564    dst_reg i(this, glsl_type::ivec4_type);
 565    emit(MOV(i, src_reg(rounded)));
 566
 567    src_reg bytes(i);
 568    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 569 }
 570
 571 /**
 572  * Returns the minimum number of vec4 elements needed to pack a type.
 573  *
 574  * For simple types, it will return 1 (a single vec4); for matrices, the
 575  * number of columns; for array and struct, the sum of the vec4_size of
 576  * each of its elements; and for sampler and atomic, zero.
 577  *
 578  * This method is useful to calculate how much register space is needed to
 579  * store a particular type.
 580  */
 581 extern "C" int
 582 type_size_vec4(const struct glsl_type *type)
 583 {
 584    unsigned int i;
 585    int size;
 586
 587    switch (type->base_type) {
 588    case GLSL_TYPE_UINT:
 589    case GLSL_TYPE_INT:
 590    case GLSL_TYPE_FLOAT:
 591    case GLSL_TYPE_BOOL:
 592       if (type->is_matrix()) {
 593          return type->matrix_columns;
 594       } else {
 595          /* Regardless of size of vector, it gets a vec4. This is bad
 596           * packing for things like floats, but otherwise arrays become a
 597           * mess.  Hopefully a later pass over the code can pack scalars
 598           * down if appropriate.
 599           */
 600          return 1;
 601       }
 602    case GLSL_TYPE_ARRAY:
 603       assert(type->length > 0);
 604       return type_size_vec4(type->fields.array) * type->length;
 605    case GLSL_TYPE_STRUCT:
 606       size = 0;
 607       for (i = 0; i < type->length; i++) {
 608          size += type_size_vec4(type->fields.structure[i].type);
 609       }
 610       return size;
 611    case GLSL_TYPE_SUBROUTINE:
 612       return 1;
 613
 614    case GLSL_TYPE_SAMPLER:
 615       /* Samplers take up no register space, since they're baked in at
 616        * link time.
 617        */
 618       return 0;
 619    case GLSL_TYPE_ATOMIC_UINT:
 620       return 0;
 621    case GLSL_TYPE_IMAGE:
 622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 623    case GLSL_TYPE_VOID:
 624    case GLSL_TYPE_DOUBLE:
 625    case GLSL_TYPE_ERROR:
 626    case GLSL_TYPE_INTERFACE:
 627    case GLSL_TYPE_FUNCTION:
 628       unreachable("not reached");
 629    }
 630
 631    return 0;
 632 }
 633
 634 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 635 {
 636    init();
 637
 638    this->file = GRF;
 639    this->reg = v->alloc.allocate(type_size_vec4(type));
 640
 641    if (type->is_array() || type->is_record()) {
 642       this->swizzle = BRW_SWIZZLE_NOOP;
 643    } else {
 644       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 645    }
 646
 647    this->type = brw_type_for_base_type(type);
 648 }
 649
 650 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 651 {
 652    assert(size > 0);
 653
 654    init();
 655
 656    this->file = GRF;
 657    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 658
 659    this->swizzle = BRW_SWIZZLE_NOOP;
 660
 661    this->type = brw_type_for_base_type(type);
 662 }
 663
 664 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 665 {
 666    init();
 667
 668    this->file = GRF;
 669    this->reg = v->alloc.allocate(type_size_vec4(type));
 670
 671    if (type->is_array() || type->is_record()) {
 672       this->writemask = WRITEMASK_XYZW;
 673    } else {
 674       this->writemask = (1 << type->vector_elements) - 1;
 675    }
 676
 677    this->type = brw_type_for_base_type(type);
 678 }
 679
 680 vec4_instruction *
 681 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 682                           src_reg src0, src_reg src1)
 683 {
 684    vec4_instruction *inst;
 685
 686    if (devinfo->gen >= 6) {
 687       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 688       inst->conditional_mod = conditionalmod;
 689    } else {
 690       emit(CMP(dst, src0, src1, conditionalmod));
 691
 692       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 693       inst->predicate = BRW_PREDICATE_NORMAL;
 694    }
 695
 696    return inst;
 697 }
 698
 699 vec4_instruction *
 700 vec4_visitor::emit_lrp(const dst_reg &dst,
 701                        const src_reg &x, const src_reg &y, const src_reg &a)
 702 {
 703    if (devinfo->gen >= 6) {
 704       /* Note that the instruction's argument order is reversed from GLSL
 705        * and the IR.
 706        */
 707      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 708                      fix_3src_operand(x)));
 709    } else {
 710       /* Earlier generations don't support three source operations, so we
 711        * need to emit x*(1-a) + y*a.
 712        */
 713       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 714       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 715       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 716       y_times_a.writemask           = dst.writemask;
 717       one_minus_a.writemask         = dst.writemask;
 718       x_times_one_minus_a.writemask = dst.writemask;
 719
 720       emit(MUL(y_times_a, y, a));
 721       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
 722       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 723       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 724    }
 725 }
 726
 727 /**
 728  * Emits the instructions needed to perform a pull constant load. before_block
 729  * and before_inst can be NULL in which case the instruction will be appended
 730  * to the end of the instruction list.
 731  */
 732 void
 733 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 734                                           src_reg surf_index,
 735                                           src_reg offset_reg,
 736                                           bblock_t *before_block,
 737                                           vec4_instruction *before_inst)
 738 {
 739    assert((before_inst == NULL && before_block == NULL) ||
 740           (before_inst && before_block));
 741
 742    vec4_instruction *pull;
 743
 744    if (devinfo->gen >= 9) {
 745       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 746       src_reg header(this, glsl_type::uvec4_type, 2);
 747
 748       pull = new(mem_ctx)
 749          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 750                           dst_reg(header));
 751
 752       if (before_inst)
 753          emit_before(before_block, before_inst, pull);
 754       else
 755          emit(pull);
 756
 757       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 758                                  offset_reg.type);
 759       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 760
 761       if (before_inst)
 762          emit_before(before_block, before_inst, pull);
 763       else
 764          emit(pull);
 765
 766       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 767                                            dst,
 768                                            surf_index,
 769                                            header);
 770       pull->mlen = 2;
 771       pull->header_size = 1;
 772    } else if (devinfo->gen >= 7) {
 773       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 774
 775       grf_offset.type = offset_reg.type;
 776
 777       pull = MOV(grf_offset, offset_reg);
 778
 779       if (before_inst)
 780          emit_before(before_block, before_inst, pull);
 781       else
 782          emit(pull);
 783
 784       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 785                                            dst,
 786                                            surf_index,
 787                                            src_reg(grf_offset));
 788       pull->mlen = 1;
 789    } else {
 790       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 791                                            dst,
 792                                            surf_index,
 793                                            offset_reg);
 794       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 795       pull->mlen = 1;
 796    }
 797
 798    if (before_inst)
 799       emit_before(before_block, before_inst, pull);
 800    else
 801       emit(pull);
 802 }
 803
 804 src_reg
 805 vec4_visitor::emit_uniformize(const src_reg &src)
 806 {
 807    const src_reg chan_index(this, glsl_type::uint_type);
 808    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 809                               src.type);
 810
 811    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 812       ->force_writemask_all = true;
 813    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 814       ->force_writemask_all = true;
 815
 816    return src_reg(dst);
 817 }
 818
 819 src_reg
 820 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 821                              src_reg coordinate, src_reg sampler)
 822 {
 823    vec4_instruction *inst =
 824       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 825                                     dst_reg(this, glsl_type::uvec4_type));
 826    inst->base_mrf = 2;
 827    inst->src[1] = sampler;
 828
 829    int param_base;
 830
 831    if (devinfo->gen >= 9) {
 832       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 833       vec4_instruction *header_inst = new(mem_ctx)
 834          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 835                           dst_reg(MRF, inst->base_mrf));
 836
 837       emit(header_inst);
 838
 839       inst->mlen = 2;
 840       inst->header_size = 1;
 841       param_base = inst->base_mrf + 1;
 842    } else {
 843       inst->mlen = 1;
 844       param_base = inst->base_mrf;
 845    }
 846
 847    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 848    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 849    int zero_mask = 0xf & ~coord_mask;
 850
 851    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 852             coordinate));
 853
 854    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 855             src_reg(0)));
 856
 857    emit(inst);
 858    return src_reg(inst->dst);
 859 }
 860
 861 bool
 862 vec4_visitor::is_high_sampler(src_reg sampler)
 863 {
 864    if (devinfo->gen < 8 && !devinfo->is_haswell)
 865       return false;
 866
 867    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 868 }
 869
 870 void
 871 vec4_visitor::emit_texture(ir_texture_opcode op,
 872                            dst_reg dest,
 873                            const glsl_type *dest_type,
 874                            src_reg coordinate,
 875                            int coord_components,
 876                            src_reg shadow_comparitor,
 877                            src_reg lod, src_reg lod2,
 878                            src_reg sample_index,
 879                            uint32_t constant_offset,
 880                            src_reg offset_value,
 881                            src_reg mcs,
 882                            bool is_cube_array,
 883                            uint32_t sampler,
 884                            src_reg sampler_reg)
 885 {
 886    enum opcode opcode;
 887    switch (op) {
 888    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 889    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 890    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 891    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 892    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 893    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 894    case ir_tg4: opcode = offset_value.file != BAD_FILE
 895                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 896    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 897    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 898    case ir_txb:
 899       unreachable("TXB is not valid for vertex shaders.");
 900    case ir_lod:
 901       unreachable("LOD is not valid for vertex shaders.");
 902    default:
 903       unreachable("Unrecognized tex op");
 904    }
 905
 906    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 907       opcode, dst_reg(this, dest_type));
 908
 909    inst->offset = constant_offset;
 910
 911    /* The message header is necessary for:
 912     * - Gen4 (always)
 913     * - Gen9+ for selecting SIMD4x2
 914     * - Texel offsets
 915     * - Gather channel selection
 916     * - Sampler indices too large to fit in a 4-bit value.
 917     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 918     */
 919    inst->header_size =
 920       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 921        inst->offset != 0 || op == ir_tg4 ||
 922        op == ir_texture_samples ||
 923        is_high_sampler(sampler_reg)) ? 1 : 0;
 924    inst->base_mrf = 2;
 925    inst->mlen = inst->header_size;
 926    inst->dst.writemask = WRITEMASK_XYZW;
 927    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 928
 929    inst->src[1] = sampler_reg;
 930
 931    /* MRF for the first parameter */
 932    int param_base = inst->base_mrf + inst->header_size;
 933
 934    if (op == ir_txs || op == ir_query_levels) {
 935       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 936       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 937       inst->mlen++;
 938    } else if (op == ir_texture_samples) {
 939       inst->dst.writemask = WRITEMASK_X;
 940    } else {
 941       /* Load the coordinate */
 942       /* FINISHME: gl_clamp_mask and saturate */
 943       int coord_mask = (1 << coord_components) - 1;
 944       int zero_mask = 0xf & ~coord_mask;
 945
 946       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 947                coordinate));
 948       inst->mlen++;
 949
 950       if (zero_mask != 0) {
 951          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 952                   src_reg(0)));
 953       }
 954       /* Load the shadow comparitor */
 955       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 956          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 957                           WRITEMASK_X),
 958                   shadow_comparitor));
 959          inst->mlen++;
 960       }
 961
 962       /* Load the LOD info */
 963       if (op == ir_tex || op == ir_txl) {
 964          int mrf, writemask;
 965          if (devinfo->gen >= 5) {
 966             mrf = param_base + 1;
 967             if (shadow_comparitor.file != BAD_FILE) {
 968                writemask = WRITEMASK_Y;
 969                /* mlen already incremented */
 970             } else {
 971                writemask = WRITEMASK_X;
 972                inst->mlen++;
 973             }
 974          } else /* devinfo->gen == 4 */ {
 975             mrf = param_base;
 976             writemask = WRITEMASK_W;
 977          }
 978          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 979       } else if (op == ir_txf) {
 980          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 981       } else if (op == ir_txf_ms) {
 982          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
 983                   sample_index));
 984          if (devinfo->gen >= 7) {
 985             /* MCS data is in the first channel of `mcs`, but we need to get it into
 986              * the .y channel of the second vec4 of params, so replicate .x across
 987              * the whole vec4 and then mask off everything except .y
 988              */
 989             mcs.swizzle = BRW_SWIZZLE_XXXX;
 990             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
 991                      mcs));
 992          }
 993          inst->mlen++;
 994       } else if (op == ir_txd) {
 995          const brw_reg_type type = lod.type;
 996
 997          if (devinfo->gen >= 5) {
 998             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
 999             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1000             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1001             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1002             inst->mlen++;
1003
1004             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1005                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1006                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1007                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1008                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1009                inst->mlen++;
1010
1011                if (shadow_comparitor.file != BAD_FILE) {
1012                   emit(MOV(dst_reg(MRF, param_base + 2,
1013                                    shadow_comparitor.type, WRITEMASK_Z),
1014                            shadow_comparitor));
1015                }
1016             }
1017          } else /* devinfo->gen == 4 */ {
1018             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1019             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1020             inst->mlen += 2;
1021          }
1022       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1023          if (shadow_comparitor.file != BAD_FILE) {
1024             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1025                      shadow_comparitor));
1026          }
1027
1028          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1029                   offset_value));
1030          inst->mlen++;
1031       }
1032    }
1033
1034    emit(inst);
1035
1036    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1037     * spec requires layers.
1038     */
1039    if (op == ir_txs && is_cube_array) {
1040       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1041                 writemask(inst->dst, WRITEMASK_Z),
1042                 src_reg(inst->dst), src_reg(6));
1043    }
1044
1045    if (devinfo->gen == 6 && op == ir_tg4) {
1046       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1047    }
1048
1049    swizzle_result(op, dest,
1050                   src_reg(inst->dst), sampler, dest_type);
1051 }
1052
1053 /**
1054  * Apply workarounds for Gen6 gather with UINT/SINT
1055  */
1056 void
1057 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1058 {
1059    if (!wa)
1060       return;
1061
1062    int width = (wa & WA_8BIT) ? 8 : 16;
1063    dst_reg dst_f = dst;
1064    dst_f.type = BRW_REGISTER_TYPE_F;
1065
1066    /* Convert from UNORM to UINT */
1067    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1068    emit(MOV(dst, src_reg(dst_f)));
1069
1070    if (wa & WA_SIGN) {
1071       /* Reinterpret the UINT value as a signed INT value by
1072        * shifting the sign bit into place, then shifting back
1073        * preserving sign.
1074        */
1075       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1076       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1077    }
1078 }
1079
1080 /**
1081  * Set up the gather channel based on the swizzle, for gather4.
1082  */
1083 uint32_t
1084 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1085 {
1086    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1087    switch (swiz) {
1088       case SWIZZLE_X: return 0;
1089       case SWIZZLE_Y:
1090          /* gather4 sampler is broken for green channel on RG32F --
1091           * we must ask for blue instead.
1092           */
1093          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1094             return 2;
1095          return 1;
1096       case SWIZZLE_Z: return 2;
1097       case SWIZZLE_W: return 3;
1098       default:
1099          unreachable("Not reached"); /* zero, one swizzles handled already */
1100    }
1101 }
1102
1103 void
1104 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1105                              src_reg orig_val, uint32_t sampler,
1106                              const glsl_type *dest_type)
1107 {
1108    int s = key_tex->swizzles[sampler];
1109
1110    dst_reg swizzled_result = dest;
1111
1112    if (op == ir_query_levels) {
1113       /* # levels is in .w */
1114       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1115       emit(MOV(swizzled_result, orig_val));
1116       return;
1117    }
1118
1119    if (op == ir_txs || dest_type == glsl_type::float_type
1120                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1121       emit(MOV(swizzled_result, orig_val));
1122       return;
1123    }
1124
1125
1126    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1127    int swizzle[4] = {0};
1128
1129    for (int i = 0; i < 4; i++) {
1130       switch (GET_SWZ(s, i)) {
1131       case SWIZZLE_ZERO:
1132          zero_mask |= (1 << i);
1133          break;
1134       case SWIZZLE_ONE:
1135          one_mask |= (1 << i);
1136          break;
1137       default:
1138          copy_mask |= (1 << i);
1139          swizzle[i] = GET_SWZ(s, i);
1140          break;
1141       }
1142    }
1143
1144    if (copy_mask) {
1145       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1146       swizzled_result.writemask = copy_mask;
1147       emit(MOV(swizzled_result, orig_val));
1148    }
1149
1150    if (zero_mask) {
1151       swizzled_result.writemask = zero_mask;
1152       emit(MOV(swizzled_result, src_reg(0.0f)));
1153    }
1154
1155    if (one_mask) {
1156       swizzled_result.writemask = one_mask;
1157       emit(MOV(swizzled_result, src_reg(1.0f)));
1158    }
1159 }
1160
1161 void
1162 vec4_visitor::gs_emit_vertex(int stream_id)
1163 {
1164    unreachable("not reached");
1165 }
1166
1167 void
1168 vec4_visitor::gs_end_primitive()
1169 {
1170    unreachable("not reached");
1171 }
1172
1173 void
1174 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1175                                   dst_reg dst, src_reg offset,
1176                                   src_reg src0, src_reg src1)
1177 {
1178    unsigned mlen = 0;
1179
1180    /* Set the atomic operation offset. */
1181    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1182    mlen++;
1183
1184    /* Set the atomic operation arguments. */
1185    if (src0.file != BAD_FILE) {
1186       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1187       mlen++;
1188    }
1189
1190    if (src1.file != BAD_FILE) {
1191       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1192       mlen++;
1193    }
1194
1195    /* Emit the instruction.  Note that this maps to the normal SIMD8
1196     * untyped atomic message on Ivy Bridge, but that's OK because
1197     * unused channels will be masked out.
1198     */
1199    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1200                                  brw_message_reg(0),
1201                                  src_reg(surf_index), src_reg(atomic_op));
1202    inst->mlen = mlen;
1203 }
1204
1205 void
1206 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1207                                         src_reg offset)
1208 {
1209    /* Set the surface read offset. */
1210    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1211
1212    /* Emit the instruction.  Note that this maps to the normal SIMD8
1213     * untyped surface read message, but that's OK because unused
1214     * channels will be masked out.
1215     */
1216    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1217                                  brw_message_reg(0),
1218                                  src_reg(surf_index), src_reg(1));
1219    inst->mlen = 1;
1220 }
1221
1222 void
1223 vec4_visitor::emit_ndc_computation()
1224 {
1225    /* Get the position */
1226    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1227
1228    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1229    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1230    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1231
1232    current_annotation = "NDC";
1233    dst_reg ndc_w = ndc;
1234    ndc_w.writemask = WRITEMASK_W;
1235    src_reg pos_w = pos;
1236    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1237    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1238
1239    dst_reg ndc_xyz = ndc;
1240    ndc_xyz.writemask = WRITEMASK_XYZ;
1241
1242    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1243 }
1244
1245 void
1246 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1247 {
1248    if (devinfo->gen < 6 &&
1249        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1250         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1251         devinfo->has_negative_rhw_bug)) {
1252       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1253       dst_reg header1_w = header1;
1254       header1_w.writemask = WRITEMASK_W;
1255
1256       emit(MOV(header1, 0u));
1257
1258       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1259          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1260
1261          current_annotation = "Point size";
1262          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1263          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1264       }
1265
1266       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1267          current_annotation = "Clipping flags";
1268          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1269          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1270
1271          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1272          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1273          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1274
1275          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1276          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1277          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1278          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1279       }
1280
1281       /* i965 clipping workaround:
1282        * 1) Test for -ve rhw
1283        * 2) If set,
1284        *      set ndc = (0,0,0,0)
1285        *      set ucp[6] = 1
1286        *
1287        * Later, clipping will detect ucp[6] and ensure the primitive is
1288        * clipped against all fixed planes.
1289        */
1290       if (devinfo->has_negative_rhw_bug) {
1291          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1292          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1293          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1294          vec4_instruction *inst;
1295          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1296          inst->predicate = BRW_PREDICATE_NORMAL;
1297          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1298          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1299          inst->predicate = BRW_PREDICATE_NORMAL;
1300       }
1301
1302       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1303    } else if (devinfo->gen < 6) {
1304       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1305    } else {
1306       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1307       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1308          dst_reg reg_w = reg;
1309          reg_w.writemask = WRITEMASK_W;
1310          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1311          reg_as_src.type = reg_w.type;
1312          reg_as_src.swizzle = brw_swizzle_for_size(1);
1313          emit(MOV(reg_w, reg_as_src));
1314       }
1315       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1316          dst_reg reg_y = reg;
1317          reg_y.writemask = WRITEMASK_Y;
1318          reg_y.type = BRW_REGISTER_TYPE_D;
1319          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1320          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1321       }
1322       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1323          dst_reg reg_z = reg;
1324          reg_z.writemask = WRITEMASK_Z;
1325          reg_z.type = BRW_REGISTER_TYPE_D;
1326          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1327          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1328       }
1329    }
1330 }
1331
1332 vec4_instruction *
1333 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1334 {
1335    assert(varying < VARYING_SLOT_MAX);
1336    assert(output_reg[varying].type == reg.type);
1337    current_annotation = output_reg_annotation[varying];
1338    /* Copy the register, saturating if necessary */
1339    return emit(MOV(reg, src_reg(output_reg[varying])));
1340 }
1341
1342 void
1343 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1344 {
1345    reg.type = BRW_REGISTER_TYPE_F;
1346    output_reg[varying].type = reg.type;
1347
1348    switch (varying) {
1349    case VARYING_SLOT_PSIZ:
1350    {
1351       /* PSIZ is always in slot 0, and is coupled with other flags. */
1352       current_annotation = "indices, point width, clip flags";
1353       emit_psiz_and_flags(reg);
1354       break;
1355    }
1356    case BRW_VARYING_SLOT_NDC:
1357       current_annotation = "NDC";
1358       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1359       break;
1360    case VARYING_SLOT_POS:
1361       current_annotation = "gl_Position";
1362       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1363       break;
1364    case VARYING_SLOT_EDGE:
1365       /* This is present when doing unfilled polygons.  We're supposed to copy
1366        * the edge flag from the user-provided vertex array
1367        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1368        * of that attribute (starts as 1.0f).  This is then used in clipping to
1369        * determine which edges should be drawn as wireframe.
1370        */
1371       current_annotation = "edge flag";
1372       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1373                                     glsl_type::float_type, WRITEMASK_XYZW))));
1374       break;
1375    case BRW_VARYING_SLOT_PAD:
1376       /* No need to write to this slot */
1377       break;
1378    default:
1379       emit_generic_urb_slot(reg, varying);
1380       break;
1381    }
1382 }
1383
1384 static int
1385 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1386 {
1387    if (devinfo->gen >= 6) {
1388       /* URB data written (does not include the message header reg) must
1389        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1390        * section 5.4.3.2.2: URB_INTERLEAVED.
1391        *
1392        * URB entries are allocated on a multiple of 1024 bits, so an
1393        * extra 128 bits written here to make the end align to 256 is
1394        * no problem.
1395        */
1396       if ((mlen % 2) != 1)
1397          mlen++;
1398    }
1399
1400    return mlen;
1401 }
1402
1403
1404 /**
1405  * Generates the VUE payload plus the necessary URB write instructions to
1406  * output it.
1407  *
1408  * The VUE layout is documented in Volume 2a.
1409  */
1410 void
1411 vec4_visitor::emit_vertex()
1412 {
1413    /* MRF 0 is reserved for the debugger, so start with message header
1414     * in MRF 1.
1415     */
1416    int base_mrf = 1;
1417    int mrf = base_mrf;
1418    /* In the process of generating our URB write message contents, we
1419     * may need to unspill a register or load from an array.  Those
1420     * reads would use MRFs 14-15.
1421     */
1422    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1423
1424    /* The following assertion verifies that max_usable_mrf causes an
1425     * even-numbered amount of URB write data, which will meet gen6's
1426     * requirements for length alignment.
1427     */
1428    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1429
1430    /* First mrf is the g0-based message header containing URB handles and
1431     * such.
1432     */
1433    emit_urb_write_header(mrf++);
1434
1435    if (devinfo->gen < 6) {
1436       emit_ndc_computation();
1437    }
1438
1439    /* We may need to split this up into several URB writes, so do them in a
1440     * loop.
1441     */
1442    int slot = 0;
1443    bool complete = false;
1444    do {
1445       /* URB offset is in URB row increments, and each of our MRFs is half of
1446        * one of those, since we're doing interleaved writes.
1447        */
1448       int offset = slot / 2;
1449
1450       mrf = base_mrf + 1;
1451       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1452          emit_urb_slot(dst_reg(MRF, mrf++),
1453                        prog_data->vue_map.slot_to_varying[slot]);
1454
1455          /* If this was max_usable_mrf, we can't fit anything more into this
1456           * URB WRITE. Same thing if we reached the maximum length available.
1457           */
1458          if (mrf > max_usable_mrf ||
1459              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1460             slot++;
1461             break;
1462          }
1463       }
1464
1465       complete = slot >= prog_data->vue_map.num_slots;
1466       current_annotation = "URB write";
1467       vec4_instruction *inst = emit_urb_write_opcode(complete);
1468       inst->base_mrf = base_mrf;
1469       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1470       inst->offset += offset;
1471    } while(!complete);
1472 }
1473
1474
1475 src_reg
1476 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1477                                  src_reg *reladdr, int reg_offset)
1478 {
1479    /* Because we store the values to scratch interleaved like our
1480     * vertex data, we need to scale the vec4 index by 2.
1481     */
1482    int message_header_scale = 2;
1483
1484    /* Pre-gen6, the message header uses byte offsets instead of vec4
1485     * (16-byte) offset units.
1486     */
1487    if (devinfo->gen < 6)
1488       message_header_scale *= 16;
1489
1490    if (reladdr) {
1491       src_reg index = src_reg(this, glsl_type::int_type);
1492
1493       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1494                                    src_reg(reg_offset)));
1495       emit_before(block, inst, MUL(dst_reg(index), index,
1496                                    src_reg(message_header_scale)));
1497
1498       return index;
1499    } else {
1500       return src_reg(reg_offset * message_header_scale);
1501    }
1502 }
1503
1504 src_reg
1505 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1506                                        src_reg *reladdr, int reg_offset)
1507 {
1508    if (reladdr) {
1509       src_reg index = src_reg(this, glsl_type::int_type);
1510
1511       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1512                                    src_reg(reg_offset)));
1513
1514       /* Pre-gen6, the message header uses byte offsets instead of vec4
1515        * (16-byte) offset units.
1516        */
1517       if (devinfo->gen < 6) {
1518          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1519       }
1520
1521       return index;
1522    } else if (devinfo->gen >= 8) {
1523       /* Store the offset in a GRF so we can send-from-GRF. */
1524       src_reg offset = src_reg(this, glsl_type::int_type);
1525       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1526       return offset;
1527    } else {
1528       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1529       return src_reg(reg_offset * message_header_scale);
1530    }
1531 }
1532
1533 /**
1534  * Emits an instruction before @inst to load the value named by @orig_src
1535  * from scratch space at @base_offset to @temp.
1536  *
1537  * @base_offset is measured in 32-byte units (the size of a register).
1538  */
1539 void
1540 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1541                                 dst_reg temp, src_reg orig_src,
1542                                 int base_offset)
1543 {
1544    int reg_offset = base_offset + orig_src.reg_offset;
1545    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1546                                       reg_offset);
1547
1548    emit_before(block, inst, SCRATCH_READ(temp, index));
1549 }
1550
1551 /**
1552  * Emits an instruction after @inst to store the value to be written
1553  * to @orig_dst to scratch space at @base_offset, from @temp.
1554  *
1555  * @base_offset is measured in 32-byte units (the size of a register).
1556  */
1557 void
1558 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1559                                  int base_offset)
1560 {
1561    int reg_offset = base_offset + inst->dst.reg_offset;
1562    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1563                                       reg_offset);
1564
1565    /* Create a temporary register to store *inst's result in.
1566     *
1567     * We have to be careful in MOVing from our temporary result register in
1568     * the scratch write.  If we swizzle from channels of the temporary that
1569     * weren't initialized, it will confuse live interval analysis, which will
1570     * make spilling fail to make progress.
1571     */
1572    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1573                                        inst->dst.type),
1574                                 brw_swizzle_for_mask(inst->dst.writemask));
1575    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1576                                        inst->dst.writemask));
1577    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1578    if (inst->opcode != BRW_OPCODE_SEL)
1579       write->predicate = inst->predicate;
1580    write->ir = inst->ir;
1581    write->annotation = inst->annotation;
1582    inst->insert_after(block, write);
1583
1584    inst->dst.file = temp.file;
1585    inst->dst.reg = temp.reg;
1586    inst->dst.reg_offset = temp.reg_offset;
1587    inst->dst.reladdr = NULL;
1588 }
1589
1590 /**
1591  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1592  * adds the scratch read(s) before \p inst. The function also checks for
1593  * recursive reladdr scratch accesses, issuing the corresponding scratch
1594  * loads and rewriting reladdr references accordingly.
1595  *
1596  * \return \p src if it did not require a scratch load, otherwise, the
1597  * register holding the result of the scratch load that the caller should
1598  * use to rewrite src.
1599  */
1600 src_reg
1601 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1602                                    vec4_instruction *inst, src_reg src)
1603 {
1604    /* Resolve recursive reladdr scratch access by calling ourselves
1605     * with src.reladdr
1606     */
1607    if (src.reladdr)
1608       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1609                                           *src.reladdr);
1610
1611    /* Now handle scratch access on src */
1612    if (src.file == GRF && scratch_loc[src.reg] != -1) {
1613       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1614       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1615       src.reg = temp.reg;
1616       src.reg_offset = temp.reg_offset;
1617       src.reladdr = NULL;
1618    }
1619
1620    return src;
1621 }
1622
1623 /**
1624  * We can't generally support array access in GRF space, because a
1625  * single instruction's destination can only span 2 contiguous
1626  * registers.  So, we send all GRF arrays that get variable index
1627  * access to scratch space.
1628  */
1629 void
1630 vec4_visitor::move_grf_array_access_to_scratch()
1631 {
1632    int scratch_loc[this->alloc.count];
1633    memset(scratch_loc, -1, sizeof(scratch_loc));
1634
1635    /* First, calculate the set of virtual GRFs that need to be punted
1636     * to scratch due to having any array access on them, and where in
1637     * scratch.
1638     */
1639    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1640       if (inst->dst.file == GRF && inst->dst.reladdr) {
1641          if (scratch_loc[inst->dst.reg] == -1) {
1642             scratch_loc[inst->dst.reg] = last_scratch;
1643             last_scratch += this->alloc.sizes[inst->dst.reg];
1644          }
1645
1646          for (src_reg *iter = inst->dst.reladdr;
1647               iter->reladdr;
1648               iter = iter->reladdr) {
1649             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1650                scratch_loc[iter->reg] = last_scratch;
1651                last_scratch += this->alloc.sizes[iter->reg];
1652             }
1653          }
1654       }
1655
1656       for (int i = 0 ; i < 3; i++) {
1657          for (src_reg *iter = &inst->src[i];
1658               iter->reladdr;
1659               iter = iter->reladdr) {
1660             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1661                scratch_loc[iter->reg] = last_scratch;
1662                last_scratch += this->alloc.sizes[iter->reg];
1663             }
1664          }
1665       }
1666    }
1667
1668    /* Now, for anything that will be accessed through scratch, rewrite
1669     * it to load/store.  Note that this is a _safe list walk, because
1670     * we may generate a new scratch_write instruction after the one
1671     * we're processing.
1672     */
1673    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1674       /* Set up the annotation tracking for new generated instructions. */
1675       base_ir = inst->ir;
1676       current_annotation = inst->annotation;
1677
1678       /* First handle scratch access on the dst. Notice we have to handle
1679        * the case where the dst's reladdr also points to scratch space.
1680        */
1681       if (inst->dst.reladdr)
1682          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1683                                                    *inst->dst.reladdr);
1684
1685       /* Now that we have handled any (possibly recursive) reladdr scratch
1686        * accesses for dst we can safely do the scratch write for dst itself
1687        */
1688       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1689          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1690
1691       /* Now handle scratch access on any src. In this case, since inst->src[i]
1692        * already is a src_reg, we can just call emit_resolve_reladdr with
1693        * inst->src[i] and it will take care of handling scratch loads for
1694        * both src and src.reladdr (recursively).
1695        */
1696       for (int i = 0 ; i < 3; i++) {
1697          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1698                                              inst->src[i]);
1699       }
1700    }
1701 }
1702
1703 /**
1704  * Emits an instruction before @inst to load the value named by @orig_src
1705  * from the pull constant buffer (surface) at @base_offset to @temp.
1706  */
1707 void
1708 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1709                                       dst_reg temp, src_reg orig_src,
1710                                       int base_offset)
1711 {
1712    int reg_offset = base_offset + orig_src.reg_offset;
1713    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1714    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1715                                              reg_offset);
1716
1717    emit_pull_constant_load_reg(temp,
1718                                index,
1719                                offset,
1720                                block, inst);
1721 }
1722
1723 /**
1724  * Implements array access of uniforms by inserting a
1725  * PULL_CONSTANT_LOAD instruction.
1726  *
1727  * Unlike temporary GRF array access (where we don't support it due to
1728  * the difficulty of doing relative addressing on instruction
1729  * destinations), we could potentially do array access of uniforms
1730  * that were loaded in GRF space as push constants.  In real-world
1731  * usage we've seen, though, the arrays being used are always larger
1732  * than we could load as push constants, so just always move all
1733  * uniform array access out to a pull constant buffer.
1734  */
1735 void
1736 vec4_visitor::move_uniform_array_access_to_pull_constants()
1737 {
1738    int pull_constant_loc[this->uniforms];
1739    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1740    bool nested_reladdr;
1741
1742    /* Walk through and find array access of uniforms.  Put a copy of that
1743     * uniform in the pull constant buffer.
1744     *
1745     * Note that we don't move constant-indexed accesses to arrays.  No
1746     * testing has been done of the performance impact of this choice.
1747     */
1748    do {
1749       nested_reladdr = false;
1750
1751       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1752          for (int i = 0 ; i < 3; i++) {
1753             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1754                continue;
1755
1756             int uniform = inst->src[i].reg;
1757
1758             if (inst->src[i].reladdr->reladdr)
1759                nested_reladdr = true;  /* will need another pass */
1760
1761             /* If this array isn't already present in the pull constant buffer,
1762              * add it.
1763              */
1764             if (pull_constant_loc[uniform] == -1) {
1765                const gl_constant_value **values =
1766                   &stage_prog_data->param[uniform * 4];
1767
1768                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1769
1770                assert(uniform < uniform_array_size);
1771                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1772                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1773                      = values[j];
1774                }
1775             }
1776
1777             /* Set up the annotation tracking for new generated instructions. */
1778             base_ir = inst->ir;
1779             current_annotation = inst->annotation;
1780
1781             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1782
1783             emit_pull_constant_load(block, inst, temp, inst->src[i],
1784                                     pull_constant_loc[uniform]);
1785
1786             inst->src[i].file = temp.file;
1787             inst->src[i].reg = temp.reg;
1788             inst->src[i].reg_offset = temp.reg_offset;
1789             inst->src[i].reladdr = NULL;
1790          }
1791       }
1792    } while (nested_reladdr);
1793
1794    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1795     * no need to track them as larger-than-vec4 objects.  This will be
1796     * relied on in cutting out unused uniform vectors from push
1797     * constants.
1798     */
1799    split_uniform_registers();
1800 }
1801
1802 void
1803 vec4_visitor::resolve_ud_negate(src_reg *reg)
1804 {
1805    if (reg->type != BRW_REGISTER_TYPE_UD ||
1806        !reg->negate)
1807       return;
1808
1809    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1810    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1811    *reg = temp;
1812 }
1813
1814 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1815                            void *log_data,
1816                            const struct brw_sampler_prog_key_data *key_tex,
1817                            struct brw_vue_prog_data *prog_data,
1818                            const nir_shader *shader,
1819                            void *mem_ctx,
1820                            bool no_spills,
1821                            int shader_time_index)
1822    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1823      key_tex(key_tex),
1824      prog_data(prog_data),
1825      fail_msg(NULL),
1826      first_non_payload_grf(0),
1827      need_all_constants_in_pull_buffer(false),
1828      no_spills(no_spills),
1829      shader_time_index(shader_time_index),
1830      last_scratch(0)
1831 {
1832    this->failed = false;
1833
1834    this->base_ir = NULL;
1835    this->current_annotation = NULL;
1836    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1837
1838    this->virtual_grf_start = NULL;
1839    this->virtual_grf_end = NULL;
1840    this->live_intervals = NULL;
1841
1842    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1843
1844    this->uniforms = 0;
1845
1846    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1847     * at least one. See setup_uniforms() in brw_vec4.cpp.
1848     */
1849    this->uniform_array_size = 1;
1850    if (prog_data) {
1851       this->uniform_array_size =
1852          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1853    }
1854
1855    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1856 }
1857
1858 vec4_visitor::~vec4_visitor()
1859 {
1860 }
1861
1862
1863 void
1864 vec4_visitor::fail(const char *format, ...)
1865 {
1866    va_list va;
1867    char *msg;
1868
1869    if (failed)
1870       return;
1871
1872    failed = true;
1873
1874    va_start(va, format);
1875    msg = ralloc_vasprintf(mem_ctx, format, va);
1876    va_end(va);
1877    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1878
1879    this->fail_msg = msg;
1880
1881    if (debug_enabled) {
1882       fprintf(stderr, "%s",  msg);
1883    }
1884 }
1885
1886 } /* namespace brw */