src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186 ALU1(DIM)
 187
 188 /** Gen4 predicated IF. */
 189 vec4_instruction *
 190 vec4_visitor::IF(enum brw_predicate predicate)
 191 {
 192    vec4_instruction *inst;
 193
 194    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196
 197    return inst;
 198 }
 199
 200 /** Gen6 IF with embedded comparison. */
 201 vec4_instruction *
 202 vec4_visitor::IF(src_reg src0, src_reg src1,
 203                  enum brw_conditional_mod condition)
 204 {
 205    assert(devinfo->gen == 6);
 206
 207    vec4_instruction *inst;
 208
 209    resolve_ud_negate(&src0);
 210    resolve_ud_negate(&src1);
 211
 212    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 213                                         src0, src1);
 214    inst->conditional_mod = condition;
 215
 216    return inst;
 217 }
 218
 219 /**
 220  * CMP: Sets the low bit of the destination channels with the result
 221  * of the comparison, while the upper bits are undefined, and updates
 222  * the flag register with the packed 16 bits of the result.
 223  */
 224 vec4_instruction *
 225 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 226                   enum brw_conditional_mod condition)
 227 {
 228    vec4_instruction *inst;
 229
 230    /* Take the instruction:
 231     *
 232     * CMP null<d> src0<f> src1<f>
 233     *
 234     * Original gen4 does type conversion to the destination type before
 235     * comparison, producing garbage results for floating point comparisons.
 236     *
 237     * The destination type doesn't matter on newer generations, so we set the
 238     * type to match src0 so we can compact the instruction.
 239     */
 240    dst.type = src0.type;
 241
 242    resolve_ud_negate(&src0);
 243    resolve_ud_negate(&src1);
 244
 245    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 246    inst->conditional_mod = condition;
 247
 248    return inst;
 249 }
 250
 251 vec4_instruction *
 252 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 253 {
 254    vec4_instruction *inst;
 255
 256    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 257                                         dst, index);
 258    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 259    inst->mlen = 2;
 260
 261    return inst;
 262 }
 263
 264 vec4_instruction *
 265 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 266                             const src_reg &index)
 267 {
 268    vec4_instruction *inst;
 269
 270    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 271                                         dst, src, index);
 272    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 273    inst->mlen = 3;
 274
 275    return inst;
 276 }
 277
 278 src_reg
 279 vec4_visitor::fix_3src_operand(const src_reg &src)
 280 {
 281    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 282     * able to use vertical stride of zero to replicate the vec4 uniform, like
 283     *
 284     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 285     *
 286     * But you can't, since vertical stride is always four in three-source
 287     * instructions. Instead, insert a MOV instruction to do the replication so
 288     * that the three-source instruction can consume it.
 289     */
 290
 291    /* The MOV is only needed if the source is a uniform or immediate. */
 292    if (src.file != UNIFORM && src.file != IMM)
 293       return src;
 294
 295    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 296       return src;
 297
 298    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 299    expanded.type = src.type;
 300    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 301    return src_reg(expanded);
 302 }
 303
 304 src_reg
 305 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 306 {
 307    if (!src.abs && !src.negate)
 308       return src;
 309
 310    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 311    resolved.type = src.type;
 312    emit(MOV(resolved, src));
 313
 314    return src_reg(resolved);
 315 }
 316
 317 src_reg
 318 vec4_visitor::fix_math_operand(const src_reg &src)
 319 {
 320    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 321       return src;
 322
 323    /* The gen6 math instruction ignores the source modifiers --
 324     * swizzle, abs, negate, and at least some parts of the register
 325     * region description.
 326     *
 327     * Rather than trying to enumerate all these cases, *always* expand the
 328     * operand to a temp GRF for gen6.
 329     *
 330     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 331     * can't use.
 332     */
 333
 334    if (devinfo->gen == 7 && src.file != IMM)
 335       return src;
 336
 337    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 338    expanded.type = src.type;
 339    emit(MOV(expanded, src));
 340    return src_reg(expanded);
 341 }
 342
 343 vec4_instruction *
 344 vec4_visitor::emit_math(enum opcode opcode,
 345                         const dst_reg &dst,
 346                         const src_reg &src0, const src_reg &src1)
 347 {
 348    vec4_instruction *math =
 349       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 350
 351    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 352       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 353       math->dst = dst_reg(this, glsl_type::vec4_type);
 354       math->dst.type = dst.type;
 355       math = emit(MOV(dst, src_reg(math->dst)));
 356    } else if (devinfo->gen < 6) {
 357       math->base_mrf = 1;
 358       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 359    }
 360
 361    return math;
 362 }
 363
 364 void
 365 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 366 {
 367    if (devinfo->gen < 7) {
 368       unreachable("ir_unop_pack_half_2x16 should be lowered");
 369    }
 370
 371    assert(dst.type == BRW_REGISTER_TYPE_UD);
 372    assert(src0.type == BRW_REGISTER_TYPE_F);
 373
 374    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 375     *
 376     *   Because this instruction does not have a 16-bit floating-point type,
 377     *   the destination data type must be Word (W).
 378     *
 379     *   The destination must be DWord-aligned and specify a horizontal stride
 380     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 381     *   each destination channel and the upper word is not modified.
 382     *
 383     * The above restriction implies that the f32to16 instruction must use
 384     * align1 mode, because only in align1 mode is it possible to specify
 385     * horizontal stride.  We choose here to defy the hardware docs and emit
 386     * align16 instructions.
 387     *
 388     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 389     * instructions. I was partially successful in that the code passed all
 390     * tests.  However, the code was dubiously correct and fragile, and the
 391     * tests were not harsh enough to probe that frailty. Not trusting the
 392     * code, I chose instead to remain in align16 mode in defiance of the hw
 393     * docs).
 394     *
 395     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 396     * simulator, emitting a f32to16 in align16 mode with UD as destination
 397     * data type is safe. The behavior differs from that specified in the PRM
 398     * in that the upper word of each destination channel is cleared to 0.
 399     */
 400
 401    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 402    src_reg tmp_src(tmp_dst);
 403
 404 #if 0
 405    /* Verify the undocumented behavior on which the following instructions
 406     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 407     * then the result of the bit-or instruction below will be incorrect.
 408     *
 409     * You should inspect the disasm output in order to verify that the MOV is
 410     * not optimized away.
 411     */
 412    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 413 #endif
 414
 415    /* Give tmp the form below, where "." means untouched.
 416     *
 417     *     w z          y          x w z          y          x
 418     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 419     *
 420     * That the upper word of each write-channel be 0 is required for the
 421     * following bit-shift and bit-or instructions to work. Note that this
 422     * relies on the undocumented hardware behavior mentioned above.
 423     */
 424    tmp_dst.writemask = WRITEMASK_XY;
 425    emit(F32TO16(tmp_dst, src0));
 426
 427    /* Give the write-channels of dst the form:
 428     *   0xhhhh0000
 429     */
 430    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 431    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 432
 433    /* Finally, give the write-channels of dst the form of packHalf2x16's
 434     * output:
 435     *   0xhhhhllll
 436     */
 437    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 438    emit(OR(dst, src_reg(dst), tmp_src));
 439 }
 440
 441 void
 442 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 443 {
 444    if (devinfo->gen < 7) {
 445       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 446    }
 447
 448    assert(dst.type == BRW_REGISTER_TYPE_F);
 449    assert(src0.type == BRW_REGISTER_TYPE_UD);
 450
 451    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 452     *
 453     *   Because this instruction does not have a 16-bit floating-point type,
 454     *   the source data type must be Word (W). The destination type must be
 455     *   F (Float).
 456     *
 457     * To use W as the source data type, we must adjust horizontal strides,
 458     * which is only possible in align1 mode. All my [chadv] attempts at
 459     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 460     * Piglit tests, so I gave up.
 461     *
 462     * I've verified that, on gen7 hardware and the simulator, it is safe to
 463     * emit f16to32 in align16 mode with UD as source data type.
 464     */
 465
 466    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 467    src_reg tmp_src(tmp_dst);
 468
 469    tmp_dst.writemask = WRITEMASK_X;
 470    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 471
 472    tmp_dst.writemask = WRITEMASK_Y;
 473    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 474
 475    dst.writemask = WRITEMASK_XY;
 476    emit(F16TO32(dst, tmp_src));
 477 }
 478
 479 void
 480 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 481 {
 482    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 483     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 484     * is not suitable to generate the shift values, but we can use the packed
 485     * vector float and a type-converting MOV.
 486     */
 487    dst_reg shift(this, glsl_type::uvec4_type);
 488    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 489
 490    dst_reg shifted(this, glsl_type::uvec4_type);
 491    src0.swizzle = BRW_SWIZZLE_XXXX;
 492    emit(SHR(shifted, src0, src_reg(shift)));
 493
 494    shifted.type = BRW_REGISTER_TYPE_UB;
 495    dst_reg f(this, glsl_type::vec4_type);
 496    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 497
 498    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 499 }
 500
 501 void
 502 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 503 {
 504    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 505     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 506     * is not suitable to generate the shift values, but we can use the packed
 507     * vector float and a type-converting MOV.
 508     */
 509    dst_reg shift(this, glsl_type::uvec4_type);
 510    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 511
 512    dst_reg shifted(this, glsl_type::uvec4_type);
 513    src0.swizzle = BRW_SWIZZLE_XXXX;
 514    emit(SHR(shifted, src0, src_reg(shift)));
 515
 516    shifted.type = BRW_REGISTER_TYPE_B;
 517    dst_reg f(this, glsl_type::vec4_type);
 518    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 519
 520    dst_reg scaled(this, glsl_type::vec4_type);
 521    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 522
 523    dst_reg max(this, glsl_type::vec4_type);
 524    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 525    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 526 }
 527
 528 void
 529 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 530 {
 531    dst_reg saturated(this, glsl_type::vec4_type);
 532    vec4_instruction *inst = emit(MOV(saturated, src0));
 533    inst->saturate = true;
 534
 535    dst_reg scaled(this, glsl_type::vec4_type);
 536    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 537
 538    dst_reg rounded(this, glsl_type::vec4_type);
 539    emit(RNDE(rounded, src_reg(scaled)));
 540
 541    dst_reg u(this, glsl_type::uvec4_type);
 542    emit(MOV(u, src_reg(rounded)));
 543
 544    src_reg bytes(u);
 545    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 546 }
 547
 548 void
 549 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 550 {
 551    dst_reg max(this, glsl_type::vec4_type);
 552    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 553
 554    dst_reg min(this, glsl_type::vec4_type);
 555    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 556
 557    dst_reg scaled(this, glsl_type::vec4_type);
 558    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 559
 560    dst_reg rounded(this, glsl_type::vec4_type);
 561    emit(RNDE(rounded, src_reg(scaled)));
 562
 563    dst_reg i(this, glsl_type::ivec4_type);
 564    emit(MOV(i, src_reg(rounded)));
 565
 566    src_reg bytes(i);
 567    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 568 }
 569
 570 /*
 571  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 572  * false) elements needed to pack a type.
 573  */
 574 static int
 575 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 576 {
 577    unsigned int i;
 578    int size;
 579
 580    switch (type->base_type) {
 581    case GLSL_TYPE_UINT:
 582    case GLSL_TYPE_INT:
 583    case GLSL_TYPE_FLOAT:
 584    case GLSL_TYPE_BOOL:
 585    case GLSL_TYPE_DOUBLE:
 586       if (type->is_matrix()) {
 587          const glsl_type *col_type = type->column_type();
 588          unsigned col_slots =
 589             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 590          return type->matrix_columns * col_slots;
 591       } else {
 592          /* Regardless of size of vector, it gets a vec4. This is bad
 593           * packing for things like floats, but otherwise arrays become a
 594           * mess.  Hopefully a later pass over the code can pack scalars
 595           * down if appropriate.
 596           */
 597          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 598       }
 599    case GLSL_TYPE_ARRAY:
 600       assert(type->length > 0);
 601       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 602    case GLSL_TYPE_STRUCT:
 603       size = 0;
 604       for (i = 0; i < type->length; i++) {
 605          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 606       }
 607       return size;
 608    case GLSL_TYPE_SUBROUTINE:
 609       return 1;
 610
 611    case GLSL_TYPE_SAMPLER:
 612       /* Samplers take up no register space, since they're baked in at
 613        * link time.
 614        */
 615       return 0;
 616    case GLSL_TYPE_ATOMIC_UINT:
 617       return 0;
 618    case GLSL_TYPE_IMAGE:
 619       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 620    case GLSL_TYPE_VOID:
 621    case GLSL_TYPE_ERROR:
 622    case GLSL_TYPE_INTERFACE:
 623    case GLSL_TYPE_FUNCTION:
 624       unreachable("not reached");
 625    }
 626
 627    return 0;
 628 }
 629
 630 /**
 631  * Returns the minimum number of vec4 elements needed to pack a type.
 632  *
 633  * For simple types, it will return 1 (a single vec4); for matrices, the
 634  * number of columns; for array and struct, the sum of the vec4_size of
 635  * each of its elements; and for sampler and atomic, zero.
 636  *
 637  * This method is useful to calculate how much register space is needed to
 638  * store a particular type.
 639  */
 640 extern "C" int
 641 type_size_vec4(const struct glsl_type *type)
 642 {
 643    return type_size_xvec4(type, true);
 644 }
 645
 646 /**
 647  * Returns the minimum number of dvec4 elements needed to pack a type.
 648  *
 649  * For simple types, it will return 1 (a single dvec4); for matrices, the
 650  * number of columns; for array and struct, the sum of the dvec4_size of
 651  * each of its elements; and for sampler and atomic, zero.
 652  *
 653  * This method is useful to calculate how much register space is needed to
 654  * store a particular type.
 655  *
 656  * Measuring double-precision vertex inputs as dvec4 is required because
 657  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 658  * than the single-precision version. That is, two consecutives dvec4 would be
 659  * located in location "x" and location "x+1", not "x+2".
 660  *
 661  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 662  * remap_vs_attrs() will take in account both the location and also if the
 663  * type fits in one or two vec4 slots.
 664  */
 665 extern "C" int
 666 type_size_dvec4(const struct glsl_type *type)
 667 {
 668    return type_size_xvec4(type, false);
 669 }
 670
 671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 672 {
 673    init();
 674
 675    this->file = VGRF;
 676    this->nr = v->alloc.allocate(type_size_vec4(type));
 677
 678    if (type->is_array() || type->is_record()) {
 679       this->swizzle = BRW_SWIZZLE_NOOP;
 680    } else {
 681       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 682    }
 683
 684    this->type = brw_type_for_base_type(type);
 685 }
 686
 687 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 688 {
 689    assert(size > 0);
 690
 691    init();
 692
 693    this->file = VGRF;
 694    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 695
 696    this->swizzle = BRW_SWIZZLE_NOOP;
 697
 698    this->type = brw_type_for_base_type(type);
 699 }
 700
 701 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 702 {
 703    init();
 704
 705    this->file = VGRF;
 706    this->nr = v->alloc.allocate(type_size_vec4(type));
 707
 708    if (type->is_array() || type->is_record()) {
 709       this->writemask = WRITEMASK_XYZW;
 710    } else {
 711       this->writemask = (1 << type->vector_elements) - 1;
 712    }
 713
 714    this->type = brw_type_for_base_type(type);
 715 }
 716
 717 vec4_instruction *
 718 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 719                           src_reg src0, src_reg src1)
 720 {
 721    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 722    inst->conditional_mod = conditionalmod;
 723    return inst;
 724 }
 725
 726 vec4_instruction *
 727 vec4_visitor::emit_lrp(const dst_reg &dst,
 728                        const src_reg &x, const src_reg &y, const src_reg &a)
 729 {
 730    if (devinfo->gen >= 6) {
 731       /* Note that the instruction's argument order is reversed from GLSL
 732        * and the IR.
 733        */
 734      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 735                      fix_3src_operand(x)));
 736    } else {
 737       /* Earlier generations don't support three source operations, so we
 738        * need to emit x*(1-a) + y*a.
 739        */
 740       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 741       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 742       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 743       y_times_a.writemask           = dst.writemask;
 744       one_minus_a.writemask         = dst.writemask;
 745       x_times_one_minus_a.writemask = dst.writemask;
 746
 747       emit(MUL(y_times_a, y, a));
 748       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 749       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 750       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 751    }
 752 }
 753
 754 /**
 755  * Emits the instructions needed to perform a pull constant load. before_block
 756  * and before_inst can be NULL in which case the instruction will be appended
 757  * to the end of the instruction list.
 758  */
 759 void
 760 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 761                                           src_reg surf_index,
 762                                           src_reg offset_reg,
 763                                           bblock_t *before_block,
 764                                           vec4_instruction *before_inst)
 765 {
 766    assert((before_inst == NULL && before_block == NULL) ||
 767           (before_inst && before_block));
 768
 769    vec4_instruction *pull;
 770
 771    if (devinfo->gen >= 9) {
 772       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 773       src_reg header(this, glsl_type::uvec4_type, 2);
 774
 775       pull = new(mem_ctx)
 776          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 777                           dst_reg(header));
 778
 779       if (before_inst)
 780          emit_before(before_block, before_inst, pull);
 781       else
 782          emit(pull);
 783
 784       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 785                                  offset_reg.type);
 786       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 787
 788       if (before_inst)
 789          emit_before(before_block, before_inst, pull);
 790       else
 791          emit(pull);
 792
 793       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 794                                            dst,
 795                                            surf_index,
 796                                            header);
 797       pull->mlen = 2;
 798       pull->header_size = 1;
 799    } else if (devinfo->gen >= 7) {
 800       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 801
 802       grf_offset.type = offset_reg.type;
 803
 804       pull = MOV(grf_offset, offset_reg);
 805
 806       if (before_inst)
 807          emit_before(before_block, before_inst, pull);
 808       else
 809          emit(pull);
 810
 811       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 812                                            dst,
 813                                            surf_index,
 814                                            src_reg(grf_offset));
 815       pull->mlen = 1;
 816    } else {
 817       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 818                                            dst,
 819                                            surf_index,
 820                                            offset_reg);
 821       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 822       pull->mlen = 1;
 823    }
 824
 825    if (before_inst)
 826       emit_before(before_block, before_inst, pull);
 827    else
 828       emit(pull);
 829 }
 830
 831 src_reg
 832 vec4_visitor::emit_uniformize(const src_reg &src)
 833 {
 834    const src_reg chan_index(this, glsl_type::uint_type);
 835    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 836                               src.type);
 837
 838    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 839       ->force_writemask_all = true;
 840    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 841       ->force_writemask_all = true;
 842
 843    return src_reg(dst);
 844 }
 845
 846 src_reg
 847 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 848                              src_reg coordinate, src_reg surface)
 849 {
 850    vec4_instruction *inst =
 851       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 852                                     dst_reg(this, glsl_type::uvec4_type));
 853    inst->base_mrf = 2;
 854    inst->src[1] = surface;
 855    inst->src[2] = surface;
 856
 857    int param_base;
 858
 859    if (devinfo->gen >= 9) {
 860       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 861       vec4_instruction *header_inst = new(mem_ctx)
 862          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 863                           dst_reg(MRF, inst->base_mrf));
 864
 865       emit(header_inst);
 866
 867       inst->mlen = 2;
 868       inst->header_size = 1;
 869       param_base = inst->base_mrf + 1;
 870    } else {
 871       inst->mlen = 1;
 872       param_base = inst->base_mrf;
 873    }
 874
 875    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 876    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 877    int zero_mask = 0xf & ~coord_mask;
 878
 879    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 880             coordinate));
 881
 882    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 883             brw_imm_d(0)));
 884
 885    emit(inst);
 886    return src_reg(inst->dst);
 887 }
 888
 889 bool
 890 vec4_visitor::is_high_sampler(src_reg sampler)
 891 {
 892    if (devinfo->gen < 8 && !devinfo->is_haswell)
 893       return false;
 894
 895    return sampler.file != IMM || sampler.ud >= 16;
 896 }
 897
 898 void
 899 vec4_visitor::emit_texture(ir_texture_opcode op,
 900                            dst_reg dest,
 901                            const glsl_type *dest_type,
 902                            src_reg coordinate,
 903                            int coord_components,
 904                            src_reg shadow_comparitor,
 905                            src_reg lod, src_reg lod2,
 906                            src_reg sample_index,
 907                            uint32_t constant_offset,
 908                            src_reg offset_value,
 909                            src_reg mcs,
 910                            uint32_t surface,
 911                            src_reg surface_reg,
 912                            uint32_t sampler,
 913                            src_reg sampler_reg)
 914 {
 915    /* The sampler can only meaningfully compute LOD for fragment shader
 916     * messages. For all other stages, we change the opcode to TXL and hardcode
 917     * the LOD to 0.
 918     *
 919     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 920     * valid LOD argument.
 921     */
 922    if (op == ir_tex || op == ir_query_levels) {
 923       assert(lod.file == BAD_FILE);
 924       lod = brw_imm_f(0.0f);
 925    }
 926
 927    enum opcode opcode;
 928    switch (op) {
 929    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 930    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 931    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 932    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 933    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 934                              SHADER_OPCODE_TXF_CMS); break;
 935    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 936    case ir_tg4: opcode = offset_value.file != BAD_FILE
 937                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 938    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 939    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 940    case ir_txb:
 941       unreachable("TXB is not valid for vertex shaders.");
 942    case ir_lod:
 943       unreachable("LOD is not valid for vertex shaders.");
 944    case ir_samples_identical: {
 945       /* There are some challenges implementing this for vec4, and it seems
 946        * unlikely to be used anyway.  For now, just return false ways.
 947        */
 948       emit(MOV(dest, brw_imm_ud(0u)));
 949       return;
 950    }
 951    default:
 952       unreachable("Unrecognized tex op");
 953    }
 954
 955    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 956
 957    inst->offset = constant_offset;
 958
 959    /* The message header is necessary for:
 960     * - Gen4 (always)
 961     * - Gen9+ for selecting SIMD4x2
 962     * - Texel offsets
 963     * - Gather channel selection
 964     * - Sampler indices too large to fit in a 4-bit value.
 965     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 966     */
 967    inst->header_size =
 968       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 969        inst->offset != 0 || op == ir_tg4 ||
 970        op == ir_texture_samples ||
 971        is_high_sampler(sampler_reg)) ? 1 : 0;
 972    inst->base_mrf = 2;
 973    inst->mlen = inst->header_size;
 974    inst->dst.writemask = WRITEMASK_XYZW;
 975    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 976
 977    inst->src[1] = surface_reg;
 978    inst->src[2] = sampler_reg;
 979
 980    /* MRF for the first parameter */
 981    int param_base = inst->base_mrf + inst->header_size;
 982
 983    if (op == ir_txs || op == ir_query_levels) {
 984       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 985       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 986       inst->mlen++;
 987    } else if (op == ir_texture_samples) {
 988       inst->dst.writemask = WRITEMASK_X;
 989    } else {
 990       /* Load the coordinate */
 991       /* FINISHME: gl_clamp_mask and saturate */
 992       int coord_mask = (1 << coord_components) - 1;
 993       int zero_mask = 0xf & ~coord_mask;
 994
 995       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 996                coordinate));
 997       inst->mlen++;
 998
 999       if (zero_mask != 0) {
1000          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001                   brw_imm_d(0)));
1002       }
1003       /* Load the shadow comparitor */
1004       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
1006                           WRITEMASK_X),
1007                   shadow_comparitor));
1008          inst->mlen++;
1009       }
1010
1011       /* Load the LOD info */
1012       if (op == ir_tex || op == ir_txl) {
1013          int mrf, writemask;
1014          if (devinfo->gen >= 5) {
1015             mrf = param_base + 1;
1016             if (shadow_comparitor.file != BAD_FILE) {
1017                writemask = WRITEMASK_Y;
1018                /* mlen already incremented */
1019             } else {
1020                writemask = WRITEMASK_X;
1021                inst->mlen++;
1022             }
1023          } else /* devinfo->gen == 4 */ {
1024             mrf = param_base;
1025             writemask = WRITEMASK_W;
1026          }
1027          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028       } else if (op == ir_txf) {
1029          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030       } else if (op == ir_txf_ms) {
1031          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032                   sample_index));
1033          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034             /* MCS data is stored in the first two channels of ‘mcs’, but we
1035              * need to get it into the .y and .z channels of the second vec4
1036              * of params.
1037              */
1038             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039             emit(MOV(dst_reg(MRF, param_base + 1,
1040                              glsl_type::uint_type, WRITEMASK_YZ),
1041                      mcs));
1042          } else if (devinfo->gen >= 7) {
1043             /* MCS data is in the first channel of `mcs`, but we need to get it into
1044              * the .y channel of the second vec4 of params, so replicate .x across
1045              * the whole vec4 and then mask off everything except .y
1046              */
1047             mcs.swizzle = BRW_SWIZZLE_XXXX;
1048             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049                      mcs));
1050          }
1051          inst->mlen++;
1052       } else if (op == ir_txd) {
1053          const brw_reg_type type = lod.type;
1054
1055          if (devinfo->gen >= 5) {
1056             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060             inst->mlen++;
1061
1062             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1063                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067                inst->mlen++;
1068
1069                if (shadow_comparitor.file != BAD_FILE) {
1070                   emit(MOV(dst_reg(MRF, param_base + 2,
1071                                    shadow_comparitor.type, WRITEMASK_Z),
1072                            shadow_comparitor));
1073                }
1074             }
1075          } else /* devinfo->gen == 4 */ {
1076             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078             inst->mlen += 2;
1079          }
1080       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081          if (shadow_comparitor.file != BAD_FILE) {
1082             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1083                      shadow_comparitor));
1084          }
1085
1086          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087                   offset_value));
1088          inst->mlen++;
1089       }
1090    }
1091
1092    emit(inst);
1093
1094    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095     * spec requires layers.
1096     */
1097    if (op == ir_txs && devinfo->gen < 7) {
1098       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1099       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1100                   src_reg(inst->dst), brw_imm_d(1));
1101    }
1102
1103    if (devinfo->gen == 6 && op == ir_tg4) {
1104       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1105    }
1106
1107    if (op == ir_query_levels) {
1108       /* # levels is in .w */
1109       src_reg swizzled(dest);
1110       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1111                                       SWIZZLE_W, SWIZZLE_W);
1112       emit(MOV(dest, swizzled));
1113    }
1114 }
1115
1116 /**
1117  * Apply workarounds for Gen6 gather with UINT/SINT
1118  */
1119 void
1120 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1121 {
1122    if (!wa)
1123       return;
1124
1125    int width = (wa & WA_8BIT) ? 8 : 16;
1126    dst_reg dst_f = dst;
1127    dst_f.type = BRW_REGISTER_TYPE_F;
1128
1129    /* Convert from UNORM to UINT */
1130    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1131    emit(MOV(dst, src_reg(dst_f)));
1132
1133    if (wa & WA_SIGN) {
1134       /* Reinterpret the UINT value as a signed INT value by
1135        * shifting the sign bit into place, then shifting back
1136        * preserving sign.
1137        */
1138       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1139       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1140    }
1141 }
1142
1143 void
1144 vec4_visitor::gs_emit_vertex(int stream_id)
1145 {
1146    unreachable("not reached");
1147 }
1148
1149 void
1150 vec4_visitor::gs_end_primitive()
1151 {
1152    unreachable("not reached");
1153 }
1154
1155 void
1156 vec4_visitor::emit_ndc_computation()
1157 {
1158    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1159       return;
1160
1161    /* Get the position */
1162    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1163
1164    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1165    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1166    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1167
1168    current_annotation = "NDC";
1169    dst_reg ndc_w = ndc;
1170    ndc_w.writemask = WRITEMASK_W;
1171    src_reg pos_w = pos;
1172    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1173    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1174
1175    dst_reg ndc_xyz = ndc;
1176    ndc_xyz.writemask = WRITEMASK_XYZ;
1177
1178    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1179 }
1180
1181 void
1182 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1183 {
1184    if (devinfo->gen < 6 &&
1185        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1186         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1187         devinfo->has_negative_rhw_bug)) {
1188       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1189       dst_reg header1_w = header1;
1190       header1_w.writemask = WRITEMASK_W;
1191
1192       emit(MOV(header1, brw_imm_ud(0u)));
1193
1194       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1195          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1196
1197          current_annotation = "Point size";
1198          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1199          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1200       }
1201
1202       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1203          current_annotation = "Clipping flags";
1204          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1205          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1206
1207          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1209          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1210
1211          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1212          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1213          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1214          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1215       }
1216
1217       /* i965 clipping workaround:
1218        * 1) Test for -ve rhw
1219        * 2) If set,
1220        *      set ndc = (0,0,0,0)
1221        *      set ucp[6] = 1
1222        *
1223        * Later, clipping will detect ucp[6] and ensure the primitive is
1224        * clipped against all fixed planes.
1225        */
1226       if (devinfo->has_negative_rhw_bug &&
1227           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1228          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1229          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1230          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1231          vec4_instruction *inst;
1232          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1233          inst->predicate = BRW_PREDICATE_NORMAL;
1234          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1235          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1236          inst->predicate = BRW_PREDICATE_NORMAL;
1237       }
1238
1239       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1240    } else if (devinfo->gen < 6) {
1241       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1242    } else {
1243       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1244       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1245          dst_reg reg_w = reg;
1246          reg_w.writemask = WRITEMASK_W;
1247          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1248          reg_as_src.type = reg_w.type;
1249          reg_as_src.swizzle = brw_swizzle_for_size(1);
1250          emit(MOV(reg_w, reg_as_src));
1251       }
1252       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1253          dst_reg reg_y = reg;
1254          reg_y.writemask = WRITEMASK_Y;
1255          reg_y.type = BRW_REGISTER_TYPE_D;
1256          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1257          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1258       }
1259       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1260          dst_reg reg_z = reg;
1261          reg_z.writemask = WRITEMASK_Z;
1262          reg_z.type = BRW_REGISTER_TYPE_D;
1263          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1264          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1265       }
1266    }
1267 }
1268
1269 vec4_instruction *
1270 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1271 {
1272    assert(varying < VARYING_SLOT_MAX);
1273    assert(output_reg[varying].type == reg.type);
1274    current_annotation = output_reg_annotation[varying];
1275    if (output_reg[varying].file != BAD_FILE) {
1276       return emit(MOV(reg, src_reg(output_reg[varying])));
1277    } else
1278       return NULL;
1279 }
1280
1281 void
1282 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1283 {
1284    assert(varying < VARYING_SLOT_MAX);
1285    assert(varying >= VARYING_SLOT_VAR0);
1286    varying = varying - VARYING_SLOT_VAR0;
1287
1288    unsigned num_comps = output_generic_num_components[varying][component];
1289    if (num_comps == 0)
1290       return;
1291
1292    assert(output_generic_reg[varying][component].type == reg.type);
1293    current_annotation = output_reg_annotation[varying];
1294    if (output_generic_reg[varying][component].file != BAD_FILE) {
1295       src_reg src = src_reg(output_generic_reg[varying][component]);
1296       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1297       reg.writemask =
1298          brw_writemask_for_component_packing(num_comps, component);
1299       emit(MOV(reg, src));
1300    }
1301 }
1302
1303 void
1304 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1305 {
1306    reg.type = BRW_REGISTER_TYPE_F;
1307    output_reg[varying].type = reg.type;
1308
1309    switch (varying) {
1310    case VARYING_SLOT_PSIZ:
1311    {
1312       /* PSIZ is always in slot 0, and is coupled with other flags. */
1313       current_annotation = "indices, point width, clip flags";
1314       emit_psiz_and_flags(reg);
1315       break;
1316    }
1317    case BRW_VARYING_SLOT_NDC:
1318       current_annotation = "NDC";
1319       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1320          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1321       break;
1322    case VARYING_SLOT_POS:
1323       current_annotation = "gl_Position";
1324       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1325          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1326       break;
1327    case VARYING_SLOT_EDGE:
1328       /* This is present when doing unfilled polygons.  We're supposed to copy
1329        * the edge flag from the user-provided vertex array
1330        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1331        * of that attribute (starts as 1.0f).  This is then used in clipping to
1332        * determine which edges should be drawn as wireframe.
1333        */
1334       current_annotation = "edge flag";
1335       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1336                                     glsl_type::float_type, WRITEMASK_XYZW))));
1337       break;
1338    case BRW_VARYING_SLOT_PAD:
1339       /* No need to write to this slot */
1340       break;
1341    default:
1342       if (varying >= VARYING_SLOT_VAR0) {
1343          for (int i = 0; i < 4; i++) {
1344             emit_generic_urb_slot(reg, varying, i);
1345          }
1346       } else {
1347          emit_generic_urb_slot(reg, varying);
1348       }
1349       break;
1350    }
1351 }
1352
1353 static int
1354 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1355 {
1356    if (devinfo->gen >= 6) {
1357       /* URB data written (does not include the message header reg) must
1358        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1359        * section 5.4.3.2.2: URB_INTERLEAVED.
1360        *
1361        * URB entries are allocated on a multiple of 1024 bits, so an
1362        * extra 128 bits written here to make the end align to 256 is
1363        * no problem.
1364        */
1365       if ((mlen % 2) != 1)
1366          mlen++;
1367    }
1368
1369    return mlen;
1370 }
1371
1372
1373 /**
1374  * Generates the VUE payload plus the necessary URB write instructions to
1375  * output it.
1376  *
1377  * The VUE layout is documented in Volume 2a.
1378  */
1379 void
1380 vec4_visitor::emit_vertex()
1381 {
1382    /* MRF 0 is reserved for the debugger, so start with message header
1383     * in MRF 1.
1384     */
1385    int base_mrf = 1;
1386    int mrf = base_mrf;
1387    /* In the process of generating our URB write message contents, we
1388     * may need to unspill a register or load from an array.  Those
1389     * reads would use MRFs 14-15.
1390     */
1391    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1392
1393    /* The following assertion verifies that max_usable_mrf causes an
1394     * even-numbered amount of URB write data, which will meet gen6's
1395     * requirements for length alignment.
1396     */
1397    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1398
1399    /* First mrf is the g0-based message header containing URB handles and
1400     * such.
1401     */
1402    emit_urb_write_header(mrf++);
1403
1404    if (devinfo->gen < 6) {
1405       emit_ndc_computation();
1406    }
1407
1408    /* We may need to split this up into several URB writes, so do them in a
1409     * loop.
1410     */
1411    int slot = 0;
1412    bool complete = false;
1413    do {
1414       /* URB offset is in URB row increments, and each of our MRFs is half of
1415        * one of those, since we're doing interleaved writes.
1416        */
1417       int offset = slot / 2;
1418
1419       mrf = base_mrf + 1;
1420       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1421          emit_urb_slot(dst_reg(MRF, mrf++),
1422                        prog_data->vue_map.slot_to_varying[slot]);
1423
1424          /* If this was max_usable_mrf, we can't fit anything more into this
1425           * URB WRITE. Same thing if we reached the maximum length available.
1426           */
1427          if (mrf > max_usable_mrf ||
1428              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1429             slot++;
1430             break;
1431          }
1432       }
1433
1434       complete = slot >= prog_data->vue_map.num_slots;
1435       current_annotation = "URB write";
1436       vec4_instruction *inst = emit_urb_write_opcode(complete);
1437       inst->base_mrf = base_mrf;
1438       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1439       inst->offset += offset;
1440    } while(!complete);
1441 }
1442
1443
1444 src_reg
1445 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1446                                  src_reg *reladdr, int reg_offset)
1447 {
1448    /* Because we store the values to scratch interleaved like our
1449     * vertex data, we need to scale the vec4 index by 2.
1450     */
1451    int message_header_scale = 2;
1452
1453    /* Pre-gen6, the message header uses byte offsets instead of vec4
1454     * (16-byte) offset units.
1455     */
1456    if (devinfo->gen < 6)
1457       message_header_scale *= 16;
1458
1459    if (reladdr) {
1460       src_reg index = src_reg(this, glsl_type::int_type);
1461
1462       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1463                                    brw_imm_d(reg_offset)));
1464       emit_before(block, inst, MUL(dst_reg(index), index,
1465                                    brw_imm_d(message_header_scale)));
1466
1467       return index;
1468    } else {
1469       return brw_imm_d(reg_offset * message_header_scale);
1470    }
1471 }
1472
1473 /**
1474  * Emits an instruction before @inst to load the value named by @orig_src
1475  * from scratch space at @base_offset to @temp.
1476  *
1477  * @base_offset is measured in 32-byte units (the size of a register).
1478  */
1479 void
1480 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1481                                 dst_reg temp, src_reg orig_src,
1482                                 int base_offset)
1483 {
1484    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1485    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1486                                       reg_offset);
1487
1488    emit_before(block, inst, SCRATCH_READ(temp, index));
1489 }
1490
1491 /**
1492  * Emits an instruction after @inst to store the value to be written
1493  * to @orig_dst to scratch space at @base_offset, from @temp.
1494  *
1495  * @base_offset is measured in 32-byte units (the size of a register).
1496  */
1497 void
1498 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1499                                  int base_offset)
1500 {
1501    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1502    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1503                                       reg_offset);
1504
1505    /* Create a temporary register to store *inst's result in.
1506     *
1507     * We have to be careful in MOVing from our temporary result register in
1508     * the scratch write.  If we swizzle from channels of the temporary that
1509     * weren't initialized, it will confuse live interval analysis, which will
1510     * make spilling fail to make progress.
1511     */
1512    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1513                                        inst->dst.type),
1514                                 brw_swizzle_for_mask(inst->dst.writemask));
1515    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1516                                        inst->dst.writemask));
1517    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1518    if (inst->opcode != BRW_OPCODE_SEL)
1519       write->predicate = inst->predicate;
1520    write->ir = inst->ir;
1521    write->annotation = inst->annotation;
1522    inst->insert_after(block, write);
1523
1524    inst->dst.file = temp.file;
1525    inst->dst.nr = temp.nr;
1526    inst->dst.offset %= REG_SIZE;
1527    inst->dst.reladdr = NULL;
1528 }
1529
1530 /**
1531  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1532  * adds the scratch read(s) before \p inst. The function also checks for
1533  * recursive reladdr scratch accesses, issuing the corresponding scratch
1534  * loads and rewriting reladdr references accordingly.
1535  *
1536  * \return \p src if it did not require a scratch load, otherwise, the
1537  * register holding the result of the scratch load that the caller should
1538  * use to rewrite src.
1539  */
1540 src_reg
1541 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1542                                    vec4_instruction *inst, src_reg src)
1543 {
1544    /* Resolve recursive reladdr scratch access by calling ourselves
1545     * with src.reladdr
1546     */
1547    if (src.reladdr)
1548       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1549                                           *src.reladdr);
1550
1551    /* Now handle scratch access on src */
1552    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1553       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1554       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1555       src.nr = temp.nr;
1556       src.offset %= REG_SIZE;
1557       src.reladdr = NULL;
1558    }
1559
1560    return src;
1561 }
1562
1563 /**
1564  * We can't generally support array access in GRF space, because a
1565  * single instruction's destination can only span 2 contiguous
1566  * registers.  So, we send all GRF arrays that get variable index
1567  * access to scratch space.
1568  */
1569 void
1570 vec4_visitor::move_grf_array_access_to_scratch()
1571 {
1572    int scratch_loc[this->alloc.count];
1573    memset(scratch_loc, -1, sizeof(scratch_loc));
1574
1575    /* First, calculate the set of virtual GRFs that need to be punted
1576     * to scratch due to having any array access on them, and where in
1577     * scratch.
1578     */
1579    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1580       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1581          if (scratch_loc[inst->dst.nr] == -1) {
1582             scratch_loc[inst->dst.nr] = last_scratch;
1583             last_scratch += this->alloc.sizes[inst->dst.nr];
1584          }
1585
1586          for (src_reg *iter = inst->dst.reladdr;
1587               iter->reladdr;
1588               iter = iter->reladdr) {
1589             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1590                scratch_loc[iter->nr] = last_scratch;
1591                last_scratch += this->alloc.sizes[iter->nr];
1592             }
1593          }
1594       }
1595
1596       for (int i = 0 ; i < 3; i++) {
1597          for (src_reg *iter = &inst->src[i];
1598               iter->reladdr;
1599               iter = iter->reladdr) {
1600             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1601                scratch_loc[iter->nr] = last_scratch;
1602                last_scratch += this->alloc.sizes[iter->nr];
1603             }
1604          }
1605       }
1606    }
1607
1608    /* Now, for anything that will be accessed through scratch, rewrite
1609     * it to load/store.  Note that this is a _safe list walk, because
1610     * we may generate a new scratch_write instruction after the one
1611     * we're processing.
1612     */
1613    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1614       /* Set up the annotation tracking for new generated instructions. */
1615       base_ir = inst->ir;
1616       current_annotation = inst->annotation;
1617
1618       /* First handle scratch access on the dst. Notice we have to handle
1619        * the case where the dst's reladdr also points to scratch space.
1620        */
1621       if (inst->dst.reladdr)
1622          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1623                                                    *inst->dst.reladdr);
1624
1625       /* Now that we have handled any (possibly recursive) reladdr scratch
1626        * accesses for dst we can safely do the scratch write for dst itself
1627        */
1628       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1629          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1630
1631       /* Now handle scratch access on any src. In this case, since inst->src[i]
1632        * already is a src_reg, we can just call emit_resolve_reladdr with
1633        * inst->src[i] and it will take care of handling scratch loads for
1634        * both src and src.reladdr (recursively).
1635        */
1636       for (int i = 0 ; i < 3; i++) {
1637          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1638                                              inst->src[i]);
1639       }
1640    }
1641 }
1642
1643 /**
1644  * Emits an instruction before @inst to load the value named by @orig_src
1645  * from the pull constant buffer (surface) at @base_offset to @temp.
1646  */
1647 void
1648 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1649                                       dst_reg temp, src_reg orig_src,
1650                                       int base_offset, src_reg indirect)
1651 {
1652    int reg_offset = base_offset + orig_src.offset / 16;
1653    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1654
1655    src_reg offset;
1656    if (indirect.file != BAD_FILE) {
1657       offset = src_reg(this, glsl_type::uint_type);
1658
1659       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1660                                    brw_imm_ud(reg_offset * 16)));
1661    } else if (devinfo->gen >= 8) {
1662       /* Store the offset in a GRF so we can send-from-GRF. */
1663       offset = src_reg(this, glsl_type::uint_type);
1664       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1665    } else {
1666       offset = brw_imm_d(reg_offset * 16);
1667    }
1668
1669    emit_pull_constant_load_reg(temp,
1670                                brw_imm_ud(index),
1671                                offset,
1672                                block, inst);
1673
1674    brw_mark_surface_used(&prog_data->base, index);
1675 }
1676
1677 /**
1678  * Implements array access of uniforms by inserting a
1679  * PULL_CONSTANT_LOAD instruction.
1680  *
1681  * Unlike temporary GRF array access (where we don't support it due to
1682  * the difficulty of doing relative addressing on instruction
1683  * destinations), we could potentially do array access of uniforms
1684  * that were loaded in GRF space as push constants.  In real-world
1685  * usage we've seen, though, the arrays being used are always larger
1686  * than we could load as push constants, so just always move all
1687  * uniform array access out to a pull constant buffer.
1688  */
1689 void
1690 vec4_visitor::move_uniform_array_access_to_pull_constants()
1691 {
1692    /* The vulkan dirver doesn't support pull constants other than UBOs so
1693     * everything has to be pushed regardless.
1694     */
1695    if (stage_prog_data->pull_param == NULL) {
1696       split_uniform_registers();
1697       return;
1698    }
1699
1700    int pull_constant_loc[this->uniforms];
1701    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1702
1703    /* First, walk through the instructions and determine which things need to
1704     * be pulled.  We mark something as needing to be pulled by setting
1705     * pull_constant_loc to 0.
1706     */
1707    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1708       /* We only care about MOV_INDIRECT of a uniform */
1709       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1710           inst->src[0].file != UNIFORM)
1711          continue;
1712
1713       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1714
1715       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1716          pull_constant_loc[uniform_nr + j] = 0;
1717    }
1718
1719    /* Next, we walk the list of uniforms and assign real pull constant
1720     * locations and set their corresponding entries in pull_param.
1721     */
1722    for (int j = 0; j < this->uniforms; j++) {
1723       if (pull_constant_loc[j] < 0)
1724          continue;
1725
1726       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1727
1728       for (int i = 0; i < 4; i++) {
1729          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1730             = stage_prog_data->param[j * 4 + i];
1731       }
1732    }
1733
1734    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1735     * instructions to actual uniform pulls.
1736     */
1737    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1738       /* We only care about MOV_INDIRECT of a uniform */
1739       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1740           inst->src[0].file != UNIFORM)
1741          continue;
1742
1743       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1744
1745       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1746
1747       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1748                               pull_constant_loc[uniform_nr], inst->src[1]);
1749       inst->remove(block);
1750    }
1751
1752    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1753     * no need to track them as larger-than-vec4 objects.  This will be
1754     * relied on in cutting out unused uniform vectors from push
1755     * constants.
1756     */
1757    split_uniform_registers();
1758 }
1759
1760 void
1761 vec4_visitor::resolve_ud_negate(src_reg *reg)
1762 {
1763    if (reg->type != BRW_REGISTER_TYPE_UD ||
1764        !reg->negate)
1765       return;
1766
1767    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1768    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1769    *reg = temp;
1770 }
1771
1772 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1773                            void *log_data,
1774                            const struct brw_sampler_prog_key_data *key_tex,
1775                            struct brw_vue_prog_data *prog_data,
1776                            const nir_shader *shader,
1777                            void *mem_ctx,
1778                            bool no_spills,
1779                            int shader_time_index)
1780    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1781      key_tex(key_tex),
1782      prog_data(prog_data),
1783      fail_msg(NULL),
1784      first_non_payload_grf(0),
1785      need_all_constants_in_pull_buffer(false),
1786      no_spills(no_spills),
1787      shader_time_index(shader_time_index),
1788      last_scratch(0)
1789 {
1790    this->failed = false;
1791
1792    this->base_ir = NULL;
1793    this->current_annotation = NULL;
1794    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1795
1796    memset(this->output_generic_num_components, 0,
1797           sizeof(this->output_generic_num_components));
1798
1799    this->virtual_grf_start = NULL;
1800    this->virtual_grf_end = NULL;
1801    this->live_intervals = NULL;
1802
1803    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1804
1805    this->uniforms = 0;
1806 }
1807
1808 vec4_visitor::~vec4_visitor()
1809 {
1810 }
1811
1812
1813 void
1814 vec4_visitor::fail(const char *format, ...)
1815 {
1816    va_list va;
1817    char *msg;
1818
1819    if (failed)
1820       return;
1821
1822    failed = true;
1823
1824    va_start(va, format);
1825    msg = ralloc_vasprintf(mem_ctx, format, va);
1826    va_end(va);
1827    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1828
1829    this->fail_msg = msg;
1830
1831    if (debug_enabled) {
1832       fprintf(stderr, "%s",  msg);
1833    }
1834 }
1835
1836 } /* namespace brw */