src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->size_written = (dst.file == BAD_FILE ? 0 : REG_SIZE);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186 ALU1(DIM)
 187
 188 /** Gen4 predicated IF. */
 189 vec4_instruction *
 190 vec4_visitor::IF(enum brw_predicate predicate)
 191 {
 192    vec4_instruction *inst;
 193
 194    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196
 197    return inst;
 198 }
 199
 200 /** Gen6 IF with embedded comparison. */
 201 vec4_instruction *
 202 vec4_visitor::IF(src_reg src0, src_reg src1,
 203                  enum brw_conditional_mod condition)
 204 {
 205    assert(devinfo->gen == 6);
 206
 207    vec4_instruction *inst;
 208
 209    resolve_ud_negate(&src0);
 210    resolve_ud_negate(&src1);
 211
 212    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 213                                         src0, src1);
 214    inst->conditional_mod = condition;
 215
 216    return inst;
 217 }
 218
 219 /**
 220  * CMP: Sets the low bit of the destination channels with the result
 221  * of the comparison, while the upper bits are undefined, and updates
 222  * the flag register with the packed 16 bits of the result.
 223  */
 224 vec4_instruction *
 225 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 226                   enum brw_conditional_mod condition)
 227 {
 228    vec4_instruction *inst;
 229
 230    /* Take the instruction:
 231     *
 232     * CMP null<d> src0<f> src1<f>
 233     *
 234     * Original gen4 does type conversion to the destination type before
 235     * comparison, producing garbage results for floating point comparisons.
 236     *
 237     * The destination type doesn't matter on newer generations, so we set the
 238     * type to match src0 so we can compact the instruction.
 239     */
 240    dst.type = src0.type;
 241
 242    resolve_ud_negate(&src0);
 243    resolve_ud_negate(&src1);
 244
 245    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 246    inst->conditional_mod = condition;
 247
 248    return inst;
 249 }
 250
 251 vec4_instruction *
 252 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 253 {
 254    vec4_instruction *inst;
 255
 256    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 257                                         dst, index);
 258    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 259    inst->mlen = 2;
 260
 261    return inst;
 262 }
 263
 264 vec4_instruction *
 265 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 266                             const src_reg &index)
 267 {
 268    vec4_instruction *inst;
 269
 270    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 271                                         dst, src, index);
 272    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 273    inst->mlen = 3;
 274
 275    return inst;
 276 }
 277
 278 src_reg
 279 vec4_visitor::fix_3src_operand(const src_reg &src)
 280 {
 281    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 282     * able to use vertical stride of zero to replicate the vec4 uniform, like
 283     *
 284     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 285     *
 286     * But you can't, since vertical stride is always four in three-source
 287     * instructions. Instead, insert a MOV instruction to do the replication so
 288     * that the three-source instruction can consume it.
 289     */
 290
 291    /* The MOV is only needed if the source is a uniform or immediate. */
 292    if (src.file != UNIFORM && src.file != IMM)
 293       return src;
 294
 295    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 296       return src;
 297
 298    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 299    expanded.type = src.type;
 300    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 301    return src_reg(expanded);
 302 }
 303
 304 src_reg
 305 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 306 {
 307    if (!src.abs && !src.negate)
 308       return src;
 309
 310    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 311    resolved.type = src.type;
 312    emit(MOV(resolved, src));
 313
 314    return src_reg(resolved);
 315 }
 316
 317 src_reg
 318 vec4_visitor::fix_math_operand(const src_reg &src)
 319 {
 320    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 321       return src;
 322
 323    /* The gen6 math instruction ignores the source modifiers --
 324     * swizzle, abs, negate, and at least some parts of the register
 325     * region description.
 326     *
 327     * Rather than trying to enumerate all these cases, *always* expand the
 328     * operand to a temp GRF for gen6.
 329     *
 330     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 331     * can't use.
 332     */
 333
 334    if (devinfo->gen == 7 && src.file != IMM)
 335       return src;
 336
 337    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 338    expanded.type = src.type;
 339    emit(MOV(expanded, src));
 340    return src_reg(expanded);
 341 }
 342
 343 vec4_instruction *
 344 vec4_visitor::emit_math(enum opcode opcode,
 345                         const dst_reg &dst,
 346                         const src_reg &src0, const src_reg &src1)
 347 {
 348    vec4_instruction *math =
 349       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 350
 351    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 352       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 353       math->dst = dst_reg(this, glsl_type::vec4_type);
 354       math->dst.type = dst.type;
 355       math = emit(MOV(dst, src_reg(math->dst)));
 356    } else if (devinfo->gen < 6) {
 357       math->base_mrf = 1;
 358       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 359    }
 360
 361    return math;
 362 }
 363
 364 void
 365 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 366 {
 367    if (devinfo->gen < 7) {
 368       unreachable("ir_unop_pack_half_2x16 should be lowered");
 369    }
 370
 371    assert(dst.type == BRW_REGISTER_TYPE_UD);
 372    assert(src0.type == BRW_REGISTER_TYPE_F);
 373
 374    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 375     *
 376     *   Because this instruction does not have a 16-bit floating-point type,
 377     *   the destination data type must be Word (W).
 378     *
 379     *   The destination must be DWord-aligned and specify a horizontal stride
 380     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 381     *   each destination channel and the upper word is not modified.
 382     *
 383     * The above restriction implies that the f32to16 instruction must use
 384     * align1 mode, because only in align1 mode is it possible to specify
 385     * horizontal stride.  We choose here to defy the hardware docs and emit
 386     * align16 instructions.
 387     *
 388     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 389     * instructions. I was partially successful in that the code passed all
 390     * tests.  However, the code was dubiously correct and fragile, and the
 391     * tests were not harsh enough to probe that frailty. Not trusting the
 392     * code, I chose instead to remain in align16 mode in defiance of the hw
 393     * docs).
 394     *
 395     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 396     * simulator, emitting a f32to16 in align16 mode with UD as destination
 397     * data type is safe. The behavior differs from that specified in the PRM
 398     * in that the upper word of each destination channel is cleared to 0.
 399     */
 400
 401    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 402    src_reg tmp_src(tmp_dst);
 403
 404 #if 0
 405    /* Verify the undocumented behavior on which the following instructions
 406     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 407     * then the result of the bit-or instruction below will be incorrect.
 408     *
 409     * You should inspect the disasm output in order to verify that the MOV is
 410     * not optimized away.
 411     */
 412    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 413 #endif
 414
 415    /* Give tmp the form below, where "." means untouched.
 416     *
 417     *     w z          y          x w z          y          x
 418     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 419     *
 420     * That the upper word of each write-channel be 0 is required for the
 421     * following bit-shift and bit-or instructions to work. Note that this
 422     * relies on the undocumented hardware behavior mentioned above.
 423     */
 424    tmp_dst.writemask = WRITEMASK_XY;
 425    emit(F32TO16(tmp_dst, src0));
 426
 427    /* Give the write-channels of dst the form:
 428     *   0xhhhh0000
 429     */
 430    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 431    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 432
 433    /* Finally, give the write-channels of dst the form of packHalf2x16's
 434     * output:
 435     *   0xhhhhllll
 436     */
 437    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 438    emit(OR(dst, src_reg(dst), tmp_src));
 439 }
 440
 441 void
 442 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 443 {
 444    if (devinfo->gen < 7) {
 445       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 446    }
 447
 448    assert(dst.type == BRW_REGISTER_TYPE_F);
 449    assert(src0.type == BRW_REGISTER_TYPE_UD);
 450
 451    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 452     *
 453     *   Because this instruction does not have a 16-bit floating-point type,
 454     *   the source data type must be Word (W). The destination type must be
 455     *   F (Float).
 456     *
 457     * To use W as the source data type, we must adjust horizontal strides,
 458     * which is only possible in align1 mode. All my [chadv] attempts at
 459     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 460     * Piglit tests, so I gave up.
 461     *
 462     * I've verified that, on gen7 hardware and the simulator, it is safe to
 463     * emit f16to32 in align16 mode with UD as source data type.
 464     */
 465
 466    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 467    src_reg tmp_src(tmp_dst);
 468
 469    tmp_dst.writemask = WRITEMASK_X;
 470    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 471
 472    tmp_dst.writemask = WRITEMASK_Y;
 473    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 474
 475    dst.writemask = WRITEMASK_XY;
 476    emit(F16TO32(dst, tmp_src));
 477 }
 478
 479 void
 480 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 481 {
 482    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 483     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 484     * is not suitable to generate the shift values, but we can use the packed
 485     * vector float and a type-converting MOV.
 486     */
 487    dst_reg shift(this, glsl_type::uvec4_type);
 488    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 489
 490    dst_reg shifted(this, glsl_type::uvec4_type);
 491    src0.swizzle = BRW_SWIZZLE_XXXX;
 492    emit(SHR(shifted, src0, src_reg(shift)));
 493
 494    shifted.type = BRW_REGISTER_TYPE_UB;
 495    dst_reg f(this, glsl_type::vec4_type);
 496    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 497
 498    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 499 }
 500
 501 void
 502 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 503 {
 504    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 505     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 506     * is not suitable to generate the shift values, but we can use the packed
 507     * vector float and a type-converting MOV.
 508     */
 509    dst_reg shift(this, glsl_type::uvec4_type);
 510    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 511
 512    dst_reg shifted(this, glsl_type::uvec4_type);
 513    src0.swizzle = BRW_SWIZZLE_XXXX;
 514    emit(SHR(shifted, src0, src_reg(shift)));
 515
 516    shifted.type = BRW_REGISTER_TYPE_B;
 517    dst_reg f(this, glsl_type::vec4_type);
 518    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 519
 520    dst_reg scaled(this, glsl_type::vec4_type);
 521    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 522
 523    dst_reg max(this, glsl_type::vec4_type);
 524    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 525    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 526 }
 527
 528 void
 529 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 530 {
 531    dst_reg saturated(this, glsl_type::vec4_type);
 532    vec4_instruction *inst = emit(MOV(saturated, src0));
 533    inst->saturate = true;
 534
 535    dst_reg scaled(this, glsl_type::vec4_type);
 536    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 537
 538    dst_reg rounded(this, glsl_type::vec4_type);
 539    emit(RNDE(rounded, src_reg(scaled)));
 540
 541    dst_reg u(this, glsl_type::uvec4_type);
 542    emit(MOV(u, src_reg(rounded)));
 543
 544    src_reg bytes(u);
 545    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 546 }
 547
 548 void
 549 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 550 {
 551    dst_reg max(this, glsl_type::vec4_type);
 552    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 553
 554    dst_reg min(this, glsl_type::vec4_type);
 555    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 556
 557    dst_reg scaled(this, glsl_type::vec4_type);
 558    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 559
 560    dst_reg rounded(this, glsl_type::vec4_type);
 561    emit(RNDE(rounded, src_reg(scaled)));
 562
 563    dst_reg i(this, glsl_type::ivec4_type);
 564    emit(MOV(i, src_reg(rounded)));
 565
 566    src_reg bytes(i);
 567    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 568 }
 569
 570 /*
 571  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 572  * false) elements needed to pack a type.
 573  */
 574 static int
 575 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 576 {
 577    unsigned int i;
 578    int size;
 579
 580    switch (type->base_type) {
 581    case GLSL_TYPE_UINT:
 582    case GLSL_TYPE_INT:
 583    case GLSL_TYPE_FLOAT:
 584    case GLSL_TYPE_BOOL:
 585    case GLSL_TYPE_DOUBLE:
 586       if (type->is_matrix()) {
 587          const glsl_type *col_type = type->column_type();
 588          unsigned col_slots =
 589             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 590          return type->matrix_columns * col_slots;
 591       } else {
 592          /* Regardless of size of vector, it gets a vec4. This is bad
 593           * packing for things like floats, but otherwise arrays become a
 594           * mess.  Hopefully a later pass over the code can pack scalars
 595           * down if appropriate.
 596           */
 597          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 598       }
 599    case GLSL_TYPE_ARRAY:
 600       assert(type->length > 0);
 601       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 602    case GLSL_TYPE_STRUCT:
 603       size = 0;
 604       for (i = 0; i < type->length; i++) {
 605          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 606       }
 607       return size;
 608    case GLSL_TYPE_SUBROUTINE:
 609       return 1;
 610
 611    case GLSL_TYPE_SAMPLER:
 612       /* Samplers take up no register space, since they're baked in at
 613        * link time.
 614        */
 615       return 0;
 616    case GLSL_TYPE_ATOMIC_UINT:
 617       return 0;
 618    case GLSL_TYPE_IMAGE:
 619       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 620    case GLSL_TYPE_VOID:
 621    case GLSL_TYPE_ERROR:
 622    case GLSL_TYPE_INTERFACE:
 623    case GLSL_TYPE_FUNCTION:
 624       unreachable("not reached");
 625    }
 626
 627    return 0;
 628 }
 629
 630 /**
 631  * Returns the minimum number of vec4 elements needed to pack a type.
 632  *
 633  * For simple types, it will return 1 (a single vec4); for matrices, the
 634  * number of columns; for array and struct, the sum of the vec4_size of
 635  * each of its elements; and for sampler and atomic, zero.
 636  *
 637  * This method is useful to calculate how much register space is needed to
 638  * store a particular type.
 639  */
 640 extern "C" int
 641 type_size_vec4(const struct glsl_type *type)
 642 {
 643    return type_size_xvec4(type, true);
 644 }
 645
 646 /**
 647  * Returns the minimum number of dvec4 elements needed to pack a type.
 648  *
 649  * For simple types, it will return 1 (a single dvec4); for matrices, the
 650  * number of columns; for array and struct, the sum of the dvec4_size of
 651  * each of its elements; and for sampler and atomic, zero.
 652  *
 653  * This method is useful to calculate how much register space is needed to
 654  * store a particular type.
 655  *
 656  * Measuring double-precision vertex inputs as dvec4 is required because
 657  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 658  * than the single-precision version. That is, two consecutives dvec4 would be
 659  * located in location "x" and location "x+1", not "x+2".
 660  *
 661  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 662  * remap_vs_attrs() will take in account both the location and also if the
 663  * type fits in one or two vec4 slots.
 664  */
 665 extern "C" int
 666 type_size_dvec4(const struct glsl_type *type)
 667 {
 668    return type_size_xvec4(type, false);
 669 }
 670
 671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 672 {
 673    init();
 674
 675    this->file = VGRF;
 676    this->nr = v->alloc.allocate(type_size_vec4(type));
 677
 678    if (type->is_array() || type->is_record()) {
 679       this->swizzle = BRW_SWIZZLE_NOOP;
 680    } else {
 681       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 682    }
 683
 684    this->type = brw_type_for_base_type(type);
 685 }
 686
 687 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 688 {
 689    assert(size > 0);
 690
 691    init();
 692
 693    this->file = VGRF;
 694    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 695
 696    this->swizzle = BRW_SWIZZLE_NOOP;
 697
 698    this->type = brw_type_for_base_type(type);
 699 }
 700
 701 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 702 {
 703    init();
 704
 705    this->file = VGRF;
 706    this->nr = v->alloc.allocate(type_size_vec4(type));
 707
 708    if (type->is_array() || type->is_record()) {
 709       this->writemask = WRITEMASK_XYZW;
 710    } else {
 711       this->writemask = (1 << type->vector_elements) - 1;
 712    }
 713
 714    this->type = brw_type_for_base_type(type);
 715 }
 716
 717 vec4_instruction *
 718 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 719                           src_reg src0, src_reg src1)
 720 {
 721    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 722    inst->conditional_mod = conditionalmod;
 723    return inst;
 724 }
 725
 726 vec4_instruction *
 727 vec4_visitor::emit_lrp(const dst_reg &dst,
 728                        const src_reg &x, const src_reg &y, const src_reg &a)
 729 {
 730    if (devinfo->gen >= 6) {
 731       /* Note that the instruction's argument order is reversed from GLSL
 732        * and the IR.
 733        */
 734      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 735                      fix_3src_operand(x)));
 736    } else {
 737       /* Earlier generations don't support three source operations, so we
 738        * need to emit x*(1-a) + y*a.
 739        */
 740       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 741       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 742       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 743       y_times_a.writemask           = dst.writemask;
 744       one_minus_a.writemask         = dst.writemask;
 745       x_times_one_minus_a.writemask = dst.writemask;
 746
 747       emit(MUL(y_times_a, y, a));
 748       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 749       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 750       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 751    }
 752 }
 753
 754 /**
 755  * Emits the instructions needed to perform a pull constant load. before_block
 756  * and before_inst can be NULL in which case the instruction will be appended
 757  * to the end of the instruction list.
 758  */
 759 void
 760 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 761                                           src_reg surf_index,
 762                                           src_reg offset_reg,
 763                                           bblock_t *before_block,
 764                                           vec4_instruction *before_inst)
 765 {
 766    assert((before_inst == NULL && before_block == NULL) ||
 767           (before_inst && before_block));
 768
 769    vec4_instruction *pull;
 770
 771    if (devinfo->gen >= 9) {
 772       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 773       src_reg header(this, glsl_type::uvec4_type, 2);
 774
 775       pull = new(mem_ctx)
 776          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 777                           dst_reg(header));
 778
 779       if (before_inst)
 780          emit_before(before_block, before_inst, pull);
 781       else
 782          emit(pull);
 783
 784       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 785                                  offset_reg.type);
 786       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 787
 788       if (before_inst)
 789          emit_before(before_block, before_inst, pull);
 790       else
 791          emit(pull);
 792
 793       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 794                                            dst,
 795                                            surf_index,
 796                                            header);
 797       pull->mlen = 2;
 798       pull->header_size = 1;
 799    } else if (devinfo->gen >= 7) {
 800       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 801
 802       grf_offset.type = offset_reg.type;
 803
 804       pull = MOV(grf_offset, offset_reg);
 805
 806       if (before_inst)
 807          emit_before(before_block, before_inst, pull);
 808       else
 809          emit(pull);
 810
 811       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 812                                            dst,
 813                                            surf_index,
 814                                            src_reg(grf_offset));
 815       pull->mlen = 1;
 816    } else {
 817       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 818                                            dst,
 819                                            surf_index,
 820                                            offset_reg);
 821       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 822       pull->mlen = 1;
 823    }
 824
 825    if (before_inst)
 826       emit_before(before_block, before_inst, pull);
 827    else
 828       emit(pull);
 829 }
 830
 831 src_reg
 832 vec4_visitor::emit_uniformize(const src_reg &src)
 833 {
 834    const src_reg chan_index(this, glsl_type::uint_type);
 835    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 836                               src.type);
 837
 838    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 839       ->force_writemask_all = true;
 840    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 841       ->force_writemask_all = true;
 842
 843    return src_reg(dst);
 844 }
 845
 846 src_reg
 847 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 848                              src_reg coordinate, src_reg surface)
 849 {
 850    vec4_instruction *inst =
 851       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 852                                     dst_reg(this, glsl_type::uvec4_type));
 853    inst->base_mrf = 2;
 854    inst->src[1] = surface;
 855    inst->src[2] = surface;
 856
 857    int param_base;
 858
 859    if (devinfo->gen >= 9) {
 860       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 861       vec4_instruction *header_inst = new(mem_ctx)
 862          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 863                           dst_reg(MRF, inst->base_mrf));
 864
 865       emit(header_inst);
 866
 867       inst->mlen = 2;
 868       inst->header_size = 1;
 869       param_base = inst->base_mrf + 1;
 870    } else {
 871       inst->mlen = 1;
 872       param_base = inst->base_mrf;
 873    }
 874
 875    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 876    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 877    int zero_mask = 0xf & ~coord_mask;
 878
 879    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 880             coordinate));
 881
 882    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 883             brw_imm_d(0)));
 884
 885    emit(inst);
 886    return src_reg(inst->dst);
 887 }
 888
 889 bool
 890 vec4_visitor::is_high_sampler(src_reg sampler)
 891 {
 892    if (devinfo->gen < 8 && !devinfo->is_haswell)
 893       return false;
 894
 895    return sampler.file != IMM || sampler.ud >= 16;
 896 }
 897
 898 void
 899 vec4_visitor::emit_texture(ir_texture_opcode op,
 900                            dst_reg dest,
 901                            const glsl_type *dest_type,
 902                            src_reg coordinate,
 903                            int coord_components,
 904                            src_reg shadow_comparitor,
 905                            src_reg lod, src_reg lod2,
 906                            src_reg sample_index,
 907                            uint32_t constant_offset,
 908                            src_reg offset_value,
 909                            src_reg mcs,
 910                            uint32_t surface,
 911                            src_reg surface_reg,
 912                            src_reg sampler_reg)
 913 {
 914    /* The sampler can only meaningfully compute LOD for fragment shader
 915     * messages. For all other stages, we change the opcode to TXL and hardcode
 916     * the LOD to 0.
 917     *
 918     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 919     * valid LOD argument.
 920     */
 921    if (op == ir_tex || op == ir_query_levels) {
 922       assert(lod.file == BAD_FILE);
 923       lod = brw_imm_f(0.0f);
 924    }
 925
 926    enum opcode opcode;
 927    switch (op) {
 928    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 929    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 930    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 931    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 932    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 933                              SHADER_OPCODE_TXF_CMS); break;
 934    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 935    case ir_tg4: opcode = offset_value.file != BAD_FILE
 936                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 937    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 938    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 939    case ir_txb:
 940       unreachable("TXB is not valid for vertex shaders.");
 941    case ir_lod:
 942       unreachable("LOD is not valid for vertex shaders.");
 943    case ir_samples_identical: {
 944       /* There are some challenges implementing this for vec4, and it seems
 945        * unlikely to be used anyway.  For now, just return false ways.
 946        */
 947       emit(MOV(dest, brw_imm_ud(0u)));
 948       return;
 949    }
 950    default:
 951       unreachable("Unrecognized tex op");
 952    }
 953
 954    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 955
 956    inst->offset = constant_offset;
 957
 958    /* The message header is necessary for:
 959     * - Gen4 (always)
 960     * - Gen9+ for selecting SIMD4x2
 961     * - Texel offsets
 962     * - Gather channel selection
 963     * - Sampler indices too large to fit in a 4-bit value.
 964     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 965     */
 966    inst->header_size =
 967       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 968        inst->offset != 0 || op == ir_tg4 ||
 969        op == ir_texture_samples ||
 970        is_high_sampler(sampler_reg)) ? 1 : 0;
 971    inst->base_mrf = 2;
 972    inst->mlen = inst->header_size;
 973    inst->dst.writemask = WRITEMASK_XYZW;
 974    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 975
 976    inst->src[1] = surface_reg;
 977    inst->src[2] = sampler_reg;
 978
 979    /* MRF for the first parameter */
 980    int param_base = inst->base_mrf + inst->header_size;
 981
 982    if (op == ir_txs || op == ir_query_levels) {
 983       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 984       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 985       inst->mlen++;
 986    } else if (op == ir_texture_samples) {
 987       inst->dst.writemask = WRITEMASK_X;
 988    } else {
 989       /* Load the coordinate */
 990       /* FINISHME: gl_clamp_mask and saturate */
 991       int coord_mask = (1 << coord_components) - 1;
 992       int zero_mask = 0xf & ~coord_mask;
 993
 994       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 995                coordinate));
 996       inst->mlen++;
 997
 998       if (zero_mask != 0) {
 999          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1000                   brw_imm_d(0)));
1001       }
1002       /* Load the shadow comparitor */
1003       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1004          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
1005                           WRITEMASK_X),
1006                   shadow_comparitor));
1007          inst->mlen++;
1008       }
1009
1010       /* Load the LOD info */
1011       if (op == ir_tex || op == ir_txl) {
1012          int mrf, writemask;
1013          if (devinfo->gen >= 5) {
1014             mrf = param_base + 1;
1015             if (shadow_comparitor.file != BAD_FILE) {
1016                writemask = WRITEMASK_Y;
1017                /* mlen already incremented */
1018             } else {
1019                writemask = WRITEMASK_X;
1020                inst->mlen++;
1021             }
1022          } else /* devinfo->gen == 4 */ {
1023             mrf = param_base;
1024             writemask = WRITEMASK_W;
1025          }
1026          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1027       } else if (op == ir_txf) {
1028          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1029       } else if (op == ir_txf_ms) {
1030          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1031                   sample_index));
1032          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1033             /* MCS data is stored in the first two channels of ‘mcs’, but we
1034              * need to get it into the .y and .z channels of the second vec4
1035              * of params.
1036              */
1037             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1038             emit(MOV(dst_reg(MRF, param_base + 1,
1039                              glsl_type::uint_type, WRITEMASK_YZ),
1040                      mcs));
1041          } else if (devinfo->gen >= 7) {
1042             /* MCS data is in the first channel of `mcs`, but we need to get it into
1043              * the .y channel of the second vec4 of params, so replicate .x across
1044              * the whole vec4 and then mask off everything except .y
1045              */
1046             mcs.swizzle = BRW_SWIZZLE_XXXX;
1047             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1048                      mcs));
1049          }
1050          inst->mlen++;
1051       } else if (op == ir_txd) {
1052          const brw_reg_type type = lod.type;
1053
1054          if (devinfo->gen >= 5) {
1055             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1056             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1058             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1059             inst->mlen++;
1060
1061             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1062                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1063                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1064                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1065                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1066                inst->mlen++;
1067
1068                if (shadow_comparitor.file != BAD_FILE) {
1069                   emit(MOV(dst_reg(MRF, param_base + 2,
1070                                    shadow_comparitor.type, WRITEMASK_Z),
1071                            shadow_comparitor));
1072                }
1073             }
1074          } else /* devinfo->gen == 4 */ {
1075             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1076             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1077             inst->mlen += 2;
1078          }
1079       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1080          if (shadow_comparitor.file != BAD_FILE) {
1081             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1082                      shadow_comparitor));
1083          }
1084
1085          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1086                   offset_value));
1087          inst->mlen++;
1088       }
1089    }
1090
1091    emit(inst);
1092
1093    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1094     * spec requires layers.
1095     */
1096    if (op == ir_txs && devinfo->gen < 7) {
1097       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1098       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1099                   src_reg(inst->dst), brw_imm_d(1));
1100    }
1101
1102    if (devinfo->gen == 6 && op == ir_tg4) {
1103       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1104    }
1105
1106    if (op == ir_query_levels) {
1107       /* # levels is in .w */
1108       src_reg swizzled(dest);
1109       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1110                                       SWIZZLE_W, SWIZZLE_W);
1111       emit(MOV(dest, swizzled));
1112    }
1113 }
1114
1115 /**
1116  * Apply workarounds for Gen6 gather with UINT/SINT
1117  */
1118 void
1119 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1120 {
1121    if (!wa)
1122       return;
1123
1124    int width = (wa & WA_8BIT) ? 8 : 16;
1125    dst_reg dst_f = dst;
1126    dst_f.type = BRW_REGISTER_TYPE_F;
1127
1128    /* Convert from UNORM to UINT */
1129    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1130    emit(MOV(dst, src_reg(dst_f)));
1131
1132    if (wa & WA_SIGN) {
1133       /* Reinterpret the UINT value as a signed INT value by
1134        * shifting the sign bit into place, then shifting back
1135        * preserving sign.
1136        */
1137       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1138       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1139    }
1140 }
1141
1142 void
1143 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1144 {
1145    unreachable("not reached");
1146 }
1147
1148 void
1149 vec4_visitor::gs_end_primitive()
1150 {
1151    unreachable("not reached");
1152 }
1153
1154 void
1155 vec4_visitor::emit_ndc_computation()
1156 {
1157    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1158       return;
1159
1160    /* Get the position */
1161    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1162
1163    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1164    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1165    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1166
1167    current_annotation = "NDC";
1168    dst_reg ndc_w = ndc;
1169    ndc_w.writemask = WRITEMASK_W;
1170    src_reg pos_w = pos;
1171    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1172    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1173
1174    dst_reg ndc_xyz = ndc;
1175    ndc_xyz.writemask = WRITEMASK_XYZ;
1176
1177    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1178 }
1179
1180 void
1181 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1182 {
1183    if (devinfo->gen < 6 &&
1184        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1185         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1186         devinfo->has_negative_rhw_bug)) {
1187       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1188       dst_reg header1_w = header1;
1189       header1_w.writemask = WRITEMASK_W;
1190
1191       emit(MOV(header1, brw_imm_ud(0u)));
1192
1193       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1194          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1195
1196          current_annotation = "Point size";
1197          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1198          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1199       }
1200
1201       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1202          current_annotation = "Clipping flags";
1203          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1204          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1205
1206          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1207          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1208          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1209
1210          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1211          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1212          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1213          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1214       }
1215
1216       /* i965 clipping workaround:
1217        * 1) Test for -ve rhw
1218        * 2) If set,
1219        *      set ndc = (0,0,0,0)
1220        *      set ucp[6] = 1
1221        *
1222        * Later, clipping will detect ucp[6] and ensure the primitive is
1223        * clipped against all fixed planes.
1224        */
1225       if (devinfo->has_negative_rhw_bug &&
1226           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1227          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1228          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1229          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1230          vec4_instruction *inst;
1231          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1232          inst->predicate = BRW_PREDICATE_NORMAL;
1233          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1234          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1235          inst->predicate = BRW_PREDICATE_NORMAL;
1236       }
1237
1238       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1239    } else if (devinfo->gen < 6) {
1240       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1241    } else {
1242       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1243       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1244          dst_reg reg_w = reg;
1245          reg_w.writemask = WRITEMASK_W;
1246          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1247          reg_as_src.type = reg_w.type;
1248          reg_as_src.swizzle = brw_swizzle_for_size(1);
1249          emit(MOV(reg_w, reg_as_src));
1250       }
1251       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1252          dst_reg reg_y = reg;
1253          reg_y.writemask = WRITEMASK_Y;
1254          reg_y.type = BRW_REGISTER_TYPE_D;
1255          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1256          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1257       }
1258       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1259          dst_reg reg_z = reg;
1260          reg_z.writemask = WRITEMASK_Z;
1261          reg_z.type = BRW_REGISTER_TYPE_D;
1262          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1263          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1264       }
1265    }
1266 }
1267
1268 vec4_instruction *
1269 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1270 {
1271    assert(varying < VARYING_SLOT_MAX);
1272    assert(output_reg[varying].type == reg.type);
1273    current_annotation = output_reg_annotation[varying];
1274    if (output_reg[varying].file != BAD_FILE) {
1275       return emit(MOV(reg, src_reg(output_reg[varying])));
1276    } else
1277       return NULL;
1278 }
1279
1280 void
1281 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1282 {
1283    assert(varying < VARYING_SLOT_MAX);
1284    assert(varying >= VARYING_SLOT_VAR0);
1285    varying = varying - VARYING_SLOT_VAR0;
1286
1287    unsigned num_comps = output_generic_num_components[varying][component];
1288    if (num_comps == 0)
1289       return;
1290
1291    assert(output_generic_reg[varying][component].type == reg.type);
1292    current_annotation = output_reg_annotation[varying];
1293    if (output_generic_reg[varying][component].file != BAD_FILE) {
1294       src_reg src = src_reg(output_generic_reg[varying][component]);
1295       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1296       reg.writemask =
1297          brw_writemask_for_component_packing(num_comps, component);
1298       emit(MOV(reg, src));
1299    }
1300 }
1301
1302 void
1303 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1304 {
1305    reg.type = BRW_REGISTER_TYPE_F;
1306    output_reg[varying].type = reg.type;
1307
1308    switch (varying) {
1309    case VARYING_SLOT_PSIZ:
1310    {
1311       /* PSIZ is always in slot 0, and is coupled with other flags. */
1312       current_annotation = "indices, point width, clip flags";
1313       emit_psiz_and_flags(reg);
1314       break;
1315    }
1316    case BRW_VARYING_SLOT_NDC:
1317       current_annotation = "NDC";
1318       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1319          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1320       break;
1321    case VARYING_SLOT_POS:
1322       current_annotation = "gl_Position";
1323       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1324          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1325       break;
1326    case VARYING_SLOT_EDGE:
1327       /* This is present when doing unfilled polygons.  We're supposed to copy
1328        * the edge flag from the user-provided vertex array
1329        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1330        * of that attribute (starts as 1.0f).  This is then used in clipping to
1331        * determine which edges should be drawn as wireframe.
1332        */
1333       current_annotation = "edge flag";
1334       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1335                                     glsl_type::float_type, WRITEMASK_XYZW))));
1336       break;
1337    case BRW_VARYING_SLOT_PAD:
1338       /* No need to write to this slot */
1339       break;
1340    default:
1341       if (varying >= VARYING_SLOT_VAR0) {
1342          for (int i = 0; i < 4; i++) {
1343             emit_generic_urb_slot(reg, varying, i);
1344          }
1345       } else {
1346          emit_generic_urb_slot(reg, varying);
1347       }
1348       break;
1349    }
1350 }
1351
1352 static int
1353 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1354 {
1355    if (devinfo->gen >= 6) {
1356       /* URB data written (does not include the message header reg) must
1357        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1358        * section 5.4.3.2.2: URB_INTERLEAVED.
1359        *
1360        * URB entries are allocated on a multiple of 1024 bits, so an
1361        * extra 128 bits written here to make the end align to 256 is
1362        * no problem.
1363        */
1364       if ((mlen % 2) != 1)
1365          mlen++;
1366    }
1367
1368    return mlen;
1369 }
1370
1371
1372 /**
1373  * Generates the VUE payload plus the necessary URB write instructions to
1374  * output it.
1375  *
1376  * The VUE layout is documented in Volume 2a.
1377  */
1378 void
1379 vec4_visitor::emit_vertex()
1380 {
1381    /* MRF 0 is reserved for the debugger, so start with message header
1382     * in MRF 1.
1383     */
1384    int base_mrf = 1;
1385    int mrf = base_mrf;
1386    /* In the process of generating our URB write message contents, we
1387     * may need to unspill a register or load from an array.  Those
1388     * reads would use MRFs 14-15.
1389     */
1390    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1391
1392    /* The following assertion verifies that max_usable_mrf causes an
1393     * even-numbered amount of URB write data, which will meet gen6's
1394     * requirements for length alignment.
1395     */
1396    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1397
1398    /* First mrf is the g0-based message header containing URB handles and
1399     * such.
1400     */
1401    emit_urb_write_header(mrf++);
1402
1403    if (devinfo->gen < 6) {
1404       emit_ndc_computation();
1405    }
1406
1407    /* We may need to split this up into several URB writes, so do them in a
1408     * loop.
1409     */
1410    int slot = 0;
1411    bool complete = false;
1412    do {
1413       /* URB offset is in URB row increments, and each of our MRFs is half of
1414        * one of those, since we're doing interleaved writes.
1415        */
1416       int offset = slot / 2;
1417
1418       mrf = base_mrf + 1;
1419       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1420          emit_urb_slot(dst_reg(MRF, mrf++),
1421                        prog_data->vue_map.slot_to_varying[slot]);
1422
1423          /* If this was max_usable_mrf, we can't fit anything more into this
1424           * URB WRITE. Same thing if we reached the maximum length available.
1425           */
1426          if (mrf > max_usable_mrf ||
1427              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1428             slot++;
1429             break;
1430          }
1431       }
1432
1433       complete = slot >= prog_data->vue_map.num_slots;
1434       current_annotation = "URB write";
1435       vec4_instruction *inst = emit_urb_write_opcode(complete);
1436       inst->base_mrf = base_mrf;
1437       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1438       inst->offset += offset;
1439    } while(!complete);
1440 }
1441
1442
1443 src_reg
1444 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1445                                  src_reg *reladdr, int reg_offset)
1446 {
1447    /* Because we store the values to scratch interleaved like our
1448     * vertex data, we need to scale the vec4 index by 2.
1449     */
1450    int message_header_scale = 2;
1451
1452    /* Pre-gen6, the message header uses byte offsets instead of vec4
1453     * (16-byte) offset units.
1454     */
1455    if (devinfo->gen < 6)
1456       message_header_scale *= 16;
1457
1458    if (reladdr) {
1459       src_reg index = src_reg(this, glsl_type::int_type);
1460
1461       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1462                                    brw_imm_d(reg_offset)));
1463       emit_before(block, inst, MUL(dst_reg(index), index,
1464                                    brw_imm_d(message_header_scale)));
1465
1466       return index;
1467    } else {
1468       return brw_imm_d(reg_offset * message_header_scale);
1469    }
1470 }
1471
1472 /**
1473  * Emits an instruction before @inst to load the value named by @orig_src
1474  * from scratch space at @base_offset to @temp.
1475  *
1476  * @base_offset is measured in 32-byte units (the size of a register).
1477  */
1478 void
1479 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1480                                 dst_reg temp, src_reg orig_src,
1481                                 int base_offset)
1482 {
1483    assert(orig_src.offset % REG_SIZE == 0);
1484    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1485    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1486                                       reg_offset);
1487
1488    emit_before(block, inst, SCRATCH_READ(temp, index));
1489 }
1490
1491 /**
1492  * Emits an instruction after @inst to store the value to be written
1493  * to @orig_dst to scratch space at @base_offset, from @temp.
1494  *
1495  * @base_offset is measured in 32-byte units (the size of a register).
1496  */
1497 void
1498 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1499                                  int base_offset)
1500 {
1501    assert(inst->dst.offset % REG_SIZE == 0);
1502    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1503    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1504                                       reg_offset);
1505
1506    /* Create a temporary register to store *inst's result in.
1507     *
1508     * We have to be careful in MOVing from our temporary result register in
1509     * the scratch write.  If we swizzle from channels of the temporary that
1510     * weren't initialized, it will confuse live interval analysis, which will
1511     * make spilling fail to make progress.
1512     */
1513    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1514                                        inst->dst.type),
1515                                 brw_swizzle_for_mask(inst->dst.writemask));
1516    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1517                                        inst->dst.writemask));
1518    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1519    if (inst->opcode != BRW_OPCODE_SEL)
1520       write->predicate = inst->predicate;
1521    write->ir = inst->ir;
1522    write->annotation = inst->annotation;
1523    inst->insert_after(block, write);
1524
1525    inst->dst.file = temp.file;
1526    inst->dst.nr = temp.nr;
1527    inst->dst.offset %= REG_SIZE;
1528    inst->dst.reladdr = NULL;
1529 }
1530
1531 /**
1532  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1533  * adds the scratch read(s) before \p inst. The function also checks for
1534  * recursive reladdr scratch accesses, issuing the corresponding scratch
1535  * loads and rewriting reladdr references accordingly.
1536  *
1537  * \return \p src if it did not require a scratch load, otherwise, the
1538  * register holding the result of the scratch load that the caller should
1539  * use to rewrite src.
1540  */
1541 src_reg
1542 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1543                                    vec4_instruction *inst, src_reg src)
1544 {
1545    /* Resolve recursive reladdr scratch access by calling ourselves
1546     * with src.reladdr
1547     */
1548    if (src.reladdr)
1549       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1550                                           *src.reladdr);
1551
1552    /* Now handle scratch access on src */
1553    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1554       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1555       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1556       src.nr = temp.nr;
1557       src.offset %= REG_SIZE;
1558       src.reladdr = NULL;
1559    }
1560
1561    return src;
1562 }
1563
1564 /**
1565  * We can't generally support array access in GRF space, because a
1566  * single instruction's destination can only span 2 contiguous
1567  * registers.  So, we send all GRF arrays that get variable index
1568  * access to scratch space.
1569  */
1570 void
1571 vec4_visitor::move_grf_array_access_to_scratch()
1572 {
1573    int scratch_loc[this->alloc.count];
1574    memset(scratch_loc, -1, sizeof(scratch_loc));
1575
1576    /* First, calculate the set of virtual GRFs that need to be punted
1577     * to scratch due to having any array access on them, and where in
1578     * scratch.
1579     */
1580    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1581       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1582          if (scratch_loc[inst->dst.nr] == -1) {
1583             scratch_loc[inst->dst.nr] = last_scratch;
1584             last_scratch += this->alloc.sizes[inst->dst.nr];
1585          }
1586
1587          for (src_reg *iter = inst->dst.reladdr;
1588               iter->reladdr;
1589               iter = iter->reladdr) {
1590             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1591                scratch_loc[iter->nr] = last_scratch;
1592                last_scratch += this->alloc.sizes[iter->nr];
1593             }
1594          }
1595       }
1596
1597       for (int i = 0 ; i < 3; i++) {
1598          for (src_reg *iter = &inst->src[i];
1599               iter->reladdr;
1600               iter = iter->reladdr) {
1601             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1602                scratch_loc[iter->nr] = last_scratch;
1603                last_scratch += this->alloc.sizes[iter->nr];
1604             }
1605          }
1606       }
1607    }
1608
1609    /* Now, for anything that will be accessed through scratch, rewrite
1610     * it to load/store.  Note that this is a _safe list walk, because
1611     * we may generate a new scratch_write instruction after the one
1612     * we're processing.
1613     */
1614    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1615       /* Set up the annotation tracking for new generated instructions. */
1616       base_ir = inst->ir;
1617       current_annotation = inst->annotation;
1618
1619       /* First handle scratch access on the dst. Notice we have to handle
1620        * the case where the dst's reladdr also points to scratch space.
1621        */
1622       if (inst->dst.reladdr)
1623          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1624                                                    *inst->dst.reladdr);
1625
1626       /* Now that we have handled any (possibly recursive) reladdr scratch
1627        * accesses for dst we can safely do the scratch write for dst itself
1628        */
1629       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1630          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1631
1632       /* Now handle scratch access on any src. In this case, since inst->src[i]
1633        * already is a src_reg, we can just call emit_resolve_reladdr with
1634        * inst->src[i] and it will take care of handling scratch loads for
1635        * both src and src.reladdr (recursively).
1636        */
1637       for (int i = 0 ; i < 3; i++) {
1638          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1639                                              inst->src[i]);
1640       }
1641    }
1642 }
1643
1644 /**
1645  * Emits an instruction before @inst to load the value named by @orig_src
1646  * from the pull constant buffer (surface) at @base_offset to @temp.
1647  */
1648 void
1649 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1650                                       dst_reg temp, src_reg orig_src,
1651                                       int base_offset, src_reg indirect)
1652 {
1653    assert(orig_src.offset % 16 == 0);
1654    int reg_offset = base_offset + orig_src.offset / 16;
1655    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1656
1657    src_reg offset;
1658    if (indirect.file != BAD_FILE) {
1659       offset = src_reg(this, glsl_type::uint_type);
1660
1661       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1662                                    brw_imm_ud(reg_offset * 16)));
1663    } else if (devinfo->gen >= 8) {
1664       /* Store the offset in a GRF so we can send-from-GRF. */
1665       offset = src_reg(this, glsl_type::uint_type);
1666       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1667    } else {
1668       offset = brw_imm_d(reg_offset * 16);
1669    }
1670
1671    emit_pull_constant_load_reg(temp,
1672                                brw_imm_ud(index),
1673                                offset,
1674                                block, inst);
1675
1676    brw_mark_surface_used(&prog_data->base, index);
1677 }
1678
1679 /**
1680  * Implements array access of uniforms by inserting a
1681  * PULL_CONSTANT_LOAD instruction.
1682  *
1683  * Unlike temporary GRF array access (where we don't support it due to
1684  * the difficulty of doing relative addressing on instruction
1685  * destinations), we could potentially do array access of uniforms
1686  * that were loaded in GRF space as push constants.  In real-world
1687  * usage we've seen, though, the arrays being used are always larger
1688  * than we could load as push constants, so just always move all
1689  * uniform array access out to a pull constant buffer.
1690  */
1691 void
1692 vec4_visitor::move_uniform_array_access_to_pull_constants()
1693 {
1694    /* The vulkan dirver doesn't support pull constants other than UBOs so
1695     * everything has to be pushed regardless.
1696     */
1697    if (stage_prog_data->pull_param == NULL) {
1698       split_uniform_registers();
1699       return;
1700    }
1701
1702    int pull_constant_loc[this->uniforms];
1703    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1704
1705    /* First, walk through the instructions and determine which things need to
1706     * be pulled.  We mark something as needing to be pulled by setting
1707     * pull_constant_loc to 0.
1708     */
1709    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1710       /* We only care about MOV_INDIRECT of a uniform */
1711       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1712           inst->src[0].file != UNIFORM)
1713          continue;
1714
1715       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1716
1717       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1718          pull_constant_loc[uniform_nr + j] = 0;
1719    }
1720
1721    /* Next, we walk the list of uniforms and assign real pull constant
1722     * locations and set their corresponding entries in pull_param.
1723     */
1724    for (int j = 0; j < this->uniforms; j++) {
1725       if (pull_constant_loc[j] < 0)
1726          continue;
1727
1728       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1729
1730       for (int i = 0; i < 4; i++) {
1731          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1732             = stage_prog_data->param[j * 4 + i];
1733       }
1734    }
1735
1736    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1737     * instructions to actual uniform pulls.
1738     */
1739    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1740       /* We only care about MOV_INDIRECT of a uniform */
1741       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1742           inst->src[0].file != UNIFORM)
1743          continue;
1744
1745       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1746
1747       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1748
1749       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1750                               pull_constant_loc[uniform_nr], inst->src[1]);
1751       inst->remove(block);
1752    }
1753
1754    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1755     * no need to track them as larger-than-vec4 objects.  This will be
1756     * relied on in cutting out unused uniform vectors from push
1757     * constants.
1758     */
1759    split_uniform_registers();
1760 }
1761
1762 void
1763 vec4_visitor::resolve_ud_negate(src_reg *reg)
1764 {
1765    if (reg->type != BRW_REGISTER_TYPE_UD ||
1766        !reg->negate)
1767       return;
1768
1769    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1770    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1771    *reg = temp;
1772 }
1773
1774 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1775                            void *log_data,
1776                            const struct brw_sampler_prog_key_data *key_tex,
1777                            struct brw_vue_prog_data *prog_data,
1778                            const nir_shader *shader,
1779                            void *mem_ctx,
1780                            bool no_spills,
1781                            int shader_time_index)
1782    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1783      key_tex(key_tex),
1784      prog_data(prog_data),
1785      fail_msg(NULL),
1786      first_non_payload_grf(0),
1787      need_all_constants_in_pull_buffer(false),
1788      no_spills(no_spills),
1789      shader_time_index(shader_time_index),
1790      last_scratch(0)
1791 {
1792    this->failed = false;
1793
1794    this->base_ir = NULL;
1795    this->current_annotation = NULL;
1796    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1797
1798    memset(this->output_generic_num_components, 0,
1799           sizeof(this->output_generic_num_components));
1800
1801    this->virtual_grf_start = NULL;
1802    this->virtual_grf_end = NULL;
1803    this->live_intervals = NULL;
1804
1805    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1806
1807    this->uniforms = 0;
1808 }
1809
1810 vec4_visitor::~vec4_visitor()
1811 {
1812 }
1813
1814
1815 void
1816 vec4_visitor::fail(const char *format, ...)
1817 {
1818    va_list va;
1819    char *msg;
1820
1821    if (failed)
1822       return;
1823
1824    failed = true;
1825
1826    va_start(va, format);
1827    msg = ralloc_vasprintf(mem_ctx, format, va);
1828    va_end(va);
1829    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1830
1831    this->fail_msg = msg;
1832
1833    if (debug_enabled) {
1834       fprintf(stderr, "%s",  msg);
1835    }
1836 }
1837
1838 } /* namespace brw */