src/intel/compiler/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27
  28 namespace brw {
  29
  30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  31                                    const src_reg &src0, const src_reg &src1,
  32                                    const src_reg &src2)
  33 {
  34    this->opcode = opcode;
  35    this->dst = dst;
  36    this->src[0] = src0;
  37    this->src[1] = src1;
  38    this->src[2] = src2;
  39    this->saturate = false;
  40    this->force_writemask_all = false;
  41    this->no_dd_clear = false;
  42    this->no_dd_check = false;
  43    this->writes_accumulator = false;
  44    this->conditional_mod = BRW_CONDITIONAL_NONE;
  45    this->predicate = BRW_PREDICATE_NONE;
  46    this->predicate_inverse = false;
  47    this->target = 0;
  48    this->shadow_compare = false;
  49    this->eot = false;
  50    this->ir = NULL;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_size = 0;
  53    this->flag_subreg = 0;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->exec_size = 8;
  58    this->group = 0;
  59    this->size_written = (dst.file == BAD_FILE ?
  60                          0 : this->exec_size * type_sz(dst.type));
  61    this->annotation = NULL;
  62 }
  63
  64 vec4_instruction *
  65 vec4_visitor::emit(vec4_instruction *inst)
  66 {
  67    inst->ir = this->base_ir;
  68    inst->annotation = this->current_annotation;
  69
  70    this->instructions.push_tail(inst);
  71
  72    return inst;
  73 }
  74
  75 vec4_instruction *
  76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  77                           vec4_instruction *new_inst)
  78 {
  79    new_inst->ir = inst->ir;
  80    new_inst->annotation = inst->annotation;
  81
  82    inst->insert_before(block, new_inst);
  83
  84    return inst;
  85 }
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  89                    const src_reg &src1, const src_reg &src2)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  92 }
  93
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  97                    const src_reg &src1)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 112 }
 113
 114 vec4_instruction *
 115 vec4_visitor::emit(enum opcode opcode)
 116 {
 117    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 118 }
 119
 120 #define ALU1(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 125    }
 126
 127 #define ALU2(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 130                     const src_reg &src1)                                \
 131    {                                                                    \
 132       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 133                                            src0, src1);                 \
 134    }
 135
 136 #define ALU2_ACC(op)                                                    \
 137    vec4_instruction *                                                   \
 138    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 139                     const src_reg &src1)                                \
 140    {                                                                    \
 141       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 142                        BRW_OPCODE_##op, dst, src0, src1);               \
 143       inst->writes_accumulator = true;                                  \
 144       return inst;                                                      \
 145    }
 146
 147 #define ALU3(op)                                                        \
 148    vec4_instruction *                                                   \
 149    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 150                     const src_reg &src1, const src_reg &src2)           \
 151    {                                                                    \
 152       assert(devinfo->gen >= 6);                                                \
 153       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 154                                            src0, src1, src2);           \
 155    }
 156
 157 ALU1(NOT)
 158 ALU1(MOV)
 159 ALU1(FRC)
 160 ALU1(RNDD)
 161 ALU1(RNDE)
 162 ALU1(RNDZ)
 163 ALU1(F32TO16)
 164 ALU1(F16TO32)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2_ACC(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(DP3)
 172 ALU2(DP4)
 173 ALU2(DPH)
 174 ALU2(SHL)
 175 ALU2(SHR)
 176 ALU2(ASR)
 177 ALU3(LRP)
 178 ALU1(BFREV)
 179 ALU3(BFE)
 180 ALU2(BFI1)
 181 ALU3(BFI2)
 182 ALU1(FBH)
 183 ALU1(FBL)
 184 ALU1(CBIT)
 185 ALU3(MAD)
 186 ALU2_ACC(ADDC)
 187 ALU2_ACC(SUBB)
 188 ALU2(MAC)
 189 ALU1(DIM)
 190
 191 /** Gen4 predicated IF. */
 192 vec4_instruction *
 193 vec4_visitor::IF(enum brw_predicate predicate)
 194 {
 195    vec4_instruction *inst;
 196
 197    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 198    inst->predicate = predicate;
 199
 200    return inst;
 201 }
 202
 203 /** Gen6 IF with embedded comparison. */
 204 vec4_instruction *
 205 vec4_visitor::IF(src_reg src0, src_reg src1,
 206                  enum brw_conditional_mod condition)
 207 {
 208    assert(devinfo->gen == 6);
 209
 210    vec4_instruction *inst;
 211
 212    resolve_ud_negate(&src0);
 213    resolve_ud_negate(&src1);
 214
 215    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 216                                         src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 /**
 223  * CMP: Sets the low bit of the destination channels with the result
 224  * of the comparison, while the upper bits are undefined, and updates
 225  * the flag register with the packed 16 bits of the result.
 226  */
 227 vec4_instruction *
 228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 229                   enum brw_conditional_mod condition)
 230 {
 231    vec4_instruction *inst;
 232
 233    /* Take the instruction:
 234     *
 235     * CMP null<d> src0<f> src1<f>
 236     *
 237     * Original gen4 does type conversion to the destination type before
 238     * comparison, producing garbage results for floating point comparisons.
 239     *
 240     * The destination type doesn't matter on newer generations, so we set the
 241     * type to match src0 so we can compact the instruction.
 242     */
 243    dst.type = src0.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(const src_reg &src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 309 {
 310    if (!src.abs && !src.negate)
 311       return src;
 312
 313    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 314    resolved.type = src.type;
 315    emit(MOV(resolved, src));
 316
 317    return src_reg(resolved);
 318 }
 319
 320 src_reg
 321 vec4_visitor::fix_math_operand(const src_reg &src)
 322 {
 323    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 324       return src;
 325
 326    /* The gen6 math instruction ignores the source modifiers --
 327     * swizzle, abs, negate, and at least some parts of the register
 328     * region description.
 329     *
 330     * Rather than trying to enumerate all these cases, *always* expand the
 331     * operand to a temp GRF for gen6.
 332     *
 333     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 334     * can't use.
 335     */
 336
 337    if (devinfo->gen == 7 && src.file != IMM)
 338       return src;
 339
 340    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 341    expanded.type = src.type;
 342    emit(MOV(expanded, src));
 343    return src_reg(expanded);
 344 }
 345
 346 vec4_instruction *
 347 vec4_visitor::emit_math(enum opcode opcode,
 348                         const dst_reg &dst,
 349                         const src_reg &src0, const src_reg &src1)
 350 {
 351    vec4_instruction *math =
 352       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 353
 354    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 355       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 356       math->dst = dst_reg(this, glsl_type::vec4_type);
 357       math->dst.type = dst.type;
 358       math = emit(MOV(dst, src_reg(math->dst)));
 359    } else if (devinfo->gen < 6) {
 360       math->base_mrf = 1;
 361       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 362    }
 363
 364    return math;
 365 }
 366
 367 void
 368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 369 {
 370    if (devinfo->gen < 7) {
 371       unreachable("ir_unop_pack_half_2x16 should be lowered");
 372    }
 373
 374    assert(dst.type == BRW_REGISTER_TYPE_UD);
 375    assert(src0.type == BRW_REGISTER_TYPE_F);
 376
 377    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 378     *
 379     *   Because this instruction does not have a 16-bit floating-point type,
 380     *   the destination data type must be Word (W).
 381     *
 382     *   The destination must be DWord-aligned and specify a horizontal stride
 383     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 384     *   each destination channel and the upper word is not modified.
 385     *
 386     * The above restriction implies that the f32to16 instruction must use
 387     * align1 mode, because only in align1 mode is it possible to specify
 388     * horizontal stride.  We choose here to defy the hardware docs and emit
 389     * align16 instructions.
 390     *
 391     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 392     * instructions. I was partially successful in that the code passed all
 393     * tests.  However, the code was dubiously correct and fragile, and the
 394     * tests were not harsh enough to probe that frailty. Not trusting the
 395     * code, I chose instead to remain in align16 mode in defiance of the hw
 396     * docs).
 397     *
 398     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 399     * simulator, emitting a f32to16 in align16 mode with UD as destination
 400     * data type is safe. The behavior differs from that specified in the PRM
 401     * in that the upper word of each destination channel is cleared to 0.
 402     */
 403
 404    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 405    src_reg tmp_src(tmp_dst);
 406
 407 #if 0
 408    /* Verify the undocumented behavior on which the following instructions
 409     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 410     * then the result of the bit-or instruction below will be incorrect.
 411     *
 412     * You should inspect the disasm output in order to verify that the MOV is
 413     * not optimized away.
 414     */
 415    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 416 #endif
 417
 418    /* Give tmp the form below, where "." means untouched.
 419     *
 420     *     w z          y          x w z          y          x
 421     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 422     *
 423     * That the upper word of each write-channel be 0 is required for the
 424     * following bit-shift and bit-or instructions to work. Note that this
 425     * relies on the undocumented hardware behavior mentioned above.
 426     */
 427    tmp_dst.writemask = WRITEMASK_XY;
 428    emit(F32TO16(tmp_dst, src0));
 429
 430    /* Give the write-channels of dst the form:
 431     *   0xhhhh0000
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 434    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 435
 436    /* Finally, give the write-channels of dst the form of packHalf2x16's
 437     * output:
 438     *   0xhhhhllll
 439     */
 440    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 441    emit(OR(dst, src_reg(dst), tmp_src));
 442 }
 443
 444 void
 445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 446 {
 447    if (devinfo->gen < 7) {
 448       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 449    }
 450
 451    assert(dst.type == BRW_REGISTER_TYPE_F);
 452    assert(src0.type == BRW_REGISTER_TYPE_UD);
 453
 454    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 455     *
 456     *   Because this instruction does not have a 16-bit floating-point type,
 457     *   the source data type must be Word (W). The destination type must be
 458     *   F (Float).
 459     *
 460     * To use W as the source data type, we must adjust horizontal strides,
 461     * which is only possible in align1 mode. All my [chadv] attempts at
 462     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 463     * Piglit tests, so I gave up.
 464     *
 465     * I've verified that, on gen7 hardware and the simulator, it is safe to
 466     * emit f16to32 in align16 mode with UD as source data type.
 467     */
 468
 469    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 470    src_reg tmp_src(tmp_dst);
 471
 472    tmp_dst.writemask = WRITEMASK_X;
 473    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 474
 475    tmp_dst.writemask = WRITEMASK_Y;
 476    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 477
 478    dst.writemask = WRITEMASK_XY;
 479    emit(F16TO32(dst, tmp_src));
 480 }
 481
 482 void
 483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 484 {
 485    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 486     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 487     * is not suitable to generate the shift values, but we can use the packed
 488     * vector float and a type-converting MOV.
 489     */
 490    dst_reg shift(this, glsl_type::uvec4_type);
 491    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 492
 493    dst_reg shifted(this, glsl_type::uvec4_type);
 494    src0.swizzle = BRW_SWIZZLE_XXXX;
 495    emit(SHR(shifted, src0, src_reg(shift)));
 496
 497    shifted.type = BRW_REGISTER_TYPE_UB;
 498    dst_reg f(this, glsl_type::vec4_type);
 499    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 500
 501    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 502 }
 503
 504 void
 505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 506 {
 507    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 508     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 509     * is not suitable to generate the shift values, but we can use the packed
 510     * vector float and a type-converting MOV.
 511     */
 512    dst_reg shift(this, glsl_type::uvec4_type);
 513    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 514
 515    dst_reg shifted(this, glsl_type::uvec4_type);
 516    src0.swizzle = BRW_SWIZZLE_XXXX;
 517    emit(SHR(shifted, src0, src_reg(shift)));
 518
 519    shifted.type = BRW_REGISTER_TYPE_B;
 520    dst_reg f(this, glsl_type::vec4_type);
 521    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 522
 523    dst_reg scaled(this, glsl_type::vec4_type);
 524    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 525
 526    dst_reg max(this, glsl_type::vec4_type);
 527    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 528    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 529 }
 530
 531 void
 532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 533 {
 534    dst_reg saturated(this, glsl_type::vec4_type);
 535    vec4_instruction *inst = emit(MOV(saturated, src0));
 536    inst->saturate = true;
 537
 538    dst_reg scaled(this, glsl_type::vec4_type);
 539    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 540
 541    dst_reg rounded(this, glsl_type::vec4_type);
 542    emit(RNDE(rounded, src_reg(scaled)));
 543
 544    dst_reg u(this, glsl_type::uvec4_type);
 545    emit(MOV(u, src_reg(rounded)));
 546
 547    src_reg bytes(u);
 548    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 549 }
 550
 551 void
 552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 553 {
 554    dst_reg max(this, glsl_type::vec4_type);
 555    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 556
 557    dst_reg min(this, glsl_type::vec4_type);
 558    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 559
 560    dst_reg scaled(this, glsl_type::vec4_type);
 561    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 562
 563    dst_reg rounded(this, glsl_type::vec4_type);
 564    emit(RNDE(rounded, src_reg(scaled)));
 565
 566    dst_reg i(this, glsl_type::ivec4_type);
 567    emit(MOV(i, src_reg(rounded)));
 568
 569    src_reg bytes(i);
 570    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 571 }
 572
 573 /*
 574  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 575  * false) elements needed to pack a type.
 576  */
 577 static int
 578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 579 {
 580    unsigned int i;
 581    int size;
 582
 583    switch (type->base_type) {
 584    case GLSL_TYPE_UINT:
 585    case GLSL_TYPE_INT:
 586    case GLSL_TYPE_FLOAT:
 587    case GLSL_TYPE_BOOL:
 588    case GLSL_TYPE_DOUBLE:
 589    case GLSL_TYPE_UINT64:
 590    case GLSL_TYPE_INT64:
 591       if (type->is_matrix()) {
 592          const glsl_type *col_type = type->column_type();
 593          unsigned col_slots =
 594             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 595          return type->matrix_columns * col_slots;
 596       } else {
 597          /* Regardless of size of vector, it gets a vec4. This is bad
 598           * packing for things like floats, but otherwise arrays become a
 599           * mess.  Hopefully a later pass over the code can pack scalars
 600           * down if appropriate.
 601           */
 602          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 603       }
 604    case GLSL_TYPE_ARRAY:
 605       assert(type->length > 0);
 606       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 607    case GLSL_TYPE_STRUCT:
 608       size = 0;
 609       for (i = 0; i < type->length; i++) {
 610          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 611       }
 612       return size;
 613    case GLSL_TYPE_SUBROUTINE:
 614       return 1;
 615
 616    case GLSL_TYPE_SAMPLER:
 617       /* Samplers take up no register space, since they're baked in at
 618        * link time.
 619        */
 620       return 0;
 621    case GLSL_TYPE_ATOMIC_UINT:
 622       return 0;
 623    case GLSL_TYPE_IMAGE:
 624       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 625    case GLSL_TYPE_VOID:
 626    case GLSL_TYPE_ERROR:
 627    case GLSL_TYPE_INTERFACE:
 628    case GLSL_TYPE_FUNCTION:
 629       unreachable("not reached");
 630    }
 631
 632    return 0;
 633 }
 634
 635 /**
 636  * Returns the minimum number of vec4 elements needed to pack a type.
 637  *
 638  * For simple types, it will return 1 (a single vec4); for matrices, the
 639  * number of columns; for array and struct, the sum of the vec4_size of
 640  * each of its elements; and for sampler and atomic, zero.
 641  *
 642  * This method is useful to calculate how much register space is needed to
 643  * store a particular type.
 644  */
 645 extern "C" int
 646 type_size_vec4(const struct glsl_type *type)
 647 {
 648    return type_size_xvec4(type, true);
 649 }
 650
 651 /**
 652  * Returns the minimum number of dvec4 elements needed to pack a type.
 653  *
 654  * For simple types, it will return 1 (a single dvec4); for matrices, the
 655  * number of columns; for array and struct, the sum of the dvec4_size of
 656  * each of its elements; and for sampler and atomic, zero.
 657  *
 658  * This method is useful to calculate how much register space is needed to
 659  * store a particular type.
 660  *
 661  * Measuring double-precision vertex inputs as dvec4 is required because
 662  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 663  * than the single-precision version. That is, two consecutives dvec4 would be
 664  * located in location "x" and location "x+1", not "x+2".
 665  *
 666  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 667  * remap_vs_attrs() will take in account both the location and also if the
 668  * type fits in one or two vec4 slots.
 669  */
 670 extern "C" int
 671 type_size_dvec4(const struct glsl_type *type)
 672 {
 673    return type_size_xvec4(type, false);
 674 }
 675
 676 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 677 {
 678    init();
 679
 680    this->file = VGRF;
 681    this->nr = v->alloc.allocate(type_size_vec4(type));
 682
 683    if (type->is_array() || type->is_record()) {
 684       this->swizzle = BRW_SWIZZLE_NOOP;
 685    } else {
 686       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 687    }
 688
 689    this->type = brw_type_for_base_type(type);
 690 }
 691
 692 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 693 {
 694    assert(size > 0);
 695
 696    init();
 697
 698    this->file = VGRF;
 699    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 700
 701    this->swizzle = BRW_SWIZZLE_NOOP;
 702
 703    this->type = brw_type_for_base_type(type);
 704 }
 705
 706 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 707 {
 708    init();
 709
 710    this->file = VGRF;
 711    this->nr = v->alloc.allocate(type_size_vec4(type));
 712
 713    if (type->is_array() || type->is_record()) {
 714       this->writemask = WRITEMASK_XYZW;
 715    } else {
 716       this->writemask = (1 << type->vector_elements) - 1;
 717    }
 718
 719    this->type = brw_type_for_base_type(type);
 720 }
 721
 722 vec4_instruction *
 723 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 724                           src_reg src0, src_reg src1)
 725 {
 726    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 727    inst->conditional_mod = conditionalmod;
 728    return inst;
 729 }
 730
 731 vec4_instruction *
 732 vec4_visitor::emit_lrp(const dst_reg &dst,
 733                        const src_reg &x, const src_reg &y, const src_reg &a)
 734 {
 735    if (devinfo->gen >= 6) {
 736       /* Note that the instruction's argument order is reversed from GLSL
 737        * and the IR.
 738        */
 739      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 740                      fix_3src_operand(x)));
 741    } else {
 742       /* Earlier generations don't support three source operations, so we
 743        * need to emit x*(1-a) + y*a.
 744        */
 745       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 746       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 747       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 748       y_times_a.writemask           = dst.writemask;
 749       one_minus_a.writemask         = dst.writemask;
 750       x_times_one_minus_a.writemask = dst.writemask;
 751
 752       emit(MUL(y_times_a, y, a));
 753       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 754       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 755       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 756    }
 757 }
 758
 759 /**
 760  * Emits the instructions needed to perform a pull constant load. before_block
 761  * and before_inst can be NULL in which case the instruction will be appended
 762  * to the end of the instruction list.
 763  */
 764 void
 765 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 766                                           src_reg surf_index,
 767                                           src_reg offset_reg,
 768                                           bblock_t *before_block,
 769                                           vec4_instruction *before_inst)
 770 {
 771    assert((before_inst == NULL && before_block == NULL) ||
 772           (before_inst && before_block));
 773
 774    vec4_instruction *pull;
 775
 776    if (devinfo->gen >= 9) {
 777       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 778       src_reg header(this, glsl_type::uvec4_type, 2);
 779
 780       pull = new(mem_ctx)
 781          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 782                           dst_reg(header));
 783
 784       if (before_inst)
 785          emit_before(before_block, before_inst, pull);
 786       else
 787          emit(pull);
 788
 789       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 790                                  offset_reg.type);
 791       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 792
 793       if (before_inst)
 794          emit_before(before_block, before_inst, pull);
 795       else
 796          emit(pull);
 797
 798       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 799                                            dst,
 800                                            surf_index,
 801                                            header);
 802       pull->mlen = 2;
 803       pull->header_size = 1;
 804    } else if (devinfo->gen >= 7) {
 805       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 806
 807       grf_offset.type = offset_reg.type;
 808
 809       pull = MOV(grf_offset, offset_reg);
 810
 811       if (before_inst)
 812          emit_before(before_block, before_inst, pull);
 813       else
 814          emit(pull);
 815
 816       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 817                                            dst,
 818                                            surf_index,
 819                                            src_reg(grf_offset));
 820       pull->mlen = 1;
 821    } else {
 822       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 823                                            dst,
 824                                            surf_index,
 825                                            offset_reg);
 826       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 827       pull->mlen = 1;
 828    }
 829
 830    if (before_inst)
 831       emit_before(before_block, before_inst, pull);
 832    else
 833       emit(pull);
 834 }
 835
 836 src_reg
 837 vec4_visitor::emit_uniformize(const src_reg &src)
 838 {
 839    const src_reg chan_index(this, glsl_type::uint_type);
 840    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 841                               src.type);
 842
 843    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 844       ->force_writemask_all = true;
 845    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 846       ->force_writemask_all = true;
 847
 848    return src_reg(dst);
 849 }
 850
 851 src_reg
 852 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 853                              src_reg coordinate, src_reg surface)
 854 {
 855    vec4_instruction *inst =
 856       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 857                                     dst_reg(this, glsl_type::uvec4_type));
 858    inst->base_mrf = 2;
 859    inst->src[1] = surface;
 860    inst->src[2] = surface;
 861
 862    int param_base;
 863
 864    if (devinfo->gen >= 9) {
 865       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 866       vec4_instruction *header_inst = new(mem_ctx)
 867          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 868                           dst_reg(MRF, inst->base_mrf));
 869
 870       emit(header_inst);
 871
 872       inst->mlen = 2;
 873       inst->header_size = 1;
 874       param_base = inst->base_mrf + 1;
 875    } else {
 876       inst->mlen = 1;
 877       param_base = inst->base_mrf;
 878    }
 879
 880    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 881    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 882    int zero_mask = 0xf & ~coord_mask;
 883
 884    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 885             coordinate));
 886
 887    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 888             brw_imm_d(0)));
 889
 890    emit(inst);
 891    return src_reg(inst->dst);
 892 }
 893
 894 bool
 895 vec4_visitor::is_high_sampler(src_reg sampler)
 896 {
 897    if (devinfo->gen < 8 && !devinfo->is_haswell)
 898       return false;
 899
 900    return sampler.file != IMM || sampler.ud >= 16;
 901 }
 902
 903 void
 904 vec4_visitor::emit_texture(ir_texture_opcode op,
 905                            dst_reg dest,
 906                            const glsl_type *dest_type,
 907                            src_reg coordinate,
 908                            int coord_components,
 909                            src_reg shadow_comparator,
 910                            src_reg lod, src_reg lod2,
 911                            src_reg sample_index,
 912                            uint32_t constant_offset,
 913                            src_reg offset_value,
 914                            src_reg mcs,
 915                            uint32_t surface,
 916                            src_reg surface_reg,
 917                            src_reg sampler_reg)
 918 {
 919    enum opcode opcode;
 920    switch (op) {
 921    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 922    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 923    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 924    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 925    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 926                              SHADER_OPCODE_TXF_CMS); break;
 927    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 928    case ir_tg4: opcode = offset_value.file != BAD_FILE
 929                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 930    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 931    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 932    case ir_txb:
 933       unreachable("TXB is not valid for vertex shaders.");
 934    case ir_lod:
 935       unreachable("LOD is not valid for vertex shaders.");
 936    case ir_samples_identical: {
 937       /* There are some challenges implementing this for vec4, and it seems
 938        * unlikely to be used anyway.  For now, just return false ways.
 939        */
 940       emit(MOV(dest, brw_imm_ud(0u)));
 941       return;
 942    }
 943    default:
 944       unreachable("Unrecognized tex op");
 945    }
 946
 947    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 948
 949    inst->offset = constant_offset;
 950
 951    /* The message header is necessary for:
 952     * - Gen4 (always)
 953     * - Gen9+ for selecting SIMD4x2
 954     * - Texel offsets
 955     * - Gather channel selection
 956     * - Sampler indices too large to fit in a 4-bit value.
 957     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 958     */
 959    inst->header_size =
 960       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 961        inst->offset != 0 || op == ir_tg4 ||
 962        op == ir_texture_samples ||
 963        is_high_sampler(sampler_reg)) ? 1 : 0;
 964    inst->base_mrf = 2;
 965    inst->mlen = inst->header_size;
 966    inst->dst.writemask = WRITEMASK_XYZW;
 967    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 968
 969    inst->src[1] = surface_reg;
 970    inst->src[2] = sampler_reg;
 971
 972    /* MRF for the first parameter */
 973    int param_base = inst->base_mrf + inst->header_size;
 974
 975    if (op == ir_txs || op == ir_query_levels) {
 976       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 977       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 978       inst->mlen++;
 979    } else if (op == ir_texture_samples) {
 980       inst->dst.writemask = WRITEMASK_X;
 981    } else {
 982       /* Load the coordinate */
 983       /* FINISHME: gl_clamp_mask and saturate */
 984       int coord_mask = (1 << coord_components) - 1;
 985       int zero_mask = 0xf & ~coord_mask;
 986
 987       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 988                coordinate));
 989       inst->mlen++;
 990
 991       if (zero_mask != 0) {
 992          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 993                   brw_imm_d(0)));
 994       }
 995       /* Load the shadow comparator */
 996       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 997          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
 998                           WRITEMASK_X),
 999                   shadow_comparator));
1000          inst->mlen++;
1001       }
1002
1003       /* Load the LOD info */
1004       if (op == ir_tex || op == ir_txl) {
1005          int mrf, writemask;
1006          if (devinfo->gen >= 5) {
1007             mrf = param_base + 1;
1008             if (shadow_comparator.file != BAD_FILE) {
1009                writemask = WRITEMASK_Y;
1010                /* mlen already incremented */
1011             } else {
1012                writemask = WRITEMASK_X;
1013                inst->mlen++;
1014             }
1015          } else /* devinfo->gen == 4 */ {
1016             mrf = param_base;
1017             writemask = WRITEMASK_W;
1018          }
1019          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1020       } else if (op == ir_txf) {
1021          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1022       } else if (op == ir_txf_ms) {
1023          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1024                   sample_index));
1025          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1026             /* MCS data is stored in the first two channels of ‘mcs’, but we
1027              * need to get it into the .y and .z channels of the second vec4
1028              * of params.
1029              */
1030             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1031             emit(MOV(dst_reg(MRF, param_base + 1,
1032                              glsl_type::uint_type, WRITEMASK_YZ),
1033                      mcs));
1034          } else if (devinfo->gen >= 7) {
1035             /* MCS data is in the first channel of `mcs`, but we need to get it into
1036              * the .y channel of the second vec4 of params, so replicate .x across
1037              * the whole vec4 and then mask off everything except .y
1038              */
1039             mcs.swizzle = BRW_SWIZZLE_XXXX;
1040             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1041                      mcs));
1042          }
1043          inst->mlen++;
1044       } else if (op == ir_txd) {
1045          const brw_reg_type type = lod.type;
1046
1047          if (devinfo->gen >= 5) {
1048             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1049             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1050             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1051             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1052             inst->mlen++;
1053
1054             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1055                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1056                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1057                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1058                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1059                inst->mlen++;
1060
1061                if (shadow_comparator.file != BAD_FILE) {
1062                   emit(MOV(dst_reg(MRF, param_base + 2,
1063                                    shadow_comparator.type, WRITEMASK_Z),
1064                            shadow_comparator));
1065                }
1066             }
1067          } else /* devinfo->gen == 4 */ {
1068             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1069             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1070             inst->mlen += 2;
1071          }
1072       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1073          if (shadow_comparator.file != BAD_FILE) {
1074             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1075                      shadow_comparator));
1076          }
1077
1078          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1079                   offset_value));
1080          inst->mlen++;
1081       }
1082    }
1083
1084    emit(inst);
1085
1086    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1087     * spec requires layers.
1088     */
1089    if (op == ir_txs && devinfo->gen < 7) {
1090       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1091       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1092                   src_reg(inst->dst), brw_imm_d(1));
1093    }
1094
1095    if (devinfo->gen == 6 && op == ir_tg4) {
1096       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1097    }
1098
1099    if (op == ir_query_levels) {
1100       /* # levels is in .w */
1101       src_reg swizzled(dest);
1102       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1103                                       SWIZZLE_W, SWIZZLE_W);
1104       emit(MOV(dest, swizzled));
1105    }
1106 }
1107
1108 /**
1109  * Apply workarounds for Gen6 gather with UINT/SINT
1110  */
1111 void
1112 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1113 {
1114    if (!wa)
1115       return;
1116
1117    int width = (wa & WA_8BIT) ? 8 : 16;
1118    dst_reg dst_f = dst;
1119    dst_f.type = BRW_REGISTER_TYPE_F;
1120
1121    /* Convert from UNORM to UINT */
1122    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1123    emit(MOV(dst, src_reg(dst_f)));
1124
1125    if (wa & WA_SIGN) {
1126       /* Reinterpret the UINT value as a signed INT value by
1127        * shifting the sign bit into place, then shifting back
1128        * preserving sign.
1129        */
1130       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1131       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1132    }
1133 }
1134
1135 void
1136 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1137 {
1138    unreachable("not reached");
1139 }
1140
1141 void
1142 vec4_visitor::gs_end_primitive()
1143 {
1144    unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::emit_ndc_computation()
1149 {
1150    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1151       return;
1152
1153    /* Get the position */
1154    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1155
1156    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1157    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1158    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1159    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1160
1161    current_annotation = "NDC";
1162    dst_reg ndc_w = ndc;
1163    ndc_w.writemask = WRITEMASK_W;
1164    src_reg pos_w = pos;
1165    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1166    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1167
1168    dst_reg ndc_xyz = ndc;
1169    ndc_xyz.writemask = WRITEMASK_XYZ;
1170
1171    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1172 }
1173
1174 void
1175 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1176 {
1177    if (devinfo->gen < 6 &&
1178        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1179         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1180         devinfo->has_negative_rhw_bug)) {
1181       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1182       dst_reg header1_w = header1;
1183       header1_w.writemask = WRITEMASK_W;
1184
1185       emit(MOV(header1, brw_imm_ud(0u)));
1186
1187       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1188          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1189
1190          current_annotation = "Point size";
1191          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1192          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1193       }
1194
1195       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1196          current_annotation = "Clipping flags";
1197          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1198          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1199
1200          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1201          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1202          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1203
1204          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1205          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1206          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1207          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1208       }
1209
1210       /* i965 clipping workaround:
1211        * 1) Test for -ve rhw
1212        * 2) If set,
1213        *      set ndc = (0,0,0,0)
1214        *      set ucp[6] = 1
1215        *
1216        * Later, clipping will detect ucp[6] and ensure the primitive is
1217        * clipped against all fixed planes.
1218        */
1219       if (devinfo->has_negative_rhw_bug &&
1220           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1221          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1222          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1223          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1224          vec4_instruction *inst;
1225          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1226          inst->predicate = BRW_PREDICATE_NORMAL;
1227          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1228          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1229          inst->predicate = BRW_PREDICATE_NORMAL;
1230       }
1231
1232       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1233    } else if (devinfo->gen < 6) {
1234       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1235    } else {
1236       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1237       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1238          dst_reg reg_w = reg;
1239          reg_w.writemask = WRITEMASK_W;
1240          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1241          reg_as_src.type = reg_w.type;
1242          reg_as_src.swizzle = brw_swizzle_for_size(1);
1243          emit(MOV(reg_w, reg_as_src));
1244       }
1245       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1246          dst_reg reg_y = reg;
1247          reg_y.writemask = WRITEMASK_Y;
1248          reg_y.type = BRW_REGISTER_TYPE_D;
1249          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1250          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1251       }
1252       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1253          dst_reg reg_z = reg;
1254          reg_z.writemask = WRITEMASK_Z;
1255          reg_z.type = BRW_REGISTER_TYPE_D;
1256          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1257          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1258       }
1259    }
1260 }
1261
1262 vec4_instruction *
1263 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1264 {
1265    assert(varying < VARYING_SLOT_MAX);
1266
1267    unsigned num_comps = output_num_components[varying][component];
1268    if (num_comps == 0)
1269       return NULL;
1270
1271    assert(output_reg[varying][component].type == reg.type);
1272    current_annotation = output_reg_annotation[varying];
1273    if (output_reg[varying][component].file != BAD_FILE) {
1274       src_reg src = src_reg(output_reg[varying][component]);
1275       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1276       reg.writemask =
1277          brw_writemask_for_component_packing(num_comps, component);
1278       return emit(MOV(reg, src));
1279    }
1280    return NULL;
1281 }
1282
1283 void
1284 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1285 {
1286    reg.type = BRW_REGISTER_TYPE_F;
1287    output_reg[varying][0].type = reg.type;
1288
1289    switch (varying) {
1290    case VARYING_SLOT_PSIZ:
1291    {
1292       /* PSIZ is always in slot 0, and is coupled with other flags. */
1293       current_annotation = "indices, point width, clip flags";
1294       emit_psiz_and_flags(reg);
1295       break;
1296    }
1297    case BRW_VARYING_SLOT_NDC:
1298       current_annotation = "NDC";
1299       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1300          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1301       break;
1302    case VARYING_SLOT_POS:
1303       current_annotation = "gl_Position";
1304       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1305          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1306       break;
1307    case VARYING_SLOT_EDGE: {
1308       /* This is present when doing unfilled polygons.  We're supposed to copy
1309        * the edge flag from the user-provided vertex array
1310        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1311        * of that attribute (starts as 1.0f).  This is then used in clipping to
1312        * determine which edges should be drawn as wireframe.
1313        */
1314       current_annotation = "edge flag";
1315       int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1316                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1317       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1318                                     glsl_type::float_type, WRITEMASK_XYZW))));
1319       break;
1320    }
1321    case BRW_VARYING_SLOT_PAD:
1322       /* No need to write to this slot */
1323       break;
1324    default:
1325       for (int i = 0; i < 4; i++) {
1326          emit_generic_urb_slot(reg, varying, i);
1327       }
1328       break;
1329    }
1330 }
1331
1332 static int
1333 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1334 {
1335    if (devinfo->gen >= 6) {
1336       /* URB data written (does not include the message header reg) must
1337        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1338        * section 5.4.3.2.2: URB_INTERLEAVED.
1339        *
1340        * URB entries are allocated on a multiple of 1024 bits, so an
1341        * extra 128 bits written here to make the end align to 256 is
1342        * no problem.
1343        */
1344       if ((mlen % 2) != 1)
1345          mlen++;
1346    }
1347
1348    return mlen;
1349 }
1350
1351
1352 /**
1353  * Generates the VUE payload plus the necessary URB write instructions to
1354  * output it.
1355  *
1356  * The VUE layout is documented in Volume 2a.
1357  */
1358 void
1359 vec4_visitor::emit_vertex()
1360 {
1361    /* MRF 0 is reserved for the debugger, so start with message header
1362     * in MRF 1.
1363     */
1364    int base_mrf = 1;
1365    int mrf = base_mrf;
1366    /* In the process of generating our URB write message contents, we
1367     * may need to unspill a register or load from an array.  Those
1368     * reads would use MRFs 14-15.
1369     */
1370    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1371
1372    /* The following assertion verifies that max_usable_mrf causes an
1373     * even-numbered amount of URB write data, which will meet gen6's
1374     * requirements for length alignment.
1375     */
1376    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1377
1378    /* First mrf is the g0-based message header containing URB handles and
1379     * such.
1380     */
1381    emit_urb_write_header(mrf++);
1382
1383    if (devinfo->gen < 6) {
1384       emit_ndc_computation();
1385    }
1386
1387    /* We may need to split this up into several URB writes, so do them in a
1388     * loop.
1389     */
1390    int slot = 0;
1391    bool complete = false;
1392    do {
1393       /* URB offset is in URB row increments, and each of our MRFs is half of
1394        * one of those, since we're doing interleaved writes.
1395        */
1396       int offset = slot / 2;
1397
1398       mrf = base_mrf + 1;
1399       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1400          emit_urb_slot(dst_reg(MRF, mrf++),
1401                        prog_data->vue_map.slot_to_varying[slot]);
1402
1403          /* If this was max_usable_mrf, we can't fit anything more into this
1404           * URB WRITE. Same thing if we reached the maximum length available.
1405           */
1406          if (mrf > max_usable_mrf ||
1407              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1408             slot++;
1409             break;
1410          }
1411       }
1412
1413       complete = slot >= prog_data->vue_map.num_slots;
1414       current_annotation = "URB write";
1415       vec4_instruction *inst = emit_urb_write_opcode(complete);
1416       inst->base_mrf = base_mrf;
1417       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1418       inst->offset += offset;
1419    } while(!complete);
1420 }
1421
1422
1423 src_reg
1424 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1425                                  src_reg *reladdr, int reg_offset)
1426 {
1427    /* Because we store the values to scratch interleaved like our
1428     * vertex data, we need to scale the vec4 index by 2.
1429     */
1430    int message_header_scale = 2;
1431
1432    /* Pre-gen6, the message header uses byte offsets instead of vec4
1433     * (16-byte) offset units.
1434     */
1435    if (devinfo->gen < 6)
1436       message_header_scale *= 16;
1437
1438    if (reladdr) {
1439       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1440        * to multiply the reladdr by 2. Notice that the reg_offset part
1441        * is in units of 16 bytes and is used to select the low/high 16-byte
1442        * chunk of a full dvec4, so we don't want to multiply that part.
1443        */
1444       src_reg index = src_reg(this, glsl_type::int_type);
1445       if (type_sz(inst->dst.type) < 8) {
1446          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1447                                       brw_imm_d(reg_offset)));
1448          emit_before(block, inst, MUL(dst_reg(index), index,
1449                                       brw_imm_d(message_header_scale)));
1450       } else {
1451          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1452                                       brw_imm_d(message_header_scale * 2)));
1453          emit_before(block, inst, ADD(dst_reg(index), index,
1454                                       brw_imm_d(reg_offset * message_header_scale)));
1455       }
1456       return index;
1457    } else {
1458       return brw_imm_d(reg_offset * message_header_scale);
1459    }
1460 }
1461
1462 /**
1463  * Emits an instruction before @inst to load the value named by @orig_src
1464  * from scratch space at @base_offset to @temp.
1465  *
1466  * @base_offset is measured in 32-byte units (the size of a register).
1467  */
1468 void
1469 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1470                                 dst_reg temp, src_reg orig_src,
1471                                 int base_offset)
1472 {
1473    assert(orig_src.offset % REG_SIZE == 0);
1474    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1475    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1476                                       reg_offset);
1477
1478    if (type_sz(orig_src.type) < 8) {
1479       emit_before(block, inst, SCRATCH_READ(temp, index));
1480    } else {
1481       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1482       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1483       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1484       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1485       vec4_instruction *last_read =
1486          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1487       emit_before(block, inst, last_read);
1488       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1489    }
1490 }
1491
1492 /**
1493  * Emits an instruction after @inst to store the value to be written
1494  * to @orig_dst to scratch space at @base_offset, from @temp.
1495  *
1496  * @base_offset is measured in 32-byte units (the size of a register).
1497  */
1498 void
1499 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1500                                  int base_offset)
1501 {
1502    assert(inst->dst.offset % REG_SIZE == 0);
1503    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1504    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1505                                       reg_offset);
1506
1507    /* Create a temporary register to store *inst's result in.
1508     *
1509     * We have to be careful in MOVing from our temporary result register in
1510     * the scratch write.  If we swizzle from channels of the temporary that
1511     * weren't initialized, it will confuse live interval analysis, which will
1512     * make spilling fail to make progress.
1513     */
1514    bool is_64bit = type_sz(inst->dst.type) == 8;
1515    const glsl_type *alloc_type =
1516       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1517    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1518                                        inst->dst.type),
1519                                 brw_swizzle_for_mask(inst->dst.writemask));
1520
1521    if (!is_64bit) {
1522       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1523                                           inst->dst.writemask));
1524       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1525       if (inst->opcode != BRW_OPCODE_SEL)
1526          write->predicate = inst->predicate;
1527       write->ir = inst->ir;
1528       write->annotation = inst->annotation;
1529       inst->insert_after(block, write);
1530    } else {
1531       dst_reg shuffled = dst_reg(this, alloc_type);
1532       vec4_instruction *last =
1533          shuffle_64bit_data(shuffled, temp, true, block, inst);
1534       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1535
1536       uint8_t mask = 0;
1537       if (inst->dst.writemask & WRITEMASK_X)
1538          mask |= WRITEMASK_XY;
1539       if (inst->dst.writemask & WRITEMASK_Y)
1540          mask |= WRITEMASK_ZW;
1541       if (mask) {
1542          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1543
1544          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1545          if (inst->opcode != BRW_OPCODE_SEL)
1546             write->predicate = inst->predicate;
1547          write->ir = inst->ir;
1548          write->annotation = inst->annotation;
1549          last->insert_after(block, write);
1550       }
1551
1552       mask = 0;
1553       if (inst->dst.writemask & WRITEMASK_Z)
1554          mask |= WRITEMASK_XY;
1555       if (inst->dst.writemask & WRITEMASK_W)
1556          mask |= WRITEMASK_ZW;
1557       if (mask) {
1558          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1559
1560          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1561                                             reg_offset + 1);
1562          vec4_instruction *write =
1563             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1564          if (inst->opcode != BRW_OPCODE_SEL)
1565             write->predicate = inst->predicate;
1566          write->ir = inst->ir;
1567          write->annotation = inst->annotation;
1568          last->insert_after(block, write);
1569       }
1570    }
1571
1572    inst->dst.file = temp.file;
1573    inst->dst.nr = temp.nr;
1574    inst->dst.offset %= REG_SIZE;
1575    inst->dst.reladdr = NULL;
1576 }
1577
1578 /**
1579  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1580  * adds the scratch read(s) before \p inst. The function also checks for
1581  * recursive reladdr scratch accesses, issuing the corresponding scratch
1582  * loads and rewriting reladdr references accordingly.
1583  *
1584  * \return \p src if it did not require a scratch load, otherwise, the
1585  * register holding the result of the scratch load that the caller should
1586  * use to rewrite src.
1587  */
1588 src_reg
1589 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1590                                    vec4_instruction *inst, src_reg src)
1591 {
1592    /* Resolve recursive reladdr scratch access by calling ourselves
1593     * with src.reladdr
1594     */
1595    if (src.reladdr)
1596       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1597                                           *src.reladdr);
1598
1599    /* Now handle scratch access on src */
1600    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1601       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1602          glsl_type::dvec4_type : glsl_type::vec4_type);
1603       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1604       src.nr = temp.nr;
1605       src.offset %= REG_SIZE;
1606       src.reladdr = NULL;
1607    }
1608
1609    return src;
1610 }
1611
1612 /**
1613  * We can't generally support array access in GRF space, because a
1614  * single instruction's destination can only span 2 contiguous
1615  * registers.  So, we send all GRF arrays that get variable index
1616  * access to scratch space.
1617  */
1618 void
1619 vec4_visitor::move_grf_array_access_to_scratch()
1620 {
1621    int scratch_loc[this->alloc.count];
1622    memset(scratch_loc, -1, sizeof(scratch_loc));
1623
1624    /* First, calculate the set of virtual GRFs that need to be punted
1625     * to scratch due to having any array access on them, and where in
1626     * scratch.
1627     */
1628    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1629       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1630          if (scratch_loc[inst->dst.nr] == -1) {
1631             scratch_loc[inst->dst.nr] = last_scratch;
1632             last_scratch += this->alloc.sizes[inst->dst.nr];
1633          }
1634
1635          for (src_reg *iter = inst->dst.reladdr;
1636               iter->reladdr;
1637               iter = iter->reladdr) {
1638             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1639                scratch_loc[iter->nr] = last_scratch;
1640                last_scratch += this->alloc.sizes[iter->nr];
1641             }
1642          }
1643       }
1644
1645       for (int i = 0 ; i < 3; i++) {
1646          for (src_reg *iter = &inst->src[i];
1647               iter->reladdr;
1648               iter = iter->reladdr) {
1649             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1650                scratch_loc[iter->nr] = last_scratch;
1651                last_scratch += this->alloc.sizes[iter->nr];
1652             }
1653          }
1654       }
1655    }
1656
1657    /* Now, for anything that will be accessed through scratch, rewrite
1658     * it to load/store.  Note that this is a _safe list walk, because
1659     * we may generate a new scratch_write instruction after the one
1660     * we're processing.
1661     */
1662    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1663       /* Set up the annotation tracking for new generated instructions. */
1664       base_ir = inst->ir;
1665       current_annotation = inst->annotation;
1666
1667       /* First handle scratch access on the dst. Notice we have to handle
1668        * the case where the dst's reladdr also points to scratch space.
1669        */
1670       if (inst->dst.reladdr)
1671          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1672                                                    *inst->dst.reladdr);
1673
1674       /* Now that we have handled any (possibly recursive) reladdr scratch
1675        * accesses for dst we can safely do the scratch write for dst itself
1676        */
1677       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1678          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1679
1680       /* Now handle scratch access on any src. In this case, since inst->src[i]
1681        * already is a src_reg, we can just call emit_resolve_reladdr with
1682        * inst->src[i] and it will take care of handling scratch loads for
1683        * both src and src.reladdr (recursively).
1684        */
1685       for (int i = 0 ; i < 3; i++) {
1686          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1687                                              inst->src[i]);
1688       }
1689    }
1690 }
1691
1692 /**
1693  * Emits an instruction before @inst to load the value named by @orig_src
1694  * from the pull constant buffer (surface) at @base_offset to @temp.
1695  */
1696 void
1697 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1698                                       dst_reg temp, src_reg orig_src,
1699                                       int base_offset, src_reg indirect)
1700 {
1701    assert(orig_src.offset % 16 == 0);
1702    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1703
1704    /* For 64bit loads we need to emit two 32-bit load messages and we also
1705     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1706     * that we emit the 32-bit loads into a temporary and we shuffle the result
1707     * into the original destination.
1708     */
1709    dst_reg orig_temp = temp;
1710    bool is_64bit = type_sz(orig_src.type) == 8;
1711    if (is_64bit) {
1712       assert(type_sz(temp.type) == 8);
1713       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1714       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1715    }
1716
1717    src_reg src = orig_src;
1718    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1719       int reg_offset = base_offset + src.offset / 16;
1720
1721       src_reg offset;
1722       if (indirect.file != BAD_FILE) {
1723          offset = src_reg(this, glsl_type::uint_type);
1724          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1725                                       brw_imm_ud(reg_offset * 16)));
1726       } else if (devinfo->gen >= 8) {
1727          /* Store the offset in a GRF so we can send-from-GRF. */
1728          offset = src_reg(this, glsl_type::uint_type);
1729          emit_before(block, inst, MOV(dst_reg(offset),
1730                                       brw_imm_ud(reg_offset * 16)));
1731       } else {
1732          offset = brw_imm_d(reg_offset * 16);
1733       }
1734
1735       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1736                                   brw_imm_ud(index),
1737                                   offset,
1738                                   block, inst);
1739
1740       src = byte_offset(src, 16);
1741    }
1742
1743    brw_mark_surface_used(&prog_data->base, index);
1744
1745    if (is_64bit) {
1746       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1747       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1748    }
1749 }
1750
1751 /**
1752  * Implements array access of uniforms by inserting a
1753  * PULL_CONSTANT_LOAD instruction.
1754  *
1755  * Unlike temporary GRF array access (where we don't support it due to
1756  * the difficulty of doing relative addressing on instruction
1757  * destinations), we could potentially do array access of uniforms
1758  * that were loaded in GRF space as push constants.  In real-world
1759  * usage we've seen, though, the arrays being used are always larger
1760  * than we could load as push constants, so just always move all
1761  * uniform array access out to a pull constant buffer.
1762  */
1763 void
1764 vec4_visitor::move_uniform_array_access_to_pull_constants()
1765 {
1766    /* The vulkan dirver doesn't support pull constants other than UBOs so
1767     * everything has to be pushed regardless.
1768     */
1769    if (!compiler->supports_pull_constants) {
1770       split_uniform_registers();
1771       return;
1772    }
1773
1774    /* Allocate the pull_params array */
1775    assert(stage_prog_data->nr_pull_params == 0);
1776    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1777                                               this->uniforms * 4);
1778
1779    int pull_constant_loc[this->uniforms];
1780    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1781
1782    /* First, walk through the instructions and determine which things need to
1783     * be pulled.  We mark something as needing to be pulled by setting
1784     * pull_constant_loc to 0.
1785     */
1786    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1787       /* We only care about MOV_INDIRECT of a uniform */
1788       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1789           inst->src[0].file != UNIFORM)
1790          continue;
1791
1792       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1793
1794       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1795          pull_constant_loc[uniform_nr + j] = 0;
1796    }
1797
1798    /* Next, we walk the list of uniforms and assign real pull constant
1799     * locations and set their corresponding entries in pull_param.
1800     */
1801    for (int j = 0; j < this->uniforms; j++) {
1802       if (pull_constant_loc[j] < 0)
1803          continue;
1804
1805       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1806
1807       for (int i = 0; i < 4; i++) {
1808          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1809             = stage_prog_data->param[j * 4 + i];
1810       }
1811    }
1812
1813    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1814     * instructions to actual uniform pulls.
1815     */
1816    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1817       /* We only care about MOV_INDIRECT of a uniform */
1818       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1819           inst->src[0].file != UNIFORM)
1820          continue;
1821
1822       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1823
1824       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1825
1826       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1827                               pull_constant_loc[uniform_nr], inst->src[1]);
1828       inst->remove(block);
1829    }
1830
1831    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1832     * no need to track them as larger-than-vec4 objects.  This will be
1833     * relied on in cutting out unused uniform vectors from push
1834     * constants.
1835     */
1836    split_uniform_registers();
1837 }
1838
1839 void
1840 vec4_visitor::resolve_ud_negate(src_reg *reg)
1841 {
1842    if (reg->type != BRW_REGISTER_TYPE_UD ||
1843        !reg->negate)
1844       return;
1845
1846    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1847    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1848    *reg = temp;
1849 }
1850
1851 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1852                            void *log_data,
1853                            const struct brw_sampler_prog_key_data *key_tex,
1854                            struct brw_vue_prog_data *prog_data,
1855                            const nir_shader *shader,
1856                            void *mem_ctx,
1857                            bool no_spills,
1858                            int shader_time_index)
1859    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1860      key_tex(key_tex),
1861      prog_data(prog_data),
1862      fail_msg(NULL),
1863      first_non_payload_grf(0),
1864      need_all_constants_in_pull_buffer(false),
1865      no_spills(no_spills),
1866      shader_time_index(shader_time_index),
1867      last_scratch(0)
1868 {
1869    this->failed = false;
1870
1871    this->base_ir = NULL;
1872    this->current_annotation = NULL;
1873    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1874
1875    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1876
1877    this->virtual_grf_start = NULL;
1878    this->virtual_grf_end = NULL;
1879    this->live_intervals = NULL;
1880
1881    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1882
1883    this->uniforms = 0;
1884 }
1885
1886
1887 void
1888 vec4_visitor::fail(const char *format, ...)
1889 {
1890    va_list va;
1891    char *msg;
1892
1893    if (failed)
1894       return;
1895
1896    failed = true;
1897
1898    va_start(va, format);
1899    msg = ralloc_vasprintf(mem_ctx, format, va);
1900    va_end(va);
1901    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1902
1903    this->fail_msg = msg;
1904
1905    if (debug_enabled) {
1906       fprintf(stderr, "%s",  msg);
1907    }
1908 }
1909
1910 } /* namespace brw */