src/intel/compiler/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27
  28 namespace brw {
  29
  30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  31                                    const src_reg &src0, const src_reg &src1,
  32                                    const src_reg &src2)
  33 {
  34    this->opcode = opcode;
  35    this->dst = dst;
  36    this->src[0] = src0;
  37    this->src[1] = src1;
  38    this->src[2] = src2;
  39    this->saturate = false;
  40    this->force_writemask_all = false;
  41    this->no_dd_clear = false;
  42    this->no_dd_check = false;
  43    this->writes_accumulator = false;
  44    this->conditional_mod = BRW_CONDITIONAL_NONE;
  45    this->predicate = BRW_PREDICATE_NONE;
  46    this->predicate_inverse = false;
  47    this->target = 0;
  48    this->shadow_compare = false;
  49    this->eot = false;
  50    this->ir = NULL;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_size = 0;
  53    this->flag_subreg = 0;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->exec_size = 8;
  58    this->group = 0;
  59    this->size_written = (dst.file == BAD_FILE ?
  60                          0 : this->exec_size * type_sz(dst.type));
  61    this->annotation = NULL;
  62 }
  63
  64 vec4_instruction *
  65 vec4_visitor::emit(vec4_instruction *inst)
  66 {
  67    inst->ir = this->base_ir;
  68    inst->annotation = this->current_annotation;
  69
  70    this->instructions.push_tail(inst);
  71
  72    return inst;
  73 }
  74
  75 vec4_instruction *
  76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  77                           vec4_instruction *new_inst)
  78 {
  79    new_inst->ir = inst->ir;
  80    new_inst->annotation = inst->annotation;
  81
  82    inst->insert_before(block, new_inst);
  83
  84    return inst;
  85 }
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  89                    const src_reg &src1, const src_reg &src2)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  92 }
  93
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  97                    const src_reg &src1)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 112 }
 113
 114 vec4_instruction *
 115 vec4_visitor::emit(enum opcode opcode)
 116 {
 117    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 118 }
 119
 120 #define ALU1(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 125    }
 126
 127 #define ALU2(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 130                     const src_reg &src1)                                \
 131    {                                                                    \
 132       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 133                                            src0, src1);                 \
 134    }
 135
 136 #define ALU2_ACC(op)                                                    \
 137    vec4_instruction *                                                   \
 138    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 139                     const src_reg &src1)                                \
 140    {                                                                    \
 141       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 142                        BRW_OPCODE_##op, dst, src0, src1);               \
 143       inst->writes_accumulator = true;                                  \
 144       return inst;                                                      \
 145    }
 146
 147 #define ALU3(op)                                                        \
 148    vec4_instruction *                                                   \
 149    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 150                     const src_reg &src1, const src_reg &src2)           \
 151    {                                                                    \
 152       assert(devinfo->gen >= 6);                                                \
 153       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 154                                            src0, src1, src2);           \
 155    }
 156
 157 ALU1(NOT)
 158 ALU1(MOV)
 159 ALU1(FRC)
 160 ALU1(RNDD)
 161 ALU1(RNDE)
 162 ALU1(RNDZ)
 163 ALU1(F32TO16)
 164 ALU1(F16TO32)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2_ACC(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(DP3)
 172 ALU2(DP4)
 173 ALU2(DPH)
 174 ALU2(SHL)
 175 ALU2(SHR)
 176 ALU2(ASR)
 177 ALU3(LRP)
 178 ALU1(BFREV)
 179 ALU3(BFE)
 180 ALU2(BFI1)
 181 ALU3(BFI2)
 182 ALU1(FBH)
 183 ALU1(FBL)
 184 ALU1(CBIT)
 185 ALU3(MAD)
 186 ALU2_ACC(ADDC)
 187 ALU2_ACC(SUBB)
 188 ALU2(MAC)
 189 ALU1(DIM)
 190
 191 /** Gen4 predicated IF. */
 192 vec4_instruction *
 193 vec4_visitor::IF(enum brw_predicate predicate)
 194 {
 195    vec4_instruction *inst;
 196
 197    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 198    inst->predicate = predicate;
 199
 200    return inst;
 201 }
 202
 203 /** Gen6 IF with embedded comparison. */
 204 vec4_instruction *
 205 vec4_visitor::IF(src_reg src0, src_reg src1,
 206                  enum brw_conditional_mod condition)
 207 {
 208    assert(devinfo->gen == 6);
 209
 210    vec4_instruction *inst;
 211
 212    resolve_ud_negate(&src0);
 213    resolve_ud_negate(&src1);
 214
 215    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 216                                         src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 /**
 223  * CMP: Sets the low bit of the destination channels with the result
 224  * of the comparison, while the upper bits are undefined, and updates
 225  * the flag register with the packed 16 bits of the result.
 226  */
 227 vec4_instruction *
 228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 229                   enum brw_conditional_mod condition)
 230 {
 231    vec4_instruction *inst;
 232
 233    /* Take the instruction:
 234     *
 235     * CMP null<d> src0<f> src1<f>
 236     *
 237     * Original gen4 does type conversion to the destination type before
 238     * comparison, producing garbage results for floating point comparisons.
 239     *
 240     * The destination type doesn't matter on newer generations, so we set the
 241     * type to match src0 so we can compact the instruction.
 242     */
 243    dst.type = src0.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(const src_reg &src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 309 {
 310    if (!src.abs && !src.negate)
 311       return src;
 312
 313    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 314    resolved.type = src.type;
 315    emit(MOV(resolved, src));
 316
 317    return src_reg(resolved);
 318 }
 319
 320 src_reg
 321 vec4_visitor::fix_math_operand(const src_reg &src)
 322 {
 323    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 324       return src;
 325
 326    /* The gen6 math instruction ignores the source modifiers --
 327     * swizzle, abs, negate, and at least some parts of the register
 328     * region description.
 329     *
 330     * Rather than trying to enumerate all these cases, *always* expand the
 331     * operand to a temp GRF for gen6.
 332     *
 333     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 334     * can't use.
 335     */
 336
 337    if (devinfo->gen == 7 && src.file != IMM)
 338       return src;
 339
 340    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 341    expanded.type = src.type;
 342    emit(MOV(expanded, src));
 343    return src_reg(expanded);
 344 }
 345
 346 vec4_instruction *
 347 vec4_visitor::emit_math(enum opcode opcode,
 348                         const dst_reg &dst,
 349                         const src_reg &src0, const src_reg &src1)
 350 {
 351    vec4_instruction *math =
 352       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 353
 354    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 355       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 356       math->dst = dst_reg(this, glsl_type::vec4_type);
 357       math->dst.type = dst.type;
 358       math = emit(MOV(dst, src_reg(math->dst)));
 359    } else if (devinfo->gen < 6) {
 360       math->base_mrf = 1;
 361       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 362    }
 363
 364    return math;
 365 }
 366
 367 void
 368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 369 {
 370    if (devinfo->gen < 7) {
 371       unreachable("ir_unop_pack_half_2x16 should be lowered");
 372    }
 373
 374    assert(dst.type == BRW_REGISTER_TYPE_UD);
 375    assert(src0.type == BRW_REGISTER_TYPE_F);
 376
 377    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 378     *
 379     *   Because this instruction does not have a 16-bit floating-point type,
 380     *   the destination data type must be Word (W).
 381     *
 382     *   The destination must be DWord-aligned and specify a horizontal stride
 383     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 384     *   each destination channel and the upper word is not modified.
 385     *
 386     * The above restriction implies that the f32to16 instruction must use
 387     * align1 mode, because only in align1 mode is it possible to specify
 388     * horizontal stride.  We choose here to defy the hardware docs and emit
 389     * align16 instructions.
 390     *
 391     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 392     * instructions. I was partially successful in that the code passed all
 393     * tests.  However, the code was dubiously correct and fragile, and the
 394     * tests were not harsh enough to probe that frailty. Not trusting the
 395     * code, I chose instead to remain in align16 mode in defiance of the hw
 396     * docs).
 397     *
 398     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 399     * simulator, emitting a f32to16 in align16 mode with UD as destination
 400     * data type is safe. The behavior differs from that specified in the PRM
 401     * in that the upper word of each destination channel is cleared to 0.
 402     */
 403
 404    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 405    src_reg tmp_src(tmp_dst);
 406
 407 #if 0
 408    /* Verify the undocumented behavior on which the following instructions
 409     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 410     * then the result of the bit-or instruction below will be incorrect.
 411     *
 412     * You should inspect the disasm output in order to verify that the MOV is
 413     * not optimized away.
 414     */
 415    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 416 #endif
 417
 418    /* Give tmp the form below, where "." means untouched.
 419     *
 420     *     w z          y          x w z          y          x
 421     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 422     *
 423     * That the upper word of each write-channel be 0 is required for the
 424     * following bit-shift and bit-or instructions to work. Note that this
 425     * relies on the undocumented hardware behavior mentioned above.
 426     */
 427    tmp_dst.writemask = WRITEMASK_XY;
 428    emit(F32TO16(tmp_dst, src0));
 429
 430    /* Give the write-channels of dst the form:
 431     *   0xhhhh0000
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 434    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 435
 436    /* Finally, give the write-channels of dst the form of packHalf2x16's
 437     * output:
 438     *   0xhhhhllll
 439     */
 440    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 441    emit(OR(dst, src_reg(dst), tmp_src));
 442 }
 443
 444 void
 445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 446 {
 447    if (devinfo->gen < 7) {
 448       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 449    }
 450
 451    assert(dst.type == BRW_REGISTER_TYPE_F);
 452    assert(src0.type == BRW_REGISTER_TYPE_UD);
 453
 454    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 455     *
 456     *   Because this instruction does not have a 16-bit floating-point type,
 457     *   the source data type must be Word (W). The destination type must be
 458     *   F (Float).
 459     *
 460     * To use W as the source data type, we must adjust horizontal strides,
 461     * which is only possible in align1 mode. All my [chadv] attempts at
 462     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 463     * Piglit tests, so I gave up.
 464     *
 465     * I've verified that, on gen7 hardware and the simulator, it is safe to
 466     * emit f16to32 in align16 mode with UD as source data type.
 467     */
 468
 469    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 470    src_reg tmp_src(tmp_dst);
 471
 472    tmp_dst.writemask = WRITEMASK_X;
 473    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 474
 475    tmp_dst.writemask = WRITEMASK_Y;
 476    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 477
 478    dst.writemask = WRITEMASK_XY;
 479    emit(F16TO32(dst, tmp_src));
 480 }
 481
 482 void
 483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 484 {
 485    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 486     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 487     * is not suitable to generate the shift values, but we can use the packed
 488     * vector float and a type-converting MOV.
 489     */
 490    dst_reg shift(this, glsl_type::uvec4_type);
 491    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 492
 493    dst_reg shifted(this, glsl_type::uvec4_type);
 494    src0.swizzle = BRW_SWIZZLE_XXXX;
 495    emit(SHR(shifted, src0, src_reg(shift)));
 496
 497    shifted.type = BRW_REGISTER_TYPE_UB;
 498    dst_reg f(this, glsl_type::vec4_type);
 499    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 500
 501    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 502 }
 503
 504 void
 505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 506 {
 507    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 508     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 509     * is not suitable to generate the shift values, but we can use the packed
 510     * vector float and a type-converting MOV.
 511     */
 512    dst_reg shift(this, glsl_type::uvec4_type);
 513    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 514
 515    dst_reg shifted(this, glsl_type::uvec4_type);
 516    src0.swizzle = BRW_SWIZZLE_XXXX;
 517    emit(SHR(shifted, src0, src_reg(shift)));
 518
 519    shifted.type = BRW_REGISTER_TYPE_B;
 520    dst_reg f(this, glsl_type::vec4_type);
 521    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 522
 523    dst_reg scaled(this, glsl_type::vec4_type);
 524    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 525
 526    dst_reg max(this, glsl_type::vec4_type);
 527    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 528    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 529 }
 530
 531 void
 532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 533 {
 534    dst_reg saturated(this, glsl_type::vec4_type);
 535    vec4_instruction *inst = emit(MOV(saturated, src0));
 536    inst->saturate = true;
 537
 538    dst_reg scaled(this, glsl_type::vec4_type);
 539    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 540
 541    dst_reg rounded(this, glsl_type::vec4_type);
 542    emit(RNDE(rounded, src_reg(scaled)));
 543
 544    dst_reg u(this, glsl_type::uvec4_type);
 545    emit(MOV(u, src_reg(rounded)));
 546
 547    src_reg bytes(u);
 548    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 549 }
 550
 551 void
 552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 553 {
 554    dst_reg max(this, glsl_type::vec4_type);
 555    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 556
 557    dst_reg min(this, glsl_type::vec4_type);
 558    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 559
 560    dst_reg scaled(this, glsl_type::vec4_type);
 561    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 562
 563    dst_reg rounded(this, glsl_type::vec4_type);
 564    emit(RNDE(rounded, src_reg(scaled)));
 565
 566    dst_reg i(this, glsl_type::ivec4_type);
 567    emit(MOV(i, src_reg(rounded)));
 568
 569    src_reg bytes(i);
 570    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 571 }
 572
 573 /*
 574  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 575  * false) elements needed to pack a type.
 576  */
 577 static int
 578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 579 {
 580    unsigned int i;
 581    int size;
 582
 583    switch (type->base_type) {
 584    case GLSL_TYPE_UINT:
 585    case GLSL_TYPE_INT:
 586    case GLSL_TYPE_FLOAT:
 587    case GLSL_TYPE_FLOAT16:
 588    case GLSL_TYPE_BOOL:
 589    case GLSL_TYPE_DOUBLE:
 590    case GLSL_TYPE_UINT16:
 591    case GLSL_TYPE_INT16:
 592    case GLSL_TYPE_UINT64:
 593    case GLSL_TYPE_INT64:
 594       if (type->is_matrix()) {
 595          const glsl_type *col_type = type->column_type();
 596          unsigned col_slots =
 597             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 598          return type->matrix_columns * col_slots;
 599       } else {
 600          /* Regardless of size of vector, it gets a vec4. This is bad
 601           * packing for things like floats, but otherwise arrays become a
 602           * mess.  Hopefully a later pass over the code can pack scalars
 603           * down if appropriate.
 604           */
 605          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 606       }
 607    case GLSL_TYPE_ARRAY:
 608       assert(type->length > 0);
 609       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 610    case GLSL_TYPE_STRUCT:
 611       size = 0;
 612       for (i = 0; i < type->length; i++) {
 613          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 614       }
 615       return size;
 616    case GLSL_TYPE_SUBROUTINE:
 617       return 1;
 618
 619    case GLSL_TYPE_SAMPLER:
 620       /* Samplers take up no register space, since they're baked in at
 621        * link time.
 622        */
 623       return 0;
 624    case GLSL_TYPE_ATOMIC_UINT:
 625       return 0;
 626    case GLSL_TYPE_IMAGE:
 627       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 628    case GLSL_TYPE_VOID:
 629    case GLSL_TYPE_ERROR:
 630    case GLSL_TYPE_INTERFACE:
 631    case GLSL_TYPE_FUNCTION:
 632       unreachable("not reached");
 633    }
 634
 635    return 0;
 636 }
 637
 638 /**
 639  * Returns the minimum number of vec4 elements needed to pack a type.
 640  *
 641  * For simple types, it will return 1 (a single vec4); for matrices, the
 642  * number of columns; for array and struct, the sum of the vec4_size of
 643  * each of its elements; and for sampler and atomic, zero.
 644  *
 645  * This method is useful to calculate how much register space is needed to
 646  * store a particular type.
 647  */
 648 extern "C" int
 649 type_size_vec4(const struct glsl_type *type)
 650 {
 651    return type_size_xvec4(type, true);
 652 }
 653
 654 /**
 655  * Returns the minimum number of dvec4 elements needed to pack a type.
 656  *
 657  * For simple types, it will return 1 (a single dvec4); for matrices, the
 658  * number of columns; for array and struct, the sum of the dvec4_size of
 659  * each of its elements; and for sampler and atomic, zero.
 660  *
 661  * This method is useful to calculate how much register space is needed to
 662  * store a particular type.
 663  *
 664  * Measuring double-precision vertex inputs as dvec4 is required because
 665  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 666  * than the single-precision version. That is, two consecutives dvec4 would be
 667  * located in location "x" and location "x+1", not "x+2".
 668  *
 669  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 670  * remap_vs_attrs() will take in account both the location and also if the
 671  * type fits in one or two vec4 slots.
 672  */
 673 extern "C" int
 674 type_size_dvec4(const struct glsl_type *type)
 675 {
 676    return type_size_xvec4(type, false);
 677 }
 678
 679 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 680 {
 681    init();
 682
 683    this->file = VGRF;
 684    this->nr = v->alloc.allocate(type_size_vec4(type));
 685
 686    if (type->is_array() || type->is_record()) {
 687       this->swizzle = BRW_SWIZZLE_NOOP;
 688    } else {
 689       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 690    }
 691
 692    this->type = brw_type_for_base_type(type);
 693 }
 694
 695 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 696 {
 697    assert(size > 0);
 698
 699    init();
 700
 701    this->file = VGRF;
 702    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 703
 704    this->swizzle = BRW_SWIZZLE_NOOP;
 705
 706    this->type = brw_type_for_base_type(type);
 707 }
 708
 709 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 710 {
 711    init();
 712
 713    this->file = VGRF;
 714    this->nr = v->alloc.allocate(type_size_vec4(type));
 715
 716    if (type->is_array() || type->is_record()) {
 717       this->writemask = WRITEMASK_XYZW;
 718    } else {
 719       this->writemask = (1 << type->vector_elements) - 1;
 720    }
 721
 722    this->type = brw_type_for_base_type(type);
 723 }
 724
 725 vec4_instruction *
 726 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 727                           src_reg src0, src_reg src1)
 728 {
 729    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 730    inst->conditional_mod = conditionalmod;
 731    return inst;
 732 }
 733
 734 vec4_instruction *
 735 vec4_visitor::emit_lrp(const dst_reg &dst,
 736                        const src_reg &x, const src_reg &y, const src_reg &a)
 737 {
 738    if (devinfo->gen >= 6) {
 739       /* Note that the instruction's argument order is reversed from GLSL
 740        * and the IR.
 741        */
 742      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 743                      fix_3src_operand(x)));
 744    } else {
 745       /* Earlier generations don't support three source operations, so we
 746        * need to emit x*(1-a) + y*a.
 747        */
 748       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 749       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 750       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 751       y_times_a.writemask           = dst.writemask;
 752       one_minus_a.writemask         = dst.writemask;
 753       x_times_one_minus_a.writemask = dst.writemask;
 754
 755       emit(MUL(y_times_a, y, a));
 756       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 757       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 758       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 759    }
 760 }
 761
 762 /**
 763  * Emits the instructions needed to perform a pull constant load. before_block
 764  * and before_inst can be NULL in which case the instruction will be appended
 765  * to the end of the instruction list.
 766  */
 767 void
 768 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 769                                           src_reg surf_index,
 770                                           src_reg offset_reg,
 771                                           bblock_t *before_block,
 772                                           vec4_instruction *before_inst)
 773 {
 774    assert((before_inst == NULL && before_block == NULL) ||
 775           (before_inst && before_block));
 776
 777    vec4_instruction *pull;
 778
 779    if (devinfo->gen >= 9) {
 780       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 781       src_reg header(this, glsl_type::uvec4_type, 2);
 782
 783       pull = new(mem_ctx)
 784          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 785                           dst_reg(header));
 786
 787       if (before_inst)
 788          emit_before(before_block, before_inst, pull);
 789       else
 790          emit(pull);
 791
 792       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 793                                  offset_reg.type);
 794       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 795
 796       if (before_inst)
 797          emit_before(before_block, before_inst, pull);
 798       else
 799          emit(pull);
 800
 801       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 802                                            dst,
 803                                            surf_index,
 804                                            header);
 805       pull->mlen = 2;
 806       pull->header_size = 1;
 807    } else if (devinfo->gen >= 7) {
 808       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 809
 810       grf_offset.type = offset_reg.type;
 811
 812       pull = MOV(grf_offset, offset_reg);
 813
 814       if (before_inst)
 815          emit_before(before_block, before_inst, pull);
 816       else
 817          emit(pull);
 818
 819       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 820                                            dst,
 821                                            surf_index,
 822                                            src_reg(grf_offset));
 823       pull->mlen = 1;
 824    } else {
 825       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 826                                            dst,
 827                                            surf_index,
 828                                            offset_reg);
 829       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 830       pull->mlen = 1;
 831    }
 832
 833    if (before_inst)
 834       emit_before(before_block, before_inst, pull);
 835    else
 836       emit(pull);
 837 }
 838
 839 src_reg
 840 vec4_visitor::emit_uniformize(const src_reg &src)
 841 {
 842    const src_reg chan_index(this, glsl_type::uint_type);
 843    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 844                               src.type);
 845
 846    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 847       ->force_writemask_all = true;
 848    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 849       ->force_writemask_all = true;
 850
 851    return src_reg(dst);
 852 }
 853
 854 src_reg
 855 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 856                              src_reg coordinate, src_reg surface)
 857 {
 858    vec4_instruction *inst =
 859       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 860                                     dst_reg(this, glsl_type::uvec4_type));
 861    inst->base_mrf = 2;
 862    inst->src[1] = surface;
 863    inst->src[2] = surface;
 864
 865    int param_base;
 866
 867    if (devinfo->gen >= 9) {
 868       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 869       vec4_instruction *header_inst = new(mem_ctx)
 870          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 871                           dst_reg(MRF, inst->base_mrf));
 872
 873       emit(header_inst);
 874
 875       inst->mlen = 2;
 876       inst->header_size = 1;
 877       param_base = inst->base_mrf + 1;
 878    } else {
 879       inst->mlen = 1;
 880       param_base = inst->base_mrf;
 881    }
 882
 883    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 884    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 885    int zero_mask = 0xf & ~coord_mask;
 886
 887    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 888             coordinate));
 889
 890    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 891             brw_imm_d(0)));
 892
 893    emit(inst);
 894    return src_reg(inst->dst);
 895 }
 896
 897 bool
 898 vec4_visitor::is_high_sampler(src_reg sampler)
 899 {
 900    if (devinfo->gen < 8 && !devinfo->is_haswell)
 901       return false;
 902
 903    return sampler.file != IMM || sampler.ud >= 16;
 904 }
 905
 906 void
 907 vec4_visitor::emit_texture(ir_texture_opcode op,
 908                            dst_reg dest,
 909                            const glsl_type *dest_type,
 910                            src_reg coordinate,
 911                            int coord_components,
 912                            src_reg shadow_comparator,
 913                            src_reg lod, src_reg lod2,
 914                            src_reg sample_index,
 915                            uint32_t constant_offset,
 916                            src_reg offset_value,
 917                            src_reg mcs,
 918                            uint32_t surface,
 919                            src_reg surface_reg,
 920                            src_reg sampler_reg)
 921 {
 922    enum opcode opcode;
 923    switch (op) {
 924    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 925    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 926    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 927    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 928    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 929                              SHADER_OPCODE_TXF_CMS); break;
 930    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 931    case ir_tg4: opcode = offset_value.file != BAD_FILE
 932                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 933    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 934    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 935    case ir_txb:
 936       unreachable("TXB is not valid for vertex shaders.");
 937    case ir_lod:
 938       unreachable("LOD is not valid for vertex shaders.");
 939    case ir_samples_identical: {
 940       /* There are some challenges implementing this for vec4, and it seems
 941        * unlikely to be used anyway.  For now, just return false ways.
 942        */
 943       emit(MOV(dest, brw_imm_ud(0u)));
 944       return;
 945    }
 946    default:
 947       unreachable("Unrecognized tex op");
 948    }
 949
 950    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 951
 952    inst->offset = constant_offset;
 953
 954    /* The message header is necessary for:
 955     * - Gen4 (always)
 956     * - Gen9+ for selecting SIMD4x2
 957     * - Texel offsets
 958     * - Gather channel selection
 959     * - Sampler indices too large to fit in a 4-bit value.
 960     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 961     */
 962    inst->header_size =
 963       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 964        inst->offset != 0 || op == ir_tg4 ||
 965        op == ir_texture_samples ||
 966        is_high_sampler(sampler_reg)) ? 1 : 0;
 967    inst->base_mrf = 2;
 968    inst->mlen = inst->header_size;
 969    inst->dst.writemask = WRITEMASK_XYZW;
 970    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 971
 972    inst->src[1] = surface_reg;
 973    inst->src[2] = sampler_reg;
 974
 975    /* MRF for the first parameter */
 976    int param_base = inst->base_mrf + inst->header_size;
 977
 978    if (op == ir_txs || op == ir_query_levels) {
 979       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 980       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 981       inst->mlen++;
 982    } else if (op == ir_texture_samples) {
 983       inst->dst.writemask = WRITEMASK_X;
 984    } else {
 985       /* Load the coordinate */
 986       /* FINISHME: gl_clamp_mask and saturate */
 987       int coord_mask = (1 << coord_components) - 1;
 988       int zero_mask = 0xf & ~coord_mask;
 989
 990       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 991                coordinate));
 992       inst->mlen++;
 993
 994       if (zero_mask != 0) {
 995          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 996                   brw_imm_d(0)));
 997       }
 998       /* Load the shadow comparator */
 999       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1000          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1001                           WRITEMASK_X),
1002                   shadow_comparator));
1003          inst->mlen++;
1004       }
1005
1006       /* Load the LOD info */
1007       if (op == ir_tex || op == ir_txl) {
1008          int mrf, writemask;
1009          if (devinfo->gen >= 5) {
1010             mrf = param_base + 1;
1011             if (shadow_comparator.file != BAD_FILE) {
1012                writemask = WRITEMASK_Y;
1013                /* mlen already incremented */
1014             } else {
1015                writemask = WRITEMASK_X;
1016                inst->mlen++;
1017             }
1018          } else /* devinfo->gen == 4 */ {
1019             mrf = param_base;
1020             writemask = WRITEMASK_W;
1021          }
1022          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1023       } else if (op == ir_txf) {
1024          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1025       } else if (op == ir_txf_ms) {
1026          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1027                   sample_index));
1028          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1029             /* MCS data is stored in the first two channels of ‘mcs’, but we
1030              * need to get it into the .y and .z channels of the second vec4
1031              * of params.
1032              */
1033             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1034             emit(MOV(dst_reg(MRF, param_base + 1,
1035                              glsl_type::uint_type, WRITEMASK_YZ),
1036                      mcs));
1037          } else if (devinfo->gen >= 7) {
1038             /* MCS data is in the first channel of `mcs`, but we need to get it into
1039              * the .y channel of the second vec4 of params, so replicate .x across
1040              * the whole vec4 and then mask off everything except .y
1041              */
1042             mcs.swizzle = BRW_SWIZZLE_XXXX;
1043             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1044                      mcs));
1045          }
1046          inst->mlen++;
1047       } else if (op == ir_txd) {
1048          const brw_reg_type type = lod.type;
1049
1050          if (devinfo->gen >= 5) {
1051             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1052             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1053             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1054             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1055             inst->mlen++;
1056
1057             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1058                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1059                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1060                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1061                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1062                inst->mlen++;
1063
1064                if (shadow_comparator.file != BAD_FILE) {
1065                   emit(MOV(dst_reg(MRF, param_base + 2,
1066                                    shadow_comparator.type, WRITEMASK_Z),
1067                            shadow_comparator));
1068                }
1069             }
1070          } else /* devinfo->gen == 4 */ {
1071             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1072             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1073             inst->mlen += 2;
1074          }
1075       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1076          if (shadow_comparator.file != BAD_FILE) {
1077             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1078                      shadow_comparator));
1079          }
1080
1081          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1082                   offset_value));
1083          inst->mlen++;
1084       }
1085    }
1086
1087    emit(inst);
1088
1089    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1090     * spec requires layers.
1091     */
1092    if (op == ir_txs && devinfo->gen < 7) {
1093       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1094       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1095                   src_reg(inst->dst), brw_imm_d(1));
1096    }
1097
1098    if (devinfo->gen == 6 && op == ir_tg4) {
1099       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1100    }
1101
1102    if (op == ir_query_levels) {
1103       /* # levels is in .w */
1104       src_reg swizzled(dest);
1105       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1106                                       SWIZZLE_W, SWIZZLE_W);
1107       emit(MOV(dest, swizzled));
1108    }
1109 }
1110
1111 /**
1112  * Apply workarounds for Gen6 gather with UINT/SINT
1113  */
1114 void
1115 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1116 {
1117    if (!wa)
1118       return;
1119
1120    int width = (wa & WA_8BIT) ? 8 : 16;
1121    dst_reg dst_f = dst;
1122    dst_f.type = BRW_REGISTER_TYPE_F;
1123
1124    /* Convert from UNORM to UINT */
1125    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1126    emit(MOV(dst, src_reg(dst_f)));
1127
1128    if (wa & WA_SIGN) {
1129       /* Reinterpret the UINT value as a signed INT value by
1130        * shifting the sign bit into place, then shifting back
1131        * preserving sign.
1132        */
1133       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1134       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1135    }
1136 }
1137
1138 void
1139 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1140 {
1141    unreachable("not reached");
1142 }
1143
1144 void
1145 vec4_visitor::gs_end_primitive()
1146 {
1147    unreachable("not reached");
1148 }
1149
1150 void
1151 vec4_visitor::emit_ndc_computation()
1152 {
1153    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1154       return;
1155
1156    /* Get the position */
1157    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1158
1159    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1160    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1161    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1162    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1163
1164    current_annotation = "NDC";
1165    dst_reg ndc_w = ndc;
1166    ndc_w.writemask = WRITEMASK_W;
1167    src_reg pos_w = pos;
1168    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1169    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1170
1171    dst_reg ndc_xyz = ndc;
1172    ndc_xyz.writemask = WRITEMASK_XYZ;
1173
1174    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1175 }
1176
1177 void
1178 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1179 {
1180    if (devinfo->gen < 6 &&
1181        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1182         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1183         devinfo->has_negative_rhw_bug)) {
1184       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1185       dst_reg header1_w = header1;
1186       header1_w.writemask = WRITEMASK_W;
1187
1188       emit(MOV(header1, brw_imm_ud(0u)));
1189
1190       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1191          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1192
1193          current_annotation = "Point size";
1194          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1195          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1196       }
1197
1198       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1199          current_annotation = "Clipping flags";
1200          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1201          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1202
1203          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1204          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1205          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1206
1207          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1209          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1210          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1211       }
1212
1213       /* i965 clipping workaround:
1214        * 1) Test for -ve rhw
1215        * 2) If set,
1216        *      set ndc = (0,0,0,0)
1217        *      set ucp[6] = 1
1218        *
1219        * Later, clipping will detect ucp[6] and ensure the primitive is
1220        * clipped against all fixed planes.
1221        */
1222       if (devinfo->has_negative_rhw_bug &&
1223           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1224          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1225          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1226          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1227          vec4_instruction *inst;
1228          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1229          inst->predicate = BRW_PREDICATE_NORMAL;
1230          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1231          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1232          inst->predicate = BRW_PREDICATE_NORMAL;
1233       }
1234
1235       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1236    } else if (devinfo->gen < 6) {
1237       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1238    } else {
1239       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1240       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1241          dst_reg reg_w = reg;
1242          reg_w.writemask = WRITEMASK_W;
1243          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1244          reg_as_src.type = reg_w.type;
1245          reg_as_src.swizzle = brw_swizzle_for_size(1);
1246          emit(MOV(reg_w, reg_as_src));
1247       }
1248       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1249          dst_reg reg_y = reg;
1250          reg_y.writemask = WRITEMASK_Y;
1251          reg_y.type = BRW_REGISTER_TYPE_D;
1252          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1253          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1254       }
1255       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1256          dst_reg reg_z = reg;
1257          reg_z.writemask = WRITEMASK_Z;
1258          reg_z.type = BRW_REGISTER_TYPE_D;
1259          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1260          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1261       }
1262    }
1263 }
1264
1265 vec4_instruction *
1266 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1267 {
1268    assert(varying < VARYING_SLOT_MAX);
1269
1270    unsigned num_comps = output_num_components[varying][component];
1271    if (num_comps == 0)
1272       return NULL;
1273
1274    assert(output_reg[varying][component].type == reg.type);
1275    current_annotation = output_reg_annotation[varying];
1276    if (output_reg[varying][component].file != BAD_FILE) {
1277       src_reg src = src_reg(output_reg[varying][component]);
1278       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1279       reg.writemask =
1280          brw_writemask_for_component_packing(num_comps, component);
1281       return emit(MOV(reg, src));
1282    }
1283    return NULL;
1284 }
1285
1286 void
1287 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1288 {
1289    reg.type = BRW_REGISTER_TYPE_F;
1290    output_reg[varying][0].type = reg.type;
1291
1292    switch (varying) {
1293    case VARYING_SLOT_PSIZ:
1294    {
1295       /* PSIZ is always in slot 0, and is coupled with other flags. */
1296       current_annotation = "indices, point width, clip flags";
1297       emit_psiz_and_flags(reg);
1298       break;
1299    }
1300    case BRW_VARYING_SLOT_NDC:
1301       current_annotation = "NDC";
1302       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1303          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1304       break;
1305    case VARYING_SLOT_POS:
1306       current_annotation = "gl_Position";
1307       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1308          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1309       break;
1310    case VARYING_SLOT_EDGE: {
1311       /* This is present when doing unfilled polygons.  We're supposed to copy
1312        * the edge flag from the user-provided vertex array
1313        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1314        * of that attribute (starts as 1.0f).  This is then used in clipping to
1315        * determine which edges should be drawn as wireframe.
1316        */
1317       current_annotation = "edge flag";
1318       int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1319                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1320       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1321                                     glsl_type::float_type, WRITEMASK_XYZW))));
1322       break;
1323    }
1324    case BRW_VARYING_SLOT_PAD:
1325       /* No need to write to this slot */
1326       break;
1327    default:
1328       for (int i = 0; i < 4; i++) {
1329          emit_generic_urb_slot(reg, varying, i);
1330       }
1331       break;
1332    }
1333 }
1334
1335 static int
1336 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1337 {
1338    if (devinfo->gen >= 6) {
1339       /* URB data written (does not include the message header reg) must
1340        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1341        * section 5.4.3.2.2: URB_INTERLEAVED.
1342        *
1343        * URB entries are allocated on a multiple of 1024 bits, so an
1344        * extra 128 bits written here to make the end align to 256 is
1345        * no problem.
1346        */
1347       if ((mlen % 2) != 1)
1348          mlen++;
1349    }
1350
1351    return mlen;
1352 }
1353
1354
1355 /**
1356  * Generates the VUE payload plus the necessary URB write instructions to
1357  * output it.
1358  *
1359  * The VUE layout is documented in Volume 2a.
1360  */
1361 void
1362 vec4_visitor::emit_vertex()
1363 {
1364    /* MRF 0 is reserved for the debugger, so start with message header
1365     * in MRF 1.
1366     */
1367    int base_mrf = 1;
1368    int mrf = base_mrf;
1369    /* In the process of generating our URB write message contents, we
1370     * may need to unspill a register or load from an array.  Those
1371     * reads would use MRFs 14-15.
1372     */
1373    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1374
1375    /* The following assertion verifies that max_usable_mrf causes an
1376     * even-numbered amount of URB write data, which will meet gen6's
1377     * requirements for length alignment.
1378     */
1379    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1380
1381    /* First mrf is the g0-based message header containing URB handles and
1382     * such.
1383     */
1384    emit_urb_write_header(mrf++);
1385
1386    if (devinfo->gen < 6) {
1387       emit_ndc_computation();
1388    }
1389
1390    /* We may need to split this up into several URB writes, so do them in a
1391     * loop.
1392     */
1393    int slot = 0;
1394    bool complete = false;
1395    do {
1396       /* URB offset is in URB row increments, and each of our MRFs is half of
1397        * one of those, since we're doing interleaved writes.
1398        */
1399       int offset = slot / 2;
1400
1401       mrf = base_mrf + 1;
1402       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1403          emit_urb_slot(dst_reg(MRF, mrf++),
1404                        prog_data->vue_map.slot_to_varying[slot]);
1405
1406          /* If this was max_usable_mrf, we can't fit anything more into this
1407           * URB WRITE. Same thing if we reached the maximum length available.
1408           */
1409          if (mrf > max_usable_mrf ||
1410              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1411             slot++;
1412             break;
1413          }
1414       }
1415
1416       complete = slot >= prog_data->vue_map.num_slots;
1417       current_annotation = "URB write";
1418       vec4_instruction *inst = emit_urb_write_opcode(complete);
1419       inst->base_mrf = base_mrf;
1420       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1421       inst->offset += offset;
1422    } while(!complete);
1423 }
1424
1425
1426 src_reg
1427 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1428                                  src_reg *reladdr, int reg_offset)
1429 {
1430    /* Because we store the values to scratch interleaved like our
1431     * vertex data, we need to scale the vec4 index by 2.
1432     */
1433    int message_header_scale = 2;
1434
1435    /* Pre-gen6, the message header uses byte offsets instead of vec4
1436     * (16-byte) offset units.
1437     */
1438    if (devinfo->gen < 6)
1439       message_header_scale *= 16;
1440
1441    if (reladdr) {
1442       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1443        * to multiply the reladdr by 2. Notice that the reg_offset part
1444        * is in units of 16 bytes and is used to select the low/high 16-byte
1445        * chunk of a full dvec4, so we don't want to multiply that part.
1446        */
1447       src_reg index = src_reg(this, glsl_type::int_type);
1448       if (type_sz(inst->dst.type) < 8) {
1449          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1450                                       brw_imm_d(reg_offset)));
1451          emit_before(block, inst, MUL(dst_reg(index), index,
1452                                       brw_imm_d(message_header_scale)));
1453       } else {
1454          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1455                                       brw_imm_d(message_header_scale * 2)));
1456          emit_before(block, inst, ADD(dst_reg(index), index,
1457                                       brw_imm_d(reg_offset * message_header_scale)));
1458       }
1459       return index;
1460    } else {
1461       return brw_imm_d(reg_offset * message_header_scale);
1462    }
1463 }
1464
1465 /**
1466  * Emits an instruction before @inst to load the value named by @orig_src
1467  * from scratch space at @base_offset to @temp.
1468  *
1469  * @base_offset is measured in 32-byte units (the size of a register).
1470  */
1471 void
1472 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1473                                 dst_reg temp, src_reg orig_src,
1474                                 int base_offset)
1475 {
1476    assert(orig_src.offset % REG_SIZE == 0);
1477    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1478    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1479                                       reg_offset);
1480
1481    if (type_sz(orig_src.type) < 8) {
1482       emit_before(block, inst, SCRATCH_READ(temp, index));
1483    } else {
1484       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1485       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1486       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1487       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1488       vec4_instruction *last_read =
1489          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1490       emit_before(block, inst, last_read);
1491       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1492    }
1493 }
1494
1495 /**
1496  * Emits an instruction after @inst to store the value to be written
1497  * to @orig_dst to scratch space at @base_offset, from @temp.
1498  *
1499  * @base_offset is measured in 32-byte units (the size of a register).
1500  */
1501 void
1502 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1503                                  int base_offset)
1504 {
1505    assert(inst->dst.offset % REG_SIZE == 0);
1506    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1507    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1508                                       reg_offset);
1509
1510    /* Create a temporary register to store *inst's result in.
1511     *
1512     * We have to be careful in MOVing from our temporary result register in
1513     * the scratch write.  If we swizzle from channels of the temporary that
1514     * weren't initialized, it will confuse live interval analysis, which will
1515     * make spilling fail to make progress.
1516     */
1517    bool is_64bit = type_sz(inst->dst.type) == 8;
1518    const glsl_type *alloc_type =
1519       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1520    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1521                                        inst->dst.type),
1522                                 brw_swizzle_for_mask(inst->dst.writemask));
1523
1524    if (!is_64bit) {
1525       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1526                                           inst->dst.writemask));
1527       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1528       if (inst->opcode != BRW_OPCODE_SEL)
1529          write->predicate = inst->predicate;
1530       write->ir = inst->ir;
1531       write->annotation = inst->annotation;
1532       inst->insert_after(block, write);
1533    } else {
1534       dst_reg shuffled = dst_reg(this, alloc_type);
1535       vec4_instruction *last =
1536          shuffle_64bit_data(shuffled, temp, true, block, inst);
1537       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1538
1539       uint8_t mask = 0;
1540       if (inst->dst.writemask & WRITEMASK_X)
1541          mask |= WRITEMASK_XY;
1542       if (inst->dst.writemask & WRITEMASK_Y)
1543          mask |= WRITEMASK_ZW;
1544       if (mask) {
1545          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1546
1547          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1548          if (inst->opcode != BRW_OPCODE_SEL)
1549             write->predicate = inst->predicate;
1550          write->ir = inst->ir;
1551          write->annotation = inst->annotation;
1552          last->insert_after(block, write);
1553       }
1554
1555       mask = 0;
1556       if (inst->dst.writemask & WRITEMASK_Z)
1557          mask |= WRITEMASK_XY;
1558       if (inst->dst.writemask & WRITEMASK_W)
1559          mask |= WRITEMASK_ZW;
1560       if (mask) {
1561          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1562
1563          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1564                                             reg_offset + 1);
1565          vec4_instruction *write =
1566             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1567          if (inst->opcode != BRW_OPCODE_SEL)
1568             write->predicate = inst->predicate;
1569          write->ir = inst->ir;
1570          write->annotation = inst->annotation;
1571          last->insert_after(block, write);
1572       }
1573    }
1574
1575    inst->dst.file = temp.file;
1576    inst->dst.nr = temp.nr;
1577    inst->dst.offset %= REG_SIZE;
1578    inst->dst.reladdr = NULL;
1579 }
1580
1581 /**
1582  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1583  * adds the scratch read(s) before \p inst. The function also checks for
1584  * recursive reladdr scratch accesses, issuing the corresponding scratch
1585  * loads and rewriting reladdr references accordingly.
1586  *
1587  * \return \p src if it did not require a scratch load, otherwise, the
1588  * register holding the result of the scratch load that the caller should
1589  * use to rewrite src.
1590  */
1591 src_reg
1592 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1593                                    vec4_instruction *inst, src_reg src)
1594 {
1595    /* Resolve recursive reladdr scratch access by calling ourselves
1596     * with src.reladdr
1597     */
1598    if (src.reladdr)
1599       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1600                                           *src.reladdr);
1601
1602    /* Now handle scratch access on src */
1603    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1604       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1605          glsl_type::dvec4_type : glsl_type::vec4_type);
1606       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1607       src.nr = temp.nr;
1608       src.offset %= REG_SIZE;
1609       src.reladdr = NULL;
1610    }
1611
1612    return src;
1613 }
1614
1615 /**
1616  * We can't generally support array access in GRF space, because a
1617  * single instruction's destination can only span 2 contiguous
1618  * registers.  So, we send all GRF arrays that get variable index
1619  * access to scratch space.
1620  */
1621 void
1622 vec4_visitor::move_grf_array_access_to_scratch()
1623 {
1624    int scratch_loc[this->alloc.count];
1625    memset(scratch_loc, -1, sizeof(scratch_loc));
1626
1627    /* First, calculate the set of virtual GRFs that need to be punted
1628     * to scratch due to having any array access on them, and where in
1629     * scratch.
1630     */
1631    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1632       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1633          if (scratch_loc[inst->dst.nr] == -1) {
1634             scratch_loc[inst->dst.nr] = last_scratch;
1635             last_scratch += this->alloc.sizes[inst->dst.nr];
1636          }
1637
1638          for (src_reg *iter = inst->dst.reladdr;
1639               iter->reladdr;
1640               iter = iter->reladdr) {
1641             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1642                scratch_loc[iter->nr] = last_scratch;
1643                last_scratch += this->alloc.sizes[iter->nr];
1644             }
1645          }
1646       }
1647
1648       for (int i = 0 ; i < 3; i++) {
1649          for (src_reg *iter = &inst->src[i];
1650               iter->reladdr;
1651               iter = iter->reladdr) {
1652             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1653                scratch_loc[iter->nr] = last_scratch;
1654                last_scratch += this->alloc.sizes[iter->nr];
1655             }
1656          }
1657       }
1658    }
1659
1660    /* Now, for anything that will be accessed through scratch, rewrite
1661     * it to load/store.  Note that this is a _safe list walk, because
1662     * we may generate a new scratch_write instruction after the one
1663     * we're processing.
1664     */
1665    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1666       /* Set up the annotation tracking for new generated instructions. */
1667       base_ir = inst->ir;
1668       current_annotation = inst->annotation;
1669
1670       /* First handle scratch access on the dst. Notice we have to handle
1671        * the case where the dst's reladdr also points to scratch space.
1672        */
1673       if (inst->dst.reladdr)
1674          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1675                                                    *inst->dst.reladdr);
1676
1677       /* Now that we have handled any (possibly recursive) reladdr scratch
1678        * accesses for dst we can safely do the scratch write for dst itself
1679        */
1680       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1681          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1682
1683       /* Now handle scratch access on any src. In this case, since inst->src[i]
1684        * already is a src_reg, we can just call emit_resolve_reladdr with
1685        * inst->src[i] and it will take care of handling scratch loads for
1686        * both src and src.reladdr (recursively).
1687        */
1688       for (int i = 0 ; i < 3; i++) {
1689          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1690                                              inst->src[i]);
1691       }
1692    }
1693 }
1694
1695 /**
1696  * Emits an instruction before @inst to load the value named by @orig_src
1697  * from the pull constant buffer (surface) at @base_offset to @temp.
1698  */
1699 void
1700 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1701                                       dst_reg temp, src_reg orig_src,
1702                                       int base_offset, src_reg indirect)
1703 {
1704    assert(orig_src.offset % 16 == 0);
1705    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1706
1707    /* For 64bit loads we need to emit two 32-bit load messages and we also
1708     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1709     * that we emit the 32-bit loads into a temporary and we shuffle the result
1710     * into the original destination.
1711     */
1712    dst_reg orig_temp = temp;
1713    bool is_64bit = type_sz(orig_src.type) == 8;
1714    if (is_64bit) {
1715       assert(type_sz(temp.type) == 8);
1716       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1717       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1718    }
1719
1720    src_reg src = orig_src;
1721    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1722       int reg_offset = base_offset + src.offset / 16;
1723
1724       src_reg offset;
1725       if (indirect.file != BAD_FILE) {
1726          offset = src_reg(this, glsl_type::uint_type);
1727          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1728                                       brw_imm_ud(reg_offset * 16)));
1729       } else if (devinfo->gen >= 8) {
1730          /* Store the offset in a GRF so we can send-from-GRF. */
1731          offset = src_reg(this, glsl_type::uint_type);
1732          emit_before(block, inst, MOV(dst_reg(offset),
1733                                       brw_imm_ud(reg_offset * 16)));
1734       } else {
1735          offset = brw_imm_d(reg_offset * 16);
1736       }
1737
1738       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1739                                   brw_imm_ud(index),
1740                                   offset,
1741                                   block, inst);
1742
1743       src = byte_offset(src, 16);
1744    }
1745
1746    brw_mark_surface_used(&prog_data->base, index);
1747
1748    if (is_64bit) {
1749       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1750       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1751    }
1752 }
1753
1754 /**
1755  * Implements array access of uniforms by inserting a
1756  * PULL_CONSTANT_LOAD instruction.
1757  *
1758  * Unlike temporary GRF array access (where we don't support it due to
1759  * the difficulty of doing relative addressing on instruction
1760  * destinations), we could potentially do array access of uniforms
1761  * that were loaded in GRF space as push constants.  In real-world
1762  * usage we've seen, though, the arrays being used are always larger
1763  * than we could load as push constants, so just always move all
1764  * uniform array access out to a pull constant buffer.
1765  */
1766 void
1767 vec4_visitor::move_uniform_array_access_to_pull_constants()
1768 {
1769    /* The vulkan dirver doesn't support pull constants other than UBOs so
1770     * everything has to be pushed regardless.
1771     */
1772    if (!compiler->supports_pull_constants) {
1773       split_uniform_registers();
1774       return;
1775    }
1776
1777    /* Allocate the pull_params array */
1778    assert(stage_prog_data->nr_pull_params == 0);
1779    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1780                                               this->uniforms * 4);
1781
1782    int pull_constant_loc[this->uniforms];
1783    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1784
1785    /* First, walk through the instructions and determine which things need to
1786     * be pulled.  We mark something as needing to be pulled by setting
1787     * pull_constant_loc to 0.
1788     */
1789    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1790       /* We only care about MOV_INDIRECT of a uniform */
1791       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1792           inst->src[0].file != UNIFORM)
1793          continue;
1794
1795       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1796
1797       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1798          pull_constant_loc[uniform_nr + j] = 0;
1799    }
1800
1801    /* Next, we walk the list of uniforms and assign real pull constant
1802     * locations and set their corresponding entries in pull_param.
1803     */
1804    for (int j = 0; j < this->uniforms; j++) {
1805       if (pull_constant_loc[j] < 0)
1806          continue;
1807
1808       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1809
1810       for (int i = 0; i < 4; i++) {
1811          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1812             = stage_prog_data->param[j * 4 + i];
1813       }
1814    }
1815
1816    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1817     * instructions to actual uniform pulls.
1818     */
1819    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1820       /* We only care about MOV_INDIRECT of a uniform */
1821       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1822           inst->src[0].file != UNIFORM)
1823          continue;
1824
1825       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1826
1827       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1828
1829       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1830                               pull_constant_loc[uniform_nr], inst->src[1]);
1831       inst->remove(block);
1832    }
1833
1834    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1835     * no need to track them as larger-than-vec4 objects.  This will be
1836     * relied on in cutting out unused uniform vectors from push
1837     * constants.
1838     */
1839    split_uniform_registers();
1840 }
1841
1842 void
1843 vec4_visitor::resolve_ud_negate(src_reg *reg)
1844 {
1845    if (reg->type != BRW_REGISTER_TYPE_UD ||
1846        !reg->negate)
1847       return;
1848
1849    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1850    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1851    *reg = temp;
1852 }
1853
1854 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1855                            void *log_data,
1856                            const struct brw_sampler_prog_key_data *key_tex,
1857                            struct brw_vue_prog_data *prog_data,
1858                            const nir_shader *shader,
1859                            void *mem_ctx,
1860                            bool no_spills,
1861                            int shader_time_index)
1862    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1863      key_tex(key_tex),
1864      prog_data(prog_data),
1865      fail_msg(NULL),
1866      first_non_payload_grf(0),
1867      need_all_constants_in_pull_buffer(false),
1868      no_spills(no_spills),
1869      shader_time_index(shader_time_index),
1870      last_scratch(0)
1871 {
1872    this->failed = false;
1873
1874    this->base_ir = NULL;
1875    this->current_annotation = NULL;
1876    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1877
1878    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1879
1880    this->virtual_grf_start = NULL;
1881    this->virtual_grf_end = NULL;
1882    this->live_intervals = NULL;
1883
1884    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1885
1886    this->uniforms = 0;
1887 }
1888
1889
1890 void
1891 vec4_visitor::fail(const char *format, ...)
1892 {
1893    va_list va;
1894    char *msg;
1895
1896    if (failed)
1897       return;
1898
1899    failed = true;
1900
1901    va_start(va, format);
1902    msg = ralloc_vasprintf(mem_ctx, format, va);
1903    va_end(va);
1904    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1905
1906    this->fail_msg = msg;
1907
1908    if (debug_enabled) {
1909       fprintf(stderr, "%s",  msg);
1910    }
1911 }
1912
1913 } /* namespace brw */