src/intel/compiler/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "util/u_math.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->eot = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->exec_size = 8;
  59    this->group = 0;
  60    this->size_written = (dst.file == BAD_FILE ?
  61                          0 : this->exec_size * type_sz(dst.type));
  62    this->annotation = NULL;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(vec4_instruction *inst)
  67 {
  68    inst->ir = this->base_ir;
  69    inst->annotation = this->current_annotation;
  70
  71    this->instructions.push_tail(inst);
  72
  73    return inst;
  74 }
  75
  76 vec4_instruction *
  77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  78                           vec4_instruction *new_inst)
  79 {
  80    new_inst->ir = inst->ir;
  81    new_inst->annotation = inst->annotation;
  82
  83    inst->insert_before(block, new_inst);
  84
  85    return inst;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  90                    const src_reg &src1, const src_reg &src2)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  93 }
  94
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  98                    const src_reg &src1)
  99 {
 100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 101 }
 102
 103 vec4_instruction *
 104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 105 {
 106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 107 }
 108
 109 vec4_instruction *
 110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 111 {
 112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 113 }
 114
 115 vec4_instruction *
 116 vec4_visitor::emit(enum opcode opcode)
 117 {
 118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 119 }
 120
 121 #define ALU1(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 126    }
 127
 128 #define ALU2(op)                                                        \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 131                     const src_reg &src1)                                \
 132    {                                                                    \
 133       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 134                                            src0, src1);                 \
 135    }
 136
 137 #define ALU2_ACC(op)                                                    \
 138    vec4_instruction *                                                   \
 139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 140                     const src_reg &src1)                                \
 141    {                                                                    \
 142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 143                        BRW_OPCODE_##op, dst, src0, src1);               \
 144       inst->writes_accumulator = true;                                  \
 145       return inst;                                                      \
 146    }
 147
 148 #define ALU3(op)                                                        \
 149    vec4_instruction *                                                   \
 150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 151                     const src_reg &src1, const src_reg &src2)           \
 152    {                                                                    \
 153       assert(devinfo->gen >= 6);                                                \
 154       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 155                                            src0, src1, src2);           \
 156    }
 157
 158 ALU1(NOT)
 159 ALU1(MOV)
 160 ALU1(FRC)
 161 ALU1(RNDD)
 162 ALU1(RNDE)
 163 ALU1(RNDZ)
 164 ALU1(F32TO16)
 165 ALU1(F16TO32)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2_ACC(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(DP3)
 173 ALU2(DP4)
 174 ALU2(DPH)
 175 ALU2(SHL)
 176 ALU2(SHR)
 177 ALU2(ASR)
 178 ALU3(LRP)
 179 ALU1(BFREV)
 180 ALU3(BFE)
 181 ALU2(BFI1)
 182 ALU3(BFI2)
 183 ALU1(FBH)
 184 ALU1(FBL)
 185 ALU1(CBIT)
 186 ALU3(MAD)
 187 ALU2_ACC(ADDC)
 188 ALU2_ACC(SUBB)
 189 ALU2(MAC)
 190 ALU1(DIM)
 191
 192 /** Gen4 predicated IF. */
 193 vec4_instruction *
 194 vec4_visitor::IF(enum brw_predicate predicate)
 195 {
 196    vec4_instruction *inst;
 197
 198    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 199    inst->predicate = predicate;
 200
 201    return inst;
 202 }
 203
 204 /** Gen6 IF with embedded comparison. */
 205 vec4_instruction *
 206 vec4_visitor::IF(src_reg src0, src_reg src1,
 207                  enum brw_conditional_mod condition)
 208 {
 209    assert(devinfo->gen == 6);
 210
 211    vec4_instruction *inst;
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 217                                         src0, src1);
 218    inst->conditional_mod = condition;
 219
 220    return inst;
 221 }
 222
 223 /**
 224  * CMP: Sets the low bit of the destination channels with the result
 225  * of the comparison, while the upper bits are undefined, and updates
 226  * the flag register with the packed 16 bits of the result.
 227  */
 228 vec4_instruction *
 229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 230                   enum brw_conditional_mod condition)
 231 {
 232    vec4_instruction *inst;
 233
 234    /* Take the instruction:
 235     *
 236     * CMP null<d> src0<f> src1<f>
 237     *
 238     * Original gen4 does type conversion to the destination type before
 239     * comparison, producing garbage results for floating point comparisons.
 240     *
 241     * The destination type doesn't matter on newer generations, so we set the
 242     * type to match src0 so we can compact the instruction.
 243     */
 244    dst.type = src0.type;
 245
 246    resolve_ud_negate(&src0);
 247    resolve_ud_negate(&src1);
 248
 249    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 250    inst->conditional_mod = condition;
 251
 252    return inst;
 253 }
 254
 255 vec4_instruction *
 256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 257 {
 258    vec4_instruction *inst;
 259
 260    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 261                                         dst, index);
 262    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 263    inst->mlen = 2;
 264
 265    return inst;
 266 }
 267
 268 vec4_instruction *
 269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 270                             const src_reg &index)
 271 {
 272    vec4_instruction *inst;
 273
 274    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 275                                         dst, src, index);
 276    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 277    inst->mlen = 3;
 278
 279    return inst;
 280 }
 281
 282 src_reg
 283 vec4_visitor::fix_3src_operand(const src_reg &src)
 284 {
 285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 286     * able to use vertical stride of zero to replicate the vec4 uniform, like
 287     *
 288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 289     *
 290     * But you can't, since vertical stride is always four in three-source
 291     * instructions. Instead, insert a MOV instruction to do the replication so
 292     * that the three-source instruction can consume it.
 293     */
 294
 295    /* The MOV is only needed if the source is a uniform or immediate. */
 296    if (src.file != UNIFORM && src.file != IMM)
 297       return src;
 298
 299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 305    return src_reg(expanded);
 306 }
 307
 308 src_reg
 309 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 310 {
 311    if (!src.abs && !src.negate)
 312       return src;
 313
 314    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 315    resolved.type = src.type;
 316    emit(MOV(resolved, src));
 317
 318    return src_reg(resolved);
 319 }
 320
 321 src_reg
 322 vec4_visitor::fix_math_operand(const src_reg &src)
 323 {
 324    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 325       return src;
 326
 327    /* The gen6 math instruction ignores the source modifiers --
 328     * swizzle, abs, negate, and at least some parts of the register
 329     * region description.
 330     *
 331     * Rather than trying to enumerate all these cases, *always* expand the
 332     * operand to a temp GRF for gen6.
 333     *
 334     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 335     * can't use.
 336     */
 337
 338    if (devinfo->gen == 7 && src.file != IMM)
 339       return src;
 340
 341    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 342    expanded.type = src.type;
 343    emit(MOV(expanded, src));
 344    return src_reg(expanded);
 345 }
 346
 347 vec4_instruction *
 348 vec4_visitor::emit_math(enum opcode opcode,
 349                         const dst_reg &dst,
 350                         const src_reg &src0, const src_reg &src1)
 351 {
 352    vec4_instruction *math =
 353       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 354
 355    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 356       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 357       math->dst = dst_reg(this, glsl_type::vec4_type);
 358       math->dst.type = dst.type;
 359       math = emit(MOV(dst, src_reg(math->dst)));
 360    } else if (devinfo->gen < 6) {
 361       math->base_mrf = 1;
 362       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 363    }
 364
 365    return math;
 366 }
 367
 368 void
 369 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 370 {
 371    if (devinfo->gen < 7) {
 372       unreachable("ir_unop_pack_half_2x16 should be lowered");
 373    }
 374
 375    assert(dst.type == BRW_REGISTER_TYPE_UD);
 376    assert(src0.type == BRW_REGISTER_TYPE_F);
 377
 378    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 379     *
 380     *   Because this instruction does not have a 16-bit floating-point type,
 381     *   the destination data type must be Word (W).
 382     *
 383     *   The destination must be DWord-aligned and specify a horizontal stride
 384     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 385     *   each destination channel and the upper word is not modified.
 386     *
 387     * The above restriction implies that the f32to16 instruction must use
 388     * align1 mode, because only in align1 mode is it possible to specify
 389     * horizontal stride.  We choose here to defy the hardware docs and emit
 390     * align16 instructions.
 391     *
 392     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 393     * instructions. I was partially successful in that the code passed all
 394     * tests.  However, the code was dubiously correct and fragile, and the
 395     * tests were not harsh enough to probe that frailty. Not trusting the
 396     * code, I chose instead to remain in align16 mode in defiance of the hw
 397     * docs).
 398     *
 399     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 400     * simulator, emitting a f32to16 in align16 mode with UD as destination
 401     * data type is safe. The behavior differs from that specified in the PRM
 402     * in that the upper word of each destination channel is cleared to 0.
 403     */
 404
 405    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 406    src_reg tmp_src(tmp_dst);
 407
 408 #if 0
 409    /* Verify the undocumented behavior on which the following instructions
 410     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 411     * then the result of the bit-or instruction below will be incorrect.
 412     *
 413     * You should inspect the disasm output in order to verify that the MOV is
 414     * not optimized away.
 415     */
 416    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 417 #endif
 418
 419    /* Give tmp the form below, where "." means untouched.
 420     *
 421     *     w z          y          x w z          y          x
 422     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 423     *
 424     * That the upper word of each write-channel be 0 is required for the
 425     * following bit-shift and bit-or instructions to work. Note that this
 426     * relies on the undocumented hardware behavior mentioned above.
 427     */
 428    tmp_dst.writemask = WRITEMASK_XY;
 429    emit(F32TO16(tmp_dst, src0));
 430
 431    /* Give the write-channels of dst the form:
 432     *   0xhhhh0000
 433     */
 434    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 435    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 436
 437    /* Finally, give the write-channels of dst the form of packHalf2x16's
 438     * output:
 439     *   0xhhhhllll
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 442    emit(OR(dst, src_reg(dst), tmp_src));
 443 }
 444
 445 void
 446 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 447 {
 448    if (devinfo->gen < 7) {
 449       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 450    }
 451
 452    assert(dst.type == BRW_REGISTER_TYPE_F);
 453    assert(src0.type == BRW_REGISTER_TYPE_UD);
 454
 455    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 456     *
 457     *   Because this instruction does not have a 16-bit floating-point type,
 458     *   the source data type must be Word (W). The destination type must be
 459     *   F (Float).
 460     *
 461     * To use W as the source data type, we must adjust horizontal strides,
 462     * which is only possible in align1 mode. All my [chadv] attempts at
 463     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 464     * Piglit tests, so I gave up.
 465     *
 466     * I've verified that, on gen7 hardware and the simulator, it is safe to
 467     * emit f16to32 in align16 mode with UD as source data type.
 468     */
 469
 470    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 471    src_reg tmp_src(tmp_dst);
 472
 473    tmp_dst.writemask = WRITEMASK_X;
 474    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 475
 476    tmp_dst.writemask = WRITEMASK_Y;
 477    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 478
 479    dst.writemask = WRITEMASK_XY;
 480    emit(F16TO32(dst, tmp_src));
 481 }
 482
 483 void
 484 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 485 {
 486    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 487     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 488     * is not suitable to generate the shift values, but we can use the packed
 489     * vector float and a type-converting MOV.
 490     */
 491    dst_reg shift(this, glsl_type::uvec4_type);
 492    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 493
 494    dst_reg shifted(this, glsl_type::uvec4_type);
 495    src0.swizzle = BRW_SWIZZLE_XXXX;
 496    emit(SHR(shifted, src0, src_reg(shift)));
 497
 498    shifted.type = BRW_REGISTER_TYPE_UB;
 499    dst_reg f(this, glsl_type::vec4_type);
 500    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 501
 502    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 503 }
 504
 505 void
 506 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 507 {
 508    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 509     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 510     * is not suitable to generate the shift values, but we can use the packed
 511     * vector float and a type-converting MOV.
 512     */
 513    dst_reg shift(this, glsl_type::uvec4_type);
 514    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 515
 516    dst_reg shifted(this, glsl_type::uvec4_type);
 517    src0.swizzle = BRW_SWIZZLE_XXXX;
 518    emit(SHR(shifted, src0, src_reg(shift)));
 519
 520    shifted.type = BRW_REGISTER_TYPE_B;
 521    dst_reg f(this, glsl_type::vec4_type);
 522    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 523
 524    dst_reg scaled(this, glsl_type::vec4_type);
 525    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 526
 527    dst_reg max(this, glsl_type::vec4_type);
 528    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 529    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 530 }
 531
 532 void
 533 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 534 {
 535    dst_reg saturated(this, glsl_type::vec4_type);
 536    vec4_instruction *inst = emit(MOV(saturated, src0));
 537    inst->saturate = true;
 538
 539    dst_reg scaled(this, glsl_type::vec4_type);
 540    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 541
 542    dst_reg rounded(this, glsl_type::vec4_type);
 543    emit(RNDE(rounded, src_reg(scaled)));
 544
 545    dst_reg u(this, glsl_type::uvec4_type);
 546    emit(MOV(u, src_reg(rounded)));
 547
 548    src_reg bytes(u);
 549    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 550 }
 551
 552 void
 553 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 554 {
 555    dst_reg max(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 557
 558    dst_reg min(this, glsl_type::vec4_type);
 559    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 560
 561    dst_reg scaled(this, glsl_type::vec4_type);
 562    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 563
 564    dst_reg rounded(this, glsl_type::vec4_type);
 565    emit(RNDE(rounded, src_reg(scaled)));
 566
 567    dst_reg i(this, glsl_type::ivec4_type);
 568    emit(MOV(i, src_reg(rounded)));
 569
 570    src_reg bytes(i);
 571    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 572 }
 573
 574 /*
 575  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 576  * false) elements needed to pack a type.
 577  */
 578 static int
 579 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
 580 {
 581    unsigned int i;
 582    int size;
 583
 584    switch (type->base_type) {
 585    case GLSL_TYPE_UINT:
 586    case GLSL_TYPE_INT:
 587    case GLSL_TYPE_FLOAT:
 588    case GLSL_TYPE_FLOAT16:
 589    case GLSL_TYPE_BOOL:
 590    case GLSL_TYPE_DOUBLE:
 591    case GLSL_TYPE_UINT16:
 592    case GLSL_TYPE_INT16:
 593    case GLSL_TYPE_UINT8:
 594    case GLSL_TYPE_INT8:
 595    case GLSL_TYPE_UINT64:
 596    case GLSL_TYPE_INT64:
 597       if (type->is_matrix()) {
 598          const glsl_type *col_type = type->column_type();
 599          unsigned col_slots =
 600             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 601          return type->matrix_columns * col_slots;
 602       } else {
 603          /* Regardless of size of vector, it gets a vec4. This is bad
 604           * packing for things like floats, but otherwise arrays become a
 605           * mess.  Hopefully a later pass over the code can pack scalars
 606           * down if appropriate.
 607           */
 608          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 609       }
 610    case GLSL_TYPE_ARRAY:
 611       assert(type->length > 0);
 612       return type_size_xvec4(type->fields.array, as_vec4, bindless) *
 613              type->length;
 614    case GLSL_TYPE_STRUCT:
 615    case GLSL_TYPE_INTERFACE:
 616       size = 0;
 617       for (i = 0; i < type->length; i++) {
 618          size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
 619                                  bindless);
 620       }
 621       return size;
 622    case GLSL_TYPE_SUBROUTINE:
 623       return 1;
 624
 625    case GLSL_TYPE_SAMPLER:
 626       /* Samplers take up no register space, since they're baked in at
 627        * link time.
 628        */
 629       return bindless ? 1 : 0;
 630    case GLSL_TYPE_ATOMIC_UINT:
 631       return 0;
 632    case GLSL_TYPE_IMAGE:
 633       return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 634    case GLSL_TYPE_VOID:
 635    case GLSL_TYPE_ERROR:
 636    case GLSL_TYPE_FUNCTION:
 637       unreachable("not reached");
 638    }
 639
 640    return 0;
 641 }
 642
 643 /**
 644  * Returns the minimum number of vec4 elements needed to pack a type.
 645  *
 646  * For simple types, it will return 1 (a single vec4); for matrices, the
 647  * number of columns; for array and struct, the sum of the vec4_size of
 648  * each of its elements; and for sampler and atomic, zero.
 649  *
 650  * This method is useful to calculate how much register space is needed to
 651  * store a particular type.
 652  */
 653 extern "C" int
 654 type_size_vec4(const struct glsl_type *type, bool bindless)
 655 {
 656    return type_size_xvec4(type, true, bindless);
 657 }
 658
 659 /**
 660  * Returns the minimum number of dvec4 elements needed to pack a type.
 661  *
 662  * For simple types, it will return 1 (a single dvec4); for matrices, the
 663  * number of columns; for array and struct, the sum of the dvec4_size of
 664  * each of its elements; and for sampler and atomic, zero.
 665  *
 666  * This method is useful to calculate how much register space is needed to
 667  * store a particular type.
 668  *
 669  * Measuring double-precision vertex inputs as dvec4 is required because
 670  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 671  * than the single-precision version. That is, two consecutives dvec4 would be
 672  * located in location "x" and location "x+1", not "x+2".
 673  *
 674  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 675  * remap_vs_attrs() will take in account both the location and also if the
 676  * type fits in one or two vec4 slots.
 677  */
 678 extern "C" int
 679 type_size_dvec4(const struct glsl_type *type, bool bindless)
 680 {
 681    return type_size_xvec4(type, false, bindless);
 682 }
 683
 684 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 685 {
 686    init();
 687
 688    this->file = VGRF;
 689    this->nr = v->alloc.allocate(type_size_vec4(type, false));
 690
 691    if (type->is_array() || type->is_struct()) {
 692       this->swizzle = BRW_SWIZZLE_NOOP;
 693    } else {
 694       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 695    }
 696
 697    this->type = brw_type_for_base_type(type);
 698 }
 699
 700 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 701 {
 702    assert(size > 0);
 703
 704    init();
 705
 706    this->file = VGRF;
 707    this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
 708
 709    this->swizzle = BRW_SWIZZLE_NOOP;
 710
 711    this->type = brw_type_for_base_type(type);
 712 }
 713
 714 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 715 {
 716    init();
 717
 718    this->file = VGRF;
 719    this->nr = v->alloc.allocate(type_size_vec4(type, false));
 720
 721    if (type->is_array() || type->is_struct()) {
 722       this->writemask = WRITEMASK_XYZW;
 723    } else {
 724       this->writemask = (1 << type->vector_elements) - 1;
 725    }
 726
 727    this->type = brw_type_for_base_type(type);
 728 }
 729
 730 vec4_instruction *
 731 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 732                           src_reg src0, src_reg src1)
 733 {
 734    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 735    inst->conditional_mod = conditionalmod;
 736    return inst;
 737 }
 738
 739 vec4_instruction *
 740 vec4_visitor::emit_lrp(const dst_reg &dst,
 741                        const src_reg &x, const src_reg &y, const src_reg &a)
 742 {
 743    if (devinfo->gen >= 6 && devinfo->gen <= 10) {
 744       /* Note that the instruction's argument order is reversed from GLSL
 745        * and the IR.
 746        */
 747      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 748                      fix_3src_operand(x)));
 749    } else {
 750       /* Earlier generations don't support three source operations, so we
 751        * need to emit x*(1-a) + y*a.
 752        */
 753       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 754       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 755       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 756       y_times_a.writemask           = dst.writemask;
 757       one_minus_a.writemask         = dst.writemask;
 758       x_times_one_minus_a.writemask = dst.writemask;
 759
 760       emit(MUL(y_times_a, y, a));
 761       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 762       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 763       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 764    }
 765 }
 766
 767 /**
 768  * Emits the instructions needed to perform a pull constant load. before_block
 769  * and before_inst can be NULL in which case the instruction will be appended
 770  * to the end of the instruction list.
 771  */
 772 void
 773 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 774                                           src_reg surf_index,
 775                                           src_reg offset_reg,
 776                                           bblock_t *before_block,
 777                                           vec4_instruction *before_inst)
 778 {
 779    assert((before_inst == NULL && before_block == NULL) ||
 780           (before_inst && before_block));
 781
 782    vec4_instruction *pull;
 783
 784    if (devinfo->gen >= 9) {
 785       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 786       src_reg header(this, glsl_type::uvec4_type, 2);
 787
 788       pull = new(mem_ctx)
 789          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 790                           dst_reg(header));
 791
 792       if (before_inst)
 793          emit_before(before_block, before_inst, pull);
 794       else
 795          emit(pull);
 796
 797       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 798                                  offset_reg.type);
 799       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 800
 801       if (before_inst)
 802          emit_before(before_block, before_inst, pull);
 803       else
 804          emit(pull);
 805
 806       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 807                                            dst,
 808                                            surf_index,
 809                                            header);
 810       pull->mlen = 2;
 811       pull->header_size = 1;
 812    } else if (devinfo->gen >= 7) {
 813       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 814
 815       grf_offset.type = offset_reg.type;
 816
 817       pull = MOV(grf_offset, offset_reg);
 818
 819       if (before_inst)
 820          emit_before(before_block, before_inst, pull);
 821       else
 822          emit(pull);
 823
 824       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 825                                            dst,
 826                                            surf_index,
 827                                            src_reg(grf_offset));
 828       pull->mlen = 1;
 829    } else {
 830       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 831                                            dst,
 832                                            surf_index,
 833                                            offset_reg);
 834       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 835       pull->mlen = 1;
 836    }
 837
 838    if (before_inst)
 839       emit_before(before_block, before_inst, pull);
 840    else
 841       emit(pull);
 842 }
 843
 844 src_reg
 845 vec4_visitor::emit_uniformize(const src_reg &src)
 846 {
 847    const src_reg chan_index(this, glsl_type::uint_type);
 848    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 849                               src.type);
 850
 851    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 852       ->force_writemask_all = true;
 853    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 854       ->force_writemask_all = true;
 855
 856    return src_reg(dst);
 857 }
 858
 859 src_reg
 860 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 861                              src_reg coordinate, src_reg surface)
 862 {
 863    vec4_instruction *inst =
 864       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 865                                     dst_reg(this, glsl_type::uvec4_type));
 866    inst->base_mrf = 2;
 867    inst->src[1] = surface;
 868    inst->src[2] = brw_imm_ud(0); /* sampler */
 869
 870    int param_base;
 871
 872    if (devinfo->gen >= 9) {
 873       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 874       vec4_instruction *header_inst = new(mem_ctx)
 875          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 876                           dst_reg(MRF, inst->base_mrf));
 877
 878       emit(header_inst);
 879
 880       inst->mlen = 2;
 881       inst->header_size = 1;
 882       param_base = inst->base_mrf + 1;
 883    } else {
 884       inst->mlen = 1;
 885       param_base = inst->base_mrf;
 886    }
 887
 888    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 889    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 890    int zero_mask = 0xf & ~coord_mask;
 891
 892    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 893             coordinate));
 894
 895    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 896             brw_imm_d(0)));
 897
 898    emit(inst);
 899    return src_reg(inst->dst);
 900 }
 901
 902 bool
 903 vec4_visitor::is_high_sampler(src_reg sampler)
 904 {
 905    if (devinfo->gen < 8 && !devinfo->is_haswell)
 906       return false;
 907
 908    return sampler.file != IMM || sampler.ud >= 16;
 909 }
 910
 911 void
 912 vec4_visitor::emit_texture(ir_texture_opcode op,
 913                            dst_reg dest,
 914                            const glsl_type *dest_type,
 915                            src_reg coordinate,
 916                            int coord_components,
 917                            src_reg shadow_comparator,
 918                            src_reg lod, src_reg lod2,
 919                            src_reg sample_index,
 920                            uint32_t constant_offset,
 921                            src_reg offset_value,
 922                            src_reg mcs,
 923                            uint32_t surface,
 924                            src_reg surface_reg,
 925                            src_reg sampler_reg)
 926 {
 927    enum opcode opcode;
 928    switch (op) {
 929    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 930    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 931    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 932    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 933    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 934                              SHADER_OPCODE_TXF_CMS); break;
 935    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 936    case ir_tg4: opcode = offset_value.file != BAD_FILE
 937                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 938    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 939    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 940    case ir_txb:
 941       unreachable("TXB is not valid for vertex shaders.");
 942    case ir_lod:
 943       unreachable("LOD is not valid for vertex shaders.");
 944    case ir_samples_identical: {
 945       /* There are some challenges implementing this for vec4, and it seems
 946        * unlikely to be used anyway.  For now, just return false ways.
 947        */
 948       emit(MOV(dest, brw_imm_ud(0u)));
 949       return;
 950    }
 951    default:
 952       unreachable("Unrecognized tex op");
 953    }
 954
 955    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 956
 957    inst->offset = constant_offset;
 958
 959    /* The message header is necessary for:
 960     * - Gen4 (always)
 961     * - Gen9+ for selecting SIMD4x2
 962     * - Texel offsets
 963     * - Gather channel selection
 964     * - Sampler indices too large to fit in a 4-bit value.
 965     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 966     */
 967    inst->header_size =
 968       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 969        inst->offset != 0 || op == ir_tg4 ||
 970        op == ir_texture_samples ||
 971        is_high_sampler(sampler_reg)) ? 1 : 0;
 972    inst->base_mrf = 2;
 973    inst->mlen = inst->header_size;
 974    inst->dst.writemask = WRITEMASK_XYZW;
 975    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 976
 977    inst->src[1] = surface_reg;
 978    inst->src[2] = sampler_reg;
 979
 980    /* MRF for the first parameter */
 981    int param_base = inst->base_mrf + inst->header_size;
 982
 983    if (op == ir_txs || op == ir_query_levels) {
 984       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 985       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 986       inst->mlen++;
 987    } else if (op == ir_texture_samples) {
 988       inst->dst.writemask = WRITEMASK_X;
 989    } else {
 990       /* Load the coordinate */
 991       /* FINISHME: gl_clamp_mask and saturate */
 992       int coord_mask = (1 << coord_components) - 1;
 993       int zero_mask = 0xf & ~coord_mask;
 994
 995       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 996                coordinate));
 997       inst->mlen++;
 998
 999       if (zero_mask != 0) {
1000          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001                   brw_imm_d(0)));
1002       }
1003       /* Load the shadow comparator */
1004       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1006                           WRITEMASK_X),
1007                   shadow_comparator));
1008          inst->mlen++;
1009       }
1010
1011       /* Load the LOD info */
1012       if (op == ir_tex || op == ir_txl) {
1013          int mrf, writemask;
1014          if (devinfo->gen >= 5) {
1015             mrf = param_base + 1;
1016             if (shadow_comparator.file != BAD_FILE) {
1017                writemask = WRITEMASK_Y;
1018                /* mlen already incremented */
1019             } else {
1020                writemask = WRITEMASK_X;
1021                inst->mlen++;
1022             }
1023          } else /* devinfo->gen == 4 */ {
1024             mrf = param_base;
1025             writemask = WRITEMASK_W;
1026          }
1027          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028       } else if (op == ir_txf) {
1029          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030       } else if (op == ir_txf_ms) {
1031          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032                   sample_index));
1033          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034             /* MCS data is stored in the first two channels of ‘mcs’, but we
1035              * need to get it into the .y and .z channels of the second vec4
1036              * of params.
1037              */
1038             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039             emit(MOV(dst_reg(MRF, param_base + 1,
1040                              glsl_type::uint_type, WRITEMASK_YZ),
1041                      mcs));
1042          } else if (devinfo->gen >= 7) {
1043             /* MCS data is in the first channel of `mcs`, but we need to get it into
1044              * the .y channel of the second vec4 of params, so replicate .x across
1045              * the whole vec4 and then mask off everything except .y
1046              */
1047             mcs.swizzle = BRW_SWIZZLE_XXXX;
1048             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049                      mcs));
1050          }
1051          inst->mlen++;
1052       } else if (op == ir_txd) {
1053          const brw_reg_type type = lod.type;
1054
1055          if (devinfo->gen >= 5) {
1056             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060             inst->mlen++;
1061
1062             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1063                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067                inst->mlen++;
1068
1069                if (shadow_comparator.file != BAD_FILE) {
1070                   emit(MOV(dst_reg(MRF, param_base + 2,
1071                                    shadow_comparator.type, WRITEMASK_Z),
1072                            shadow_comparator));
1073                }
1074             }
1075          } else /* devinfo->gen == 4 */ {
1076             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078             inst->mlen += 2;
1079          }
1080       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081          if (shadow_comparator.file != BAD_FILE) {
1082             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1083                      shadow_comparator));
1084          }
1085
1086          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087                   offset_value));
1088          inst->mlen++;
1089       }
1090    }
1091
1092    emit(inst);
1093
1094    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095     * spec requires layers.
1096     */
1097    if (op == ir_txs && devinfo->gen < 7) {
1098       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1099       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1100                   src_reg(inst->dst), brw_imm_d(1));
1101    }
1102
1103    if (devinfo->gen == 6 && op == ir_tg4) {
1104       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1105    }
1106
1107    if (op == ir_query_levels) {
1108       /* # levels is in .w */
1109       src_reg swizzled(dest);
1110       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1111                                       SWIZZLE_W, SWIZZLE_W);
1112       emit(MOV(dest, swizzled));
1113    }
1114 }
1115
1116 /**
1117  * Apply workarounds for Gen6 gather with UINT/SINT
1118  */
1119 void
1120 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1121 {
1122    if (!wa)
1123       return;
1124
1125    int width = (wa & WA_8BIT) ? 8 : 16;
1126    dst_reg dst_f = dst;
1127    dst_f.type = BRW_REGISTER_TYPE_F;
1128
1129    /* Convert from UNORM to UINT */
1130    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1131    emit(MOV(dst, src_reg(dst_f)));
1132
1133    if (wa & WA_SIGN) {
1134       /* Reinterpret the UINT value as a signed INT value by
1135        * shifting the sign bit into place, then shifting back
1136        * preserving sign.
1137        */
1138       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1139       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1140    }
1141 }
1142
1143 void
1144 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1145 {
1146    unreachable("not reached");
1147 }
1148
1149 void
1150 vec4_visitor::gs_end_primitive()
1151 {
1152    unreachable("not reached");
1153 }
1154
1155 void
1156 vec4_visitor::emit_ndc_computation()
1157 {
1158    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1159       return;
1160
1161    /* Get the position */
1162    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1163
1164    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1165    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1166    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1167    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1168
1169    current_annotation = "NDC";
1170    dst_reg ndc_w = ndc;
1171    ndc_w.writemask = WRITEMASK_W;
1172    src_reg pos_w = pos;
1173    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1174    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1175
1176    dst_reg ndc_xyz = ndc;
1177    ndc_xyz.writemask = WRITEMASK_XYZ;
1178
1179    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1180 }
1181
1182 void
1183 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1184 {
1185    if (devinfo->gen < 6 &&
1186        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1187         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1188         devinfo->has_negative_rhw_bug)) {
1189       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1190       dst_reg header1_w = header1;
1191       header1_w.writemask = WRITEMASK_W;
1192
1193       emit(MOV(header1, brw_imm_ud(0u)));
1194
1195       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1196          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1197
1198          current_annotation = "Point size";
1199          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1200          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1201       }
1202
1203       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1204          current_annotation = "Clipping flags";
1205          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1206
1207          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1209          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1210       }
1211
1212       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1213          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1214          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218       }
1219
1220       /* i965 clipping workaround:
1221        * 1) Test for -ve rhw
1222        * 2) If set,
1223        *      set ndc = (0,0,0,0)
1224        *      set ucp[6] = 1
1225        *
1226        * Later, clipping will detect ucp[6] and ensure the primitive is
1227        * clipped against all fixed planes.
1228        */
1229       if (devinfo->has_negative_rhw_bug &&
1230           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234          vec4_instruction *inst;
1235          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236          inst->predicate = BRW_PREDICATE_NORMAL;
1237          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239          inst->predicate = BRW_PREDICATE_NORMAL;
1240       }
1241
1242       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243    } else if (devinfo->gen < 6) {
1244       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245    } else {
1246       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1248          dst_reg reg_w = reg;
1249          reg_w.writemask = WRITEMASK_W;
1250          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251          reg_as_src.type = reg_w.type;
1252          reg_as_src.swizzle = brw_swizzle_for_size(1);
1253          emit(MOV(reg_w, reg_as_src));
1254       }
1255       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1256          dst_reg reg_y = reg;
1257          reg_y.writemask = WRITEMASK_Y;
1258          reg_y.type = BRW_REGISTER_TYPE_D;
1259          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261       }
1262       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1263          dst_reg reg_z = reg;
1264          reg_z.writemask = WRITEMASK_Z;
1265          reg_z.type = BRW_REGISTER_TYPE_D;
1266          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268       }
1269    }
1270 }
1271
1272 vec4_instruction *
1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274 {
1275    assert(varying < VARYING_SLOT_MAX);
1276
1277    unsigned num_comps = output_num_components[varying][component];
1278    if (num_comps == 0)
1279       return NULL;
1280
1281    assert(output_reg[varying][component].type == reg.type);
1282    current_annotation = output_reg_annotation[varying];
1283    if (output_reg[varying][component].file != BAD_FILE) {
1284       src_reg src = src_reg(output_reg[varying][component]);
1285       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286       reg.writemask =
1287          brw_writemask_for_component_packing(num_comps, component);
1288       return emit(MOV(reg, src));
1289    }
1290    return NULL;
1291 }
1292
1293 void
1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295 {
1296    reg.type = BRW_REGISTER_TYPE_F;
1297    output_reg[varying][0].type = reg.type;
1298
1299    switch (varying) {
1300    case VARYING_SLOT_PSIZ:
1301    {
1302       /* PSIZ is always in slot 0, and is coupled with other flags. */
1303       current_annotation = "indices, point width, clip flags";
1304       emit_psiz_and_flags(reg);
1305       break;
1306    }
1307    case BRW_VARYING_SLOT_NDC:
1308       current_annotation = "NDC";
1309       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311       break;
1312    case VARYING_SLOT_POS:
1313       current_annotation = "gl_Position";
1314       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316       break;
1317    case VARYING_SLOT_EDGE: {
1318       /* This is present when doing unfilled polygons.  We're supposed to copy
1319        * the edge flag from the user-provided vertex array
1320        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321        * of that attribute (starts as 1.0f).  This is then used in clipping to
1322        * determine which edges should be drawn as wireframe.
1323        */
1324       current_annotation = "edge flag";
1325       int edge_attr = util_bitcount64(nir->info.inputs_read &
1326                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1327       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1328                                     glsl_type::float_type, WRITEMASK_XYZW))));
1329       break;
1330    }
1331    case BRW_VARYING_SLOT_PAD:
1332       /* No need to write to this slot */
1333       break;
1334    default:
1335       for (int i = 0; i < 4; i++) {
1336          emit_generic_urb_slot(reg, varying, i);
1337       }
1338       break;
1339    }
1340 }
1341
1342 static unsigned
1343 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1344 {
1345    if (devinfo->gen >= 6) {
1346       /* URB data written (does not include the message header reg) must
1347        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1348        * section 5.4.3.2.2: URB_INTERLEAVED.
1349        *
1350        * URB entries are allocated on a multiple of 1024 bits, so an
1351        * extra 128 bits written here to make the end align to 256 is
1352        * no problem.
1353        */
1354       if ((mlen % 2) != 1)
1355          mlen++;
1356    }
1357
1358    return mlen;
1359 }
1360
1361
1362 /**
1363  * Generates the VUE payload plus the necessary URB write instructions to
1364  * output it.
1365  *
1366  * The VUE layout is documented in Volume 2a.
1367  */
1368 void
1369 vec4_visitor::emit_vertex()
1370 {
1371    /* MRF 0 is reserved for the debugger, so start with message header
1372     * in MRF 1.
1373     */
1374    int base_mrf = 1;
1375    int mrf = base_mrf;
1376    /* In the process of generating our URB write message contents, we
1377     * may need to unspill a register or load from an array.  Those
1378     * reads would use MRFs 14-15.
1379     */
1380    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1381
1382    /* The following assertion verifies that max_usable_mrf causes an
1383     * even-numbered amount of URB write data, which will meet gen6's
1384     * requirements for length alignment.
1385     */
1386    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1387
1388    /* First mrf is the g0-based message header containing URB handles and
1389     * such.
1390     */
1391    emit_urb_write_header(mrf++);
1392
1393    if (devinfo->gen < 6) {
1394       emit_ndc_computation();
1395    }
1396
1397    /* We may need to split this up into several URB writes, so do them in a
1398     * loop.
1399     */
1400    int slot = 0;
1401    bool complete = false;
1402    do {
1403       /* URB offset is in URB row increments, and each of our MRFs is half of
1404        * one of those, since we're doing interleaved writes.
1405        */
1406       int offset = slot / 2;
1407
1408       mrf = base_mrf + 1;
1409       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1410          emit_urb_slot(dst_reg(MRF, mrf++),
1411                        prog_data->vue_map.slot_to_varying[slot]);
1412
1413          /* If this was max_usable_mrf, we can't fit anything more into this
1414           * URB WRITE. Same thing if we reached the maximum length available.
1415           */
1416          if (mrf > max_usable_mrf ||
1417              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1418             slot++;
1419             break;
1420          }
1421       }
1422
1423       complete = slot >= prog_data->vue_map.num_slots;
1424       current_annotation = "URB write";
1425       vec4_instruction *inst = emit_urb_write_opcode(complete);
1426       inst->base_mrf = base_mrf;
1427       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1428       inst->offset += offset;
1429    } while(!complete);
1430 }
1431
1432
1433 src_reg
1434 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1435                                  src_reg *reladdr, int reg_offset)
1436 {
1437    /* Because we store the values to scratch interleaved like our
1438     * vertex data, we need to scale the vec4 index by 2.
1439     */
1440    int message_header_scale = 2;
1441
1442    /* Pre-gen6, the message header uses byte offsets instead of vec4
1443     * (16-byte) offset units.
1444     */
1445    if (devinfo->gen < 6)
1446       message_header_scale *= 16;
1447
1448    if (reladdr) {
1449       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1450        * to multiply the reladdr by 2. Notice that the reg_offset part
1451        * is in units of 16 bytes and is used to select the low/high 16-byte
1452        * chunk of a full dvec4, so we don't want to multiply that part.
1453        */
1454       src_reg index = src_reg(this, glsl_type::int_type);
1455       if (type_sz(inst->dst.type) < 8) {
1456          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1457                                       brw_imm_d(reg_offset)));
1458          emit_before(block, inst, MUL(dst_reg(index), index,
1459                                       brw_imm_d(message_header_scale)));
1460       } else {
1461          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1462                                       brw_imm_d(message_header_scale * 2)));
1463          emit_before(block, inst, ADD(dst_reg(index), index,
1464                                       brw_imm_d(reg_offset * message_header_scale)));
1465       }
1466       return index;
1467    } else {
1468       return brw_imm_d(reg_offset * message_header_scale);
1469    }
1470 }
1471
1472 /**
1473  * Emits an instruction before @inst to load the value named by @orig_src
1474  * from scratch space at @base_offset to @temp.
1475  *
1476  * @base_offset is measured in 32-byte units (the size of a register).
1477  */
1478 void
1479 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1480                                 dst_reg temp, src_reg orig_src,
1481                                 int base_offset)
1482 {
1483    assert(orig_src.offset % REG_SIZE == 0);
1484    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1485    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1486                                       reg_offset);
1487
1488    if (type_sz(orig_src.type) < 8) {
1489       emit_before(block, inst, SCRATCH_READ(temp, index));
1490    } else {
1491       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1492       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1493       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1494       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1495       vec4_instruction *last_read =
1496          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1497       emit_before(block, inst, last_read);
1498       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1499    }
1500 }
1501
1502 /**
1503  * Emits an instruction after @inst to store the value to be written
1504  * to @orig_dst to scratch space at @base_offset, from @temp.
1505  *
1506  * @base_offset is measured in 32-byte units (the size of a register).
1507  */
1508 void
1509 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1510                                  int base_offset)
1511 {
1512    assert(inst->dst.offset % REG_SIZE == 0);
1513    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1514    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1515                                       reg_offset);
1516
1517    /* Create a temporary register to store *inst's result in.
1518     *
1519     * We have to be careful in MOVing from our temporary result register in
1520     * the scratch write.  If we swizzle from channels of the temporary that
1521     * weren't initialized, it will confuse live interval analysis, which will
1522     * make spilling fail to make progress.
1523     */
1524    bool is_64bit = type_sz(inst->dst.type) == 8;
1525    const glsl_type *alloc_type =
1526       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1527    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1528                                        inst->dst.type),
1529                                 brw_swizzle_for_mask(inst->dst.writemask));
1530
1531    if (!is_64bit) {
1532       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1533                                           inst->dst.writemask));
1534       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1535       if (inst->opcode != BRW_OPCODE_SEL)
1536          write->predicate = inst->predicate;
1537       write->ir = inst->ir;
1538       write->annotation = inst->annotation;
1539       inst->insert_after(block, write);
1540    } else {
1541       dst_reg shuffled = dst_reg(this, alloc_type);
1542       vec4_instruction *last =
1543          shuffle_64bit_data(shuffled, temp, true, block, inst);
1544       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1545
1546       uint8_t mask = 0;
1547       if (inst->dst.writemask & WRITEMASK_X)
1548          mask |= WRITEMASK_XY;
1549       if (inst->dst.writemask & WRITEMASK_Y)
1550          mask |= WRITEMASK_ZW;
1551       if (mask) {
1552          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1553
1554          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1555          if (inst->opcode != BRW_OPCODE_SEL)
1556             write->predicate = inst->predicate;
1557          write->ir = inst->ir;
1558          write->annotation = inst->annotation;
1559          last->insert_after(block, write);
1560       }
1561
1562       mask = 0;
1563       if (inst->dst.writemask & WRITEMASK_Z)
1564          mask |= WRITEMASK_XY;
1565       if (inst->dst.writemask & WRITEMASK_W)
1566          mask |= WRITEMASK_ZW;
1567       if (mask) {
1568          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1569
1570          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1571                                             reg_offset + 1);
1572          vec4_instruction *write =
1573             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1574          if (inst->opcode != BRW_OPCODE_SEL)
1575             write->predicate = inst->predicate;
1576          write->ir = inst->ir;
1577          write->annotation = inst->annotation;
1578          last->insert_after(block, write);
1579       }
1580    }
1581
1582    inst->dst.file = temp.file;
1583    inst->dst.nr = temp.nr;
1584    inst->dst.offset %= REG_SIZE;
1585    inst->dst.reladdr = NULL;
1586 }
1587
1588 /**
1589  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1590  * adds the scratch read(s) before \p inst. The function also checks for
1591  * recursive reladdr scratch accesses, issuing the corresponding scratch
1592  * loads and rewriting reladdr references accordingly.
1593  *
1594  * \return \p src if it did not require a scratch load, otherwise, the
1595  * register holding the result of the scratch load that the caller should
1596  * use to rewrite src.
1597  */
1598 src_reg
1599 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1600                                    vec4_instruction *inst, src_reg src)
1601 {
1602    /* Resolve recursive reladdr scratch access by calling ourselves
1603     * with src.reladdr
1604     */
1605    if (src.reladdr)
1606       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1607                                           *src.reladdr);
1608
1609    /* Now handle scratch access on src */
1610    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1611       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1612          glsl_type::dvec4_type : glsl_type::vec4_type);
1613       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1614       src.nr = temp.nr;
1615       src.offset %= REG_SIZE;
1616       src.reladdr = NULL;
1617    }
1618
1619    return src;
1620 }
1621
1622 /**
1623  * We can't generally support array access in GRF space, because a
1624  * single instruction's destination can only span 2 contiguous
1625  * registers.  So, we send all GRF arrays that get variable index
1626  * access to scratch space.
1627  */
1628 void
1629 vec4_visitor::move_grf_array_access_to_scratch()
1630 {
1631    int scratch_loc[this->alloc.count];
1632    memset(scratch_loc, -1, sizeof(scratch_loc));
1633
1634    /* First, calculate the set of virtual GRFs that need to be punted
1635     * to scratch due to having any array access on them, and where in
1636     * scratch.
1637     */
1638    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1639       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1640          if (scratch_loc[inst->dst.nr] == -1) {
1641             scratch_loc[inst->dst.nr] = last_scratch;
1642             last_scratch += this->alloc.sizes[inst->dst.nr];
1643          }
1644
1645          for (src_reg *iter = inst->dst.reladdr;
1646               iter->reladdr;
1647               iter = iter->reladdr) {
1648             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1649                scratch_loc[iter->nr] = last_scratch;
1650                last_scratch += this->alloc.sizes[iter->nr];
1651             }
1652          }
1653       }
1654
1655       for (int i = 0 ; i < 3; i++) {
1656          for (src_reg *iter = &inst->src[i];
1657               iter->reladdr;
1658               iter = iter->reladdr) {
1659             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1660                scratch_loc[iter->nr] = last_scratch;
1661                last_scratch += this->alloc.sizes[iter->nr];
1662             }
1663          }
1664       }
1665    }
1666
1667    /* Now, for anything that will be accessed through scratch, rewrite
1668     * it to load/store.  Note that this is a _safe list walk, because
1669     * we may generate a new scratch_write instruction after the one
1670     * we're processing.
1671     */
1672    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1673       /* Set up the annotation tracking for new generated instructions. */
1674       base_ir = inst->ir;
1675       current_annotation = inst->annotation;
1676
1677       /* First handle scratch access on the dst. Notice we have to handle
1678        * the case where the dst's reladdr also points to scratch space.
1679        */
1680       if (inst->dst.reladdr)
1681          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1682                                                    *inst->dst.reladdr);
1683
1684       /* Now that we have handled any (possibly recursive) reladdr scratch
1685        * accesses for dst we can safely do the scratch write for dst itself
1686        */
1687       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1688          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1689
1690       /* Now handle scratch access on any src. In this case, since inst->src[i]
1691        * already is a src_reg, we can just call emit_resolve_reladdr with
1692        * inst->src[i] and it will take care of handling scratch loads for
1693        * both src and src.reladdr (recursively).
1694        */
1695       for (int i = 0 ; i < 3; i++) {
1696          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1697                                              inst->src[i]);
1698       }
1699    }
1700 }
1701
1702 /**
1703  * Emits an instruction before @inst to load the value named by @orig_src
1704  * from the pull constant buffer (surface) at @base_offset to @temp.
1705  */
1706 void
1707 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1708                                       dst_reg temp, src_reg orig_src,
1709                                       int base_offset, src_reg indirect)
1710 {
1711    assert(orig_src.offset % 16 == 0);
1712    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1713
1714    /* For 64bit loads we need to emit two 32-bit load messages and we also
1715     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1716     * that we emit the 32-bit loads into a temporary and we shuffle the result
1717     * into the original destination.
1718     */
1719    dst_reg orig_temp = temp;
1720    bool is_64bit = type_sz(orig_src.type) == 8;
1721    if (is_64bit) {
1722       assert(type_sz(temp.type) == 8);
1723       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1724       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1725    }
1726
1727    src_reg src = orig_src;
1728    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1729       int reg_offset = base_offset + src.offset / 16;
1730
1731       src_reg offset;
1732       if (indirect.file != BAD_FILE) {
1733          offset = src_reg(this, glsl_type::uint_type);
1734          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1735                                       brw_imm_ud(reg_offset * 16)));
1736       } else if (devinfo->gen >= 8) {
1737          /* Store the offset in a GRF so we can send-from-GRF. */
1738          offset = src_reg(this, glsl_type::uint_type);
1739          emit_before(block, inst, MOV(dst_reg(offset),
1740                                       brw_imm_ud(reg_offset * 16)));
1741       } else {
1742          offset = brw_imm_d(reg_offset * 16);
1743       }
1744
1745       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1746                                   brw_imm_ud(index),
1747                                   offset,
1748                                   block, inst);
1749
1750       src = byte_offset(src, 16);
1751    }
1752
1753    if (is_64bit) {
1754       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1755       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1756    }
1757 }
1758
1759 /**
1760  * Implements array access of uniforms by inserting a
1761  * PULL_CONSTANT_LOAD instruction.
1762  *
1763  * Unlike temporary GRF array access (where we don't support it due to
1764  * the difficulty of doing relative addressing on instruction
1765  * destinations), we could potentially do array access of uniforms
1766  * that were loaded in GRF space as push constants.  In real-world
1767  * usage we've seen, though, the arrays being used are always larger
1768  * than we could load as push constants, so just always move all
1769  * uniform array access out to a pull constant buffer.
1770  */
1771 void
1772 vec4_visitor::move_uniform_array_access_to_pull_constants()
1773 {
1774    /* The vulkan dirver doesn't support pull constants other than UBOs so
1775     * everything has to be pushed regardless.
1776     */
1777    if (!compiler->supports_pull_constants) {
1778       split_uniform_registers();
1779       return;
1780    }
1781
1782    /* Allocate the pull_params array */
1783    assert(stage_prog_data->nr_pull_params == 0);
1784    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1785                                               this->uniforms * 4);
1786
1787    int pull_constant_loc[this->uniforms];
1788    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1789
1790    /* First, walk through the instructions and determine which things need to
1791     * be pulled.  We mark something as needing to be pulled by setting
1792     * pull_constant_loc to 0.
1793     */
1794    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1795       /* We only care about MOV_INDIRECT of a uniform */
1796       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1797           inst->src[0].file != UNIFORM)
1798          continue;
1799
1800       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1801
1802       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1803          pull_constant_loc[uniform_nr + j] = 0;
1804    }
1805
1806    /* Next, we walk the list of uniforms and assign real pull constant
1807     * locations and set their corresponding entries in pull_param.
1808     */
1809    for (int j = 0; j < this->uniforms; j++) {
1810       if (pull_constant_loc[j] < 0)
1811          continue;
1812
1813       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1814
1815       for (int i = 0; i < 4; i++) {
1816          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1817             = stage_prog_data->param[j * 4 + i];
1818       }
1819    }
1820
1821    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1822     * instructions to actual uniform pulls.
1823     */
1824    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1825       /* We only care about MOV_INDIRECT of a uniform */
1826       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1827           inst->src[0].file != UNIFORM)
1828          continue;
1829
1830       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1831
1832       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1833
1834       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1835                               pull_constant_loc[uniform_nr], inst->src[1]);
1836       inst->remove(block);
1837    }
1838
1839    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1840     * no need to track them as larger-than-vec4 objects.  This will be
1841     * relied on in cutting out unused uniform vectors from push
1842     * constants.
1843     */
1844    split_uniform_registers();
1845 }
1846
1847 void
1848 vec4_visitor::resolve_ud_negate(src_reg *reg)
1849 {
1850    if (reg->type != BRW_REGISTER_TYPE_UD ||
1851        !reg->negate)
1852       return;
1853
1854    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1855    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1856    *reg = temp;
1857 }
1858
1859 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1860                            void *log_data,
1861                            const struct brw_sampler_prog_key_data *key_tex,
1862                            struct brw_vue_prog_data *prog_data,
1863                            const nir_shader *shader,
1864                            void *mem_ctx,
1865                            bool no_spills,
1866                            int shader_time_index)
1867    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1868      key_tex(key_tex),
1869      prog_data(prog_data),
1870      fail_msg(NULL),
1871      first_non_payload_grf(0),
1872      need_all_constants_in_pull_buffer(false),
1873      no_spills(no_spills),
1874      shader_time_index(shader_time_index),
1875      last_scratch(0)
1876 {
1877    this->failed = false;
1878
1879    this->base_ir = NULL;
1880    this->current_annotation = NULL;
1881    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1882
1883    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1884
1885    this->virtual_grf_start = NULL;
1886    this->virtual_grf_end = NULL;
1887    this->live_intervals = NULL;
1888
1889    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1890
1891    this->uniforms = 0;
1892 }
1893
1894
1895 void
1896 vec4_visitor::fail(const char *format, ...)
1897 {
1898    va_list va;
1899    char *msg;
1900
1901    if (failed)
1902       return;
1903
1904    failed = true;
1905
1906    va_start(va, format);
1907    msg = ralloc_vasprintf(mem_ctx, format, va);
1908    va_end(va);
1909    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1910
1911    this->fail_msg = msg;
1912
1913    if (debug_enabled) {
1914       fprintf(stderr, "%s",  msg);
1915    }
1916 }
1917
1918 } /* namespace brw */