src/intel/compiler/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "util/u_math.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->eot = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->exec_size = 8;
  59    this->group = 0;
  60    this->size_written = (dst.file == BAD_FILE ?
  61                          0 : this->exec_size * type_sz(dst.type));
  62    this->annotation = NULL;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(vec4_instruction *inst)
  67 {
  68    inst->ir = this->base_ir;
  69    inst->annotation = this->current_annotation;
  70
  71    this->instructions.push_tail(inst);
  72
  73    return inst;
  74 }
  75
  76 vec4_instruction *
  77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  78                           vec4_instruction *new_inst)
  79 {
  80    new_inst->ir = inst->ir;
  81    new_inst->annotation = inst->annotation;
  82
  83    inst->insert_before(block, new_inst);
  84
  85    return inst;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  90                    const src_reg &src1, const src_reg &src2)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  93 }
  94
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  98                    const src_reg &src1)
  99 {
 100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 101 }
 102
 103 vec4_instruction *
 104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 105 {
 106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 107 }
 108
 109 vec4_instruction *
 110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 111 {
 112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 113 }
 114
 115 vec4_instruction *
 116 vec4_visitor::emit(enum opcode opcode)
 117 {
 118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 119 }
 120
 121 #define ALU1(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 126    }
 127
 128 #define ALU2(op)                                                        \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 131                     const src_reg &src1)                                \
 132    {                                                                    \
 133       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 134                                            src0, src1);                 \
 135    }
 136
 137 #define ALU2_ACC(op)                                                    \
 138    vec4_instruction *                                                   \
 139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 140                     const src_reg &src1)                                \
 141    {                                                                    \
 142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 143                        BRW_OPCODE_##op, dst, src0, src1);               \
 144       inst->writes_accumulator = true;                                  \
 145       return inst;                                                      \
 146    }
 147
 148 #define ALU3(op)                                                        \
 149    vec4_instruction *                                                   \
 150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 151                     const src_reg &src1, const src_reg &src2)           \
 152    {                                                                    \
 153       assert(devinfo->gen >= 6);                                                \
 154       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 155                                            src0, src1, src2);           \
 156    }
 157
 158 ALU1(NOT)
 159 ALU1(MOV)
 160 ALU1(FRC)
 161 ALU1(RNDD)
 162 ALU1(RNDE)
 163 ALU1(RNDZ)
 164 ALU1(F32TO16)
 165 ALU1(F16TO32)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2_ACC(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(DP3)
 173 ALU2(DP4)
 174 ALU2(DPH)
 175 ALU2(SHL)
 176 ALU2(SHR)
 177 ALU2(ASR)
 178 ALU3(LRP)
 179 ALU1(BFREV)
 180 ALU3(BFE)
 181 ALU2(BFI1)
 182 ALU3(BFI2)
 183 ALU1(FBH)
 184 ALU1(FBL)
 185 ALU1(CBIT)
 186 ALU3(MAD)
 187 ALU2_ACC(ADDC)
 188 ALU2_ACC(SUBB)
 189 ALU2(MAC)
 190 ALU1(DIM)
 191
 192 /** Gen4 predicated IF. */
 193 vec4_instruction *
 194 vec4_visitor::IF(enum brw_predicate predicate)
 195 {
 196    vec4_instruction *inst;
 197
 198    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 199    inst->predicate = predicate;
 200
 201    return inst;
 202 }
 203
 204 /** Gen6 IF with embedded comparison. */
 205 vec4_instruction *
 206 vec4_visitor::IF(src_reg src0, src_reg src1,
 207                  enum brw_conditional_mod condition)
 208 {
 209    assert(devinfo->gen == 6);
 210
 211    vec4_instruction *inst;
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 217                                         src0, src1);
 218    inst->conditional_mod = condition;
 219
 220    return inst;
 221 }
 222
 223 /**
 224  * CMP: Sets the low bit of the destination channels with the result
 225  * of the comparison, while the upper bits are undefined, and updates
 226  * the flag register with the packed 16 bits of the result.
 227  */
 228 vec4_instruction *
 229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 230                   enum brw_conditional_mod condition)
 231 {
 232    vec4_instruction *inst;
 233
 234    /* Take the instruction:
 235     *
 236     * CMP null<d> src0<f> src1<f>
 237     *
 238     * Original gen4 does type conversion to the destination type before
 239     * comparison, producing garbage results for floating point comparisons.
 240     *
 241     * The destination type doesn't matter on newer generations, so we set the
 242     * type to match src0 so we can compact the instruction.
 243     */
 244    dst.type = src0.type;
 245
 246    resolve_ud_negate(&src0);
 247    resolve_ud_negate(&src1);
 248
 249    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 250    inst->conditional_mod = condition;
 251
 252    return inst;
 253 }
 254
 255 vec4_instruction *
 256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 257 {
 258    vec4_instruction *inst;
 259
 260    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 261                                         dst, index);
 262    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 263    inst->mlen = 2;
 264
 265    return inst;
 266 }
 267
 268 vec4_instruction *
 269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 270                             const src_reg &index)
 271 {
 272    vec4_instruction *inst;
 273
 274    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 275                                         dst, src, index);
 276    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 277    inst->mlen = 3;
 278
 279    return inst;
 280 }
 281
 282 src_reg
 283 vec4_visitor::fix_3src_operand(const src_reg &src)
 284 {
 285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 286     * able to use vertical stride of zero to replicate the vec4 uniform, like
 287     *
 288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 289     *
 290     * But you can't, since vertical stride is always four in three-source
 291     * instructions. Instead, insert a MOV instruction to do the replication so
 292     * that the three-source instruction can consume it.
 293     */
 294
 295    /* The MOV is only needed if the source is a uniform or immediate. */
 296    if (src.file != UNIFORM && src.file != IMM)
 297       return src;
 298
 299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 305    return src_reg(expanded);
 306 }
 307
 308 src_reg
 309 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 310 {
 311    if (!src.abs && !src.negate)
 312       return src;
 313
 314    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 315    resolved.type = src.type;
 316    emit(MOV(resolved, src));
 317
 318    return src_reg(resolved);
 319 }
 320
 321 src_reg
 322 vec4_visitor::fix_math_operand(const src_reg &src)
 323 {
 324    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 325       return src;
 326
 327    /* The gen6 math instruction ignores the source modifiers --
 328     * swizzle, abs, negate, and at least some parts of the register
 329     * region description.
 330     *
 331     * Rather than trying to enumerate all these cases, *always* expand the
 332     * operand to a temp GRF for gen6.
 333     *
 334     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 335     * can't use.
 336     */
 337
 338    if (devinfo->gen == 7 && src.file != IMM)
 339       return src;
 340
 341    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 342    expanded.type = src.type;
 343    emit(MOV(expanded, src));
 344    return src_reg(expanded);
 345 }
 346
 347 vec4_instruction *
 348 vec4_visitor::emit_math(enum opcode opcode,
 349                         const dst_reg &dst,
 350                         const src_reg &src0, const src_reg &src1)
 351 {
 352    vec4_instruction *math =
 353       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 354
 355    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 356       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 357       math->dst = dst_reg(this, glsl_type::vec4_type);
 358       math->dst.type = dst.type;
 359       math = emit(MOV(dst, src_reg(math->dst)));
 360    } else if (devinfo->gen < 6) {
 361       math->base_mrf = 1;
 362       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 363    }
 364
 365    return math;
 366 }
 367
 368 void
 369 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 370 {
 371    if (devinfo->gen < 7) {
 372       unreachable("ir_unop_pack_half_2x16 should be lowered");
 373    }
 374
 375    assert(dst.type == BRW_REGISTER_TYPE_UD);
 376    assert(src0.type == BRW_REGISTER_TYPE_F);
 377
 378    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 379     *
 380     *   Because this instruction does not have a 16-bit floating-point type,
 381     *   the destination data type must be Word (W).
 382     *
 383     *   The destination must be DWord-aligned and specify a horizontal stride
 384     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 385     *   each destination channel and the upper word is not modified.
 386     *
 387     * The above restriction implies that the f32to16 instruction must use
 388     * align1 mode, because only in align1 mode is it possible to specify
 389     * horizontal stride.  We choose here to defy the hardware docs and emit
 390     * align16 instructions.
 391     *
 392     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 393     * instructions. I was partially successful in that the code passed all
 394     * tests.  However, the code was dubiously correct and fragile, and the
 395     * tests were not harsh enough to probe that frailty. Not trusting the
 396     * code, I chose instead to remain in align16 mode in defiance of the hw
 397     * docs).
 398     *
 399     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 400     * simulator, emitting a f32to16 in align16 mode with UD as destination
 401     * data type is safe. The behavior differs from that specified in the PRM
 402     * in that the upper word of each destination channel is cleared to 0.
 403     */
 404
 405    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 406    src_reg tmp_src(tmp_dst);
 407
 408 #if 0
 409    /* Verify the undocumented behavior on which the following instructions
 410     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 411     * then the result of the bit-or instruction below will be incorrect.
 412     *
 413     * You should inspect the disasm output in order to verify that the MOV is
 414     * not optimized away.
 415     */
 416    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 417 #endif
 418
 419    /* Give tmp the form below, where "." means untouched.
 420     *
 421     *     w z          y          x w z          y          x
 422     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 423     *
 424     * That the upper word of each write-channel be 0 is required for the
 425     * following bit-shift and bit-or instructions to work. Note that this
 426     * relies on the undocumented hardware behavior mentioned above.
 427     */
 428    tmp_dst.writemask = WRITEMASK_XY;
 429    emit(F32TO16(tmp_dst, src0));
 430
 431    /* Give the write-channels of dst the form:
 432     *   0xhhhh0000
 433     */
 434    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 435    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 436
 437    /* Finally, give the write-channels of dst the form of packHalf2x16's
 438     * output:
 439     *   0xhhhhllll
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 442    emit(OR(dst, src_reg(dst), tmp_src));
 443 }
 444
 445 void
 446 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 447 {
 448    if (devinfo->gen < 7) {
 449       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 450    }
 451
 452    assert(dst.type == BRW_REGISTER_TYPE_F);
 453    assert(src0.type == BRW_REGISTER_TYPE_UD);
 454
 455    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 456     *
 457     *   Because this instruction does not have a 16-bit floating-point type,
 458     *   the source data type must be Word (W). The destination type must be
 459     *   F (Float).
 460     *
 461     * To use W as the source data type, we must adjust horizontal strides,
 462     * which is only possible in align1 mode. All my [chadv] attempts at
 463     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 464     * Piglit tests, so I gave up.
 465     *
 466     * I've verified that, on gen7 hardware and the simulator, it is safe to
 467     * emit f16to32 in align16 mode with UD as source data type.
 468     */
 469
 470    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 471    src_reg tmp_src(tmp_dst);
 472
 473    tmp_dst.writemask = WRITEMASK_X;
 474    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 475
 476    tmp_dst.writemask = WRITEMASK_Y;
 477    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 478
 479    dst.writemask = WRITEMASK_XY;
 480    emit(F16TO32(dst, tmp_src));
 481 }
 482
 483 void
 484 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 485 {
 486    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 487     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 488     * is not suitable to generate the shift values, but we can use the packed
 489     * vector float and a type-converting MOV.
 490     */
 491    dst_reg shift(this, glsl_type::uvec4_type);
 492    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 493
 494    dst_reg shifted(this, glsl_type::uvec4_type);
 495    src0.swizzle = BRW_SWIZZLE_XXXX;
 496    emit(SHR(shifted, src0, src_reg(shift)));
 497
 498    shifted.type = BRW_REGISTER_TYPE_UB;
 499    dst_reg f(this, glsl_type::vec4_type);
 500    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 501
 502    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 503 }
 504
 505 void
 506 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 507 {
 508    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 509     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 510     * is not suitable to generate the shift values, but we can use the packed
 511     * vector float and a type-converting MOV.
 512     */
 513    dst_reg shift(this, glsl_type::uvec4_type);
 514    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 515
 516    dst_reg shifted(this, glsl_type::uvec4_type);
 517    src0.swizzle = BRW_SWIZZLE_XXXX;
 518    emit(SHR(shifted, src0, src_reg(shift)));
 519
 520    shifted.type = BRW_REGISTER_TYPE_B;
 521    dst_reg f(this, glsl_type::vec4_type);
 522    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 523
 524    dst_reg scaled(this, glsl_type::vec4_type);
 525    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 526
 527    dst_reg max(this, glsl_type::vec4_type);
 528    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 529    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 530 }
 531
 532 void
 533 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 534 {
 535    dst_reg saturated(this, glsl_type::vec4_type);
 536    vec4_instruction *inst = emit(MOV(saturated, src0));
 537    inst->saturate = true;
 538
 539    dst_reg scaled(this, glsl_type::vec4_type);
 540    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 541
 542    dst_reg rounded(this, glsl_type::vec4_type);
 543    emit(RNDE(rounded, src_reg(scaled)));
 544
 545    dst_reg u(this, glsl_type::uvec4_type);
 546    emit(MOV(u, src_reg(rounded)));
 547
 548    src_reg bytes(u);
 549    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 550 }
 551
 552 void
 553 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 554 {
 555    dst_reg max(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 557
 558    dst_reg min(this, glsl_type::vec4_type);
 559    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 560
 561    dst_reg scaled(this, glsl_type::vec4_type);
 562    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 563
 564    dst_reg rounded(this, glsl_type::vec4_type);
 565    emit(RNDE(rounded, src_reg(scaled)));
 566
 567    dst_reg i(this, glsl_type::ivec4_type);
 568    emit(MOV(i, src_reg(rounded)));
 569
 570    src_reg bytes(i);
 571    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 572 }
 573
 574 /*
 575  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 576  * false) elements needed to pack a type.
 577  */
 578 static int
 579 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 580 {
 581    unsigned int i;
 582    int size;
 583
 584    switch (type->base_type) {
 585    case GLSL_TYPE_UINT:
 586    case GLSL_TYPE_INT:
 587    case GLSL_TYPE_FLOAT:
 588    case GLSL_TYPE_FLOAT16:
 589    case GLSL_TYPE_BOOL:
 590    case GLSL_TYPE_DOUBLE:
 591    case GLSL_TYPE_UINT16:
 592    case GLSL_TYPE_INT16:
 593    case GLSL_TYPE_UINT8:
 594    case GLSL_TYPE_INT8:
 595    case GLSL_TYPE_UINT64:
 596    case GLSL_TYPE_INT64:
 597       if (type->is_matrix()) {
 598          const glsl_type *col_type = type->column_type();
 599          unsigned col_slots =
 600             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 601          return type->matrix_columns * col_slots;
 602       } else {
 603          /* Regardless of size of vector, it gets a vec4. This is bad
 604           * packing for things like floats, but otherwise arrays become a
 605           * mess.  Hopefully a later pass over the code can pack scalars
 606           * down if appropriate.
 607           */
 608          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 609       }
 610    case GLSL_TYPE_ARRAY:
 611       assert(type->length > 0);
 612       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 613    case GLSL_TYPE_STRUCT:
 614       size = 0;
 615       for (i = 0; i < type->length; i++) {
 616          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 617       }
 618       return size;
 619    case GLSL_TYPE_SUBROUTINE:
 620       return 1;
 621
 622    case GLSL_TYPE_SAMPLER:
 623       /* Samplers take up no register space, since they're baked in at
 624        * link time.
 625        */
 626       return 0;
 627    case GLSL_TYPE_ATOMIC_UINT:
 628       return 0;
 629    case GLSL_TYPE_IMAGE:
 630       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 631    case GLSL_TYPE_VOID:
 632    case GLSL_TYPE_ERROR:
 633    case GLSL_TYPE_INTERFACE:
 634    case GLSL_TYPE_FUNCTION:
 635       unreachable("not reached");
 636    }
 637
 638    return 0;
 639 }
 640
 641 /**
 642  * Returns the minimum number of vec4 elements needed to pack a type.
 643  *
 644  * For simple types, it will return 1 (a single vec4); for matrices, the
 645  * number of columns; for array and struct, the sum of the vec4_size of
 646  * each of its elements; and for sampler and atomic, zero.
 647  *
 648  * This method is useful to calculate how much register space is needed to
 649  * store a particular type.
 650  */
 651 extern "C" int
 652 type_size_vec4(const struct glsl_type *type)
 653 {
 654    return type_size_xvec4(type, true);
 655 }
 656
 657 /**
 658  * Returns the minimum number of dvec4 elements needed to pack a type.
 659  *
 660  * For simple types, it will return 1 (a single dvec4); for matrices, the
 661  * number of columns; for array and struct, the sum of the dvec4_size of
 662  * each of its elements; and for sampler and atomic, zero.
 663  *
 664  * This method is useful to calculate how much register space is needed to
 665  * store a particular type.
 666  *
 667  * Measuring double-precision vertex inputs as dvec4 is required because
 668  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 669  * than the single-precision version. That is, two consecutives dvec4 would be
 670  * located in location "x" and location "x+1", not "x+2".
 671  *
 672  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 673  * remap_vs_attrs() will take in account both the location and also if the
 674  * type fits in one or two vec4 slots.
 675  */
 676 extern "C" int
 677 type_size_dvec4(const struct glsl_type *type)
 678 {
 679    return type_size_xvec4(type, false);
 680 }
 681
 682 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 683 {
 684    init();
 685
 686    this->file = VGRF;
 687    this->nr = v->alloc.allocate(type_size_vec4(type));
 688
 689    if (type->is_array() || type->is_record()) {
 690       this->swizzle = BRW_SWIZZLE_NOOP;
 691    } else {
 692       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 693    }
 694
 695    this->type = brw_type_for_base_type(type);
 696 }
 697
 698 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 699 {
 700    assert(size > 0);
 701
 702    init();
 703
 704    this->file = VGRF;
 705    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 706
 707    this->swizzle = BRW_SWIZZLE_NOOP;
 708
 709    this->type = brw_type_for_base_type(type);
 710 }
 711
 712 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 713 {
 714    init();
 715
 716    this->file = VGRF;
 717    this->nr = v->alloc.allocate(type_size_vec4(type));
 718
 719    if (type->is_array() || type->is_record()) {
 720       this->writemask = WRITEMASK_XYZW;
 721    } else {
 722       this->writemask = (1 << type->vector_elements) - 1;
 723    }
 724
 725    this->type = brw_type_for_base_type(type);
 726 }
 727
 728 vec4_instruction *
 729 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 730                           src_reg src0, src_reg src1)
 731 {
 732    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 733    inst->conditional_mod = conditionalmod;
 734    return inst;
 735 }
 736
 737 vec4_instruction *
 738 vec4_visitor::emit_lrp(const dst_reg &dst,
 739                        const src_reg &x, const src_reg &y, const src_reg &a)
 740 {
 741    if (devinfo->gen >= 6 && devinfo->gen <= 10) {
 742       /* Note that the instruction's argument order is reversed from GLSL
 743        * and the IR.
 744        */
 745      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 746                      fix_3src_operand(x)));
 747    } else {
 748       /* Earlier generations don't support three source operations, so we
 749        * need to emit x*(1-a) + y*a.
 750        */
 751       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 752       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 753       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 754       y_times_a.writemask           = dst.writemask;
 755       one_minus_a.writemask         = dst.writemask;
 756       x_times_one_minus_a.writemask = dst.writemask;
 757
 758       emit(MUL(y_times_a, y, a));
 759       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 760       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 761       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 762    }
 763 }
 764
 765 /**
 766  * Emits the instructions needed to perform a pull constant load. before_block
 767  * and before_inst can be NULL in which case the instruction will be appended
 768  * to the end of the instruction list.
 769  */
 770 void
 771 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 772                                           src_reg surf_index,
 773                                           src_reg offset_reg,
 774                                           bblock_t *before_block,
 775                                           vec4_instruction *before_inst)
 776 {
 777    assert((before_inst == NULL && before_block == NULL) ||
 778           (before_inst && before_block));
 779
 780    vec4_instruction *pull;
 781
 782    if (devinfo->gen >= 9) {
 783       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 784       src_reg header(this, glsl_type::uvec4_type, 2);
 785
 786       pull = new(mem_ctx)
 787          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 788                           dst_reg(header));
 789
 790       if (before_inst)
 791          emit_before(before_block, before_inst, pull);
 792       else
 793          emit(pull);
 794
 795       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 796                                  offset_reg.type);
 797       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 798
 799       if (before_inst)
 800          emit_before(before_block, before_inst, pull);
 801       else
 802          emit(pull);
 803
 804       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 805                                            dst,
 806                                            surf_index,
 807                                            header);
 808       pull->mlen = 2;
 809       pull->header_size = 1;
 810    } else if (devinfo->gen >= 7) {
 811       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 812
 813       grf_offset.type = offset_reg.type;
 814
 815       pull = MOV(grf_offset, offset_reg);
 816
 817       if (before_inst)
 818          emit_before(before_block, before_inst, pull);
 819       else
 820          emit(pull);
 821
 822       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 823                                            dst,
 824                                            surf_index,
 825                                            src_reg(grf_offset));
 826       pull->mlen = 1;
 827    } else {
 828       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 829                                            dst,
 830                                            surf_index,
 831                                            offset_reg);
 832       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 833       pull->mlen = 1;
 834    }
 835
 836    if (before_inst)
 837       emit_before(before_block, before_inst, pull);
 838    else
 839       emit(pull);
 840 }
 841
 842 src_reg
 843 vec4_visitor::emit_uniformize(const src_reg &src)
 844 {
 845    const src_reg chan_index(this, glsl_type::uint_type);
 846    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 847                               src.type);
 848
 849    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 850       ->force_writemask_all = true;
 851    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 852       ->force_writemask_all = true;
 853
 854    return src_reg(dst);
 855 }
 856
 857 src_reg
 858 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 859                              src_reg coordinate, src_reg surface)
 860 {
 861    vec4_instruction *inst =
 862       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 863                                     dst_reg(this, glsl_type::uvec4_type));
 864    inst->base_mrf = 2;
 865    inst->src[1] = surface;
 866    inst->src[2] = surface;
 867
 868    int param_base;
 869
 870    if (devinfo->gen >= 9) {
 871       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 872       vec4_instruction *header_inst = new(mem_ctx)
 873          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 874                           dst_reg(MRF, inst->base_mrf));
 875
 876       emit(header_inst);
 877
 878       inst->mlen = 2;
 879       inst->header_size = 1;
 880       param_base = inst->base_mrf + 1;
 881    } else {
 882       inst->mlen = 1;
 883       param_base = inst->base_mrf;
 884    }
 885
 886    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 887    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 888    int zero_mask = 0xf & ~coord_mask;
 889
 890    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 891             coordinate));
 892
 893    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 894             brw_imm_d(0)));
 895
 896    emit(inst);
 897    return src_reg(inst->dst);
 898 }
 899
 900 bool
 901 vec4_visitor::is_high_sampler(src_reg sampler)
 902 {
 903    if (devinfo->gen < 8 && !devinfo->is_haswell)
 904       return false;
 905
 906    return sampler.file != IMM || sampler.ud >= 16;
 907 }
 908
 909 void
 910 vec4_visitor::emit_texture(ir_texture_opcode op,
 911                            dst_reg dest,
 912                            const glsl_type *dest_type,
 913                            src_reg coordinate,
 914                            int coord_components,
 915                            src_reg shadow_comparator,
 916                            src_reg lod, src_reg lod2,
 917                            src_reg sample_index,
 918                            uint32_t constant_offset,
 919                            src_reg offset_value,
 920                            src_reg mcs,
 921                            uint32_t surface,
 922                            src_reg surface_reg,
 923                            src_reg sampler_reg)
 924 {
 925    enum opcode opcode;
 926    switch (op) {
 927    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 928    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 929    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 930    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 931    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 932                              SHADER_OPCODE_TXF_CMS); break;
 933    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 934    case ir_tg4: opcode = offset_value.file != BAD_FILE
 935                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 936    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 937    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 938    case ir_txb:
 939       unreachable("TXB is not valid for vertex shaders.");
 940    case ir_lod:
 941       unreachable("LOD is not valid for vertex shaders.");
 942    case ir_samples_identical: {
 943       /* There are some challenges implementing this for vec4, and it seems
 944        * unlikely to be used anyway.  For now, just return false ways.
 945        */
 946       emit(MOV(dest, brw_imm_ud(0u)));
 947       return;
 948    }
 949    default:
 950       unreachable("Unrecognized tex op");
 951    }
 952
 953    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 954
 955    inst->offset = constant_offset;
 956
 957    /* The message header is necessary for:
 958     * - Gen4 (always)
 959     * - Gen9+ for selecting SIMD4x2
 960     * - Texel offsets
 961     * - Gather channel selection
 962     * - Sampler indices too large to fit in a 4-bit value.
 963     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 964     */
 965    inst->header_size =
 966       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 967        inst->offset != 0 || op == ir_tg4 ||
 968        op == ir_texture_samples ||
 969        is_high_sampler(sampler_reg)) ? 1 : 0;
 970    inst->base_mrf = 2;
 971    inst->mlen = inst->header_size;
 972    inst->dst.writemask = WRITEMASK_XYZW;
 973    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 974
 975    inst->src[1] = surface_reg;
 976    inst->src[2] = sampler_reg;
 977
 978    /* MRF for the first parameter */
 979    int param_base = inst->base_mrf + inst->header_size;
 980
 981    if (op == ir_txs || op == ir_query_levels) {
 982       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 983       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 984       inst->mlen++;
 985    } else if (op == ir_texture_samples) {
 986       inst->dst.writemask = WRITEMASK_X;
 987    } else {
 988       /* Load the coordinate */
 989       /* FINISHME: gl_clamp_mask and saturate */
 990       int coord_mask = (1 << coord_components) - 1;
 991       int zero_mask = 0xf & ~coord_mask;
 992
 993       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 994                coordinate));
 995       inst->mlen++;
 996
 997       if (zero_mask != 0) {
 998          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 999                   brw_imm_d(0)));
1000       }
1001       /* Load the shadow comparator */
1002       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1003          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1004                           WRITEMASK_X),
1005                   shadow_comparator));
1006          inst->mlen++;
1007       }
1008
1009       /* Load the LOD info */
1010       if (op == ir_tex || op == ir_txl) {
1011          int mrf, writemask;
1012          if (devinfo->gen >= 5) {
1013             mrf = param_base + 1;
1014             if (shadow_comparator.file != BAD_FILE) {
1015                writemask = WRITEMASK_Y;
1016                /* mlen already incremented */
1017             } else {
1018                writemask = WRITEMASK_X;
1019                inst->mlen++;
1020             }
1021          } else /* devinfo->gen == 4 */ {
1022             mrf = param_base;
1023             writemask = WRITEMASK_W;
1024          }
1025          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1026       } else if (op == ir_txf) {
1027          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1028       } else if (op == ir_txf_ms) {
1029          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1030                   sample_index));
1031          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1032             /* MCS data is stored in the first two channels of ‘mcs’, but we
1033              * need to get it into the .y and .z channels of the second vec4
1034              * of params.
1035              */
1036             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1037             emit(MOV(dst_reg(MRF, param_base + 1,
1038                              glsl_type::uint_type, WRITEMASK_YZ),
1039                      mcs));
1040          } else if (devinfo->gen >= 7) {
1041             /* MCS data is in the first channel of `mcs`, but we need to get it into
1042              * the .y channel of the second vec4 of params, so replicate .x across
1043              * the whole vec4 and then mask off everything except .y
1044              */
1045             mcs.swizzle = BRW_SWIZZLE_XXXX;
1046             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1047                      mcs));
1048          }
1049          inst->mlen++;
1050       } else if (op == ir_txd) {
1051          const brw_reg_type type = lod.type;
1052
1053          if (devinfo->gen >= 5) {
1054             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1055             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1056             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1057             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1058             inst->mlen++;
1059
1060             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1061                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1062                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1063                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1064                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1065                inst->mlen++;
1066
1067                if (shadow_comparator.file != BAD_FILE) {
1068                   emit(MOV(dst_reg(MRF, param_base + 2,
1069                                    shadow_comparator.type, WRITEMASK_Z),
1070                            shadow_comparator));
1071                }
1072             }
1073          } else /* devinfo->gen == 4 */ {
1074             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1075             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1076             inst->mlen += 2;
1077          }
1078       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1079          if (shadow_comparator.file != BAD_FILE) {
1080             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1081                      shadow_comparator));
1082          }
1083
1084          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1085                   offset_value));
1086          inst->mlen++;
1087       }
1088    }
1089
1090    emit(inst);
1091
1092    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1093     * spec requires layers.
1094     */
1095    if (op == ir_txs && devinfo->gen < 7) {
1096       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1097       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1098                   src_reg(inst->dst), brw_imm_d(1));
1099    }
1100
1101    if (devinfo->gen == 6 && op == ir_tg4) {
1102       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1103    }
1104
1105    if (op == ir_query_levels) {
1106       /* # levels is in .w */
1107       src_reg swizzled(dest);
1108       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1109                                       SWIZZLE_W, SWIZZLE_W);
1110       emit(MOV(dest, swizzled));
1111    }
1112 }
1113
1114 /**
1115  * Apply workarounds for Gen6 gather with UINT/SINT
1116  */
1117 void
1118 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1119 {
1120    if (!wa)
1121       return;
1122
1123    int width = (wa & WA_8BIT) ? 8 : 16;
1124    dst_reg dst_f = dst;
1125    dst_f.type = BRW_REGISTER_TYPE_F;
1126
1127    /* Convert from UNORM to UINT */
1128    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1129    emit(MOV(dst, src_reg(dst_f)));
1130
1131    if (wa & WA_SIGN) {
1132       /* Reinterpret the UINT value as a signed INT value by
1133        * shifting the sign bit into place, then shifting back
1134        * preserving sign.
1135        */
1136       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1137       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1138    }
1139 }
1140
1141 void
1142 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1143 {
1144    unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::gs_end_primitive()
1149 {
1150    unreachable("not reached");
1151 }
1152
1153 void
1154 vec4_visitor::emit_ndc_computation()
1155 {
1156    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1157       return;
1158
1159    /* Get the position */
1160    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1161
1162    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1163    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1164    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1165    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1166
1167    current_annotation = "NDC";
1168    dst_reg ndc_w = ndc;
1169    ndc_w.writemask = WRITEMASK_W;
1170    src_reg pos_w = pos;
1171    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1172    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1173
1174    dst_reg ndc_xyz = ndc;
1175    ndc_xyz.writemask = WRITEMASK_XYZ;
1176
1177    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1178 }
1179
1180 void
1181 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1182 {
1183    if (devinfo->gen < 6 &&
1184        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1185         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1186         devinfo->has_negative_rhw_bug)) {
1187       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1188       dst_reg header1_w = header1;
1189       header1_w.writemask = WRITEMASK_W;
1190
1191       emit(MOV(header1, brw_imm_ud(0u)));
1192
1193       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1194          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1195
1196          current_annotation = "Point size";
1197          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1198          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1199       }
1200
1201       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1202          current_annotation = "Clipping flags";
1203          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1204
1205          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1206          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1207          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1208       }
1209
1210       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1211          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1212          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1213          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1214          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1215          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1216       }
1217
1218       /* i965 clipping workaround:
1219        * 1) Test for -ve rhw
1220        * 2) If set,
1221        *      set ndc = (0,0,0,0)
1222        *      set ucp[6] = 1
1223        *
1224        * Later, clipping will detect ucp[6] and ensure the primitive is
1225        * clipped against all fixed planes.
1226        */
1227       if (devinfo->has_negative_rhw_bug &&
1228           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1229          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1230          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1231          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1232          vec4_instruction *inst;
1233          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1234          inst->predicate = BRW_PREDICATE_NORMAL;
1235          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1236          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1237          inst->predicate = BRW_PREDICATE_NORMAL;
1238       }
1239
1240       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1241    } else if (devinfo->gen < 6) {
1242       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1243    } else {
1244       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1245       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1246          dst_reg reg_w = reg;
1247          reg_w.writemask = WRITEMASK_W;
1248          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1249          reg_as_src.type = reg_w.type;
1250          reg_as_src.swizzle = brw_swizzle_for_size(1);
1251          emit(MOV(reg_w, reg_as_src));
1252       }
1253       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1254          dst_reg reg_y = reg;
1255          reg_y.writemask = WRITEMASK_Y;
1256          reg_y.type = BRW_REGISTER_TYPE_D;
1257          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1258          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1259       }
1260       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1261          dst_reg reg_z = reg;
1262          reg_z.writemask = WRITEMASK_Z;
1263          reg_z.type = BRW_REGISTER_TYPE_D;
1264          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1265          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1266       }
1267    }
1268 }
1269
1270 vec4_instruction *
1271 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1272 {
1273    assert(varying < VARYING_SLOT_MAX);
1274
1275    unsigned num_comps = output_num_components[varying][component];
1276    if (num_comps == 0)
1277       return NULL;
1278
1279    assert(output_reg[varying][component].type == reg.type);
1280    current_annotation = output_reg_annotation[varying];
1281    if (output_reg[varying][component].file != BAD_FILE) {
1282       src_reg src = src_reg(output_reg[varying][component]);
1283       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1284       reg.writemask =
1285          brw_writemask_for_component_packing(num_comps, component);
1286       return emit(MOV(reg, src));
1287    }
1288    return NULL;
1289 }
1290
1291 void
1292 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1293 {
1294    reg.type = BRW_REGISTER_TYPE_F;
1295    output_reg[varying][0].type = reg.type;
1296
1297    switch (varying) {
1298    case VARYING_SLOT_PSIZ:
1299    {
1300       /* PSIZ is always in slot 0, and is coupled with other flags. */
1301       current_annotation = "indices, point width, clip flags";
1302       emit_psiz_and_flags(reg);
1303       break;
1304    }
1305    case BRW_VARYING_SLOT_NDC:
1306       current_annotation = "NDC";
1307       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1308          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1309       break;
1310    case VARYING_SLOT_POS:
1311       current_annotation = "gl_Position";
1312       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1313          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1314       break;
1315    case VARYING_SLOT_EDGE: {
1316       /* This is present when doing unfilled polygons.  We're supposed to copy
1317        * the edge flag from the user-provided vertex array
1318        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1319        * of that attribute (starts as 1.0f).  This is then used in clipping to
1320        * determine which edges should be drawn as wireframe.
1321        */
1322       current_annotation = "edge flag";
1323       int edge_attr = util_bitcount64(nir->info.inputs_read &
1324                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1325       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1326                                     glsl_type::float_type, WRITEMASK_XYZW))));
1327       break;
1328    }
1329    case BRW_VARYING_SLOT_PAD:
1330       /* No need to write to this slot */
1331       break;
1332    default:
1333       for (int i = 0; i < 4; i++) {
1334          emit_generic_urb_slot(reg, varying, i);
1335       }
1336       break;
1337    }
1338 }
1339
1340 static unsigned
1341 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1342 {
1343    if (devinfo->gen >= 6) {
1344       /* URB data written (does not include the message header reg) must
1345        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1346        * section 5.4.3.2.2: URB_INTERLEAVED.
1347        *
1348        * URB entries are allocated on a multiple of 1024 bits, so an
1349        * extra 128 bits written here to make the end align to 256 is
1350        * no problem.
1351        */
1352       if ((mlen % 2) != 1)
1353          mlen++;
1354    }
1355
1356    return mlen;
1357 }
1358
1359
1360 /**
1361  * Generates the VUE payload plus the necessary URB write instructions to
1362  * output it.
1363  *
1364  * The VUE layout is documented in Volume 2a.
1365  */
1366 void
1367 vec4_visitor::emit_vertex()
1368 {
1369    /* MRF 0 is reserved for the debugger, so start with message header
1370     * in MRF 1.
1371     */
1372    int base_mrf = 1;
1373    int mrf = base_mrf;
1374    /* In the process of generating our URB write message contents, we
1375     * may need to unspill a register or load from an array.  Those
1376     * reads would use MRFs 14-15.
1377     */
1378    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1379
1380    /* The following assertion verifies that max_usable_mrf causes an
1381     * even-numbered amount of URB write data, which will meet gen6's
1382     * requirements for length alignment.
1383     */
1384    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1385
1386    /* First mrf is the g0-based message header containing URB handles and
1387     * such.
1388     */
1389    emit_urb_write_header(mrf++);
1390
1391    if (devinfo->gen < 6) {
1392       emit_ndc_computation();
1393    }
1394
1395    /* We may need to split this up into several URB writes, so do them in a
1396     * loop.
1397     */
1398    int slot = 0;
1399    bool complete = false;
1400    do {
1401       /* URB offset is in URB row increments, and each of our MRFs is half of
1402        * one of those, since we're doing interleaved writes.
1403        */
1404       int offset = slot / 2;
1405
1406       mrf = base_mrf + 1;
1407       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1408          emit_urb_slot(dst_reg(MRF, mrf++),
1409                        prog_data->vue_map.slot_to_varying[slot]);
1410
1411          /* If this was max_usable_mrf, we can't fit anything more into this
1412           * URB WRITE. Same thing if we reached the maximum length available.
1413           */
1414          if (mrf > max_usable_mrf ||
1415              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1416             slot++;
1417             break;
1418          }
1419       }
1420
1421       complete = slot >= prog_data->vue_map.num_slots;
1422       current_annotation = "URB write";
1423       vec4_instruction *inst = emit_urb_write_opcode(complete);
1424       inst->base_mrf = base_mrf;
1425       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1426       inst->offset += offset;
1427    } while(!complete);
1428 }
1429
1430
1431 src_reg
1432 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1433                                  src_reg *reladdr, int reg_offset)
1434 {
1435    /* Because we store the values to scratch interleaved like our
1436     * vertex data, we need to scale the vec4 index by 2.
1437     */
1438    int message_header_scale = 2;
1439
1440    /* Pre-gen6, the message header uses byte offsets instead of vec4
1441     * (16-byte) offset units.
1442     */
1443    if (devinfo->gen < 6)
1444       message_header_scale *= 16;
1445
1446    if (reladdr) {
1447       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1448        * to multiply the reladdr by 2. Notice that the reg_offset part
1449        * is in units of 16 bytes and is used to select the low/high 16-byte
1450        * chunk of a full dvec4, so we don't want to multiply that part.
1451        */
1452       src_reg index = src_reg(this, glsl_type::int_type);
1453       if (type_sz(inst->dst.type) < 8) {
1454          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1455                                       brw_imm_d(reg_offset)));
1456          emit_before(block, inst, MUL(dst_reg(index), index,
1457                                       brw_imm_d(message_header_scale)));
1458       } else {
1459          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1460                                       brw_imm_d(message_header_scale * 2)));
1461          emit_before(block, inst, ADD(dst_reg(index), index,
1462                                       brw_imm_d(reg_offset * message_header_scale)));
1463       }
1464       return index;
1465    } else {
1466       return brw_imm_d(reg_offset * message_header_scale);
1467    }
1468 }
1469
1470 /**
1471  * Emits an instruction before @inst to load the value named by @orig_src
1472  * from scratch space at @base_offset to @temp.
1473  *
1474  * @base_offset is measured in 32-byte units (the size of a register).
1475  */
1476 void
1477 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1478                                 dst_reg temp, src_reg orig_src,
1479                                 int base_offset)
1480 {
1481    assert(orig_src.offset % REG_SIZE == 0);
1482    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1483    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1484                                       reg_offset);
1485
1486    if (type_sz(orig_src.type) < 8) {
1487       emit_before(block, inst, SCRATCH_READ(temp, index));
1488    } else {
1489       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1490       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1491       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1492       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1493       vec4_instruction *last_read =
1494          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1495       emit_before(block, inst, last_read);
1496       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1497    }
1498 }
1499
1500 /**
1501  * Emits an instruction after @inst to store the value to be written
1502  * to @orig_dst to scratch space at @base_offset, from @temp.
1503  *
1504  * @base_offset is measured in 32-byte units (the size of a register).
1505  */
1506 void
1507 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1508                                  int base_offset)
1509 {
1510    assert(inst->dst.offset % REG_SIZE == 0);
1511    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1512    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1513                                       reg_offset);
1514
1515    /* Create a temporary register to store *inst's result in.
1516     *
1517     * We have to be careful in MOVing from our temporary result register in
1518     * the scratch write.  If we swizzle from channels of the temporary that
1519     * weren't initialized, it will confuse live interval analysis, which will
1520     * make spilling fail to make progress.
1521     */
1522    bool is_64bit = type_sz(inst->dst.type) == 8;
1523    const glsl_type *alloc_type =
1524       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1525    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1526                                        inst->dst.type),
1527                                 brw_swizzle_for_mask(inst->dst.writemask));
1528
1529    if (!is_64bit) {
1530       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1531                                           inst->dst.writemask));
1532       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1533       if (inst->opcode != BRW_OPCODE_SEL)
1534          write->predicate = inst->predicate;
1535       write->ir = inst->ir;
1536       write->annotation = inst->annotation;
1537       inst->insert_after(block, write);
1538    } else {
1539       dst_reg shuffled = dst_reg(this, alloc_type);
1540       vec4_instruction *last =
1541          shuffle_64bit_data(shuffled, temp, true, block, inst);
1542       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1543
1544       uint8_t mask = 0;
1545       if (inst->dst.writemask & WRITEMASK_X)
1546          mask |= WRITEMASK_XY;
1547       if (inst->dst.writemask & WRITEMASK_Y)
1548          mask |= WRITEMASK_ZW;
1549       if (mask) {
1550          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1551
1552          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1553          if (inst->opcode != BRW_OPCODE_SEL)
1554             write->predicate = inst->predicate;
1555          write->ir = inst->ir;
1556          write->annotation = inst->annotation;
1557          last->insert_after(block, write);
1558       }
1559
1560       mask = 0;
1561       if (inst->dst.writemask & WRITEMASK_Z)
1562          mask |= WRITEMASK_XY;
1563       if (inst->dst.writemask & WRITEMASK_W)
1564          mask |= WRITEMASK_ZW;
1565       if (mask) {
1566          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1567
1568          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1569                                             reg_offset + 1);
1570          vec4_instruction *write =
1571             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1572          if (inst->opcode != BRW_OPCODE_SEL)
1573             write->predicate = inst->predicate;
1574          write->ir = inst->ir;
1575          write->annotation = inst->annotation;
1576          last->insert_after(block, write);
1577       }
1578    }
1579
1580    inst->dst.file = temp.file;
1581    inst->dst.nr = temp.nr;
1582    inst->dst.offset %= REG_SIZE;
1583    inst->dst.reladdr = NULL;
1584 }
1585
1586 /**
1587  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1588  * adds the scratch read(s) before \p inst. The function also checks for
1589  * recursive reladdr scratch accesses, issuing the corresponding scratch
1590  * loads and rewriting reladdr references accordingly.
1591  *
1592  * \return \p src if it did not require a scratch load, otherwise, the
1593  * register holding the result of the scratch load that the caller should
1594  * use to rewrite src.
1595  */
1596 src_reg
1597 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1598                                    vec4_instruction *inst, src_reg src)
1599 {
1600    /* Resolve recursive reladdr scratch access by calling ourselves
1601     * with src.reladdr
1602     */
1603    if (src.reladdr)
1604       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1605                                           *src.reladdr);
1606
1607    /* Now handle scratch access on src */
1608    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1609       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1610          glsl_type::dvec4_type : glsl_type::vec4_type);
1611       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1612       src.nr = temp.nr;
1613       src.offset %= REG_SIZE;
1614       src.reladdr = NULL;
1615    }
1616
1617    return src;
1618 }
1619
1620 /**
1621  * We can't generally support array access in GRF space, because a
1622  * single instruction's destination can only span 2 contiguous
1623  * registers.  So, we send all GRF arrays that get variable index
1624  * access to scratch space.
1625  */
1626 void
1627 vec4_visitor::move_grf_array_access_to_scratch()
1628 {
1629    int scratch_loc[this->alloc.count];
1630    memset(scratch_loc, -1, sizeof(scratch_loc));
1631
1632    /* First, calculate the set of virtual GRFs that need to be punted
1633     * to scratch due to having any array access on them, and where in
1634     * scratch.
1635     */
1636    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1637       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1638          if (scratch_loc[inst->dst.nr] == -1) {
1639             scratch_loc[inst->dst.nr] = last_scratch;
1640             last_scratch += this->alloc.sizes[inst->dst.nr];
1641          }
1642
1643          for (src_reg *iter = inst->dst.reladdr;
1644               iter->reladdr;
1645               iter = iter->reladdr) {
1646             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1647                scratch_loc[iter->nr] = last_scratch;
1648                last_scratch += this->alloc.sizes[iter->nr];
1649             }
1650          }
1651       }
1652
1653       for (int i = 0 ; i < 3; i++) {
1654          for (src_reg *iter = &inst->src[i];
1655               iter->reladdr;
1656               iter = iter->reladdr) {
1657             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1658                scratch_loc[iter->nr] = last_scratch;
1659                last_scratch += this->alloc.sizes[iter->nr];
1660             }
1661          }
1662       }
1663    }
1664
1665    /* Now, for anything that will be accessed through scratch, rewrite
1666     * it to load/store.  Note that this is a _safe list walk, because
1667     * we may generate a new scratch_write instruction after the one
1668     * we're processing.
1669     */
1670    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1671       /* Set up the annotation tracking for new generated instructions. */
1672       base_ir = inst->ir;
1673       current_annotation = inst->annotation;
1674
1675       /* First handle scratch access on the dst. Notice we have to handle
1676        * the case where the dst's reladdr also points to scratch space.
1677        */
1678       if (inst->dst.reladdr)
1679          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1680                                                    *inst->dst.reladdr);
1681
1682       /* Now that we have handled any (possibly recursive) reladdr scratch
1683        * accesses for dst we can safely do the scratch write for dst itself
1684        */
1685       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1686          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1687
1688       /* Now handle scratch access on any src. In this case, since inst->src[i]
1689        * already is a src_reg, we can just call emit_resolve_reladdr with
1690        * inst->src[i] and it will take care of handling scratch loads for
1691        * both src and src.reladdr (recursively).
1692        */
1693       for (int i = 0 ; i < 3; i++) {
1694          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1695                                              inst->src[i]);
1696       }
1697    }
1698 }
1699
1700 /**
1701  * Emits an instruction before @inst to load the value named by @orig_src
1702  * from the pull constant buffer (surface) at @base_offset to @temp.
1703  */
1704 void
1705 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1706                                       dst_reg temp, src_reg orig_src,
1707                                       int base_offset, src_reg indirect)
1708 {
1709    assert(orig_src.offset % 16 == 0);
1710    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1711
1712    /* For 64bit loads we need to emit two 32-bit load messages and we also
1713     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1714     * that we emit the 32-bit loads into a temporary and we shuffle the result
1715     * into the original destination.
1716     */
1717    dst_reg orig_temp = temp;
1718    bool is_64bit = type_sz(orig_src.type) == 8;
1719    if (is_64bit) {
1720       assert(type_sz(temp.type) == 8);
1721       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1722       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1723    }
1724
1725    src_reg src = orig_src;
1726    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1727       int reg_offset = base_offset + src.offset / 16;
1728
1729       src_reg offset;
1730       if (indirect.file != BAD_FILE) {
1731          offset = src_reg(this, glsl_type::uint_type);
1732          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1733                                       brw_imm_ud(reg_offset * 16)));
1734       } else if (devinfo->gen >= 8) {
1735          /* Store the offset in a GRF so we can send-from-GRF. */
1736          offset = src_reg(this, glsl_type::uint_type);
1737          emit_before(block, inst, MOV(dst_reg(offset),
1738                                       brw_imm_ud(reg_offset * 16)));
1739       } else {
1740          offset = brw_imm_d(reg_offset * 16);
1741       }
1742
1743       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1744                                   brw_imm_ud(index),
1745                                   offset,
1746                                   block, inst);
1747
1748       src = byte_offset(src, 16);
1749    }
1750
1751    if (is_64bit) {
1752       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1753       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1754    }
1755 }
1756
1757 /**
1758  * Implements array access of uniforms by inserting a
1759  * PULL_CONSTANT_LOAD instruction.
1760  *
1761  * Unlike temporary GRF array access (where we don't support it due to
1762  * the difficulty of doing relative addressing on instruction
1763  * destinations), we could potentially do array access of uniforms
1764  * that were loaded in GRF space as push constants.  In real-world
1765  * usage we've seen, though, the arrays being used are always larger
1766  * than we could load as push constants, so just always move all
1767  * uniform array access out to a pull constant buffer.
1768  */
1769 void
1770 vec4_visitor::move_uniform_array_access_to_pull_constants()
1771 {
1772    /* The vulkan dirver doesn't support pull constants other than UBOs so
1773     * everything has to be pushed regardless.
1774     */
1775    if (!compiler->supports_pull_constants) {
1776       split_uniform_registers();
1777       return;
1778    }
1779
1780    /* Allocate the pull_params array */
1781    assert(stage_prog_data->nr_pull_params == 0);
1782    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1783                                               this->uniforms * 4);
1784
1785    int pull_constant_loc[this->uniforms];
1786    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1787
1788    /* First, walk through the instructions and determine which things need to
1789     * be pulled.  We mark something as needing to be pulled by setting
1790     * pull_constant_loc to 0.
1791     */
1792    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1793       /* We only care about MOV_INDIRECT of a uniform */
1794       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1795           inst->src[0].file != UNIFORM)
1796          continue;
1797
1798       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1799
1800       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1801          pull_constant_loc[uniform_nr + j] = 0;
1802    }
1803
1804    /* Next, we walk the list of uniforms and assign real pull constant
1805     * locations and set their corresponding entries in pull_param.
1806     */
1807    for (int j = 0; j < this->uniforms; j++) {
1808       if (pull_constant_loc[j] < 0)
1809          continue;
1810
1811       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1812
1813       for (int i = 0; i < 4; i++) {
1814          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1815             = stage_prog_data->param[j * 4 + i];
1816       }
1817    }
1818
1819    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1820     * instructions to actual uniform pulls.
1821     */
1822    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1823       /* We only care about MOV_INDIRECT of a uniform */
1824       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1825           inst->src[0].file != UNIFORM)
1826          continue;
1827
1828       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1829
1830       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1831
1832       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1833                               pull_constant_loc[uniform_nr], inst->src[1]);
1834       inst->remove(block);
1835    }
1836
1837    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1838     * no need to track them as larger-than-vec4 objects.  This will be
1839     * relied on in cutting out unused uniform vectors from push
1840     * constants.
1841     */
1842    split_uniform_registers();
1843 }
1844
1845 void
1846 vec4_visitor::resolve_ud_negate(src_reg *reg)
1847 {
1848    if (reg->type != BRW_REGISTER_TYPE_UD ||
1849        !reg->negate)
1850       return;
1851
1852    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1853    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1854    *reg = temp;
1855 }
1856
1857 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1858                            void *log_data,
1859                            const struct brw_sampler_prog_key_data *key_tex,
1860                            struct brw_vue_prog_data *prog_data,
1861                            const nir_shader *shader,
1862                            void *mem_ctx,
1863                            bool no_spills,
1864                            int shader_time_index)
1865    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1866      key_tex(key_tex),
1867      prog_data(prog_data),
1868      fail_msg(NULL),
1869      first_non_payload_grf(0),
1870      need_all_constants_in_pull_buffer(false),
1871      no_spills(no_spills),
1872      shader_time_index(shader_time_index),
1873      last_scratch(0)
1874 {
1875    this->failed = false;
1876
1877    this->base_ir = NULL;
1878    this->current_annotation = NULL;
1879    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1880
1881    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1882
1883    this->virtual_grf_start = NULL;
1884    this->virtual_grf_end = NULL;
1885    this->live_intervals = NULL;
1886
1887    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1888
1889    this->uniforms = 0;
1890 }
1891
1892
1893 void
1894 vec4_visitor::fail(const char *format, ...)
1895 {
1896    va_list va;
1897    char *msg;
1898
1899    if (failed)
1900       return;
1901
1902    failed = true;
1903
1904    va_start(va, format);
1905    msg = ralloc_vasprintf(mem_ctx, format, va);
1906    va_end(va);
1907    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1908
1909    this->fail_msg = msg;
1910
1911    if (debug_enabled) {
1912       fprintf(stderr, "%s",  msg);
1913    }
1914 }
1915
1916 } /* namespace brw */