src/intel/compiler/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "util/u_math.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->eot = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->exec_size = 8;
  59    this->group = 0;
  60    this->size_written = (dst.file == BAD_FILE ?
  61                          0 : this->exec_size * type_sz(dst.type));
  62    this->annotation = NULL;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(vec4_instruction *inst)
  67 {
  68    inst->ir = this->base_ir;
  69    inst->annotation = this->current_annotation;
  70
  71    this->instructions.push_tail(inst);
  72
  73    return inst;
  74 }
  75
  76 vec4_instruction *
  77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  78                           vec4_instruction *new_inst)
  79 {
  80    new_inst->ir = inst->ir;
  81    new_inst->annotation = inst->annotation;
  82
  83    inst->insert_before(block, new_inst);
  84
  85    return inst;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  90                    const src_reg &src1, const src_reg &src2)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  93 }
  94
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  98                    const src_reg &src1)
  99 {
 100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 101 }
 102
 103 vec4_instruction *
 104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 105 {
 106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 107 }
 108
 109 vec4_instruction *
 110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 111 {
 112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 113 }
 114
 115 vec4_instruction *
 116 vec4_visitor::emit(enum opcode opcode)
 117 {
 118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 119 }
 120
 121 #define ALU1(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 126    }
 127
 128 #define ALU2(op)                                                        \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 131                     const src_reg &src1)                                \
 132    {                                                                    \
 133       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 134                                            src0, src1);                 \
 135    }
 136
 137 #define ALU2_ACC(op)                                                    \
 138    vec4_instruction *                                                   \
 139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 140                     const src_reg &src1)                                \
 141    {                                                                    \
 142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 143                        BRW_OPCODE_##op, dst, src0, src1);               \
 144       inst->writes_accumulator = true;                                  \
 145       return inst;                                                      \
 146    }
 147
 148 #define ALU3(op)                                                        \
 149    vec4_instruction *                                                   \
 150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 151                     const src_reg &src1, const src_reg &src2)           \
 152    {                                                                    \
 153       assert(devinfo->gen >= 6);                                                \
 154       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 155                                            src0, src1, src2);           \
 156    }
 157
 158 ALU1(NOT)
 159 ALU1(MOV)
 160 ALU1(FRC)
 161 ALU1(RNDD)
 162 ALU1(RNDE)
 163 ALU1(RNDZ)
 164 ALU1(F32TO16)
 165 ALU1(F16TO32)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2_ACC(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(DP3)
 173 ALU2(DP4)
 174 ALU2(DPH)
 175 ALU2(SHL)
 176 ALU2(SHR)
 177 ALU2(ASR)
 178 ALU3(LRP)
 179 ALU1(BFREV)
 180 ALU3(BFE)
 181 ALU2(BFI1)
 182 ALU3(BFI2)
 183 ALU1(FBH)
 184 ALU1(FBL)
 185 ALU1(CBIT)
 186 ALU3(MAD)
 187 ALU2_ACC(ADDC)
 188 ALU2_ACC(SUBB)
 189 ALU2(MAC)
 190 ALU1(DIM)
 191
 192 /** Gen4 predicated IF. */
 193 vec4_instruction *
 194 vec4_visitor::IF(enum brw_predicate predicate)
 195 {
 196    vec4_instruction *inst;
 197
 198    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 199    inst->predicate = predicate;
 200
 201    return inst;
 202 }
 203
 204 /** Gen6 IF with embedded comparison. */
 205 vec4_instruction *
 206 vec4_visitor::IF(src_reg src0, src_reg src1,
 207                  enum brw_conditional_mod condition)
 208 {
 209    assert(devinfo->gen == 6);
 210
 211    vec4_instruction *inst;
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 217                                         src0, src1);
 218    inst->conditional_mod = condition;
 219
 220    return inst;
 221 }
 222
 223 /**
 224  * CMP: Sets the low bit of the destination channels with the result
 225  * of the comparison, while the upper bits are undefined, and updates
 226  * the flag register with the packed 16 bits of the result.
 227  */
 228 vec4_instruction *
 229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 230                   enum brw_conditional_mod condition)
 231 {
 232    vec4_instruction *inst;
 233
 234    /* Take the instruction:
 235     *
 236     * CMP null<d> src0<f> src1<f>
 237     *
 238     * Original gen4 does type conversion to the destination type before
 239     * comparison, producing garbage results for floating point comparisons.
 240     *
 241     * The destination type doesn't matter on newer generations, so we set the
 242     * type to match src0 so we can compact the instruction.
 243     */
 244    dst.type = src0.type;
 245
 246    resolve_ud_negate(&src0);
 247    resolve_ud_negate(&src1);
 248
 249    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 250    inst->conditional_mod = condition;
 251
 252    return inst;
 253 }
 254
 255 vec4_instruction *
 256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 257 {
 258    vec4_instruction *inst;
 259
 260    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 261                                         dst, index);
 262    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 263    inst->mlen = 2;
 264
 265    return inst;
 266 }
 267
 268 vec4_instruction *
 269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 270                             const src_reg &index)
 271 {
 272    vec4_instruction *inst;
 273
 274    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 275                                         dst, src, index);
 276    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 277    inst->mlen = 3;
 278
 279    return inst;
 280 }
 281
 282 src_reg
 283 vec4_visitor::fix_3src_operand(const src_reg &src)
 284 {
 285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 286     * able to use vertical stride of zero to replicate the vec4 uniform, like
 287     *
 288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 289     *
 290     * But you can't, since vertical stride is always four in three-source
 291     * instructions. Instead, insert a MOV instruction to do the replication so
 292     * that the three-source instruction can consume it.
 293     */
 294
 295    /* The MOV is only needed if the source is a uniform or immediate. */
 296    if (src.file != UNIFORM && src.file != IMM)
 297       return src;
 298
 299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 300       return src;
 301
 302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 303    expanded.type = src.type;
 304    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 305    return src_reg(expanded);
 306 }
 307
 308 src_reg
 309 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 310 {
 311    if (!src.abs && !src.negate)
 312       return src;
 313
 314    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 315    resolved.type = src.type;
 316    emit(MOV(resolved, src));
 317
 318    return src_reg(resolved);
 319 }
 320
 321 src_reg
 322 vec4_visitor::fix_math_operand(const src_reg &src)
 323 {
 324    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 325       return src;
 326
 327    /* The gen6 math instruction ignores the source modifiers --
 328     * swizzle, abs, negate, and at least some parts of the register
 329     * region description.
 330     *
 331     * Rather than trying to enumerate all these cases, *always* expand the
 332     * operand to a temp GRF for gen6.
 333     *
 334     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 335     * can't use.
 336     */
 337
 338    if (devinfo->gen == 7 && src.file != IMM)
 339       return src;
 340
 341    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 342    expanded.type = src.type;
 343    emit(MOV(expanded, src));
 344    return src_reg(expanded);
 345 }
 346
 347 vec4_instruction *
 348 vec4_visitor::emit_math(enum opcode opcode,
 349                         const dst_reg &dst,
 350                         const src_reg &src0, const src_reg &src1)
 351 {
 352    vec4_instruction *math =
 353       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 354
 355    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 356       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 357       math->dst = dst_reg(this, glsl_type::vec4_type);
 358       math->dst.type = dst.type;
 359       math = emit(MOV(dst, src_reg(math->dst)));
 360    } else if (devinfo->gen < 6) {
 361       math->base_mrf = 1;
 362       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 363    }
 364
 365    return math;
 366 }
 367
 368 void
 369 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 370 {
 371    if (devinfo->gen < 7) {
 372       unreachable("ir_unop_pack_half_2x16 should be lowered");
 373    }
 374
 375    assert(dst.type == BRW_REGISTER_TYPE_UD);
 376    assert(src0.type == BRW_REGISTER_TYPE_F);
 377
 378    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 379     *
 380     *   Because this instruction does not have a 16-bit floating-point type,
 381     *   the destination data type must be Word (W).
 382     *
 383     *   The destination must be DWord-aligned and specify a horizontal stride
 384     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 385     *   each destination channel and the upper word is not modified.
 386     *
 387     * The above restriction implies that the f32to16 instruction must use
 388     * align1 mode, because only in align1 mode is it possible to specify
 389     * horizontal stride.  We choose here to defy the hardware docs and emit
 390     * align16 instructions.
 391     *
 392     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 393     * instructions. I was partially successful in that the code passed all
 394     * tests.  However, the code was dubiously correct and fragile, and the
 395     * tests were not harsh enough to probe that frailty. Not trusting the
 396     * code, I chose instead to remain in align16 mode in defiance of the hw
 397     * docs).
 398     *
 399     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 400     * simulator, emitting a f32to16 in align16 mode with UD as destination
 401     * data type is safe. The behavior differs from that specified in the PRM
 402     * in that the upper word of each destination channel is cleared to 0.
 403     */
 404
 405    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 406    src_reg tmp_src(tmp_dst);
 407
 408 #if 0
 409    /* Verify the undocumented behavior on which the following instructions
 410     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 411     * then the result of the bit-or instruction below will be incorrect.
 412     *
 413     * You should inspect the disasm output in order to verify that the MOV is
 414     * not optimized away.
 415     */
 416    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 417 #endif
 418
 419    /* Give tmp the form below, where "." means untouched.
 420     *
 421     *     w z          y          x w z          y          x
 422     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 423     *
 424     * That the upper word of each write-channel be 0 is required for the
 425     * following bit-shift and bit-or instructions to work. Note that this
 426     * relies on the undocumented hardware behavior mentioned above.
 427     */
 428    tmp_dst.writemask = WRITEMASK_XY;
 429    emit(F32TO16(tmp_dst, src0));
 430
 431    /* Give the write-channels of dst the form:
 432     *   0xhhhh0000
 433     */
 434    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 435    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 436
 437    /* Finally, give the write-channels of dst the form of packHalf2x16's
 438     * output:
 439     *   0xhhhhllll
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 442    emit(OR(dst, src_reg(dst), tmp_src));
 443 }
 444
 445 void
 446 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 447 {
 448    if (devinfo->gen < 7) {
 449       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 450    }
 451
 452    assert(dst.type == BRW_REGISTER_TYPE_F);
 453    assert(src0.type == BRW_REGISTER_TYPE_UD);
 454
 455    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 456     *
 457     *   Because this instruction does not have a 16-bit floating-point type,
 458     *   the source data type must be Word (W). The destination type must be
 459     *   F (Float).
 460     *
 461     * To use W as the source data type, we must adjust horizontal strides,
 462     * which is only possible in align1 mode. All my [chadv] attempts at
 463     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 464     * Piglit tests, so I gave up.
 465     *
 466     * I've verified that, on gen7 hardware and the simulator, it is safe to
 467     * emit f16to32 in align16 mode with UD as source data type.
 468     */
 469
 470    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 471    src_reg tmp_src(tmp_dst);
 472
 473    tmp_dst.writemask = WRITEMASK_X;
 474    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 475
 476    tmp_dst.writemask = WRITEMASK_Y;
 477    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 478
 479    dst.writemask = WRITEMASK_XY;
 480    emit(F16TO32(dst, tmp_src));
 481 }
 482
 483 void
 484 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 485 {
 486    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 487     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 488     * is not suitable to generate the shift values, but we can use the packed
 489     * vector float and a type-converting MOV.
 490     */
 491    dst_reg shift(this, glsl_type::uvec4_type);
 492    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 493
 494    dst_reg shifted(this, glsl_type::uvec4_type);
 495    src0.swizzle = BRW_SWIZZLE_XXXX;
 496    emit(SHR(shifted, src0, src_reg(shift)));
 497
 498    shifted.type = BRW_REGISTER_TYPE_UB;
 499    dst_reg f(this, glsl_type::vec4_type);
 500    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 501
 502    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 503 }
 504
 505 void
 506 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 507 {
 508    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 509     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 510     * is not suitable to generate the shift values, but we can use the packed
 511     * vector float and a type-converting MOV.
 512     */
 513    dst_reg shift(this, glsl_type::uvec4_type);
 514    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 515
 516    dst_reg shifted(this, glsl_type::uvec4_type);
 517    src0.swizzle = BRW_SWIZZLE_XXXX;
 518    emit(SHR(shifted, src0, src_reg(shift)));
 519
 520    shifted.type = BRW_REGISTER_TYPE_B;
 521    dst_reg f(this, glsl_type::vec4_type);
 522    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 523
 524    dst_reg scaled(this, glsl_type::vec4_type);
 525    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 526
 527    dst_reg max(this, glsl_type::vec4_type);
 528    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 529    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 530 }
 531
 532 void
 533 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 534 {
 535    dst_reg saturated(this, glsl_type::vec4_type);
 536    vec4_instruction *inst = emit(MOV(saturated, src0));
 537    inst->saturate = true;
 538
 539    dst_reg scaled(this, glsl_type::vec4_type);
 540    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 541
 542    dst_reg rounded(this, glsl_type::vec4_type);
 543    emit(RNDE(rounded, src_reg(scaled)));
 544
 545    dst_reg u(this, glsl_type::uvec4_type);
 546    emit(MOV(u, src_reg(rounded)));
 547
 548    src_reg bytes(u);
 549    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 550 }
 551
 552 void
 553 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 554 {
 555    dst_reg max(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 557
 558    dst_reg min(this, glsl_type::vec4_type);
 559    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 560
 561    dst_reg scaled(this, glsl_type::vec4_type);
 562    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 563
 564    dst_reg rounded(this, glsl_type::vec4_type);
 565    emit(RNDE(rounded, src_reg(scaled)));
 566
 567    dst_reg i(this, glsl_type::ivec4_type);
 568    emit(MOV(i, src_reg(rounded)));
 569
 570    src_reg bytes(i);
 571    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 572 }
 573
 574 /*
 575  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 576  * false) elements needed to pack a type.
 577  */
 578 static int
 579 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
 580 {
 581    unsigned int i;
 582    int size;
 583
 584    switch (type->base_type) {
 585    case GLSL_TYPE_UINT:
 586    case GLSL_TYPE_INT:
 587    case GLSL_TYPE_FLOAT:
 588    case GLSL_TYPE_FLOAT16:
 589    case GLSL_TYPE_BOOL:
 590    case GLSL_TYPE_DOUBLE:
 591    case GLSL_TYPE_UINT16:
 592    case GLSL_TYPE_INT16:
 593    case GLSL_TYPE_UINT8:
 594    case GLSL_TYPE_INT8:
 595    case GLSL_TYPE_UINT64:
 596    case GLSL_TYPE_INT64:
 597       if (type->is_matrix()) {
 598          const glsl_type *col_type = type->column_type();
 599          unsigned col_slots =
 600             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 601          return type->matrix_columns * col_slots;
 602       } else {
 603          /* Regardless of size of vector, it gets a vec4. This is bad
 604           * packing for things like floats, but otherwise arrays become a
 605           * mess.  Hopefully a later pass over the code can pack scalars
 606           * down if appropriate.
 607           */
 608          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 609       }
 610    case GLSL_TYPE_ARRAY:
 611       assert(type->length > 0);
 612       return type_size_xvec4(type->fields.array, as_vec4, bindless) *
 613              type->length;
 614    case GLSL_TYPE_STRUCT:
 615    case GLSL_TYPE_INTERFACE:
 616       size = 0;
 617       for (i = 0; i < type->length; i++) {
 618          size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
 619                                  bindless);
 620       }
 621       return size;
 622    case GLSL_TYPE_SUBROUTINE:
 623       return 1;
 624
 625    case GLSL_TYPE_SAMPLER:
 626       /* Samplers take up no register space, since they're baked in at
 627        * link time.
 628        */
 629       return bindless ? 1 : 0;
 630    case GLSL_TYPE_ATOMIC_UINT:
 631       return 0;
 632    case GLSL_TYPE_IMAGE:
 633       return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 634    case GLSL_TYPE_VOID:
 635    case GLSL_TYPE_ERROR:
 636    case GLSL_TYPE_FUNCTION:
 637       unreachable("not reached");
 638    }
 639
 640    return 0;
 641 }
 642
 643 /**
 644  * Returns the minimum number of vec4 elements needed to pack a type.
 645  *
 646  * For simple types, it will return 1 (a single vec4); for matrices, the
 647  * number of columns; for array and struct, the sum of the vec4_size of
 648  * each of its elements; and for sampler and atomic, zero.
 649  *
 650  * This method is useful to calculate how much register space is needed to
 651  * store a particular type.
 652  */
 653 extern "C" int
 654 type_size_vec4(const struct glsl_type *type, bool bindless)
 655 {
 656    return type_size_xvec4(type, true, bindless);
 657 }
 658
 659 /**
 660  * Returns the minimum number of dvec4 elements needed to pack a type.
 661  *
 662  * For simple types, it will return 1 (a single dvec4); for matrices, the
 663  * number of columns; for array and struct, the sum of the dvec4_size of
 664  * each of its elements; and for sampler and atomic, zero.
 665  *
 666  * This method is useful to calculate how much register space is needed to
 667  * store a particular type.
 668  *
 669  * Measuring double-precision vertex inputs as dvec4 is required because
 670  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 671  * than the single-precision version. That is, two consecutives dvec4 would be
 672  * located in location "x" and location "x+1", not "x+2".
 673  *
 674  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 675  * remap_vs_attrs() will take in account both the location and also if the
 676  * type fits in one or two vec4 slots.
 677  */
 678 extern "C" int
 679 type_size_dvec4(const struct glsl_type *type, bool bindless)
 680 {
 681    return type_size_xvec4(type, false, bindless);
 682 }
 683
 684 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 685 {
 686    init();
 687
 688    this->file = VGRF;
 689    this->nr = v->alloc.allocate(type_size_vec4(type, false));
 690
 691    if (type->is_array() || type->is_struct()) {
 692       this->swizzle = BRW_SWIZZLE_NOOP;
 693    } else {
 694       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 695    }
 696
 697    this->type = brw_type_for_base_type(type);
 698 }
 699
 700 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 701 {
 702    assert(size > 0);
 703
 704    init();
 705
 706    this->file = VGRF;
 707    this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
 708
 709    this->swizzle = BRW_SWIZZLE_NOOP;
 710
 711    this->type = brw_type_for_base_type(type);
 712 }
 713
 714 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 715 {
 716    init();
 717
 718    this->file = VGRF;
 719    this->nr = v->alloc.allocate(type_size_vec4(type, false));
 720
 721    if (type->is_array() || type->is_struct()) {
 722       this->writemask = WRITEMASK_XYZW;
 723    } else {
 724       this->writemask = (1 << type->vector_elements) - 1;
 725    }
 726
 727    this->type = brw_type_for_base_type(type);
 728 }
 729
 730 vec4_instruction *
 731 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 732                           src_reg src0, src_reg src1)
 733 {
 734    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 735    inst->conditional_mod = conditionalmod;
 736    return inst;
 737 }
 738
 739 /**
 740  * Emits the instructions needed to perform a pull constant load. before_block
 741  * and before_inst can be NULL in which case the instruction will be appended
 742  * to the end of the instruction list.
 743  */
 744 void
 745 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 746                                           src_reg surf_index,
 747                                           src_reg offset_reg,
 748                                           bblock_t *before_block,
 749                                           vec4_instruction *before_inst)
 750 {
 751    assert((before_inst == NULL && before_block == NULL) ||
 752           (before_inst && before_block));
 753
 754    vec4_instruction *pull;
 755
 756    if (devinfo->gen >= 9) {
 757       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 758       src_reg header(this, glsl_type::uvec4_type, 2);
 759
 760       pull = new(mem_ctx)
 761          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 762                           dst_reg(header));
 763
 764       if (before_inst)
 765          emit_before(before_block, before_inst, pull);
 766       else
 767          emit(pull);
 768
 769       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 770                                  offset_reg.type);
 771       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 772
 773       if (before_inst)
 774          emit_before(before_block, before_inst, pull);
 775       else
 776          emit(pull);
 777
 778       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 779                                            dst,
 780                                            surf_index,
 781                                            header);
 782       pull->mlen = 2;
 783       pull->header_size = 1;
 784    } else if (devinfo->gen >= 7) {
 785       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 786
 787       grf_offset.type = offset_reg.type;
 788
 789       pull = MOV(grf_offset, offset_reg);
 790
 791       if (before_inst)
 792          emit_before(before_block, before_inst, pull);
 793       else
 794          emit(pull);
 795
 796       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 797                                            dst,
 798                                            surf_index,
 799                                            src_reg(grf_offset));
 800       pull->mlen = 1;
 801    } else {
 802       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 803                                            dst,
 804                                            surf_index,
 805                                            offset_reg);
 806       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 807       pull->mlen = 1;
 808    }
 809
 810    if (before_inst)
 811       emit_before(before_block, before_inst, pull);
 812    else
 813       emit(pull);
 814 }
 815
 816 src_reg
 817 vec4_visitor::emit_uniformize(const src_reg &src)
 818 {
 819    const src_reg chan_index(this, glsl_type::uint_type);
 820    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 821                               src.type);
 822
 823    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 824       ->force_writemask_all = true;
 825    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 826       ->force_writemask_all = true;
 827
 828    return src_reg(dst);
 829 }
 830
 831 src_reg
 832 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 833                              src_reg coordinate, src_reg surface)
 834 {
 835    vec4_instruction *inst =
 836       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 837                                     dst_reg(this, glsl_type::uvec4_type));
 838    inst->base_mrf = 2;
 839    inst->src[1] = surface;
 840    inst->src[2] = brw_imm_ud(0); /* sampler */
 841
 842    int param_base;
 843
 844    if (devinfo->gen >= 9) {
 845       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 846       vec4_instruction *header_inst = new(mem_ctx)
 847          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 848                           dst_reg(MRF, inst->base_mrf));
 849
 850       emit(header_inst);
 851
 852       inst->mlen = 2;
 853       inst->header_size = 1;
 854       param_base = inst->base_mrf + 1;
 855    } else {
 856       inst->mlen = 1;
 857       param_base = inst->base_mrf;
 858    }
 859
 860    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 861    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 862    int zero_mask = 0xf & ~coord_mask;
 863
 864    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 865             coordinate));
 866
 867    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 868             brw_imm_d(0)));
 869
 870    emit(inst);
 871    return src_reg(inst->dst);
 872 }
 873
 874 bool
 875 vec4_visitor::is_high_sampler(src_reg sampler)
 876 {
 877    if (devinfo->gen < 8 && !devinfo->is_haswell)
 878       return false;
 879
 880    return sampler.file != IMM || sampler.ud >= 16;
 881 }
 882
 883 void
 884 vec4_visitor::emit_texture(ir_texture_opcode op,
 885                            dst_reg dest,
 886                            const glsl_type *dest_type,
 887                            src_reg coordinate,
 888                            int coord_components,
 889                            src_reg shadow_comparator,
 890                            src_reg lod, src_reg lod2,
 891                            src_reg sample_index,
 892                            uint32_t constant_offset,
 893                            src_reg offset_value,
 894                            src_reg mcs,
 895                            uint32_t surface,
 896                            src_reg surface_reg,
 897                            src_reg sampler_reg)
 898 {
 899    enum opcode opcode;
 900    switch (op) {
 901    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 902    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 903    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 904    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 905    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 906                              SHADER_OPCODE_TXF_CMS); break;
 907    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 908    case ir_tg4: opcode = offset_value.file != BAD_FILE
 909                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 910    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 911    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 912    case ir_txb:
 913       unreachable("TXB is not valid for vertex shaders.");
 914    case ir_lod:
 915       unreachable("LOD is not valid for vertex shaders.");
 916    case ir_samples_identical: {
 917       /* There are some challenges implementing this for vec4, and it seems
 918        * unlikely to be used anyway.  For now, just return false ways.
 919        */
 920       emit(MOV(dest, brw_imm_ud(0u)));
 921       return;
 922    }
 923    default:
 924       unreachable("Unrecognized tex op");
 925    }
 926
 927    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 928
 929    inst->offset = constant_offset;
 930
 931    /* The message header is necessary for:
 932     * - Gen4 (always)
 933     * - Gen9+ for selecting SIMD4x2
 934     * - Texel offsets
 935     * - Gather channel selection
 936     * - Sampler indices too large to fit in a 4-bit value.
 937     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 938     */
 939    inst->header_size =
 940       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 941        inst->offset != 0 || op == ir_tg4 ||
 942        op == ir_texture_samples ||
 943        is_high_sampler(sampler_reg)) ? 1 : 0;
 944    inst->base_mrf = 2;
 945    inst->mlen = inst->header_size;
 946    inst->dst.writemask = WRITEMASK_XYZW;
 947    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 948
 949    inst->src[1] = surface_reg;
 950    inst->src[2] = sampler_reg;
 951
 952    /* MRF for the first parameter */
 953    int param_base = inst->base_mrf + inst->header_size;
 954
 955    if (op == ir_txs || op == ir_query_levels) {
 956       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 957       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 958       inst->mlen++;
 959    } else if (op == ir_texture_samples) {
 960       inst->dst.writemask = WRITEMASK_X;
 961    } else {
 962       /* Load the coordinate */
 963       /* FINISHME: gl_clamp_mask and saturate */
 964       int coord_mask = (1 << coord_components) - 1;
 965       int zero_mask = 0xf & ~coord_mask;
 966
 967       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 968                coordinate));
 969       inst->mlen++;
 970
 971       if (zero_mask != 0) {
 972          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 973                   brw_imm_d(0)));
 974       }
 975       /* Load the shadow comparator */
 976       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 977          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
 978                           WRITEMASK_X),
 979                   shadow_comparator));
 980          inst->mlen++;
 981       }
 982
 983       /* Load the LOD info */
 984       if (op == ir_tex || op == ir_txl) {
 985          int mrf, writemask;
 986          if (devinfo->gen >= 5) {
 987             mrf = param_base + 1;
 988             if (shadow_comparator.file != BAD_FILE) {
 989                writemask = WRITEMASK_Y;
 990                /* mlen already incremented */
 991             } else {
 992                writemask = WRITEMASK_X;
 993                inst->mlen++;
 994             }
 995          } else /* devinfo->gen == 4 */ {
 996             mrf = param_base;
 997             writemask = WRITEMASK_W;
 998          }
 999          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1000       } else if (op == ir_txf) {
1001          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1002       } else if (op == ir_txf_ms) {
1003          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1004                   sample_index));
1005          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1006             /* MCS data is stored in the first two channels of ‘mcs’, but we
1007              * need to get it into the .y and .z channels of the second vec4
1008              * of params.
1009              */
1010             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1011             emit(MOV(dst_reg(MRF, param_base + 1,
1012                              glsl_type::uint_type, WRITEMASK_YZ),
1013                      mcs));
1014          } else if (devinfo->gen >= 7) {
1015             /* MCS data is in the first channel of `mcs`, but we need to get it into
1016              * the .y channel of the second vec4 of params, so replicate .x across
1017              * the whole vec4 and then mask off everything except .y
1018              */
1019             mcs.swizzle = BRW_SWIZZLE_XXXX;
1020             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1021                      mcs));
1022          }
1023          inst->mlen++;
1024       } else if (op == ir_txd) {
1025          const brw_reg_type type = lod.type;
1026
1027          if (devinfo->gen >= 5) {
1028             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1029             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1030             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1031             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1032             inst->mlen++;
1033
1034             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1035                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1036                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1037                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1038                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1039                inst->mlen++;
1040
1041                if (shadow_comparator.file != BAD_FILE) {
1042                   emit(MOV(dst_reg(MRF, param_base + 2,
1043                                    shadow_comparator.type, WRITEMASK_Z),
1044                            shadow_comparator));
1045                }
1046             }
1047          } else /* devinfo->gen == 4 */ {
1048             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1049             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1050             inst->mlen += 2;
1051          }
1052       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1053          if (shadow_comparator.file != BAD_FILE) {
1054             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1055                      shadow_comparator));
1056          }
1057
1058          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1059                   offset_value));
1060          inst->mlen++;
1061       }
1062    }
1063
1064    emit(inst);
1065
1066    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1067     * spec requires layers.
1068     */
1069    if (op == ir_txs && devinfo->gen < 7) {
1070       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1071       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1072                   src_reg(inst->dst), brw_imm_d(1));
1073    }
1074
1075    if (devinfo->gen == 6 && op == ir_tg4) {
1076       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1077    }
1078
1079    if (op == ir_query_levels) {
1080       /* # levels is in .w */
1081       src_reg swizzled(dest);
1082       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1083                                       SWIZZLE_W, SWIZZLE_W);
1084       emit(MOV(dest, swizzled));
1085    }
1086 }
1087
1088 /**
1089  * Apply workarounds for Gen6 gather with UINT/SINT
1090  */
1091 void
1092 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1093 {
1094    if (!wa)
1095       return;
1096
1097    int width = (wa & WA_8BIT) ? 8 : 16;
1098    dst_reg dst_f = dst;
1099    dst_f.type = BRW_REGISTER_TYPE_F;
1100
1101    /* Convert from UNORM to UINT */
1102    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1103    emit(MOV(dst, src_reg(dst_f)));
1104
1105    if (wa & WA_SIGN) {
1106       /* Reinterpret the UINT value as a signed INT value by
1107        * shifting the sign bit into place, then shifting back
1108        * preserving sign.
1109        */
1110       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1111       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1112    }
1113 }
1114
1115 void
1116 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1117 {
1118    unreachable("not reached");
1119 }
1120
1121 void
1122 vec4_visitor::gs_end_primitive()
1123 {
1124    unreachable("not reached");
1125 }
1126
1127 void
1128 vec4_visitor::emit_ndc_computation()
1129 {
1130    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1131       return;
1132
1133    /* Get the position */
1134    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1135
1136    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1137    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1138    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1139    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1140
1141    current_annotation = "NDC";
1142    dst_reg ndc_w = ndc;
1143    ndc_w.writemask = WRITEMASK_W;
1144    src_reg pos_w = pos;
1145    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1146    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1147
1148    dst_reg ndc_xyz = ndc;
1149    ndc_xyz.writemask = WRITEMASK_XYZ;
1150
1151    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1152 }
1153
1154 void
1155 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1156 {
1157    if (devinfo->gen < 6 &&
1158        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1159         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1160         devinfo->has_negative_rhw_bug)) {
1161       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1162       dst_reg header1_w = header1;
1163       header1_w.writemask = WRITEMASK_W;
1164
1165       emit(MOV(header1, brw_imm_ud(0u)));
1166
1167       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1168          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1169
1170          current_annotation = "Point size";
1171          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1172          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1173       }
1174
1175       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1176          current_annotation = "Clipping flags";
1177          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1178
1179          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1180          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1181          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1182       }
1183
1184       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1185          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1186          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1187          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1188          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1189          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1190       }
1191
1192       /* i965 clipping workaround:
1193        * 1) Test for -ve rhw
1194        * 2) If set,
1195        *      set ndc = (0,0,0,0)
1196        *      set ucp[6] = 1
1197        *
1198        * Later, clipping will detect ucp[6] and ensure the primitive is
1199        * clipped against all fixed planes.
1200        */
1201       if (devinfo->has_negative_rhw_bug &&
1202           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1203          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1204          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1205          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1206          vec4_instruction *inst;
1207          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1208          inst->predicate = BRW_PREDICATE_NORMAL;
1209          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1210          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1211          inst->predicate = BRW_PREDICATE_NORMAL;
1212       }
1213
1214       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1215    } else if (devinfo->gen < 6) {
1216       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1217    } else {
1218       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1219       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1220          dst_reg reg_w = reg;
1221          reg_w.writemask = WRITEMASK_W;
1222          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1223          reg_as_src.type = reg_w.type;
1224          reg_as_src.swizzle = brw_swizzle_for_size(1);
1225          emit(MOV(reg_w, reg_as_src));
1226       }
1227       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1228          dst_reg reg_y = reg;
1229          reg_y.writemask = WRITEMASK_Y;
1230          reg_y.type = BRW_REGISTER_TYPE_D;
1231          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1232          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1233       }
1234       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1235          dst_reg reg_z = reg;
1236          reg_z.writemask = WRITEMASK_Z;
1237          reg_z.type = BRW_REGISTER_TYPE_D;
1238          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1239          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1240       }
1241    }
1242 }
1243
1244 vec4_instruction *
1245 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1246 {
1247    assert(varying < VARYING_SLOT_MAX);
1248
1249    unsigned num_comps = output_num_components[varying][component];
1250    if (num_comps == 0)
1251       return NULL;
1252
1253    assert(output_reg[varying][component].type == reg.type);
1254    current_annotation = output_reg_annotation[varying];
1255    if (output_reg[varying][component].file != BAD_FILE) {
1256       src_reg src = src_reg(output_reg[varying][component]);
1257       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1258       reg.writemask =
1259          brw_writemask_for_component_packing(num_comps, component);
1260       return emit(MOV(reg, src));
1261    }
1262    return NULL;
1263 }
1264
1265 void
1266 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1267 {
1268    reg.type = BRW_REGISTER_TYPE_F;
1269    output_reg[varying][0].type = reg.type;
1270
1271    switch (varying) {
1272    case VARYING_SLOT_PSIZ:
1273    {
1274       /* PSIZ is always in slot 0, and is coupled with other flags. */
1275       current_annotation = "indices, point width, clip flags";
1276       emit_psiz_and_flags(reg);
1277       break;
1278    }
1279    case BRW_VARYING_SLOT_NDC:
1280       current_annotation = "NDC";
1281       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1282          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1283       break;
1284    case VARYING_SLOT_POS:
1285       current_annotation = "gl_Position";
1286       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1287          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1288       break;
1289    case VARYING_SLOT_EDGE: {
1290       /* This is present when doing unfilled polygons.  We're supposed to copy
1291        * the edge flag from the user-provided vertex array
1292        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1293        * of that attribute (starts as 1.0f).  This is then used in clipping to
1294        * determine which edges should be drawn as wireframe.
1295        */
1296       current_annotation = "edge flag";
1297       int edge_attr = util_bitcount64(nir->info.inputs_read &
1298                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1299       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1300                                     glsl_type::float_type, WRITEMASK_XYZW))));
1301       break;
1302    }
1303    case BRW_VARYING_SLOT_PAD:
1304       /* No need to write to this slot */
1305       break;
1306    default:
1307       for (int i = 0; i < 4; i++) {
1308          emit_generic_urb_slot(reg, varying, i);
1309       }
1310       break;
1311    }
1312 }
1313
1314 static unsigned
1315 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1316 {
1317    if (devinfo->gen >= 6) {
1318       /* URB data written (does not include the message header reg) must
1319        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1320        * section 5.4.3.2.2: URB_INTERLEAVED.
1321        *
1322        * URB entries are allocated on a multiple of 1024 bits, so an
1323        * extra 128 bits written here to make the end align to 256 is
1324        * no problem.
1325        */
1326       if ((mlen % 2) != 1)
1327          mlen++;
1328    }
1329
1330    return mlen;
1331 }
1332
1333
1334 /**
1335  * Generates the VUE payload plus the necessary URB write instructions to
1336  * output it.
1337  *
1338  * The VUE layout is documented in Volume 2a.
1339  */
1340 void
1341 vec4_visitor::emit_vertex()
1342 {
1343    /* MRF 0 is reserved for the debugger, so start with message header
1344     * in MRF 1.
1345     */
1346    int base_mrf = 1;
1347    int mrf = base_mrf;
1348    /* In the process of generating our URB write message contents, we
1349     * may need to unspill a register or load from an array.  Those
1350     * reads would use MRFs 14-15.
1351     */
1352    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1353
1354    /* The following assertion verifies that max_usable_mrf causes an
1355     * even-numbered amount of URB write data, which will meet gen6's
1356     * requirements for length alignment.
1357     */
1358    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1359
1360    /* First mrf is the g0-based message header containing URB handles and
1361     * such.
1362     */
1363    emit_urb_write_header(mrf++);
1364
1365    if (devinfo->gen < 6) {
1366       emit_ndc_computation();
1367    }
1368
1369    /* We may need to split this up into several URB writes, so do them in a
1370     * loop.
1371     */
1372    int slot = 0;
1373    bool complete = false;
1374    do {
1375       /* URB offset is in URB row increments, and each of our MRFs is half of
1376        * one of those, since we're doing interleaved writes.
1377        */
1378       int offset = slot / 2;
1379
1380       mrf = base_mrf + 1;
1381       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1382          emit_urb_slot(dst_reg(MRF, mrf++),
1383                        prog_data->vue_map.slot_to_varying[slot]);
1384
1385          /* If this was max_usable_mrf, we can't fit anything more into this
1386           * URB WRITE. Same thing if we reached the maximum length available.
1387           */
1388          if (mrf > max_usable_mrf ||
1389              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1390             slot++;
1391             break;
1392          }
1393       }
1394
1395       complete = slot >= prog_data->vue_map.num_slots;
1396       current_annotation = "URB write";
1397       vec4_instruction *inst = emit_urb_write_opcode(complete);
1398       inst->base_mrf = base_mrf;
1399       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1400       inst->offset += offset;
1401    } while(!complete);
1402 }
1403
1404
1405 src_reg
1406 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1407                                  src_reg *reladdr, int reg_offset)
1408 {
1409    /* Because we store the values to scratch interleaved like our
1410     * vertex data, we need to scale the vec4 index by 2.
1411     */
1412    int message_header_scale = 2;
1413
1414    /* Pre-gen6, the message header uses byte offsets instead of vec4
1415     * (16-byte) offset units.
1416     */
1417    if (devinfo->gen < 6)
1418       message_header_scale *= 16;
1419
1420    if (reladdr) {
1421       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1422        * to multiply the reladdr by 2. Notice that the reg_offset part
1423        * is in units of 16 bytes and is used to select the low/high 16-byte
1424        * chunk of a full dvec4, so we don't want to multiply that part.
1425        */
1426       src_reg index = src_reg(this, glsl_type::int_type);
1427       if (type_sz(inst->dst.type) < 8) {
1428          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1429                                       brw_imm_d(reg_offset)));
1430          emit_before(block, inst, MUL(dst_reg(index), index,
1431                                       brw_imm_d(message_header_scale)));
1432       } else {
1433          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1434                                       brw_imm_d(message_header_scale * 2)));
1435          emit_before(block, inst, ADD(dst_reg(index), index,
1436                                       brw_imm_d(reg_offset * message_header_scale)));
1437       }
1438       return index;
1439    } else {
1440       return brw_imm_d(reg_offset * message_header_scale);
1441    }
1442 }
1443
1444 /**
1445  * Emits an instruction before @inst to load the value named by @orig_src
1446  * from scratch space at @base_offset to @temp.
1447  *
1448  * @base_offset is measured in 32-byte units (the size of a register).
1449  */
1450 void
1451 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1452                                 dst_reg temp, src_reg orig_src,
1453                                 int base_offset)
1454 {
1455    assert(orig_src.offset % REG_SIZE == 0);
1456    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1457    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1458                                       reg_offset);
1459
1460    if (type_sz(orig_src.type) < 8) {
1461       emit_before(block, inst, SCRATCH_READ(temp, index));
1462    } else {
1463       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1464       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1465       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1466       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1467       vec4_instruction *last_read =
1468          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1469       emit_before(block, inst, last_read);
1470       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1471    }
1472 }
1473
1474 /**
1475  * Emits an instruction after @inst to store the value to be written
1476  * to @orig_dst to scratch space at @base_offset, from @temp.
1477  *
1478  * @base_offset is measured in 32-byte units (the size of a register).
1479  */
1480 void
1481 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1482                                  int base_offset)
1483 {
1484    assert(inst->dst.offset % REG_SIZE == 0);
1485    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1486    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1487                                       reg_offset);
1488
1489    /* Create a temporary register to store *inst's result in.
1490     *
1491     * We have to be careful in MOVing from our temporary result register in
1492     * the scratch write.  If we swizzle from channels of the temporary that
1493     * weren't initialized, it will confuse live interval analysis, which will
1494     * make spilling fail to make progress.
1495     */
1496    bool is_64bit = type_sz(inst->dst.type) == 8;
1497    const glsl_type *alloc_type =
1498       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1499    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1500                                        inst->dst.type),
1501                                 brw_swizzle_for_mask(inst->dst.writemask));
1502
1503    if (!is_64bit) {
1504       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1505                                           inst->dst.writemask));
1506       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1507       if (inst->opcode != BRW_OPCODE_SEL)
1508          write->predicate = inst->predicate;
1509       write->ir = inst->ir;
1510       write->annotation = inst->annotation;
1511       inst->insert_after(block, write);
1512    } else {
1513       dst_reg shuffled = dst_reg(this, alloc_type);
1514       vec4_instruction *last =
1515          shuffle_64bit_data(shuffled, temp, true, block, inst);
1516       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1517
1518       uint8_t mask = 0;
1519       if (inst->dst.writemask & WRITEMASK_X)
1520          mask |= WRITEMASK_XY;
1521       if (inst->dst.writemask & WRITEMASK_Y)
1522          mask |= WRITEMASK_ZW;
1523       if (mask) {
1524          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1525
1526          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1527          if (inst->opcode != BRW_OPCODE_SEL)
1528             write->predicate = inst->predicate;
1529          write->ir = inst->ir;
1530          write->annotation = inst->annotation;
1531          last->insert_after(block, write);
1532       }
1533
1534       mask = 0;
1535       if (inst->dst.writemask & WRITEMASK_Z)
1536          mask |= WRITEMASK_XY;
1537       if (inst->dst.writemask & WRITEMASK_W)
1538          mask |= WRITEMASK_ZW;
1539       if (mask) {
1540          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1541
1542          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1543                                             reg_offset + 1);
1544          vec4_instruction *write =
1545             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1546          if (inst->opcode != BRW_OPCODE_SEL)
1547             write->predicate = inst->predicate;
1548          write->ir = inst->ir;
1549          write->annotation = inst->annotation;
1550          last->insert_after(block, write);
1551       }
1552    }
1553
1554    inst->dst.file = temp.file;
1555    inst->dst.nr = temp.nr;
1556    inst->dst.offset %= REG_SIZE;
1557    inst->dst.reladdr = NULL;
1558 }
1559
1560 /**
1561  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1562  * adds the scratch read(s) before \p inst. The function also checks for
1563  * recursive reladdr scratch accesses, issuing the corresponding scratch
1564  * loads and rewriting reladdr references accordingly.
1565  *
1566  * \return \p src if it did not require a scratch load, otherwise, the
1567  * register holding the result of the scratch load that the caller should
1568  * use to rewrite src.
1569  */
1570 src_reg
1571 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1572                                    vec4_instruction *inst, src_reg src)
1573 {
1574    /* Resolve recursive reladdr scratch access by calling ourselves
1575     * with src.reladdr
1576     */
1577    if (src.reladdr)
1578       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1579                                           *src.reladdr);
1580
1581    /* Now handle scratch access on src */
1582    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1583       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1584          glsl_type::dvec4_type : glsl_type::vec4_type);
1585       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1586       src.nr = temp.nr;
1587       src.offset %= REG_SIZE;
1588       src.reladdr = NULL;
1589    }
1590
1591    return src;
1592 }
1593
1594 /**
1595  * We can't generally support array access in GRF space, because a
1596  * single instruction's destination can only span 2 contiguous
1597  * registers.  So, we send all GRF arrays that get variable index
1598  * access to scratch space.
1599  */
1600 void
1601 vec4_visitor::move_grf_array_access_to_scratch()
1602 {
1603    int scratch_loc[this->alloc.count];
1604    memset(scratch_loc, -1, sizeof(scratch_loc));
1605
1606    /* First, calculate the set of virtual GRFs that need to be punted
1607     * to scratch due to having any array access on them, and where in
1608     * scratch.
1609     */
1610    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1611       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1612          if (scratch_loc[inst->dst.nr] == -1) {
1613             scratch_loc[inst->dst.nr] = last_scratch;
1614             last_scratch += this->alloc.sizes[inst->dst.nr];
1615          }
1616
1617          for (src_reg *iter = inst->dst.reladdr;
1618               iter->reladdr;
1619               iter = iter->reladdr) {
1620             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1621                scratch_loc[iter->nr] = last_scratch;
1622                last_scratch += this->alloc.sizes[iter->nr];
1623             }
1624          }
1625       }
1626
1627       for (int i = 0 ; i < 3; i++) {
1628          for (src_reg *iter = &inst->src[i];
1629               iter->reladdr;
1630               iter = iter->reladdr) {
1631             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1632                scratch_loc[iter->nr] = last_scratch;
1633                last_scratch += this->alloc.sizes[iter->nr];
1634             }
1635          }
1636       }
1637    }
1638
1639    /* Now, for anything that will be accessed through scratch, rewrite
1640     * it to load/store.  Note that this is a _safe list walk, because
1641     * we may generate a new scratch_write instruction after the one
1642     * we're processing.
1643     */
1644    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1645       /* Set up the annotation tracking for new generated instructions. */
1646       base_ir = inst->ir;
1647       current_annotation = inst->annotation;
1648
1649       /* First handle scratch access on the dst. Notice we have to handle
1650        * the case where the dst's reladdr also points to scratch space.
1651        */
1652       if (inst->dst.reladdr)
1653          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1654                                                    *inst->dst.reladdr);
1655
1656       /* Now that we have handled any (possibly recursive) reladdr scratch
1657        * accesses for dst we can safely do the scratch write for dst itself
1658        */
1659       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1660          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1661
1662       /* Now handle scratch access on any src. In this case, since inst->src[i]
1663        * already is a src_reg, we can just call emit_resolve_reladdr with
1664        * inst->src[i] and it will take care of handling scratch loads for
1665        * both src and src.reladdr (recursively).
1666        */
1667       for (int i = 0 ; i < 3; i++) {
1668          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1669                                              inst->src[i]);
1670       }
1671    }
1672 }
1673
1674 /**
1675  * Emits an instruction before @inst to load the value named by @orig_src
1676  * from the pull constant buffer (surface) at @base_offset to @temp.
1677  */
1678 void
1679 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1680                                       dst_reg temp, src_reg orig_src,
1681                                       int base_offset, src_reg indirect)
1682 {
1683    assert(orig_src.offset % 16 == 0);
1684    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1685
1686    /* For 64bit loads we need to emit two 32-bit load messages and we also
1687     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1688     * that we emit the 32-bit loads into a temporary and we shuffle the result
1689     * into the original destination.
1690     */
1691    dst_reg orig_temp = temp;
1692    bool is_64bit = type_sz(orig_src.type) == 8;
1693    if (is_64bit) {
1694       assert(type_sz(temp.type) == 8);
1695       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1696       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1697    }
1698
1699    src_reg src = orig_src;
1700    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1701       int reg_offset = base_offset + src.offset / 16;
1702
1703       src_reg offset;
1704       if (indirect.file != BAD_FILE) {
1705          offset = src_reg(this, glsl_type::uint_type);
1706          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1707                                       brw_imm_ud(reg_offset * 16)));
1708       } else if (devinfo->gen >= 8) {
1709          /* Store the offset in a GRF so we can send-from-GRF. */
1710          offset = src_reg(this, glsl_type::uint_type);
1711          emit_before(block, inst, MOV(dst_reg(offset),
1712                                       brw_imm_ud(reg_offset * 16)));
1713       } else {
1714          offset = brw_imm_d(reg_offset * 16);
1715       }
1716
1717       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1718                                   brw_imm_ud(index),
1719                                   offset,
1720                                   block, inst);
1721
1722       src = byte_offset(src, 16);
1723    }
1724
1725    if (is_64bit) {
1726       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1727       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1728    }
1729 }
1730
1731 /**
1732  * Implements array access of uniforms by inserting a
1733  * PULL_CONSTANT_LOAD instruction.
1734  *
1735  * Unlike temporary GRF array access (where we don't support it due to
1736  * the difficulty of doing relative addressing on instruction
1737  * destinations), we could potentially do array access of uniforms
1738  * that were loaded in GRF space as push constants.  In real-world
1739  * usage we've seen, though, the arrays being used are always larger
1740  * than we could load as push constants, so just always move all
1741  * uniform array access out to a pull constant buffer.
1742  */
1743 void
1744 vec4_visitor::move_uniform_array_access_to_pull_constants()
1745 {
1746    /* The vulkan dirver doesn't support pull constants other than UBOs so
1747     * everything has to be pushed regardless.
1748     */
1749    if (!compiler->supports_pull_constants) {
1750       split_uniform_registers();
1751       return;
1752    }
1753
1754    /* Allocate the pull_params array */
1755    assert(stage_prog_data->nr_pull_params == 0);
1756    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1757                                               this->uniforms * 4);
1758
1759    int pull_constant_loc[this->uniforms];
1760    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1761
1762    /* First, walk through the instructions and determine which things need to
1763     * be pulled.  We mark something as needing to be pulled by setting
1764     * pull_constant_loc to 0.
1765     */
1766    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1767       /* We only care about MOV_INDIRECT of a uniform */
1768       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1769           inst->src[0].file != UNIFORM)
1770          continue;
1771
1772       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1773
1774       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1775          pull_constant_loc[uniform_nr + j] = 0;
1776    }
1777
1778    /* Next, we walk the list of uniforms and assign real pull constant
1779     * locations and set their corresponding entries in pull_param.
1780     */
1781    for (int j = 0; j < this->uniforms; j++) {
1782       if (pull_constant_loc[j] < 0)
1783          continue;
1784
1785       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1786
1787       for (int i = 0; i < 4; i++) {
1788          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1789             = stage_prog_data->param[j * 4 + i];
1790       }
1791    }
1792
1793    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1794     * instructions to actual uniform pulls.
1795     */
1796    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1797       /* We only care about MOV_INDIRECT of a uniform */
1798       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1799           inst->src[0].file != UNIFORM)
1800          continue;
1801
1802       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1803
1804       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1805
1806       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1807                               pull_constant_loc[uniform_nr], inst->src[1]);
1808       inst->remove(block);
1809    }
1810
1811    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1812     * no need to track them as larger-than-vec4 objects.  This will be
1813     * relied on in cutting out unused uniform vectors from push
1814     * constants.
1815     */
1816    split_uniform_registers();
1817 }
1818
1819 void
1820 vec4_visitor::resolve_ud_negate(src_reg *reg)
1821 {
1822    if (reg->type != BRW_REGISTER_TYPE_UD ||
1823        !reg->negate)
1824       return;
1825
1826    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1827    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1828    *reg = temp;
1829 }
1830
1831 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1832                            void *log_data,
1833                            const struct brw_sampler_prog_key_data *key_tex,
1834                            struct brw_vue_prog_data *prog_data,
1835                            const nir_shader *shader,
1836                            void *mem_ctx,
1837                            bool no_spills,
1838                            int shader_time_index)
1839    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1840      key_tex(key_tex),
1841      prog_data(prog_data),
1842      fail_msg(NULL),
1843      first_non_payload_grf(0),
1844      live_analysis(this), performance_analysis(this),
1845      need_all_constants_in_pull_buffer(false),
1846      no_spills(no_spills),
1847      shader_time_index(shader_time_index),
1848      last_scratch(0)
1849 {
1850    this->failed = false;
1851
1852    this->base_ir = NULL;
1853    this->current_annotation = NULL;
1854    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1855
1856    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1857
1858    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1859
1860    this->uniforms = 0;
1861
1862    this->nir_locals = NULL;
1863    this->nir_ssa_values = NULL;
1864 }
1865
1866
1867 void
1868 vec4_visitor::fail(const char *format, ...)
1869 {
1870    va_list va;
1871    char *msg;
1872
1873    if (failed)
1874       return;
1875
1876    failed = true;
1877
1878    va_start(va, format);
1879    msg = ralloc_vasprintf(mem_ctx, format, va);
1880    va_end(va);
1881    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1882
1883    this->fail_msg = msg;
1884
1885    if (debug_enabled) {
1886       fprintf(stderr, "%s",  msg);
1887    }
1888 }
1889
1890 } /* namespace brw */