src/intel/compiler/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27
  28 namespace brw {
  29
  30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  31                                    const src_reg &src0, const src_reg &src1,
  32                                    const src_reg &src2)
  33 {
  34    this->opcode = opcode;
  35    this->dst = dst;
  36    this->src[0] = src0;
  37    this->src[1] = src1;
  38    this->src[2] = src2;
  39    this->saturate = false;
  40    this->force_writemask_all = false;
  41    this->no_dd_clear = false;
  42    this->no_dd_check = false;
  43    this->writes_accumulator = false;
  44    this->conditional_mod = BRW_CONDITIONAL_NONE;
  45    this->predicate = BRW_PREDICATE_NONE;
  46    this->predicate_inverse = false;
  47    this->target = 0;
  48    this->shadow_compare = false;
  49    this->ir = NULL;
  50    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  51    this->header_size = 0;
  52    this->flag_subreg = 0;
  53    this->mlen = 0;
  54    this->base_mrf = 0;
  55    this->offset = 0;
  56    this->exec_size = 8;
  57    this->group = 0;
  58    this->size_written = (dst.file == BAD_FILE ?
  59                          0 : this->exec_size * type_sz(dst.type));
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188 ALU1(DIM)
 189
 190 /** Gen4 predicated IF. */
 191 vec4_instruction *
 192 vec4_visitor::IF(enum brw_predicate predicate)
 193 {
 194    vec4_instruction *inst;
 195
 196    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 197    inst->predicate = predicate;
 198
 199    return inst;
 200 }
 201
 202 /** Gen6 IF with embedded comparison. */
 203 vec4_instruction *
 204 vec4_visitor::IF(src_reg src0, src_reg src1,
 205                  enum brw_conditional_mod condition)
 206 {
 207    assert(devinfo->gen == 6);
 208
 209    vec4_instruction *inst;
 210
 211    resolve_ud_negate(&src0);
 212    resolve_ud_negate(&src1);
 213
 214    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 215                                         src0, src1);
 216    inst->conditional_mod = condition;
 217
 218    return inst;
 219 }
 220
 221 /**
 222  * CMP: Sets the low bit of the destination channels with the result
 223  * of the comparison, while the upper bits are undefined, and updates
 224  * the flag register with the packed 16 bits of the result.
 225  */
 226 vec4_instruction *
 227 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 228                   enum brw_conditional_mod condition)
 229 {
 230    vec4_instruction *inst;
 231
 232    /* Take the instruction:
 233     *
 234     * CMP null<d> src0<f> src1<f>
 235     *
 236     * Original gen4 does type conversion to the destination type before
 237     * comparison, producing garbage results for floating point comparisons.
 238     *
 239     * The destination type doesn't matter on newer generations, so we set the
 240     * type to match src0 so we can compact the instruction.
 241     */
 242    dst.type = src0.type;
 243
 244    resolve_ud_negate(&src0);
 245    resolve_ud_negate(&src1);
 246
 247    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 248    inst->conditional_mod = condition;
 249
 250    return inst;
 251 }
 252
 253 vec4_instruction *
 254 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 255 {
 256    vec4_instruction *inst;
 257
 258    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 259                                         dst, index);
 260    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 261    inst->mlen = 2;
 262
 263    return inst;
 264 }
 265
 266 vec4_instruction *
 267 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 268                             const src_reg &index)
 269 {
 270    vec4_instruction *inst;
 271
 272    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 273                                         dst, src, index);
 274    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 275    inst->mlen = 3;
 276
 277    return inst;
 278 }
 279
 280 src_reg
 281 vec4_visitor::fix_3src_operand(const src_reg &src)
 282 {
 283    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 284     * able to use vertical stride of zero to replicate the vec4 uniform, like
 285     *
 286     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 287     *
 288     * But you can't, since vertical stride is always four in three-source
 289     * instructions. Instead, insert a MOV instruction to do the replication so
 290     * that the three-source instruction can consume it.
 291     */
 292
 293    /* The MOV is only needed if the source is a uniform or immediate. */
 294    if (src.file != UNIFORM && src.file != IMM)
 295       return src;
 296
 297    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 298       return src;
 299
 300    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 301    expanded.type = src.type;
 302    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 303    return src_reg(expanded);
 304 }
 305
 306 src_reg
 307 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 308 {
 309    if (!src.abs && !src.negate)
 310       return src;
 311
 312    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 313    resolved.type = src.type;
 314    emit(MOV(resolved, src));
 315
 316    return src_reg(resolved);
 317 }
 318
 319 src_reg
 320 vec4_visitor::fix_math_operand(const src_reg &src)
 321 {
 322    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 323       return src;
 324
 325    /* The gen6 math instruction ignores the source modifiers --
 326     * swizzle, abs, negate, and at least some parts of the register
 327     * region description.
 328     *
 329     * Rather than trying to enumerate all these cases, *always* expand the
 330     * operand to a temp GRF for gen6.
 331     *
 332     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 333     * can't use.
 334     */
 335
 336    if (devinfo->gen == 7 && src.file != IMM)
 337       return src;
 338
 339    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 340    expanded.type = src.type;
 341    emit(MOV(expanded, src));
 342    return src_reg(expanded);
 343 }
 344
 345 vec4_instruction *
 346 vec4_visitor::emit_math(enum opcode opcode,
 347                         const dst_reg &dst,
 348                         const src_reg &src0, const src_reg &src1)
 349 {
 350    vec4_instruction *math =
 351       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 352
 353    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 354       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 355       math->dst = dst_reg(this, glsl_type::vec4_type);
 356       math->dst.type = dst.type;
 357       math = emit(MOV(dst, src_reg(math->dst)));
 358    } else if (devinfo->gen < 6) {
 359       math->base_mrf = 1;
 360       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 361    }
 362
 363    return math;
 364 }
 365
 366 void
 367 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 368 {
 369    if (devinfo->gen < 7) {
 370       unreachable("ir_unop_pack_half_2x16 should be lowered");
 371    }
 372
 373    assert(dst.type == BRW_REGISTER_TYPE_UD);
 374    assert(src0.type == BRW_REGISTER_TYPE_F);
 375
 376    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 377     *
 378     *   Because this instruction does not have a 16-bit floating-point type,
 379     *   the destination data type must be Word (W).
 380     *
 381     *   The destination must be DWord-aligned and specify a horizontal stride
 382     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 383     *   each destination channel and the upper word is not modified.
 384     *
 385     * The above restriction implies that the f32to16 instruction must use
 386     * align1 mode, because only in align1 mode is it possible to specify
 387     * horizontal stride.  We choose here to defy the hardware docs and emit
 388     * align16 instructions.
 389     *
 390     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 391     * instructions. I was partially successful in that the code passed all
 392     * tests.  However, the code was dubiously correct and fragile, and the
 393     * tests were not harsh enough to probe that frailty. Not trusting the
 394     * code, I chose instead to remain in align16 mode in defiance of the hw
 395     * docs).
 396     *
 397     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 398     * simulator, emitting a f32to16 in align16 mode with UD as destination
 399     * data type is safe. The behavior differs from that specified in the PRM
 400     * in that the upper word of each destination channel is cleared to 0.
 401     */
 402
 403    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 404    src_reg tmp_src(tmp_dst);
 405
 406 #if 0
 407    /* Verify the undocumented behavior on which the following instructions
 408     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 409     * then the result of the bit-or instruction below will be incorrect.
 410     *
 411     * You should inspect the disasm output in order to verify that the MOV is
 412     * not optimized away.
 413     */
 414    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 415 #endif
 416
 417    /* Give tmp the form below, where "." means untouched.
 418     *
 419     *     w z          y          x w z          y          x
 420     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 421     *
 422     * That the upper word of each write-channel be 0 is required for the
 423     * following bit-shift and bit-or instructions to work. Note that this
 424     * relies on the undocumented hardware behavior mentioned above.
 425     */
 426    tmp_dst.writemask = WRITEMASK_XY;
 427    emit(F32TO16(tmp_dst, src0));
 428
 429    /* Give the write-channels of dst the form:
 430     *   0xhhhh0000
 431     */
 432    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 433    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 434
 435    /* Finally, give the write-channels of dst the form of packHalf2x16's
 436     * output:
 437     *   0xhhhhllll
 438     */
 439    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 440    emit(OR(dst, src_reg(dst), tmp_src));
 441 }
 442
 443 void
 444 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 445 {
 446    if (devinfo->gen < 7) {
 447       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 448    }
 449
 450    assert(dst.type == BRW_REGISTER_TYPE_F);
 451    assert(src0.type == BRW_REGISTER_TYPE_UD);
 452
 453    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 454     *
 455     *   Because this instruction does not have a 16-bit floating-point type,
 456     *   the source data type must be Word (W). The destination type must be
 457     *   F (Float).
 458     *
 459     * To use W as the source data type, we must adjust horizontal strides,
 460     * which is only possible in align1 mode. All my [chadv] attempts at
 461     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 462     * Piglit tests, so I gave up.
 463     *
 464     * I've verified that, on gen7 hardware and the simulator, it is safe to
 465     * emit f16to32 in align16 mode with UD as source data type.
 466     */
 467
 468    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 469    src_reg tmp_src(tmp_dst);
 470
 471    tmp_dst.writemask = WRITEMASK_X;
 472    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 473
 474    tmp_dst.writemask = WRITEMASK_Y;
 475    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 476
 477    dst.writemask = WRITEMASK_XY;
 478    emit(F16TO32(dst, tmp_src));
 479 }
 480
 481 void
 482 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 483 {
 484    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 485     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 486     * is not suitable to generate the shift values, but we can use the packed
 487     * vector float and a type-converting MOV.
 488     */
 489    dst_reg shift(this, glsl_type::uvec4_type);
 490    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 491
 492    dst_reg shifted(this, glsl_type::uvec4_type);
 493    src0.swizzle = BRW_SWIZZLE_XXXX;
 494    emit(SHR(shifted, src0, src_reg(shift)));
 495
 496    shifted.type = BRW_REGISTER_TYPE_UB;
 497    dst_reg f(this, glsl_type::vec4_type);
 498    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 499
 500    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 501 }
 502
 503 void
 504 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 505 {
 506    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 507     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 508     * is not suitable to generate the shift values, but we can use the packed
 509     * vector float and a type-converting MOV.
 510     */
 511    dst_reg shift(this, glsl_type::uvec4_type);
 512    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 513
 514    dst_reg shifted(this, glsl_type::uvec4_type);
 515    src0.swizzle = BRW_SWIZZLE_XXXX;
 516    emit(SHR(shifted, src0, src_reg(shift)));
 517
 518    shifted.type = BRW_REGISTER_TYPE_B;
 519    dst_reg f(this, glsl_type::vec4_type);
 520    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 521
 522    dst_reg scaled(this, glsl_type::vec4_type);
 523    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 524
 525    dst_reg max(this, glsl_type::vec4_type);
 526    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 527    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 528 }
 529
 530 void
 531 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 532 {
 533    dst_reg saturated(this, glsl_type::vec4_type);
 534    vec4_instruction *inst = emit(MOV(saturated, src0));
 535    inst->saturate = true;
 536
 537    dst_reg scaled(this, glsl_type::vec4_type);
 538    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 539
 540    dst_reg rounded(this, glsl_type::vec4_type);
 541    emit(RNDE(rounded, src_reg(scaled)));
 542
 543    dst_reg u(this, glsl_type::uvec4_type);
 544    emit(MOV(u, src_reg(rounded)));
 545
 546    src_reg bytes(u);
 547    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 548 }
 549
 550 void
 551 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 552 {
 553    dst_reg max(this, glsl_type::vec4_type);
 554    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 555
 556    dst_reg min(this, glsl_type::vec4_type);
 557    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 558
 559    dst_reg scaled(this, glsl_type::vec4_type);
 560    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 561
 562    dst_reg rounded(this, glsl_type::vec4_type);
 563    emit(RNDE(rounded, src_reg(scaled)));
 564
 565    dst_reg i(this, glsl_type::ivec4_type);
 566    emit(MOV(i, src_reg(rounded)));
 567
 568    src_reg bytes(i);
 569    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 570 }
 571
 572 /*
 573  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 574  * false) elements needed to pack a type.
 575  */
 576 static int
 577 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587    case GLSL_TYPE_DOUBLE:
 588    case GLSL_TYPE_UINT64:
 589    case GLSL_TYPE_INT64:
 590       if (type->is_matrix()) {
 591          const glsl_type *col_type = type->column_type();
 592          unsigned col_slots =
 593             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 594          return type->matrix_columns * col_slots;
 595       } else {
 596          /* Regardless of size of vector, it gets a vec4. This is bad
 597           * packing for things like floats, but otherwise arrays become a
 598           * mess.  Hopefully a later pass over the code can pack scalars
 599           * down if appropriate.
 600           */
 601          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 602       }
 603    case GLSL_TYPE_ARRAY:
 604       assert(type->length > 0);
 605       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 606    case GLSL_TYPE_STRUCT:
 607       size = 0;
 608       for (i = 0; i < type->length; i++) {
 609          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 610       }
 611       return size;
 612    case GLSL_TYPE_SUBROUTINE:
 613       return 1;
 614
 615    case GLSL_TYPE_SAMPLER:
 616       /* Samplers take up no register space, since they're baked in at
 617        * link time.
 618        */
 619       return 0;
 620    case GLSL_TYPE_ATOMIC_UINT:
 621       return 0;
 622    case GLSL_TYPE_IMAGE:
 623       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 624    case GLSL_TYPE_VOID:
 625    case GLSL_TYPE_ERROR:
 626    case GLSL_TYPE_INTERFACE:
 627    case GLSL_TYPE_FUNCTION:
 628       unreachable("not reached");
 629    }
 630
 631    return 0;
 632 }
 633
 634 /**
 635  * Returns the minimum number of vec4 elements needed to pack a type.
 636  *
 637  * For simple types, it will return 1 (a single vec4); for matrices, the
 638  * number of columns; for array and struct, the sum of the vec4_size of
 639  * each of its elements; and for sampler and atomic, zero.
 640  *
 641  * This method is useful to calculate how much register space is needed to
 642  * store a particular type.
 643  */
 644 extern "C" int
 645 type_size_vec4(const struct glsl_type *type)
 646 {
 647    return type_size_xvec4(type, true);
 648 }
 649
 650 /**
 651  * Returns the minimum number of dvec4 elements needed to pack a type.
 652  *
 653  * For simple types, it will return 1 (a single dvec4); for matrices, the
 654  * number of columns; for array and struct, the sum of the dvec4_size of
 655  * each of its elements; and for sampler and atomic, zero.
 656  *
 657  * This method is useful to calculate how much register space is needed to
 658  * store a particular type.
 659  *
 660  * Measuring double-precision vertex inputs as dvec4 is required because
 661  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 662  * than the single-precision version. That is, two consecutives dvec4 would be
 663  * located in location "x" and location "x+1", not "x+2".
 664  *
 665  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 666  * remap_vs_attrs() will take in account both the location and also if the
 667  * type fits in one or two vec4 slots.
 668  */
 669 extern "C" int
 670 type_size_dvec4(const struct glsl_type *type)
 671 {
 672    return type_size_xvec4(type, false);
 673 }
 674
 675 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 676 {
 677    init();
 678
 679    this->file = VGRF;
 680    this->nr = v->alloc.allocate(type_size_vec4(type));
 681
 682    if (type->is_array() || type->is_record()) {
 683       this->swizzle = BRW_SWIZZLE_NOOP;
 684    } else {
 685       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 686    }
 687
 688    this->type = brw_type_for_base_type(type);
 689 }
 690
 691 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 692 {
 693    assert(size > 0);
 694
 695    init();
 696
 697    this->file = VGRF;
 698    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 699
 700    this->swizzle = BRW_SWIZZLE_NOOP;
 701
 702    this->type = brw_type_for_base_type(type);
 703 }
 704
 705 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 706 {
 707    init();
 708
 709    this->file = VGRF;
 710    this->nr = v->alloc.allocate(type_size_vec4(type));
 711
 712    if (type->is_array() || type->is_record()) {
 713       this->writemask = WRITEMASK_XYZW;
 714    } else {
 715       this->writemask = (1 << type->vector_elements) - 1;
 716    }
 717
 718    this->type = brw_type_for_base_type(type);
 719 }
 720
 721 vec4_instruction *
 722 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 723                           src_reg src0, src_reg src1)
 724 {
 725    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 726    inst->conditional_mod = conditionalmod;
 727    return inst;
 728 }
 729
 730 vec4_instruction *
 731 vec4_visitor::emit_lrp(const dst_reg &dst,
 732                        const src_reg &x, const src_reg &y, const src_reg &a)
 733 {
 734    if (devinfo->gen >= 6) {
 735       /* Note that the instruction's argument order is reversed from GLSL
 736        * and the IR.
 737        */
 738      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 739                      fix_3src_operand(x)));
 740    } else {
 741       /* Earlier generations don't support three source operations, so we
 742        * need to emit x*(1-a) + y*a.
 743        */
 744       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 745       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 746       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 747       y_times_a.writemask           = dst.writemask;
 748       one_minus_a.writemask         = dst.writemask;
 749       x_times_one_minus_a.writemask = dst.writemask;
 750
 751       emit(MUL(y_times_a, y, a));
 752       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 753       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 754       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 755    }
 756 }
 757
 758 /**
 759  * Emits the instructions needed to perform a pull constant load. before_block
 760  * and before_inst can be NULL in which case the instruction will be appended
 761  * to the end of the instruction list.
 762  */
 763 void
 764 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 765                                           src_reg surf_index,
 766                                           src_reg offset_reg,
 767                                           bblock_t *before_block,
 768                                           vec4_instruction *before_inst)
 769 {
 770    assert((before_inst == NULL && before_block == NULL) ||
 771           (before_inst && before_block));
 772
 773    vec4_instruction *pull;
 774
 775    if (devinfo->gen >= 9) {
 776       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 777       src_reg header(this, glsl_type::uvec4_type, 2);
 778
 779       pull = new(mem_ctx)
 780          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 781                           dst_reg(header));
 782
 783       if (before_inst)
 784          emit_before(before_block, before_inst, pull);
 785       else
 786          emit(pull);
 787
 788       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 789                                  offset_reg.type);
 790       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 791
 792       if (before_inst)
 793          emit_before(before_block, before_inst, pull);
 794       else
 795          emit(pull);
 796
 797       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 798                                            dst,
 799                                            surf_index,
 800                                            header);
 801       pull->mlen = 2;
 802       pull->header_size = 1;
 803    } else if (devinfo->gen >= 7) {
 804       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 805
 806       grf_offset.type = offset_reg.type;
 807
 808       pull = MOV(grf_offset, offset_reg);
 809
 810       if (before_inst)
 811          emit_before(before_block, before_inst, pull);
 812       else
 813          emit(pull);
 814
 815       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 816                                            dst,
 817                                            surf_index,
 818                                            src_reg(grf_offset));
 819       pull->mlen = 1;
 820    } else {
 821       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 822                                            dst,
 823                                            surf_index,
 824                                            offset_reg);
 825       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 826       pull->mlen = 1;
 827    }
 828
 829    if (before_inst)
 830       emit_before(before_block, before_inst, pull);
 831    else
 832       emit(pull);
 833 }
 834
 835 src_reg
 836 vec4_visitor::emit_uniformize(const src_reg &src)
 837 {
 838    const src_reg chan_index(this, glsl_type::uint_type);
 839    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 840                               src.type);
 841
 842    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 843       ->force_writemask_all = true;
 844    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 845       ->force_writemask_all = true;
 846
 847    return src_reg(dst);
 848 }
 849
 850 src_reg
 851 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 852                              src_reg coordinate, src_reg surface)
 853 {
 854    vec4_instruction *inst =
 855       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 856                                     dst_reg(this, glsl_type::uvec4_type));
 857    inst->base_mrf = 2;
 858    inst->src[1] = surface;
 859    inst->src[2] = surface;
 860
 861    int param_base;
 862
 863    if (devinfo->gen >= 9) {
 864       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 865       vec4_instruction *header_inst = new(mem_ctx)
 866          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 867                           dst_reg(MRF, inst->base_mrf));
 868
 869       emit(header_inst);
 870
 871       inst->mlen = 2;
 872       inst->header_size = 1;
 873       param_base = inst->base_mrf + 1;
 874    } else {
 875       inst->mlen = 1;
 876       param_base = inst->base_mrf;
 877    }
 878
 879    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 880    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 881    int zero_mask = 0xf & ~coord_mask;
 882
 883    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 884             coordinate));
 885
 886    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 887             brw_imm_d(0)));
 888
 889    emit(inst);
 890    return src_reg(inst->dst);
 891 }
 892
 893 bool
 894 vec4_visitor::is_high_sampler(src_reg sampler)
 895 {
 896    if (devinfo->gen < 8 && !devinfo->is_haswell)
 897       return false;
 898
 899    return sampler.file != IMM || sampler.ud >= 16;
 900 }
 901
 902 void
 903 vec4_visitor::emit_texture(ir_texture_opcode op,
 904                            dst_reg dest,
 905                            const glsl_type *dest_type,
 906                            src_reg coordinate,
 907                            int coord_components,
 908                            src_reg shadow_comparator,
 909                            src_reg lod, src_reg lod2,
 910                            src_reg sample_index,
 911                            uint32_t constant_offset,
 912                            src_reg offset_value,
 913                            src_reg mcs,
 914                            uint32_t surface,
 915                            src_reg surface_reg,
 916                            src_reg sampler_reg)
 917 {
 918    /* The sampler can only meaningfully compute LOD for fragment shader
 919     * messages. For all other stages, we change the opcode to TXL and hardcode
 920     * the LOD to 0.
 921     *
 922     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 923     * valid LOD argument.
 924     */
 925    if (op == ir_tex || op == ir_query_levels) {
 926       assert(lod.file == BAD_FILE);
 927       lod = brw_imm_f(0.0f);
 928    }
 929
 930    enum opcode opcode;
 931    switch (op) {
 932    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 933    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 934    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 935    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 936    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 937                              SHADER_OPCODE_TXF_CMS); break;
 938    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 939    case ir_tg4: opcode = offset_value.file != BAD_FILE
 940                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 941    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 942    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 943    case ir_txb:
 944       unreachable("TXB is not valid for vertex shaders.");
 945    case ir_lod:
 946       unreachable("LOD is not valid for vertex shaders.");
 947    case ir_samples_identical: {
 948       /* There are some challenges implementing this for vec4, and it seems
 949        * unlikely to be used anyway.  For now, just return false ways.
 950        */
 951       emit(MOV(dest, brw_imm_ud(0u)));
 952       return;
 953    }
 954    default:
 955       unreachable("Unrecognized tex op");
 956    }
 957
 958    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 959
 960    inst->offset = constant_offset;
 961
 962    /* The message header is necessary for:
 963     * - Gen4 (always)
 964     * - Gen9+ for selecting SIMD4x2
 965     * - Texel offsets
 966     * - Gather channel selection
 967     * - Sampler indices too large to fit in a 4-bit value.
 968     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 969     */
 970    inst->header_size =
 971       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 972        inst->offset != 0 || op == ir_tg4 ||
 973        op == ir_texture_samples ||
 974        is_high_sampler(sampler_reg)) ? 1 : 0;
 975    inst->base_mrf = 2;
 976    inst->mlen = inst->header_size;
 977    inst->dst.writemask = WRITEMASK_XYZW;
 978    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 979
 980    inst->src[1] = surface_reg;
 981    inst->src[2] = sampler_reg;
 982
 983    /* MRF for the first parameter */
 984    int param_base = inst->base_mrf + inst->header_size;
 985
 986    if (op == ir_txs || op == ir_query_levels) {
 987       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 988       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 989       inst->mlen++;
 990    } else if (op == ir_texture_samples) {
 991       inst->dst.writemask = WRITEMASK_X;
 992    } else {
 993       /* Load the coordinate */
 994       /* FINISHME: gl_clamp_mask and saturate */
 995       int coord_mask = (1 << coord_components) - 1;
 996       int zero_mask = 0xf & ~coord_mask;
 997
 998       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 999                coordinate));
1000       inst->mlen++;
1001
1002       if (zero_mask != 0) {
1003          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1004                   brw_imm_d(0)));
1005       }
1006       /* Load the shadow comparator */
1007       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1008          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1009                           WRITEMASK_X),
1010                   shadow_comparator));
1011          inst->mlen++;
1012       }
1013
1014       /* Load the LOD info */
1015       if (op == ir_tex || op == ir_txl) {
1016          int mrf, writemask;
1017          if (devinfo->gen >= 5) {
1018             mrf = param_base + 1;
1019             if (shadow_comparator.file != BAD_FILE) {
1020                writemask = WRITEMASK_Y;
1021                /* mlen already incremented */
1022             } else {
1023                writemask = WRITEMASK_X;
1024                inst->mlen++;
1025             }
1026          } else /* devinfo->gen == 4 */ {
1027             mrf = param_base;
1028             writemask = WRITEMASK_W;
1029          }
1030          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1031       } else if (op == ir_txf) {
1032          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1033       } else if (op == ir_txf_ms) {
1034          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1035                   sample_index));
1036          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1037             /* MCS data is stored in the first two channels of ‘mcs’, but we
1038              * need to get it into the .y and .z channels of the second vec4
1039              * of params.
1040              */
1041             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1042             emit(MOV(dst_reg(MRF, param_base + 1,
1043                              glsl_type::uint_type, WRITEMASK_YZ),
1044                      mcs));
1045          } else if (devinfo->gen >= 7) {
1046             /* MCS data is in the first channel of `mcs`, but we need to get it into
1047              * the .y channel of the second vec4 of params, so replicate .x across
1048              * the whole vec4 and then mask off everything except .y
1049              */
1050             mcs.swizzle = BRW_SWIZZLE_XXXX;
1051             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1052                      mcs));
1053          }
1054          inst->mlen++;
1055       } else if (op == ir_txd) {
1056          const brw_reg_type type = lod.type;
1057
1058          if (devinfo->gen >= 5) {
1059             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1060             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1061             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1062             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1063             inst->mlen++;
1064
1065             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1066                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1067                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1068                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1069                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1070                inst->mlen++;
1071
1072                if (shadow_comparator.file != BAD_FILE) {
1073                   emit(MOV(dst_reg(MRF, param_base + 2,
1074                                    shadow_comparator.type, WRITEMASK_Z),
1075                            shadow_comparator));
1076                }
1077             }
1078          } else /* devinfo->gen == 4 */ {
1079             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1080             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1081             inst->mlen += 2;
1082          }
1083       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1084          if (shadow_comparator.file != BAD_FILE) {
1085             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1086                      shadow_comparator));
1087          }
1088
1089          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1090                   offset_value));
1091          inst->mlen++;
1092       }
1093    }
1094
1095    emit(inst);
1096
1097    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1098     * spec requires layers.
1099     */
1100    if (op == ir_txs && devinfo->gen < 7) {
1101       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1102       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1103                   src_reg(inst->dst), brw_imm_d(1));
1104    }
1105
1106    if (devinfo->gen == 6 && op == ir_tg4) {
1107       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1108    }
1109
1110    if (op == ir_query_levels) {
1111       /* # levels is in .w */
1112       src_reg swizzled(dest);
1113       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1114                                       SWIZZLE_W, SWIZZLE_W);
1115       emit(MOV(dest, swizzled));
1116    }
1117 }
1118
1119 /**
1120  * Apply workarounds for Gen6 gather with UINT/SINT
1121  */
1122 void
1123 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1124 {
1125    if (!wa)
1126       return;
1127
1128    int width = (wa & WA_8BIT) ? 8 : 16;
1129    dst_reg dst_f = dst;
1130    dst_f.type = BRW_REGISTER_TYPE_F;
1131
1132    /* Convert from UNORM to UINT */
1133    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1134    emit(MOV(dst, src_reg(dst_f)));
1135
1136    if (wa & WA_SIGN) {
1137       /* Reinterpret the UINT value as a signed INT value by
1138        * shifting the sign bit into place, then shifting back
1139        * preserving sign.
1140        */
1141       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1142       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1143    }
1144 }
1145
1146 void
1147 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1148 {
1149    unreachable("not reached");
1150 }
1151
1152 void
1153 vec4_visitor::gs_end_primitive()
1154 {
1155    unreachable("not reached");
1156 }
1157
1158 void
1159 vec4_visitor::emit_ndc_computation()
1160 {
1161    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1162       return;
1163
1164    /* Get the position */
1165    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1166
1167    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1168    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1169    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1170    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1171
1172    current_annotation = "NDC";
1173    dst_reg ndc_w = ndc;
1174    ndc_w.writemask = WRITEMASK_W;
1175    src_reg pos_w = pos;
1176    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1177    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1178
1179    dst_reg ndc_xyz = ndc;
1180    ndc_xyz.writemask = WRITEMASK_XYZ;
1181
1182    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1183 }
1184
1185 void
1186 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1187 {
1188    if (devinfo->gen < 6 &&
1189        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1190         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1191         devinfo->has_negative_rhw_bug)) {
1192       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1193       dst_reg header1_w = header1;
1194       header1_w.writemask = WRITEMASK_W;
1195
1196       emit(MOV(header1, brw_imm_ud(0u)));
1197
1198       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1199          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1200
1201          current_annotation = "Point size";
1202          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1203          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1204       }
1205
1206       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1207          current_annotation = "Clipping flags";
1208          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1209          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1210
1211          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1212          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1213          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1214
1215          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1216          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1217          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1218          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1219       }
1220
1221       /* i965 clipping workaround:
1222        * 1) Test for -ve rhw
1223        * 2) If set,
1224        *      set ndc = (0,0,0,0)
1225        *      set ucp[6] = 1
1226        *
1227        * Later, clipping will detect ucp[6] and ensure the primitive is
1228        * clipped against all fixed planes.
1229        */
1230       if (devinfo->has_negative_rhw_bug &&
1231           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1232          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1233          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1234          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1235          vec4_instruction *inst;
1236          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1237          inst->predicate = BRW_PREDICATE_NORMAL;
1238          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1239          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1240          inst->predicate = BRW_PREDICATE_NORMAL;
1241       }
1242
1243       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1244    } else if (devinfo->gen < 6) {
1245       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1246    } else {
1247       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1248       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1249          dst_reg reg_w = reg;
1250          reg_w.writemask = WRITEMASK_W;
1251          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1252          reg_as_src.type = reg_w.type;
1253          reg_as_src.swizzle = brw_swizzle_for_size(1);
1254          emit(MOV(reg_w, reg_as_src));
1255       }
1256       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1257          dst_reg reg_y = reg;
1258          reg_y.writemask = WRITEMASK_Y;
1259          reg_y.type = BRW_REGISTER_TYPE_D;
1260          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1261          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1262       }
1263       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1264          dst_reg reg_z = reg;
1265          reg_z.writemask = WRITEMASK_Z;
1266          reg_z.type = BRW_REGISTER_TYPE_D;
1267          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1268          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1269       }
1270    }
1271 }
1272
1273 vec4_instruction *
1274 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1275 {
1276    assert(varying < VARYING_SLOT_MAX);
1277
1278    unsigned num_comps = output_num_components[varying][component];
1279    if (num_comps == 0)
1280       return NULL;
1281
1282    assert(output_reg[varying][component].type == reg.type);
1283    current_annotation = output_reg_annotation[varying];
1284    if (output_reg[varying][component].file != BAD_FILE) {
1285       src_reg src = src_reg(output_reg[varying][component]);
1286       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1287       reg.writemask =
1288          brw_writemask_for_component_packing(num_comps, component);
1289       return emit(MOV(reg, src));
1290    }
1291    return NULL;
1292 }
1293
1294 void
1295 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1296 {
1297    reg.type = BRW_REGISTER_TYPE_F;
1298    output_reg[varying][0].type = reg.type;
1299
1300    switch (varying) {
1301    case VARYING_SLOT_PSIZ:
1302    {
1303       /* PSIZ is always in slot 0, and is coupled with other flags. */
1304       current_annotation = "indices, point width, clip flags";
1305       emit_psiz_and_flags(reg);
1306       break;
1307    }
1308    case BRW_VARYING_SLOT_NDC:
1309       current_annotation = "NDC";
1310       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1311          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1312       break;
1313    case VARYING_SLOT_POS:
1314       current_annotation = "gl_Position";
1315       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1316          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1317       break;
1318    case VARYING_SLOT_EDGE: {
1319       /* This is present when doing unfilled polygons.  We're supposed to copy
1320        * the edge flag from the user-provided vertex array
1321        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1322        * of that attribute (starts as 1.0f).  This is then used in clipping to
1323        * determine which edges should be drawn as wireframe.
1324        */
1325       current_annotation = "edge flag";
1326       int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1327                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1328       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1329                                     glsl_type::float_type, WRITEMASK_XYZW))));
1330       break;
1331    }
1332    case BRW_VARYING_SLOT_PAD:
1333       /* No need to write to this slot */
1334       break;
1335    default:
1336       for (int i = 0; i < 4; i++) {
1337          emit_generic_urb_slot(reg, varying, i);
1338       }
1339       break;
1340    }
1341 }
1342
1343 static int
1344 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1345 {
1346    if (devinfo->gen >= 6) {
1347       /* URB data written (does not include the message header reg) must
1348        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1349        * section 5.4.3.2.2: URB_INTERLEAVED.
1350        *
1351        * URB entries are allocated on a multiple of 1024 bits, so an
1352        * extra 128 bits written here to make the end align to 256 is
1353        * no problem.
1354        */
1355       if ((mlen % 2) != 1)
1356          mlen++;
1357    }
1358
1359    return mlen;
1360 }
1361
1362
1363 /**
1364  * Generates the VUE payload plus the necessary URB write instructions to
1365  * output it.
1366  *
1367  * The VUE layout is documented in Volume 2a.
1368  */
1369 void
1370 vec4_visitor::emit_vertex()
1371 {
1372    /* MRF 0 is reserved for the debugger, so start with message header
1373     * in MRF 1.
1374     */
1375    int base_mrf = 1;
1376    int mrf = base_mrf;
1377    /* In the process of generating our URB write message contents, we
1378     * may need to unspill a register or load from an array.  Those
1379     * reads would use MRFs 14-15.
1380     */
1381    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1382
1383    /* The following assertion verifies that max_usable_mrf causes an
1384     * even-numbered amount of URB write data, which will meet gen6's
1385     * requirements for length alignment.
1386     */
1387    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1388
1389    /* First mrf is the g0-based message header containing URB handles and
1390     * such.
1391     */
1392    emit_urb_write_header(mrf++);
1393
1394    if (devinfo->gen < 6) {
1395       emit_ndc_computation();
1396    }
1397
1398    /* We may need to split this up into several URB writes, so do them in a
1399     * loop.
1400     */
1401    int slot = 0;
1402    bool complete = false;
1403    do {
1404       /* URB offset is in URB row increments, and each of our MRFs is half of
1405        * one of those, since we're doing interleaved writes.
1406        */
1407       int offset = slot / 2;
1408
1409       mrf = base_mrf + 1;
1410       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1411          emit_urb_slot(dst_reg(MRF, mrf++),
1412                        prog_data->vue_map.slot_to_varying[slot]);
1413
1414          /* If this was max_usable_mrf, we can't fit anything more into this
1415           * URB WRITE. Same thing if we reached the maximum length available.
1416           */
1417          if (mrf > max_usable_mrf ||
1418              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1419             slot++;
1420             break;
1421          }
1422       }
1423
1424       complete = slot >= prog_data->vue_map.num_slots;
1425       current_annotation = "URB write";
1426       vec4_instruction *inst = emit_urb_write_opcode(complete);
1427       inst->base_mrf = base_mrf;
1428       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1429       inst->offset += offset;
1430    } while(!complete);
1431 }
1432
1433
1434 src_reg
1435 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1436                                  src_reg *reladdr, int reg_offset)
1437 {
1438    /* Because we store the values to scratch interleaved like our
1439     * vertex data, we need to scale the vec4 index by 2.
1440     */
1441    int message_header_scale = 2;
1442
1443    /* Pre-gen6, the message header uses byte offsets instead of vec4
1444     * (16-byte) offset units.
1445     */
1446    if (devinfo->gen < 6)
1447       message_header_scale *= 16;
1448
1449    if (reladdr) {
1450       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1451        * to multiply the reladdr by 2. Notice that the reg_offset part
1452        * is in units of 16 bytes and is used to select the low/high 16-byte
1453        * chunk of a full dvec4, so we don't want to multiply that part.
1454        */
1455       src_reg index = src_reg(this, glsl_type::int_type);
1456       if (type_sz(inst->dst.type) < 8) {
1457          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1458                                       brw_imm_d(reg_offset)));
1459          emit_before(block, inst, MUL(dst_reg(index), index,
1460                                       brw_imm_d(message_header_scale)));
1461       } else {
1462          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1463                                       brw_imm_d(message_header_scale * 2)));
1464          emit_before(block, inst, ADD(dst_reg(index), index,
1465                                       brw_imm_d(reg_offset * message_header_scale)));
1466       }
1467       return index;
1468    } else {
1469       return brw_imm_d(reg_offset * message_header_scale);
1470    }
1471 }
1472
1473 /**
1474  * Emits an instruction before @inst to load the value named by @orig_src
1475  * from scratch space at @base_offset to @temp.
1476  *
1477  * @base_offset is measured in 32-byte units (the size of a register).
1478  */
1479 void
1480 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1481                                 dst_reg temp, src_reg orig_src,
1482                                 int base_offset)
1483 {
1484    assert(orig_src.offset % REG_SIZE == 0);
1485    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1486    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1487                                       reg_offset);
1488
1489    if (type_sz(orig_src.type) < 8) {
1490       emit_before(block, inst, SCRATCH_READ(temp, index));
1491    } else {
1492       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1493       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1494       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1495       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1496       vec4_instruction *last_read =
1497          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1498       emit_before(block, inst, last_read);
1499       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1500    }
1501 }
1502
1503 /**
1504  * Emits an instruction after @inst to store the value to be written
1505  * to @orig_dst to scratch space at @base_offset, from @temp.
1506  *
1507  * @base_offset is measured in 32-byte units (the size of a register).
1508  */
1509 void
1510 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1511                                  int base_offset)
1512 {
1513    assert(inst->dst.offset % REG_SIZE == 0);
1514    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1515    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1516                                       reg_offset);
1517
1518    /* Create a temporary register to store *inst's result in.
1519     *
1520     * We have to be careful in MOVing from our temporary result register in
1521     * the scratch write.  If we swizzle from channels of the temporary that
1522     * weren't initialized, it will confuse live interval analysis, which will
1523     * make spilling fail to make progress.
1524     */
1525    bool is_64bit = type_sz(inst->dst.type) == 8;
1526    const glsl_type *alloc_type =
1527       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1528    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1529                                        inst->dst.type),
1530                                 brw_swizzle_for_mask(inst->dst.writemask));
1531
1532    if (!is_64bit) {
1533       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1534                                           inst->dst.writemask));
1535       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1536       if (inst->opcode != BRW_OPCODE_SEL)
1537          write->predicate = inst->predicate;
1538       write->ir = inst->ir;
1539       write->annotation = inst->annotation;
1540       inst->insert_after(block, write);
1541    } else {
1542       dst_reg shuffled = dst_reg(this, alloc_type);
1543       vec4_instruction *last =
1544          shuffle_64bit_data(shuffled, temp, true, block, inst);
1545       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1546
1547       uint8_t mask = 0;
1548       if (inst->dst.writemask & WRITEMASK_X)
1549          mask |= WRITEMASK_XY;
1550       if (inst->dst.writemask & WRITEMASK_Y)
1551          mask |= WRITEMASK_ZW;
1552       if (mask) {
1553          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1554
1555          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1556          if (inst->opcode != BRW_OPCODE_SEL)
1557             write->predicate = inst->predicate;
1558          write->ir = inst->ir;
1559          write->annotation = inst->annotation;
1560          last->insert_after(block, write);
1561       }
1562
1563       mask = 0;
1564       if (inst->dst.writemask & WRITEMASK_Z)
1565          mask |= WRITEMASK_XY;
1566       if (inst->dst.writemask & WRITEMASK_W)
1567          mask |= WRITEMASK_ZW;
1568       if (mask) {
1569          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1570
1571          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1572                                             reg_offset + 1);
1573          vec4_instruction *write =
1574             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1575          if (inst->opcode != BRW_OPCODE_SEL)
1576             write->predicate = inst->predicate;
1577          write->ir = inst->ir;
1578          write->annotation = inst->annotation;
1579          last->insert_after(block, write);
1580       }
1581    }
1582
1583    inst->dst.file = temp.file;
1584    inst->dst.nr = temp.nr;
1585    inst->dst.offset %= REG_SIZE;
1586    inst->dst.reladdr = NULL;
1587 }
1588
1589 /**
1590  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1591  * adds the scratch read(s) before \p inst. The function also checks for
1592  * recursive reladdr scratch accesses, issuing the corresponding scratch
1593  * loads and rewriting reladdr references accordingly.
1594  *
1595  * \return \p src if it did not require a scratch load, otherwise, the
1596  * register holding the result of the scratch load that the caller should
1597  * use to rewrite src.
1598  */
1599 src_reg
1600 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1601                                    vec4_instruction *inst, src_reg src)
1602 {
1603    /* Resolve recursive reladdr scratch access by calling ourselves
1604     * with src.reladdr
1605     */
1606    if (src.reladdr)
1607       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1608                                           *src.reladdr);
1609
1610    /* Now handle scratch access on src */
1611    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1612       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1613          glsl_type::dvec4_type : glsl_type::vec4_type);
1614       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1615       src.nr = temp.nr;
1616       src.offset %= REG_SIZE;
1617       src.reladdr = NULL;
1618    }
1619
1620    return src;
1621 }
1622
1623 /**
1624  * We can't generally support array access in GRF space, because a
1625  * single instruction's destination can only span 2 contiguous
1626  * registers.  So, we send all GRF arrays that get variable index
1627  * access to scratch space.
1628  */
1629 void
1630 vec4_visitor::move_grf_array_access_to_scratch()
1631 {
1632    int scratch_loc[this->alloc.count];
1633    memset(scratch_loc, -1, sizeof(scratch_loc));
1634
1635    /* First, calculate the set of virtual GRFs that need to be punted
1636     * to scratch due to having any array access on them, and where in
1637     * scratch.
1638     */
1639    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1640       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1641          if (scratch_loc[inst->dst.nr] == -1) {
1642             scratch_loc[inst->dst.nr] = last_scratch;
1643             last_scratch += this->alloc.sizes[inst->dst.nr];
1644          }
1645
1646          for (src_reg *iter = inst->dst.reladdr;
1647               iter->reladdr;
1648               iter = iter->reladdr) {
1649             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1650                scratch_loc[iter->nr] = last_scratch;
1651                last_scratch += this->alloc.sizes[iter->nr];
1652             }
1653          }
1654       }
1655
1656       for (int i = 0 ; i < 3; i++) {
1657          for (src_reg *iter = &inst->src[i];
1658               iter->reladdr;
1659               iter = iter->reladdr) {
1660             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1661                scratch_loc[iter->nr] = last_scratch;
1662                last_scratch += this->alloc.sizes[iter->nr];
1663             }
1664          }
1665       }
1666    }
1667
1668    /* Now, for anything that will be accessed through scratch, rewrite
1669     * it to load/store.  Note that this is a _safe list walk, because
1670     * we may generate a new scratch_write instruction after the one
1671     * we're processing.
1672     */
1673    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1674       /* Set up the annotation tracking for new generated instructions. */
1675       base_ir = inst->ir;
1676       current_annotation = inst->annotation;
1677
1678       /* First handle scratch access on the dst. Notice we have to handle
1679        * the case where the dst's reladdr also points to scratch space.
1680        */
1681       if (inst->dst.reladdr)
1682          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1683                                                    *inst->dst.reladdr);
1684
1685       /* Now that we have handled any (possibly recursive) reladdr scratch
1686        * accesses for dst we can safely do the scratch write for dst itself
1687        */
1688       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1689          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1690
1691       /* Now handle scratch access on any src. In this case, since inst->src[i]
1692        * already is a src_reg, we can just call emit_resolve_reladdr with
1693        * inst->src[i] and it will take care of handling scratch loads for
1694        * both src and src.reladdr (recursively).
1695        */
1696       for (int i = 0 ; i < 3; i++) {
1697          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1698                                              inst->src[i]);
1699       }
1700    }
1701 }
1702
1703 /**
1704  * Emits an instruction before @inst to load the value named by @orig_src
1705  * from the pull constant buffer (surface) at @base_offset to @temp.
1706  */
1707 void
1708 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1709                                       dst_reg temp, src_reg orig_src,
1710                                       int base_offset, src_reg indirect)
1711 {
1712    assert(orig_src.offset % 16 == 0);
1713    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1714
1715    /* For 64bit loads we need to emit two 32-bit load messages and we also
1716     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1717     * that we emit the 32-bit loads into a temporary and we shuffle the result
1718     * into the original destination.
1719     */
1720    dst_reg orig_temp = temp;
1721    bool is_64bit = type_sz(orig_src.type) == 8;
1722    if (is_64bit) {
1723       assert(type_sz(temp.type) == 8);
1724       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1725       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1726    }
1727
1728    src_reg src = orig_src;
1729    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1730       int reg_offset = base_offset + src.offset / 16;
1731
1732       src_reg offset;
1733       if (indirect.file != BAD_FILE) {
1734          offset = src_reg(this, glsl_type::uint_type);
1735          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1736                                       brw_imm_ud(reg_offset * 16)));
1737       } else if (devinfo->gen >= 8) {
1738          /* Store the offset in a GRF so we can send-from-GRF. */
1739          offset = src_reg(this, glsl_type::uint_type);
1740          emit_before(block, inst, MOV(dst_reg(offset),
1741                                       brw_imm_ud(reg_offset * 16)));
1742       } else {
1743          offset = brw_imm_d(reg_offset * 16);
1744       }
1745
1746       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1747                                   brw_imm_ud(index),
1748                                   offset,
1749                                   block, inst);
1750
1751       src = byte_offset(src, 16);
1752    }
1753
1754    brw_mark_surface_used(&prog_data->base, index);
1755
1756    if (is_64bit) {
1757       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1758       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1759    }
1760 }
1761
1762 /**
1763  * Implements array access of uniforms by inserting a
1764  * PULL_CONSTANT_LOAD instruction.
1765  *
1766  * Unlike temporary GRF array access (where we don't support it due to
1767  * the difficulty of doing relative addressing on instruction
1768  * destinations), we could potentially do array access of uniforms
1769  * that were loaded in GRF space as push constants.  In real-world
1770  * usage we've seen, though, the arrays being used are always larger
1771  * than we could load as push constants, so just always move all
1772  * uniform array access out to a pull constant buffer.
1773  */
1774 void
1775 vec4_visitor::move_uniform_array_access_to_pull_constants()
1776 {
1777    /* The vulkan dirver doesn't support pull constants other than UBOs so
1778     * everything has to be pushed regardless.
1779     */
1780    if (stage_prog_data->pull_param == NULL) {
1781       split_uniform_registers();
1782       return;
1783    }
1784
1785    int pull_constant_loc[this->uniforms];
1786    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1787
1788    /* First, walk through the instructions and determine which things need to
1789     * be pulled.  We mark something as needing to be pulled by setting
1790     * pull_constant_loc to 0.
1791     */
1792    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1793       /* We only care about MOV_INDIRECT of a uniform */
1794       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1795           inst->src[0].file != UNIFORM)
1796          continue;
1797
1798       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1799
1800       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1801          pull_constant_loc[uniform_nr + j] = 0;
1802    }
1803
1804    /* Next, we walk the list of uniforms and assign real pull constant
1805     * locations and set their corresponding entries in pull_param.
1806     */
1807    for (int j = 0; j < this->uniforms; j++) {
1808       if (pull_constant_loc[j] < 0)
1809          continue;
1810
1811       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1812
1813       for (int i = 0; i < 4; i++) {
1814          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1815             = stage_prog_data->param[j * 4 + i];
1816       }
1817    }
1818
1819    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1820     * instructions to actual uniform pulls.
1821     */
1822    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1823       /* We only care about MOV_INDIRECT of a uniform */
1824       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1825           inst->src[0].file != UNIFORM)
1826          continue;
1827
1828       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1829
1830       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1831
1832       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1833                               pull_constant_loc[uniform_nr], inst->src[1]);
1834       inst->remove(block);
1835    }
1836
1837    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1838     * no need to track them as larger-than-vec4 objects.  This will be
1839     * relied on in cutting out unused uniform vectors from push
1840     * constants.
1841     */
1842    split_uniform_registers();
1843 }
1844
1845 void
1846 vec4_visitor::resolve_ud_negate(src_reg *reg)
1847 {
1848    if (reg->type != BRW_REGISTER_TYPE_UD ||
1849        !reg->negate)
1850       return;
1851
1852    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1853    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1854    *reg = temp;
1855 }
1856
1857 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1858                            void *log_data,
1859                            const struct brw_sampler_prog_key_data *key_tex,
1860                            struct brw_vue_prog_data *prog_data,
1861                            const nir_shader *shader,
1862                            void *mem_ctx,
1863                            bool no_spills,
1864                            int shader_time_index)
1865    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1866      key_tex(key_tex),
1867      prog_data(prog_data),
1868      fail_msg(NULL),
1869      first_non_payload_grf(0),
1870      need_all_constants_in_pull_buffer(false),
1871      no_spills(no_spills),
1872      shader_time_index(shader_time_index),
1873      last_scratch(0)
1874 {
1875    this->failed = false;
1876
1877    this->base_ir = NULL;
1878    this->current_annotation = NULL;
1879    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1880
1881    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1882
1883    this->virtual_grf_start = NULL;
1884    this->virtual_grf_end = NULL;
1885    this->live_intervals = NULL;
1886
1887    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1888
1889    this->uniforms = 0;
1890 }
1891
1892 vec4_visitor::~vec4_visitor()
1893 {
1894 }
1895
1896
1897 void
1898 vec4_visitor::fail(const char *format, ...)
1899 {
1900    va_list va;
1901    char *msg;
1902
1903    if (failed)
1904       return;
1905
1906    failed = true;
1907
1908    va_start(va, format);
1909    msg = ralloc_vasprintf(mem_ctx, format, va);
1910    va_end(va);
1911    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1912
1913    this->fail_msg = msg;
1914
1915    if (debug_enabled) {
1916       fprintf(stderr, "%s",  msg);
1917    }
1918 }
1919
1920 } /* namespace brw */