src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240
 241    resolve_ud_negate(&src0);
 242    resolve_ud_negate(&src1);
 243
 244    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 245    inst->conditional_mod = condition;
 246
 247    return inst;
 248 }
 249
 250 vec4_instruction *
 251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 252 {
 253    vec4_instruction *inst;
 254
 255    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 256                                         dst, index);
 257    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 258    inst->mlen = 2;
 259
 260    return inst;
 261 }
 262
 263 vec4_instruction *
 264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 265                             const src_reg &index)
 266 {
 267    vec4_instruction *inst;
 268
 269    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 270                                         dst, src, index);
 271    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 272    inst->mlen = 3;
 273
 274    return inst;
 275 }
 276
 277 src_reg
 278 vec4_visitor::fix_3src_operand(const src_reg &src)
 279 {
 280    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 281     * able to use vertical stride of zero to replicate the vec4 uniform, like
 282     *
 283     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 284     *
 285     * But you can't, since vertical stride is always four in three-source
 286     * instructions. Instead, insert a MOV instruction to do the replication so
 287     * that the three-source instruction can consume it.
 288     */
 289
 290    /* The MOV is only needed if the source is a uniform or immediate. */
 291    if (src.file != UNIFORM && src.file != IMM)
 292       return src;
 293
 294    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 295       return src;
 296
 297    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 298    expanded.type = src.type;
 299    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 300    return src_reg(expanded);
 301 }
 302
 303 src_reg
 304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 305 {
 306    if (!src.abs && !src.negate)
 307       return src;
 308
 309    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 310    resolved.type = src.type;
 311    emit(MOV(resolved, src));
 312
 313    return src_reg(resolved);
 314 }
 315
 316 src_reg
 317 vec4_visitor::fix_math_operand(const src_reg &src)
 318 {
 319    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 320       return src;
 321
 322    /* The gen6 math instruction ignores the source modifiers --
 323     * swizzle, abs, negate, and at least some parts of the register
 324     * region description.
 325     *
 326     * Rather than trying to enumerate all these cases, *always* expand the
 327     * operand to a temp GRF for gen6.
 328     *
 329     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 330     * can't use.
 331     */
 332
 333    if (devinfo->gen == 7 && src.file != IMM)
 334       return src;
 335
 336    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 337    expanded.type = src.type;
 338    emit(MOV(expanded, src));
 339    return src_reg(expanded);
 340 }
 341
 342 vec4_instruction *
 343 vec4_visitor::emit_math(enum opcode opcode,
 344                         const dst_reg &dst,
 345                         const src_reg &src0, const src_reg &src1)
 346 {
 347    vec4_instruction *math =
 348       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 349
 350    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 351       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 352       math->dst = dst_reg(this, glsl_type::vec4_type);
 353       math->dst.type = dst.type;
 354       math = emit(MOV(dst, src_reg(math->dst)));
 355    } else if (devinfo->gen < 6) {
 356       math->base_mrf = 1;
 357       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 358    }
 359
 360    return math;
 361 }
 362
 363 void
 364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 365 {
 366    if (devinfo->gen < 7) {
 367       unreachable("ir_unop_pack_half_2x16 should be lowered");
 368    }
 369
 370    assert(dst.type == BRW_REGISTER_TYPE_UD);
 371    assert(src0.type == BRW_REGISTER_TYPE_F);
 372
 373    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 374     *
 375     *   Because this instruction does not have a 16-bit floating-point type,
 376     *   the destination data type must be Word (W).
 377     *
 378     *   The destination must be DWord-aligned and specify a horizontal stride
 379     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 380     *   each destination channel and the upper word is not modified.
 381     *
 382     * The above restriction implies that the f32to16 instruction must use
 383     * align1 mode, because only in align1 mode is it possible to specify
 384     * horizontal stride.  We choose here to defy the hardware docs and emit
 385     * align16 instructions.
 386     *
 387     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 388     * instructions. I was partially successful in that the code passed all
 389     * tests.  However, the code was dubiously correct and fragile, and the
 390     * tests were not harsh enough to probe that frailty. Not trusting the
 391     * code, I chose instead to remain in align16 mode in defiance of the hw
 392     * docs).
 393     *
 394     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 395     * simulator, emitting a f32to16 in align16 mode with UD as destination
 396     * data type is safe. The behavior differs from that specified in the PRM
 397     * in that the upper word of each destination channel is cleared to 0.
 398     */
 399
 400    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 401    src_reg tmp_src(tmp_dst);
 402
 403 #if 0
 404    /* Verify the undocumented behavior on which the following instructions
 405     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 406     * then the result of the bit-or instruction below will be incorrect.
 407     *
 408     * You should inspect the disasm output in order to verify that the MOV is
 409     * not optimized away.
 410     */
 411    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 412 #endif
 413
 414    /* Give tmp the form below, where "." means untouched.
 415     *
 416     *     w z          y          x w z          y          x
 417     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 418     *
 419     * That the upper word of each write-channel be 0 is required for the
 420     * following bit-shift and bit-or instructions to work. Note that this
 421     * relies on the undocumented hardware behavior mentioned above.
 422     */
 423    tmp_dst.writemask = WRITEMASK_XY;
 424    emit(F32TO16(tmp_dst, src0));
 425
 426    /* Give the write-channels of dst the form:
 427     *   0xhhhh0000
 428     */
 429    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 430    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 431
 432    /* Finally, give the write-channels of dst the form of packHalf2x16's
 433     * output:
 434     *   0xhhhhllll
 435     */
 436    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 437    emit(OR(dst, src_reg(dst), tmp_src));
 438 }
 439
 440 void
 441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 442 {
 443    if (devinfo->gen < 7) {
 444       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 445    }
 446
 447    assert(dst.type == BRW_REGISTER_TYPE_F);
 448    assert(src0.type == BRW_REGISTER_TYPE_UD);
 449
 450    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 451     *
 452     *   Because this instruction does not have a 16-bit floating-point type,
 453     *   the source data type must be Word (W). The destination type must be
 454     *   F (Float).
 455     *
 456     * To use W as the source data type, we must adjust horizontal strides,
 457     * which is only possible in align1 mode. All my [chadv] attempts at
 458     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 459     * Piglit tests, so I gave up.
 460     *
 461     * I've verified that, on gen7 hardware and the simulator, it is safe to
 462     * emit f16to32 in align16 mode with UD as source data type.
 463     */
 464
 465    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 466    src_reg tmp_src(tmp_dst);
 467
 468    tmp_dst.writemask = WRITEMASK_X;
 469    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 470
 471    tmp_dst.writemask = WRITEMASK_Y;
 472    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 473
 474    dst.writemask = WRITEMASK_XY;
 475    emit(F16TO32(dst, tmp_src));
 476 }
 477
 478 void
 479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 480 {
 481    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 482     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 483     * is not suitable to generate the shift values, but we can use the packed
 484     * vector float and a type-converting MOV.
 485     */
 486    dst_reg shift(this, glsl_type::uvec4_type);
 487    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 488
 489    dst_reg shifted(this, glsl_type::uvec4_type);
 490    src0.swizzle = BRW_SWIZZLE_XXXX;
 491    emit(SHR(shifted, src0, src_reg(shift)));
 492
 493    shifted.type = BRW_REGISTER_TYPE_UB;
 494    dst_reg f(this, glsl_type::vec4_type);
 495    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 496
 497    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 498 }
 499
 500 void
 501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 502 {
 503    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 504     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 505     * is not suitable to generate the shift values, but we can use the packed
 506     * vector float and a type-converting MOV.
 507     */
 508    dst_reg shift(this, glsl_type::uvec4_type);
 509    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 510
 511    dst_reg shifted(this, glsl_type::uvec4_type);
 512    src0.swizzle = BRW_SWIZZLE_XXXX;
 513    emit(SHR(shifted, src0, src_reg(shift)));
 514
 515    shifted.type = BRW_REGISTER_TYPE_B;
 516    dst_reg f(this, glsl_type::vec4_type);
 517    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 518
 519    dst_reg scaled(this, glsl_type::vec4_type);
 520    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 521
 522    dst_reg max(this, glsl_type::vec4_type);
 523    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 524    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 525 }
 526
 527 void
 528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 529 {
 530    dst_reg saturated(this, glsl_type::vec4_type);
 531    vec4_instruction *inst = emit(MOV(saturated, src0));
 532    inst->saturate = true;
 533
 534    dst_reg scaled(this, glsl_type::vec4_type);
 535    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 536
 537    dst_reg rounded(this, glsl_type::vec4_type);
 538    emit(RNDE(rounded, src_reg(scaled)));
 539
 540    dst_reg u(this, glsl_type::uvec4_type);
 541    emit(MOV(u, src_reg(rounded)));
 542
 543    src_reg bytes(u);
 544    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 545 }
 546
 547 void
 548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 549 {
 550    dst_reg max(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 552
 553    dst_reg min(this, glsl_type::vec4_type);
 554    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 555
 556    dst_reg scaled(this, glsl_type::vec4_type);
 557    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 558
 559    dst_reg rounded(this, glsl_type::vec4_type);
 560    emit(RNDE(rounded, src_reg(scaled)));
 561
 562    dst_reg i(this, glsl_type::ivec4_type);
 563    emit(MOV(i, src_reg(rounded)));
 564
 565    src_reg bytes(i);
 566    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 567 }
 568
 569 /*
 570  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 571  * false) elements needed to pack a type.
 572  */
 573 static int
 574 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 575 {
 576    unsigned int i;
 577    int size;
 578
 579    switch (type->base_type) {
 580    case GLSL_TYPE_UINT:
 581    case GLSL_TYPE_INT:
 582    case GLSL_TYPE_FLOAT:
 583    case GLSL_TYPE_BOOL:
 584    case GLSL_TYPE_DOUBLE:
 585       if (type->is_matrix()) {
 586          const glsl_type *col_type = type->column_type();
 587          unsigned col_slots =
 588             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 589          return type->matrix_columns * col_slots;
 590       } else {
 591          /* Regardless of size of vector, it gets a vec4. This is bad
 592           * packing for things like floats, but otherwise arrays become a
 593           * mess.  Hopefully a later pass over the code can pack scalars
 594           * down if appropriate.
 595           */
 596          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 597       }
 598    case GLSL_TYPE_ARRAY:
 599       assert(type->length > 0);
 600       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 601    case GLSL_TYPE_STRUCT:
 602       size = 0;
 603       for (i = 0; i < type->length; i++) {
 604          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 605       }
 606       return size;
 607    case GLSL_TYPE_SUBROUTINE:
 608       return 1;
 609
 610    case GLSL_TYPE_SAMPLER:
 611       /* Samplers take up no register space, since they're baked in at
 612        * link time.
 613        */
 614       return 0;
 615    case GLSL_TYPE_ATOMIC_UINT:
 616       return 0;
 617    case GLSL_TYPE_IMAGE:
 618       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 619    case GLSL_TYPE_VOID:
 620    case GLSL_TYPE_ERROR:
 621    case GLSL_TYPE_INTERFACE:
 622    case GLSL_TYPE_FUNCTION:
 623       unreachable("not reached");
 624    }
 625
 626    return 0;
 627 }
 628
 629 /**
 630  * Returns the minimum number of vec4 elements needed to pack a type.
 631  *
 632  * For simple types, it will return 1 (a single vec4); for matrices, the
 633  * number of columns; for array and struct, the sum of the vec4_size of
 634  * each of its elements; and for sampler and atomic, zero.
 635  *
 636  * This method is useful to calculate how much register space is needed to
 637  * store a particular type.
 638  */
 639 extern "C" int
 640 type_size_vec4(const struct glsl_type *type)
 641 {
 642    return type_size_xvec4(type, true);
 643 }
 644
 645 /**
 646  * Returns the minimum number of dvec4 elements needed to pack a type.
 647  *
 648  * For simple types, it will return 1 (a single dvec4); for matrices, the
 649  * number of columns; for array and struct, the sum of the dvec4_size of
 650  * each of its elements; and for sampler and atomic, zero.
 651  *
 652  * This method is useful to calculate how much register space is needed to
 653  * store a particular type.
 654  *
 655  * Measuring double-precision vertex inputs as dvec4 is required because
 656  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 657  * than the single-precision version. That is, two consecutives dvec4 would be
 658  * located in location "x" and location "x+1", not "x+2".
 659  *
 660  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 661  * remap_vs_attrs() will take in account both the location and also if the
 662  * type fits in one or two vec4 slots.
 663  */
 664 extern "C" int
 665 type_size_dvec4(const struct glsl_type *type)
 666 {
 667    return type_size_xvec4(type, false);
 668 }
 669
 670 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 671 {
 672    init();
 673
 674    this->file = VGRF;
 675    this->nr = v->alloc.allocate(type_size_vec4(type));
 676
 677    if (type->is_array() || type->is_record()) {
 678       this->swizzle = BRW_SWIZZLE_NOOP;
 679    } else {
 680       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 681    }
 682
 683    this->type = brw_type_for_base_type(type);
 684 }
 685
 686 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 687 {
 688    assert(size > 0);
 689
 690    init();
 691
 692    this->file = VGRF;
 693    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 694
 695    this->swizzle = BRW_SWIZZLE_NOOP;
 696
 697    this->type = brw_type_for_base_type(type);
 698 }
 699
 700 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 701 {
 702    init();
 703
 704    this->file = VGRF;
 705    this->nr = v->alloc.allocate(type_size_vec4(type));
 706
 707    if (type->is_array() || type->is_record()) {
 708       this->writemask = WRITEMASK_XYZW;
 709    } else {
 710       this->writemask = (1 << type->vector_elements) - 1;
 711    }
 712
 713    this->type = brw_type_for_base_type(type);
 714 }
 715
 716 vec4_instruction *
 717 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 718                           src_reg src0, src_reg src1)
 719 {
 720    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 721    inst->conditional_mod = conditionalmod;
 722    return inst;
 723 }
 724
 725 vec4_instruction *
 726 vec4_visitor::emit_lrp(const dst_reg &dst,
 727                        const src_reg &x, const src_reg &y, const src_reg &a)
 728 {
 729    if (devinfo->gen >= 6) {
 730       /* Note that the instruction's argument order is reversed from GLSL
 731        * and the IR.
 732        */
 733      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 734                      fix_3src_operand(x)));
 735    } else {
 736       /* Earlier generations don't support three source operations, so we
 737        * need to emit x*(1-a) + y*a.
 738        */
 739       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 740       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 741       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 742       y_times_a.writemask           = dst.writemask;
 743       one_minus_a.writemask         = dst.writemask;
 744       x_times_one_minus_a.writemask = dst.writemask;
 745
 746       emit(MUL(y_times_a, y, a));
 747       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 748       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 749       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 750    }
 751 }
 752
 753 /**
 754  * Emits the instructions needed to perform a pull constant load. before_block
 755  * and before_inst can be NULL in which case the instruction will be appended
 756  * to the end of the instruction list.
 757  */
 758 void
 759 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 760                                           src_reg surf_index,
 761                                           src_reg offset_reg,
 762                                           bblock_t *before_block,
 763                                           vec4_instruction *before_inst)
 764 {
 765    assert((before_inst == NULL && before_block == NULL) ||
 766           (before_inst && before_block));
 767
 768    vec4_instruction *pull;
 769
 770    if (devinfo->gen >= 9) {
 771       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 772       src_reg header(this, glsl_type::uvec4_type, 2);
 773
 774       pull = new(mem_ctx)
 775          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 776                           dst_reg(header));
 777
 778       if (before_inst)
 779          emit_before(before_block, before_inst, pull);
 780       else
 781          emit(pull);
 782
 783       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 784                                  offset_reg.type);
 785       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 786
 787       if (before_inst)
 788          emit_before(before_block, before_inst, pull);
 789       else
 790          emit(pull);
 791
 792       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 793                                            dst,
 794                                            surf_index,
 795                                            header);
 796       pull->mlen = 2;
 797       pull->header_size = 1;
 798    } else if (devinfo->gen >= 7) {
 799       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 800
 801       grf_offset.type = offset_reg.type;
 802
 803       pull = MOV(grf_offset, offset_reg);
 804
 805       if (before_inst)
 806          emit_before(before_block, before_inst, pull);
 807       else
 808          emit(pull);
 809
 810       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 811                                            dst,
 812                                            surf_index,
 813                                            src_reg(grf_offset));
 814       pull->mlen = 1;
 815    } else {
 816       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 817                                            dst,
 818                                            surf_index,
 819                                            offset_reg);
 820       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 821       pull->mlen = 1;
 822    }
 823
 824    if (before_inst)
 825       emit_before(before_block, before_inst, pull);
 826    else
 827       emit(pull);
 828 }
 829
 830 src_reg
 831 vec4_visitor::emit_uniformize(const src_reg &src)
 832 {
 833    const src_reg chan_index(this, glsl_type::uint_type);
 834    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 835                               src.type);
 836
 837    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 838       ->force_writemask_all = true;
 839    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 840       ->force_writemask_all = true;
 841
 842    return src_reg(dst);
 843 }
 844
 845 src_reg
 846 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 847                              src_reg coordinate, src_reg surface)
 848 {
 849    vec4_instruction *inst =
 850       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 851                                     dst_reg(this, glsl_type::uvec4_type));
 852    inst->base_mrf = 2;
 853    inst->src[1] = surface;
 854    inst->src[2] = surface;
 855
 856    int param_base;
 857
 858    if (devinfo->gen >= 9) {
 859       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 860       vec4_instruction *header_inst = new(mem_ctx)
 861          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 862                           dst_reg(MRF, inst->base_mrf));
 863
 864       emit(header_inst);
 865
 866       inst->mlen = 2;
 867       inst->header_size = 1;
 868       param_base = inst->base_mrf + 1;
 869    } else {
 870       inst->mlen = 1;
 871       param_base = inst->base_mrf;
 872    }
 873
 874    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 875    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 876    int zero_mask = 0xf & ~coord_mask;
 877
 878    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 879             coordinate));
 880
 881    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 882             brw_imm_d(0)));
 883
 884    emit(inst);
 885    return src_reg(inst->dst);
 886 }
 887
 888 bool
 889 vec4_visitor::is_high_sampler(src_reg sampler)
 890 {
 891    if (devinfo->gen < 8 && !devinfo->is_haswell)
 892       return false;
 893
 894    return sampler.file != IMM || sampler.ud >= 16;
 895 }
 896
 897 void
 898 vec4_visitor::emit_texture(ir_texture_opcode op,
 899                            dst_reg dest,
 900                            const glsl_type *dest_type,
 901                            src_reg coordinate,
 902                            int coord_components,
 903                            src_reg shadow_comparitor,
 904                            src_reg lod, src_reg lod2,
 905                            src_reg sample_index,
 906                            uint32_t constant_offset,
 907                            src_reg offset_value,
 908                            src_reg mcs,
 909                            bool is_cube_array,
 910                            uint32_t surface,
 911                            src_reg surface_reg,
 912                            uint32_t sampler,
 913                            src_reg sampler_reg)
 914 {
 915    /* The sampler can only meaningfully compute LOD for fragment shader
 916     * messages. For all other stages, we change the opcode to TXL and hardcode
 917     * the LOD to 0.
 918     *
 919     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 920     * valid LOD argument.
 921     */
 922    if (op == ir_tex || op == ir_query_levels) {
 923       assert(lod.file == BAD_FILE);
 924       lod = brw_imm_f(0.0f);
 925    }
 926
 927    enum opcode opcode;
 928    switch (op) {
 929    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 930    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 931    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 932    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 933    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 934                              SHADER_OPCODE_TXF_CMS); break;
 935    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 936    case ir_tg4: opcode = offset_value.file != BAD_FILE
 937                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 938    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 939    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 940    case ir_txb:
 941       unreachable("TXB is not valid for vertex shaders.");
 942    case ir_lod:
 943       unreachable("LOD is not valid for vertex shaders.");
 944    case ir_samples_identical: {
 945       /* There are some challenges implementing this for vec4, and it seems
 946        * unlikely to be used anyway.  For now, just return false ways.
 947        */
 948       emit(MOV(dest, brw_imm_ud(0u)));
 949       return;
 950    }
 951    default:
 952       unreachable("Unrecognized tex op");
 953    }
 954
 955    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 956
 957    inst->offset = constant_offset;
 958
 959    /* The message header is necessary for:
 960     * - Gen4 (always)
 961     * - Gen9+ for selecting SIMD4x2
 962     * - Texel offsets
 963     * - Gather channel selection
 964     * - Sampler indices too large to fit in a 4-bit value.
 965     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 966     */
 967    inst->header_size =
 968       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 969        inst->offset != 0 || op == ir_tg4 ||
 970        op == ir_texture_samples ||
 971        is_high_sampler(sampler_reg)) ? 1 : 0;
 972    inst->base_mrf = 2;
 973    inst->mlen = inst->header_size;
 974    inst->dst.writemask = WRITEMASK_XYZW;
 975    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 976
 977    inst->src[1] = surface_reg;
 978    inst->src[2] = sampler_reg;
 979
 980    /* MRF for the first parameter */
 981    int param_base = inst->base_mrf + inst->header_size;
 982
 983    if (op == ir_txs || op == ir_query_levels) {
 984       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 985       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 986       inst->mlen++;
 987    } else if (op == ir_texture_samples) {
 988       inst->dst.writemask = WRITEMASK_X;
 989    } else {
 990       /* Load the coordinate */
 991       /* FINISHME: gl_clamp_mask and saturate */
 992       int coord_mask = (1 << coord_components) - 1;
 993       int zero_mask = 0xf & ~coord_mask;
 994
 995       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 996                coordinate));
 997       inst->mlen++;
 998
 999       if (zero_mask != 0) {
1000          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001                   brw_imm_d(0)));
1002       }
1003       /* Load the shadow comparitor */
1004       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
1006                           WRITEMASK_X),
1007                   shadow_comparitor));
1008          inst->mlen++;
1009       }
1010
1011       /* Load the LOD info */
1012       if (op == ir_tex || op == ir_txl) {
1013          int mrf, writemask;
1014          if (devinfo->gen >= 5) {
1015             mrf = param_base + 1;
1016             if (shadow_comparitor.file != BAD_FILE) {
1017                writemask = WRITEMASK_Y;
1018                /* mlen already incremented */
1019             } else {
1020                writemask = WRITEMASK_X;
1021                inst->mlen++;
1022             }
1023          } else /* devinfo->gen == 4 */ {
1024             mrf = param_base;
1025             writemask = WRITEMASK_W;
1026          }
1027          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028       } else if (op == ir_txf) {
1029          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030       } else if (op == ir_txf_ms) {
1031          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032                   sample_index));
1033          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034             /* MCS data is stored in the first two channels of ‘mcs’, but we
1035              * need to get it into the .y and .z channels of the second vec4
1036              * of params.
1037              */
1038             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039             emit(MOV(dst_reg(MRF, param_base + 1,
1040                              glsl_type::uint_type, WRITEMASK_YZ),
1041                      mcs));
1042          } else if (devinfo->gen >= 7) {
1043             /* MCS data is in the first channel of `mcs`, but we need to get it into
1044              * the .y channel of the second vec4 of params, so replicate .x across
1045              * the whole vec4 and then mask off everything except .y
1046              */
1047             mcs.swizzle = BRW_SWIZZLE_XXXX;
1048             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049                      mcs));
1050          }
1051          inst->mlen++;
1052       } else if (op == ir_txd) {
1053          const brw_reg_type type = lod.type;
1054
1055          if (devinfo->gen >= 5) {
1056             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060             inst->mlen++;
1061
1062             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1063                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067                inst->mlen++;
1068
1069                if (shadow_comparitor.file != BAD_FILE) {
1070                   emit(MOV(dst_reg(MRF, param_base + 2,
1071                                    shadow_comparitor.type, WRITEMASK_Z),
1072                            shadow_comparitor));
1073                }
1074             }
1075          } else /* devinfo->gen == 4 */ {
1076             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078             inst->mlen += 2;
1079          }
1080       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081          if (shadow_comparitor.file != BAD_FILE) {
1082             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1083                      shadow_comparitor));
1084          }
1085
1086          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087                   offset_value));
1088          inst->mlen++;
1089       }
1090    }
1091
1092    emit(inst);
1093
1094    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095     * spec requires layers.
1096     */
1097    if (op == ir_txs) {
1098       if (is_cube_array) {
1099          emit_math(SHADER_OPCODE_INT_QUOTIENT,
1100                    writemask(inst->dst, WRITEMASK_Z),
1101                    src_reg(inst->dst), brw_imm_d(6));
1102       } else if (devinfo->gen < 7) {
1103          /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1104          emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1105                      src_reg(inst->dst), brw_imm_d(1));
1106       }
1107    }
1108
1109    if (devinfo->gen == 6 && op == ir_tg4) {
1110       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1111    }
1112
1113    if (op == ir_query_levels) {
1114       /* # levels is in .w */
1115       src_reg swizzled(dest);
1116       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1117                                       SWIZZLE_W, SWIZZLE_W);
1118       emit(MOV(dest, swizzled));
1119    }
1120 }
1121
1122 /**
1123  * Apply workarounds for Gen6 gather with UINT/SINT
1124  */
1125 void
1126 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1127 {
1128    if (!wa)
1129       return;
1130
1131    int width = (wa & WA_8BIT) ? 8 : 16;
1132    dst_reg dst_f = dst;
1133    dst_f.type = BRW_REGISTER_TYPE_F;
1134
1135    /* Convert from UNORM to UINT */
1136    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1137    emit(MOV(dst, src_reg(dst_f)));
1138
1139    if (wa & WA_SIGN) {
1140       /* Reinterpret the UINT value as a signed INT value by
1141        * shifting the sign bit into place, then shifting back
1142        * preserving sign.
1143        */
1144       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1145       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1146    }
1147 }
1148
1149 void
1150 vec4_visitor::gs_emit_vertex(int stream_id)
1151 {
1152    unreachable("not reached");
1153 }
1154
1155 void
1156 vec4_visitor::gs_end_primitive()
1157 {
1158    unreachable("not reached");
1159 }
1160
1161 void
1162 vec4_visitor::emit_ndc_computation()
1163 {
1164    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1165       return;
1166
1167    /* Get the position */
1168    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1169
1170    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1171    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1172    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1173
1174    current_annotation = "NDC";
1175    dst_reg ndc_w = ndc;
1176    ndc_w.writemask = WRITEMASK_W;
1177    src_reg pos_w = pos;
1178    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1179    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1180
1181    dst_reg ndc_xyz = ndc;
1182    ndc_xyz.writemask = WRITEMASK_XYZ;
1183
1184    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1185 }
1186
1187 void
1188 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1189 {
1190    if (devinfo->gen < 6 &&
1191        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1192         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1193         devinfo->has_negative_rhw_bug)) {
1194       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1195       dst_reg header1_w = header1;
1196       header1_w.writemask = WRITEMASK_W;
1197
1198       emit(MOV(header1, brw_imm_ud(0u)));
1199
1200       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1201          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1202
1203          current_annotation = "Point size";
1204          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1205          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1206       }
1207
1208       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1209          current_annotation = "Clipping flags";
1210          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1211          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1212
1213          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1214          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1215          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1216
1217          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1218          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1219          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1220          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1221       }
1222
1223       /* i965 clipping workaround:
1224        * 1) Test for -ve rhw
1225        * 2) If set,
1226        *      set ndc = (0,0,0,0)
1227        *      set ucp[6] = 1
1228        *
1229        * Later, clipping will detect ucp[6] and ensure the primitive is
1230        * clipped against all fixed planes.
1231        */
1232       if (devinfo->has_negative_rhw_bug &&
1233           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1234          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1235          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1236          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1237          vec4_instruction *inst;
1238          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1239          inst->predicate = BRW_PREDICATE_NORMAL;
1240          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1241          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1242          inst->predicate = BRW_PREDICATE_NORMAL;
1243       }
1244
1245       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1246    } else if (devinfo->gen < 6) {
1247       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1248    } else {
1249       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1250       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1251          dst_reg reg_w = reg;
1252          reg_w.writemask = WRITEMASK_W;
1253          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1254          reg_as_src.type = reg_w.type;
1255          reg_as_src.swizzle = brw_swizzle_for_size(1);
1256          emit(MOV(reg_w, reg_as_src));
1257       }
1258       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1259          dst_reg reg_y = reg;
1260          reg_y.writemask = WRITEMASK_Y;
1261          reg_y.type = BRW_REGISTER_TYPE_D;
1262          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1263          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1264       }
1265       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1266          dst_reg reg_z = reg;
1267          reg_z.writemask = WRITEMASK_Z;
1268          reg_z.type = BRW_REGISTER_TYPE_D;
1269          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1270          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1271       }
1272    }
1273 }
1274
1275 vec4_instruction *
1276 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1277 {
1278    assert(varying < VARYING_SLOT_MAX);
1279    assert(output_reg[varying].type == reg.type);
1280    current_annotation = output_reg_annotation[varying];
1281    if (output_reg[varying].file != BAD_FILE)
1282       return emit(MOV(reg, src_reg(output_reg[varying])));
1283    else
1284       return NULL;
1285 }
1286
1287 void
1288 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1289 {
1290    reg.type = BRW_REGISTER_TYPE_F;
1291    output_reg[varying].type = reg.type;
1292
1293    switch (varying) {
1294    case VARYING_SLOT_PSIZ:
1295    {
1296       /* PSIZ is always in slot 0, and is coupled with other flags. */
1297       current_annotation = "indices, point width, clip flags";
1298       emit_psiz_and_flags(reg);
1299       break;
1300    }
1301    case BRW_VARYING_SLOT_NDC:
1302       current_annotation = "NDC";
1303       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1304          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1305       break;
1306    case VARYING_SLOT_POS:
1307       current_annotation = "gl_Position";
1308       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1309          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1310       break;
1311    case VARYING_SLOT_EDGE:
1312       /* This is present when doing unfilled polygons.  We're supposed to copy
1313        * the edge flag from the user-provided vertex array
1314        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1315        * of that attribute (starts as 1.0f).  This is then used in clipping to
1316        * determine which edges should be drawn as wireframe.
1317        */
1318       current_annotation = "edge flag";
1319       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1320                                     glsl_type::float_type, WRITEMASK_XYZW))));
1321       break;
1322    case BRW_VARYING_SLOT_PAD:
1323       /* No need to write to this slot */
1324       break;
1325    default:
1326       emit_generic_urb_slot(reg, varying);
1327       break;
1328    }
1329 }
1330
1331 static int
1332 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1333 {
1334    if (devinfo->gen >= 6) {
1335       /* URB data written (does not include the message header reg) must
1336        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1337        * section 5.4.3.2.2: URB_INTERLEAVED.
1338        *
1339        * URB entries are allocated on a multiple of 1024 bits, so an
1340        * extra 128 bits written here to make the end align to 256 is
1341        * no problem.
1342        */
1343       if ((mlen % 2) != 1)
1344          mlen++;
1345    }
1346
1347    return mlen;
1348 }
1349
1350
1351 /**
1352  * Generates the VUE payload plus the necessary URB write instructions to
1353  * output it.
1354  *
1355  * The VUE layout is documented in Volume 2a.
1356  */
1357 void
1358 vec4_visitor::emit_vertex()
1359 {
1360    /* MRF 0 is reserved for the debugger, so start with message header
1361     * in MRF 1.
1362     */
1363    int base_mrf = 1;
1364    int mrf = base_mrf;
1365    /* In the process of generating our URB write message contents, we
1366     * may need to unspill a register or load from an array.  Those
1367     * reads would use MRFs 14-15.
1368     */
1369    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1370
1371    /* The following assertion verifies that max_usable_mrf causes an
1372     * even-numbered amount of URB write data, which will meet gen6's
1373     * requirements for length alignment.
1374     */
1375    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1376
1377    /* First mrf is the g0-based message header containing URB handles and
1378     * such.
1379     */
1380    emit_urb_write_header(mrf++);
1381
1382    if (devinfo->gen < 6) {
1383       emit_ndc_computation();
1384    }
1385
1386    /* We may need to split this up into several URB writes, so do them in a
1387     * loop.
1388     */
1389    int slot = 0;
1390    bool complete = false;
1391    do {
1392       /* URB offset is in URB row increments, and each of our MRFs is half of
1393        * one of those, since we're doing interleaved writes.
1394        */
1395       int offset = slot / 2;
1396
1397       mrf = base_mrf + 1;
1398       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1399          emit_urb_slot(dst_reg(MRF, mrf++),
1400                        prog_data->vue_map.slot_to_varying[slot]);
1401
1402          /* If this was max_usable_mrf, we can't fit anything more into this
1403           * URB WRITE. Same thing if we reached the maximum length available.
1404           */
1405          if (mrf > max_usable_mrf ||
1406              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1407             slot++;
1408             break;
1409          }
1410       }
1411
1412       complete = slot >= prog_data->vue_map.num_slots;
1413       current_annotation = "URB write";
1414       vec4_instruction *inst = emit_urb_write_opcode(complete);
1415       inst->base_mrf = base_mrf;
1416       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1417       inst->offset += offset;
1418    } while(!complete);
1419 }
1420
1421
1422 src_reg
1423 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1424                                  src_reg *reladdr, int reg_offset)
1425 {
1426    /* Because we store the values to scratch interleaved like our
1427     * vertex data, we need to scale the vec4 index by 2.
1428     */
1429    int message_header_scale = 2;
1430
1431    /* Pre-gen6, the message header uses byte offsets instead of vec4
1432     * (16-byte) offset units.
1433     */
1434    if (devinfo->gen < 6)
1435       message_header_scale *= 16;
1436
1437    if (reladdr) {
1438       src_reg index = src_reg(this, glsl_type::int_type);
1439
1440       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1441                                    brw_imm_d(reg_offset)));
1442       emit_before(block, inst, MUL(dst_reg(index), index,
1443                                    brw_imm_d(message_header_scale)));
1444
1445       return index;
1446    } else {
1447       return brw_imm_d(reg_offset * message_header_scale);
1448    }
1449 }
1450
1451 /**
1452  * Emits an instruction before @inst to load the value named by @orig_src
1453  * from scratch space at @base_offset to @temp.
1454  *
1455  * @base_offset is measured in 32-byte units (the size of a register).
1456  */
1457 void
1458 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1459                                 dst_reg temp, src_reg orig_src,
1460                                 int base_offset)
1461 {
1462    int reg_offset = base_offset + orig_src.reg_offset;
1463    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1464                                       reg_offset);
1465
1466    emit_before(block, inst, SCRATCH_READ(temp, index));
1467 }
1468
1469 /**
1470  * Emits an instruction after @inst to store the value to be written
1471  * to @orig_dst to scratch space at @base_offset, from @temp.
1472  *
1473  * @base_offset is measured in 32-byte units (the size of a register).
1474  */
1475 void
1476 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1477                                  int base_offset)
1478 {
1479    int reg_offset = base_offset + inst->dst.reg_offset;
1480    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1481                                       reg_offset);
1482
1483    /* Create a temporary register to store *inst's result in.
1484     *
1485     * We have to be careful in MOVing from our temporary result register in
1486     * the scratch write.  If we swizzle from channels of the temporary that
1487     * weren't initialized, it will confuse live interval analysis, which will
1488     * make spilling fail to make progress.
1489     */
1490    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1491                                        inst->dst.type),
1492                                 brw_swizzle_for_mask(inst->dst.writemask));
1493    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1494                                        inst->dst.writemask));
1495    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1496    if (inst->opcode != BRW_OPCODE_SEL)
1497       write->predicate = inst->predicate;
1498    write->ir = inst->ir;
1499    write->annotation = inst->annotation;
1500    inst->insert_after(block, write);
1501
1502    inst->dst.file = temp.file;
1503    inst->dst.nr = temp.nr;
1504    inst->dst.reg_offset = temp.reg_offset;
1505    inst->dst.reladdr = NULL;
1506 }
1507
1508 /**
1509  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1510  * adds the scratch read(s) before \p inst. The function also checks for
1511  * recursive reladdr scratch accesses, issuing the corresponding scratch
1512  * loads and rewriting reladdr references accordingly.
1513  *
1514  * \return \p src if it did not require a scratch load, otherwise, the
1515  * register holding the result of the scratch load that the caller should
1516  * use to rewrite src.
1517  */
1518 src_reg
1519 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1520                                    vec4_instruction *inst, src_reg src)
1521 {
1522    /* Resolve recursive reladdr scratch access by calling ourselves
1523     * with src.reladdr
1524     */
1525    if (src.reladdr)
1526       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1527                                           *src.reladdr);
1528
1529    /* Now handle scratch access on src */
1530    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1531       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1532       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1533       src.nr = temp.nr;
1534       src.reg_offset = temp.reg_offset;
1535       src.reladdr = NULL;
1536    }
1537
1538    return src;
1539 }
1540
1541 /**
1542  * We can't generally support array access in GRF space, because a
1543  * single instruction's destination can only span 2 contiguous
1544  * registers.  So, we send all GRF arrays that get variable index
1545  * access to scratch space.
1546  */
1547 void
1548 vec4_visitor::move_grf_array_access_to_scratch()
1549 {
1550    int scratch_loc[this->alloc.count];
1551    memset(scratch_loc, -1, sizeof(scratch_loc));
1552
1553    /* First, calculate the set of virtual GRFs that need to be punted
1554     * to scratch due to having any array access on them, and where in
1555     * scratch.
1556     */
1557    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1558       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1559          if (scratch_loc[inst->dst.nr] == -1) {
1560             scratch_loc[inst->dst.nr] = last_scratch;
1561             last_scratch += this->alloc.sizes[inst->dst.nr];
1562          }
1563
1564          for (src_reg *iter = inst->dst.reladdr;
1565               iter->reladdr;
1566               iter = iter->reladdr) {
1567             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1568                scratch_loc[iter->nr] = last_scratch;
1569                last_scratch += this->alloc.sizes[iter->nr];
1570             }
1571          }
1572       }
1573
1574       for (int i = 0 ; i < 3; i++) {
1575          for (src_reg *iter = &inst->src[i];
1576               iter->reladdr;
1577               iter = iter->reladdr) {
1578             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1579                scratch_loc[iter->nr] = last_scratch;
1580                last_scratch += this->alloc.sizes[iter->nr];
1581             }
1582          }
1583       }
1584    }
1585
1586    /* Now, for anything that will be accessed through scratch, rewrite
1587     * it to load/store.  Note that this is a _safe list walk, because
1588     * we may generate a new scratch_write instruction after the one
1589     * we're processing.
1590     */
1591    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1592       /* Set up the annotation tracking for new generated instructions. */
1593       base_ir = inst->ir;
1594       current_annotation = inst->annotation;
1595
1596       /* First handle scratch access on the dst. Notice we have to handle
1597        * the case where the dst's reladdr also points to scratch space.
1598        */
1599       if (inst->dst.reladdr)
1600          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1601                                                    *inst->dst.reladdr);
1602
1603       /* Now that we have handled any (possibly recursive) reladdr scratch
1604        * accesses for dst we can safely do the scratch write for dst itself
1605        */
1606       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1607          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1608
1609       /* Now handle scratch access on any src. In this case, since inst->src[i]
1610        * already is a src_reg, we can just call emit_resolve_reladdr with
1611        * inst->src[i] and it will take care of handling scratch loads for
1612        * both src and src.reladdr (recursively).
1613        */
1614       for (int i = 0 ; i < 3; i++) {
1615          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1616                                              inst->src[i]);
1617       }
1618    }
1619 }
1620
1621 /**
1622  * Emits an instruction before @inst to load the value named by @orig_src
1623  * from the pull constant buffer (surface) at @base_offset to @temp.
1624  */
1625 void
1626 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1627                                       dst_reg temp, src_reg orig_src,
1628                                       int base_offset, src_reg indirect)
1629 {
1630    int reg_offset = base_offset + orig_src.reg_offset;
1631    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1632
1633    src_reg offset;
1634    if (indirect.file != BAD_FILE) {
1635       offset = src_reg(this, glsl_type::uint_type);
1636
1637       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1638                                    brw_imm_ud(reg_offset * 16)));
1639    } else if (devinfo->gen >= 8) {
1640       /* Store the offset in a GRF so we can send-from-GRF. */
1641       offset = src_reg(this, glsl_type::uint_type);
1642       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1643    } else {
1644       offset = brw_imm_d(reg_offset * 16);
1645    }
1646
1647    emit_pull_constant_load_reg(temp,
1648                                brw_imm_ud(index),
1649                                offset,
1650                                block, inst);
1651
1652    brw_mark_surface_used(&prog_data->base, index);
1653 }
1654
1655 /**
1656  * Implements array access of uniforms by inserting a
1657  * PULL_CONSTANT_LOAD instruction.
1658  *
1659  * Unlike temporary GRF array access (where we don't support it due to
1660  * the difficulty of doing relative addressing on instruction
1661  * destinations), we could potentially do array access of uniforms
1662  * that were loaded in GRF space as push constants.  In real-world
1663  * usage we've seen, though, the arrays being used are always larger
1664  * than we could load as push constants, so just always move all
1665  * uniform array access out to a pull constant buffer.
1666  */
1667 void
1668 vec4_visitor::move_uniform_array_access_to_pull_constants()
1669 {
1670    /* The vulkan dirver doesn't support pull constants other than UBOs so
1671     * everything has to be pushed regardless.
1672     */
1673    if (stage_prog_data->pull_param == NULL) {
1674       split_uniform_registers();
1675       return;
1676    }
1677
1678    int pull_constant_loc[this->uniforms];
1679    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1680
1681    /* First, walk through the instructions and determine which things need to
1682     * be pulled.  We mark something as needing to be pulled by setting
1683     * pull_constant_loc to 0.
1684     */
1685    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1686       /* We only care about MOV_INDIRECT of a uniform */
1687       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1688           inst->src[0].file != UNIFORM)
1689          continue;
1690
1691       int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1692
1693       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1694          pull_constant_loc[uniform_nr + j] = 0;
1695    }
1696
1697    /* Next, we walk the list of uniforms and assign real pull constant
1698     * locations and set their corresponding entries in pull_param.
1699     */
1700    for (int j = 0; j < this->uniforms; j++) {
1701       if (pull_constant_loc[j] < 0)
1702          continue;
1703
1704       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1705
1706       for (int i = 0; i < 4; i++) {
1707          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1708             = stage_prog_data->param[j * 4 + i];
1709       }
1710    }
1711
1712    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1713     * instructions to actual uniform pulls.
1714     */
1715    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1716       /* We only care about MOV_INDIRECT of a uniform */
1717       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1718           inst->src[0].file != UNIFORM)
1719          continue;
1720
1721       int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1722
1723       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1724
1725       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1726                               pull_constant_loc[uniform_nr], inst->src[1]);
1727       inst->remove(block);
1728    }
1729
1730    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1731     * no need to track them as larger-than-vec4 objects.  This will be
1732     * relied on in cutting out unused uniform vectors from push
1733     * constants.
1734     */
1735    split_uniform_registers();
1736 }
1737
1738 void
1739 vec4_visitor::resolve_ud_negate(src_reg *reg)
1740 {
1741    if (reg->type != BRW_REGISTER_TYPE_UD ||
1742        !reg->negate)
1743       return;
1744
1745    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1746    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1747    *reg = temp;
1748 }
1749
1750 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1751                            void *log_data,
1752                            const struct brw_sampler_prog_key_data *key_tex,
1753                            struct brw_vue_prog_data *prog_data,
1754                            const nir_shader *shader,
1755                            void *mem_ctx,
1756                            bool no_spills,
1757                            int shader_time_index)
1758    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1759      key_tex(key_tex),
1760      prog_data(prog_data),
1761      fail_msg(NULL),
1762      first_non_payload_grf(0),
1763      need_all_constants_in_pull_buffer(false),
1764      no_spills(no_spills),
1765      shader_time_index(shader_time_index),
1766      last_scratch(0)
1767 {
1768    this->failed = false;
1769
1770    this->base_ir = NULL;
1771    this->current_annotation = NULL;
1772    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1773
1774    this->virtual_grf_start = NULL;
1775    this->virtual_grf_end = NULL;
1776    this->live_intervals = NULL;
1777
1778    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1779
1780    this->uniforms = 0;
1781 }
1782
1783 vec4_visitor::~vec4_visitor()
1784 {
1785 }
1786
1787
1788 void
1789 vec4_visitor::fail(const char *format, ...)
1790 {
1791    va_list va;
1792    char *msg;
1793
1794    if (failed)
1795       return;
1796
1797    failed = true;
1798
1799    va_start(va, format);
1800    msg = ralloc_vasprintf(mem_ctx, format, va);
1801    va_end(va);
1802    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1803
1804    this->fail_msg = msg;
1805
1806    if (debug_enabled) {
1807       fprintf(stderr, "%s",  msg);
1808    }
1809 }
1810
1811 } /* namespace brw */