src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240
 241    resolve_ud_negate(&src0);
 242    resolve_ud_negate(&src1);
 243
 244    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 245    inst->conditional_mod = condition;
 246
 247    return inst;
 248 }
 249
 250 vec4_instruction *
 251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 252 {
 253    vec4_instruction *inst;
 254
 255    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 256                                         dst, index);
 257    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 258    inst->mlen = 2;
 259
 260    return inst;
 261 }
 262
 263 vec4_instruction *
 264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 265                             const src_reg &index)
 266 {
 267    vec4_instruction *inst;
 268
 269    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 270                                         dst, src, index);
 271    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 272    inst->mlen = 3;
 273
 274    return inst;
 275 }
 276
 277 src_reg
 278 vec4_visitor::fix_3src_operand(const src_reg &src)
 279 {
 280    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 281     * able to use vertical stride of zero to replicate the vec4 uniform, like
 282     *
 283     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 284     *
 285     * But you can't, since vertical stride is always four in three-source
 286     * instructions. Instead, insert a MOV instruction to do the replication so
 287     * that the three-source instruction can consume it.
 288     */
 289
 290    /* The MOV is only needed if the source is a uniform or immediate. */
 291    if (src.file != UNIFORM && src.file != IMM)
 292       return src;
 293
 294    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 295       return src;
 296
 297    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 298    expanded.type = src.type;
 299    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 300    return src_reg(expanded);
 301 }
 302
 303 src_reg
 304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 305 {
 306    if (!src.abs && !src.negate)
 307       return src;
 308
 309    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 310    resolved.type = src.type;
 311    emit(MOV(resolved, src));
 312
 313    return src_reg(resolved);
 314 }
 315
 316 src_reg
 317 vec4_visitor::fix_math_operand(const src_reg &src)
 318 {
 319    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 320       return src;
 321
 322    /* The gen6 math instruction ignores the source modifiers --
 323     * swizzle, abs, negate, and at least some parts of the register
 324     * region description.
 325     *
 326     * Rather than trying to enumerate all these cases, *always* expand the
 327     * operand to a temp GRF for gen6.
 328     *
 329     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 330     * can't use.
 331     */
 332
 333    if (devinfo->gen == 7 && src.file != IMM)
 334       return src;
 335
 336    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 337    expanded.type = src.type;
 338    emit(MOV(expanded, src));
 339    return src_reg(expanded);
 340 }
 341
 342 vec4_instruction *
 343 vec4_visitor::emit_math(enum opcode opcode,
 344                         const dst_reg &dst,
 345                         const src_reg &src0, const src_reg &src1)
 346 {
 347    vec4_instruction *math =
 348       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 349
 350    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 351       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 352       math->dst = dst_reg(this, glsl_type::vec4_type);
 353       math->dst.type = dst.type;
 354       math = emit(MOV(dst, src_reg(math->dst)));
 355    } else if (devinfo->gen < 6) {
 356       math->base_mrf = 1;
 357       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 358    }
 359
 360    return math;
 361 }
 362
 363 void
 364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 365 {
 366    if (devinfo->gen < 7) {
 367       unreachable("ir_unop_pack_half_2x16 should be lowered");
 368    }
 369
 370    assert(dst.type == BRW_REGISTER_TYPE_UD);
 371    assert(src0.type == BRW_REGISTER_TYPE_F);
 372
 373    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 374     *
 375     *   Because this instruction does not have a 16-bit floating-point type,
 376     *   the destination data type must be Word (W).
 377     *
 378     *   The destination must be DWord-aligned and specify a horizontal stride
 379     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 380     *   each destination channel and the upper word is not modified.
 381     *
 382     * The above restriction implies that the f32to16 instruction must use
 383     * align1 mode, because only in align1 mode is it possible to specify
 384     * horizontal stride.  We choose here to defy the hardware docs and emit
 385     * align16 instructions.
 386     *
 387     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 388     * instructions. I was partially successful in that the code passed all
 389     * tests.  However, the code was dubiously correct and fragile, and the
 390     * tests were not harsh enough to probe that frailty. Not trusting the
 391     * code, I chose instead to remain in align16 mode in defiance of the hw
 392     * docs).
 393     *
 394     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 395     * simulator, emitting a f32to16 in align16 mode with UD as destination
 396     * data type is safe. The behavior differs from that specified in the PRM
 397     * in that the upper word of each destination channel is cleared to 0.
 398     */
 399
 400    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 401    src_reg tmp_src(tmp_dst);
 402
 403 #if 0
 404    /* Verify the undocumented behavior on which the following instructions
 405     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 406     * then the result of the bit-or instruction below will be incorrect.
 407     *
 408     * You should inspect the disasm output in order to verify that the MOV is
 409     * not optimized away.
 410     */
 411    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 412 #endif
 413
 414    /* Give tmp the form below, where "." means untouched.
 415     *
 416     *     w z          y          x w z          y          x
 417     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 418     *
 419     * That the upper word of each write-channel be 0 is required for the
 420     * following bit-shift and bit-or instructions to work. Note that this
 421     * relies on the undocumented hardware behavior mentioned above.
 422     */
 423    tmp_dst.writemask = WRITEMASK_XY;
 424    emit(F32TO16(tmp_dst, src0));
 425
 426    /* Give the write-channels of dst the form:
 427     *   0xhhhh0000
 428     */
 429    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 430    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 431
 432    /* Finally, give the write-channels of dst the form of packHalf2x16's
 433     * output:
 434     *   0xhhhhllll
 435     */
 436    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 437    emit(OR(dst, src_reg(dst), tmp_src));
 438 }
 439
 440 void
 441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 442 {
 443    if (devinfo->gen < 7) {
 444       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 445    }
 446
 447    assert(dst.type == BRW_REGISTER_TYPE_F);
 448    assert(src0.type == BRW_REGISTER_TYPE_UD);
 449
 450    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 451     *
 452     *   Because this instruction does not have a 16-bit floating-point type,
 453     *   the source data type must be Word (W). The destination type must be
 454     *   F (Float).
 455     *
 456     * To use W as the source data type, we must adjust horizontal strides,
 457     * which is only possible in align1 mode. All my [chadv] attempts at
 458     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 459     * Piglit tests, so I gave up.
 460     *
 461     * I've verified that, on gen7 hardware and the simulator, it is safe to
 462     * emit f16to32 in align16 mode with UD as source data type.
 463     */
 464
 465    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 466    src_reg tmp_src(tmp_dst);
 467
 468    tmp_dst.writemask = WRITEMASK_X;
 469    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 470
 471    tmp_dst.writemask = WRITEMASK_Y;
 472    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 473
 474    dst.writemask = WRITEMASK_XY;
 475    emit(F16TO32(dst, tmp_src));
 476 }
 477
 478 void
 479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 480 {
 481    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 482     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 483     * is not suitable to generate the shift values, but we can use the packed
 484     * vector float and a type-converting MOV.
 485     */
 486    dst_reg shift(this, glsl_type::uvec4_type);
 487    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 488
 489    dst_reg shifted(this, glsl_type::uvec4_type);
 490    src0.swizzle = BRW_SWIZZLE_XXXX;
 491    emit(SHR(shifted, src0, src_reg(shift)));
 492
 493    shifted.type = BRW_REGISTER_TYPE_UB;
 494    dst_reg f(this, glsl_type::vec4_type);
 495    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 496
 497    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 498 }
 499
 500 void
 501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 502 {
 503    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 504     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 505     * is not suitable to generate the shift values, but we can use the packed
 506     * vector float and a type-converting MOV.
 507     */
 508    dst_reg shift(this, glsl_type::uvec4_type);
 509    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 510
 511    dst_reg shifted(this, glsl_type::uvec4_type);
 512    src0.swizzle = BRW_SWIZZLE_XXXX;
 513    emit(SHR(shifted, src0, src_reg(shift)));
 514
 515    shifted.type = BRW_REGISTER_TYPE_B;
 516    dst_reg f(this, glsl_type::vec4_type);
 517    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 518
 519    dst_reg scaled(this, glsl_type::vec4_type);
 520    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 521
 522    dst_reg max(this, glsl_type::vec4_type);
 523    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 524    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 525 }
 526
 527 void
 528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 529 {
 530    dst_reg saturated(this, glsl_type::vec4_type);
 531    vec4_instruction *inst = emit(MOV(saturated, src0));
 532    inst->saturate = true;
 533
 534    dst_reg scaled(this, glsl_type::vec4_type);
 535    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 536
 537    dst_reg rounded(this, glsl_type::vec4_type);
 538    emit(RNDE(rounded, src_reg(scaled)));
 539
 540    dst_reg u(this, glsl_type::uvec4_type);
 541    emit(MOV(u, src_reg(rounded)));
 542
 543    src_reg bytes(u);
 544    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 545 }
 546
 547 void
 548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 549 {
 550    dst_reg max(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 552
 553    dst_reg min(this, glsl_type::vec4_type);
 554    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 555
 556    dst_reg scaled(this, glsl_type::vec4_type);
 557    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 558
 559    dst_reg rounded(this, glsl_type::vec4_type);
 560    emit(RNDE(rounded, src_reg(scaled)));
 561
 562    dst_reg i(this, glsl_type::ivec4_type);
 563    emit(MOV(i, src_reg(rounded)));
 564
 565    src_reg bytes(i);
 566    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 567 }
 568
 569 /**
 570  * Returns the minimum number of vec4 elements needed to pack a type.
 571  *
 572  * For simple types, it will return 1 (a single vec4); for matrices, the
 573  * number of columns; for array and struct, the sum of the vec4_size of
 574  * each of its elements; and for sampler and atomic, zero.
 575  *
 576  * This method is useful to calculate how much register space is needed to
 577  * store a particular type.
 578  */
 579 extern "C" int
 580 type_size_vec4(const struct glsl_type *type)
 581 {
 582    unsigned int i;
 583    int size;
 584
 585    switch (type->base_type) {
 586    case GLSL_TYPE_UINT:
 587    case GLSL_TYPE_INT:
 588    case GLSL_TYPE_FLOAT:
 589    case GLSL_TYPE_BOOL:
 590       if (type->is_matrix()) {
 591          return type->matrix_columns;
 592       } else {
 593          /* Regardless of size of vector, it gets a vec4. This is bad
 594           * packing for things like floats, but otherwise arrays become a
 595           * mess.  Hopefully a later pass over the code can pack scalars
 596           * down if appropriate.
 597           */
 598          return 1;
 599       }
 600    case GLSL_TYPE_ARRAY:
 601       assert(type->length > 0);
 602       return type_size_vec4(type->fields.array) * type->length;
 603    case GLSL_TYPE_STRUCT:
 604       size = 0;
 605       for (i = 0; i < type->length; i++) {
 606          size += type_size_vec4(type->fields.structure[i].type);
 607       }
 608       return size;
 609    case GLSL_TYPE_SUBROUTINE:
 610       return 1;
 611
 612    case GLSL_TYPE_SAMPLER:
 613       /* Samplers take up no register space, since they're baked in at
 614        * link time.
 615        */
 616       return 0;
 617    case GLSL_TYPE_ATOMIC_UINT:
 618       return 0;
 619    case GLSL_TYPE_IMAGE:
 620       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 621    case GLSL_TYPE_VOID:
 622    case GLSL_TYPE_DOUBLE:
 623    case GLSL_TYPE_ERROR:
 624    case GLSL_TYPE_INTERFACE:
 625    case GLSL_TYPE_FUNCTION:
 626       unreachable("not reached");
 627    }
 628
 629    return 0;
 630 }
 631
 632 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 633 {
 634    init();
 635
 636    this->file = VGRF;
 637    this->nr = v->alloc.allocate(type_size_vec4(type));
 638
 639    if (type->is_array() || type->is_record()) {
 640       this->swizzle = BRW_SWIZZLE_NOOP;
 641    } else {
 642       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 643    }
 644
 645    this->type = brw_type_for_base_type(type);
 646 }
 647
 648 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 649 {
 650    assert(size > 0);
 651
 652    init();
 653
 654    this->file = VGRF;
 655    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 656
 657    this->swizzle = BRW_SWIZZLE_NOOP;
 658
 659    this->type = brw_type_for_base_type(type);
 660 }
 661
 662 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 663 {
 664    init();
 665
 666    this->file = VGRF;
 667    this->nr = v->alloc.allocate(type_size_vec4(type));
 668
 669    if (type->is_array() || type->is_record()) {
 670       this->writemask = WRITEMASK_XYZW;
 671    } else {
 672       this->writemask = (1 << type->vector_elements) - 1;
 673    }
 674
 675    this->type = brw_type_for_base_type(type);
 676 }
 677
 678 vec4_instruction *
 679 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 680                           src_reg src0, src_reg src1)
 681 {
 682    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 683    inst->conditional_mod = conditionalmod;
 684    return inst;
 685 }
 686
 687 vec4_instruction *
 688 vec4_visitor::emit_lrp(const dst_reg &dst,
 689                        const src_reg &x, const src_reg &y, const src_reg &a)
 690 {
 691    if (devinfo->gen >= 6) {
 692       /* Note that the instruction's argument order is reversed from GLSL
 693        * and the IR.
 694        */
 695      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 696                      fix_3src_operand(x)));
 697    } else {
 698       /* Earlier generations don't support three source operations, so we
 699        * need to emit x*(1-a) + y*a.
 700        */
 701       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 702       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 703       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 704       y_times_a.writemask           = dst.writemask;
 705       one_minus_a.writemask         = dst.writemask;
 706       x_times_one_minus_a.writemask = dst.writemask;
 707
 708       emit(MUL(y_times_a, y, a));
 709       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 710       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 711       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 712    }
 713 }
 714
 715 /**
 716  * Emits the instructions needed to perform a pull constant load. before_block
 717  * and before_inst can be NULL in which case the instruction will be appended
 718  * to the end of the instruction list.
 719  */
 720 void
 721 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 722                                           src_reg surf_index,
 723                                           src_reg offset_reg,
 724                                           bblock_t *before_block,
 725                                           vec4_instruction *before_inst)
 726 {
 727    assert((before_inst == NULL && before_block == NULL) ||
 728           (before_inst && before_block));
 729
 730    vec4_instruction *pull;
 731
 732    if (devinfo->gen >= 9) {
 733       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 734       src_reg header(this, glsl_type::uvec4_type, 2);
 735
 736       pull = new(mem_ctx)
 737          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 738                           dst_reg(header));
 739
 740       if (before_inst)
 741          emit_before(before_block, before_inst, pull);
 742       else
 743          emit(pull);
 744
 745       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 746                                  offset_reg.type);
 747       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 748
 749       if (before_inst)
 750          emit_before(before_block, before_inst, pull);
 751       else
 752          emit(pull);
 753
 754       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 755                                            dst,
 756                                            surf_index,
 757                                            header);
 758       pull->mlen = 2;
 759       pull->header_size = 1;
 760    } else if (devinfo->gen >= 7) {
 761       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 762
 763       grf_offset.type = offset_reg.type;
 764
 765       pull = MOV(grf_offset, offset_reg);
 766
 767       if (before_inst)
 768          emit_before(before_block, before_inst, pull);
 769       else
 770          emit(pull);
 771
 772       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 773                                            dst,
 774                                            surf_index,
 775                                            src_reg(grf_offset));
 776       pull->mlen = 1;
 777    } else {
 778       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 779                                            dst,
 780                                            surf_index,
 781                                            offset_reg);
 782       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 783       pull->mlen = 1;
 784    }
 785
 786    if (before_inst)
 787       emit_before(before_block, before_inst, pull);
 788    else
 789       emit(pull);
 790 }
 791
 792 src_reg
 793 vec4_visitor::emit_uniformize(const src_reg &src)
 794 {
 795    const src_reg chan_index(this, glsl_type::uint_type);
 796    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 797                               src.type);
 798
 799    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 800       ->force_writemask_all = true;
 801    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 802       ->force_writemask_all = true;
 803
 804    return src_reg(dst);
 805 }
 806
 807 src_reg
 808 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 809                              src_reg coordinate, src_reg surface)
 810 {
 811    vec4_instruction *inst =
 812       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 813                                     dst_reg(this, glsl_type::uvec4_type));
 814    inst->base_mrf = 2;
 815    inst->src[1] = surface;
 816    inst->src[2] = surface;
 817
 818    int param_base;
 819
 820    if (devinfo->gen >= 9) {
 821       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 822       vec4_instruction *header_inst = new(mem_ctx)
 823          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 824                           dst_reg(MRF, inst->base_mrf));
 825
 826       emit(header_inst);
 827
 828       inst->mlen = 2;
 829       inst->header_size = 1;
 830       param_base = inst->base_mrf + 1;
 831    } else {
 832       inst->mlen = 1;
 833       param_base = inst->base_mrf;
 834    }
 835
 836    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 837    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 838    int zero_mask = 0xf & ~coord_mask;
 839
 840    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 841             coordinate));
 842
 843    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 844             brw_imm_d(0)));
 845
 846    emit(inst);
 847    return src_reg(inst->dst);
 848 }
 849
 850 bool
 851 vec4_visitor::is_high_sampler(src_reg sampler)
 852 {
 853    if (devinfo->gen < 8 && !devinfo->is_haswell)
 854       return false;
 855
 856    return sampler.file != IMM || sampler.ud >= 16;
 857 }
 858
 859 void
 860 vec4_visitor::emit_texture(ir_texture_opcode op,
 861                            dst_reg dest,
 862                            const glsl_type *dest_type,
 863                            src_reg coordinate,
 864                            int coord_components,
 865                            src_reg shadow_comparitor,
 866                            src_reg lod, src_reg lod2,
 867                            src_reg sample_index,
 868                            uint32_t constant_offset,
 869                            src_reg offset_value,
 870                            src_reg mcs,
 871                            bool is_cube_array,
 872                            uint32_t surface,
 873                            src_reg surface_reg,
 874                            uint32_t sampler,
 875                            src_reg sampler_reg)
 876 {
 877    /* The sampler can only meaningfully compute LOD for fragment shader
 878     * messages. For all other stages, we change the opcode to TXL and hardcode
 879     * the LOD to 0.
 880     *
 881     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 882     * valid LOD argument.
 883     */
 884    if (op == ir_tex || op == ir_query_levels) {
 885       assert(lod.file == BAD_FILE);
 886       lod = brw_imm_f(0.0f);
 887    }
 888
 889    enum opcode opcode;
 890    switch (op) {
 891    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 892    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 893    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 894    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 895    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 896                              SHADER_OPCODE_TXF_CMS); break;
 897    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 898    case ir_tg4: opcode = offset_value.file != BAD_FILE
 899                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 900    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 901    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 902    case ir_txb:
 903       unreachable("TXB is not valid for vertex shaders.");
 904    case ir_lod:
 905       unreachable("LOD is not valid for vertex shaders.");
 906    case ir_samples_identical: {
 907       /* There are some challenges implementing this for vec4, and it seems
 908        * unlikely to be used anyway.  For now, just return false ways.
 909        */
 910       emit(MOV(dest, brw_imm_ud(0u)));
 911       return;
 912    }
 913    default:
 914       unreachable("Unrecognized tex op");
 915    }
 916
 917    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 918
 919    inst->offset = constant_offset;
 920
 921    /* The message header is necessary for:
 922     * - Gen4 (always)
 923     * - Gen9+ for selecting SIMD4x2
 924     * - Texel offsets
 925     * - Gather channel selection
 926     * - Sampler indices too large to fit in a 4-bit value.
 927     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 928     */
 929    inst->header_size =
 930       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 931        inst->offset != 0 || op == ir_tg4 ||
 932        op == ir_texture_samples ||
 933        is_high_sampler(sampler_reg)) ? 1 : 0;
 934    inst->base_mrf = 2;
 935    inst->mlen = inst->header_size;
 936    inst->dst.writemask = WRITEMASK_XYZW;
 937    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 938
 939    inst->src[1] = surface_reg;
 940    inst->src[2] = sampler_reg;
 941
 942    /* MRF for the first parameter */
 943    int param_base = inst->base_mrf + inst->header_size;
 944
 945    if (op == ir_txs || op == ir_query_levels) {
 946       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 947       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 948       inst->mlen++;
 949    } else if (op == ir_texture_samples) {
 950       inst->dst.writemask = WRITEMASK_X;
 951    } else {
 952       /* Load the coordinate */
 953       /* FINISHME: gl_clamp_mask and saturate */
 954       int coord_mask = (1 << coord_components) - 1;
 955       int zero_mask = 0xf & ~coord_mask;
 956
 957       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 958                coordinate));
 959       inst->mlen++;
 960
 961       if (zero_mask != 0) {
 962          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 963                   brw_imm_d(0)));
 964       }
 965       /* Load the shadow comparitor */
 966       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 967          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 968                           WRITEMASK_X),
 969                   shadow_comparitor));
 970          inst->mlen++;
 971       }
 972
 973       /* Load the LOD info */
 974       if (op == ir_tex || op == ir_txl) {
 975          int mrf, writemask;
 976          if (devinfo->gen >= 5) {
 977             mrf = param_base + 1;
 978             if (shadow_comparitor.file != BAD_FILE) {
 979                writemask = WRITEMASK_Y;
 980                /* mlen already incremented */
 981             } else {
 982                writemask = WRITEMASK_X;
 983                inst->mlen++;
 984             }
 985          } else /* devinfo->gen == 4 */ {
 986             mrf = param_base;
 987             writemask = WRITEMASK_W;
 988          }
 989          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 990       } else if (op == ir_txf) {
 991          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 992       } else if (op == ir_txf_ms) {
 993          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
 994                   sample_index));
 995          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
 996             /* MCS data is stored in the first two channels of ‘mcs’, but we
 997              * need to get it into the .y and .z channels of the second vec4
 998              * of params.
 999              */
1000             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1001             emit(MOV(dst_reg(MRF, param_base + 1,
1002                              glsl_type::uint_type, WRITEMASK_YZ),
1003                      mcs));
1004          } else if (devinfo->gen >= 7) {
1005             /* MCS data is in the first channel of `mcs`, but we need to get it into
1006              * the .y channel of the second vec4 of params, so replicate .x across
1007              * the whole vec4 and then mask off everything except .y
1008              */
1009             mcs.swizzle = BRW_SWIZZLE_XXXX;
1010             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1011                      mcs));
1012          }
1013          inst->mlen++;
1014       } else if (op == ir_txd) {
1015          const brw_reg_type type = lod.type;
1016
1017          if (devinfo->gen >= 5) {
1018             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1019             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1020             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1021             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1022             inst->mlen++;
1023
1024             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1025                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1026                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1027                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1028                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1029                inst->mlen++;
1030
1031                if (shadow_comparitor.file != BAD_FILE) {
1032                   emit(MOV(dst_reg(MRF, param_base + 2,
1033                                    shadow_comparitor.type, WRITEMASK_Z),
1034                            shadow_comparitor));
1035                }
1036             }
1037          } else /* devinfo->gen == 4 */ {
1038             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1039             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1040             inst->mlen += 2;
1041          }
1042       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1043          if (shadow_comparitor.file != BAD_FILE) {
1044             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1045                      shadow_comparitor));
1046          }
1047
1048          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1049                   offset_value));
1050          inst->mlen++;
1051       }
1052    }
1053
1054    emit(inst);
1055
1056    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1057     * spec requires layers.
1058     */
1059    if (op == ir_txs && is_cube_array) {
1060       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1061                 writemask(inst->dst, WRITEMASK_Z),
1062                 src_reg(inst->dst), brw_imm_d(6));
1063    }
1064
1065    if (devinfo->gen == 6 && op == ir_tg4) {
1066       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1067    }
1068
1069    if (op == ir_query_levels) {
1070       /* # levels is in .w */
1071       src_reg swizzled(dest);
1072       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1073                                       SWIZZLE_W, SWIZZLE_W);
1074       emit(MOV(dest, swizzled));
1075    }
1076 }
1077
1078 /**
1079  * Apply workarounds for Gen6 gather with UINT/SINT
1080  */
1081 void
1082 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1083 {
1084    if (!wa)
1085       return;
1086
1087    int width = (wa & WA_8BIT) ? 8 : 16;
1088    dst_reg dst_f = dst;
1089    dst_f.type = BRW_REGISTER_TYPE_F;
1090
1091    /* Convert from UNORM to UINT */
1092    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1093    emit(MOV(dst, src_reg(dst_f)));
1094
1095    if (wa & WA_SIGN) {
1096       /* Reinterpret the UINT value as a signed INT value by
1097        * shifting the sign bit into place, then shifting back
1098        * preserving sign.
1099        */
1100       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1101       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1102    }
1103 }
1104
1105 void
1106 vec4_visitor::gs_emit_vertex(int stream_id)
1107 {
1108    unreachable("not reached");
1109 }
1110
1111 void
1112 vec4_visitor::gs_end_primitive()
1113 {
1114    unreachable("not reached");
1115 }
1116
1117 void
1118 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1119                                   dst_reg dst, src_reg surf_offset,
1120                                   src_reg src0, src_reg src1)
1121 {
1122    unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1123    src_reg src_payload(this, glsl_type::uint_type, mlen);
1124    dst_reg payload(src_payload);
1125    payload.writemask = WRITEMASK_X;
1126
1127    /* Set the atomic operation offset. */
1128    emit(MOV(offset(payload, 0), surf_offset));
1129    unsigned i = 1;
1130
1131    /* Set the atomic operation arguments. */
1132    if (src0.file != BAD_FILE) {
1133       emit(MOV(offset(payload, i), src0));
1134       i++;
1135    }
1136
1137    if (src1.file != BAD_FILE) {
1138       emit(MOV(offset(payload, i), src1));
1139       i++;
1140    }
1141
1142    /* Emit the instruction.  Note that this maps to the normal SIMD8
1143     * untyped atomic message on Ivy Bridge, but that's OK because
1144     * unused channels will be masked out.
1145     */
1146    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1147                                  src_payload,
1148                                  brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1149    inst->mlen = mlen;
1150 }
1151
1152 void
1153 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1154                                         src_reg surf_offset)
1155 {
1156    dst_reg offset(this, glsl_type::uint_type);
1157    offset.writemask = WRITEMASK_X;
1158
1159    /* Set the surface read offset. */
1160    emit(MOV(offset, surf_offset));
1161
1162    /* Emit the instruction.  Note that this maps to the normal SIMD8
1163     * untyped surface read message, but that's OK because unused
1164     * channels will be masked out.
1165     */
1166    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1167                                  src_reg(offset),
1168                                  brw_imm_ud(surf_index), brw_imm_d(1));
1169    inst->mlen = 1;
1170 }
1171
1172 void
1173 vec4_visitor::emit_ndc_computation()
1174 {
1175    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1176       return;
1177
1178    /* Get the position */
1179    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1180
1181    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1182    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1183    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1184
1185    current_annotation = "NDC";
1186    dst_reg ndc_w = ndc;
1187    ndc_w.writemask = WRITEMASK_W;
1188    src_reg pos_w = pos;
1189    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1190    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1191
1192    dst_reg ndc_xyz = ndc;
1193    ndc_xyz.writemask = WRITEMASK_XYZ;
1194
1195    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1196 }
1197
1198 void
1199 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1200 {
1201    if (devinfo->gen < 6 &&
1202        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1203         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1204         devinfo->has_negative_rhw_bug)) {
1205       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1206       dst_reg header1_w = header1;
1207       header1_w.writemask = WRITEMASK_W;
1208
1209       emit(MOV(header1, brw_imm_ud(0u)));
1210
1211       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1212          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1213
1214          current_annotation = "Point size";
1215          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1216          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1217       }
1218
1219       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1220          current_annotation = "Clipping flags";
1221          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1222          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1223
1224          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1225          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1226          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1227
1228          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1229          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1230          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1231          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1232       }
1233
1234       /* i965 clipping workaround:
1235        * 1) Test for -ve rhw
1236        * 2) If set,
1237        *      set ndc = (0,0,0,0)
1238        *      set ucp[6] = 1
1239        *
1240        * Later, clipping will detect ucp[6] and ensure the primitive is
1241        * clipped against all fixed planes.
1242        */
1243       if (devinfo->has_negative_rhw_bug &&
1244           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1245          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1246          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1247          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1248          vec4_instruction *inst;
1249          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1250          inst->predicate = BRW_PREDICATE_NORMAL;
1251          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1252          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1253          inst->predicate = BRW_PREDICATE_NORMAL;
1254       }
1255
1256       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1257    } else if (devinfo->gen < 6) {
1258       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1259    } else {
1260       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1261       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1262          dst_reg reg_w = reg;
1263          reg_w.writemask = WRITEMASK_W;
1264          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1265          reg_as_src.type = reg_w.type;
1266          reg_as_src.swizzle = brw_swizzle_for_size(1);
1267          emit(MOV(reg_w, reg_as_src));
1268       }
1269       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1270          dst_reg reg_y = reg;
1271          reg_y.writemask = WRITEMASK_Y;
1272          reg_y.type = BRW_REGISTER_TYPE_D;
1273          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1274          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1275       }
1276       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1277          dst_reg reg_z = reg;
1278          reg_z.writemask = WRITEMASK_Z;
1279          reg_z.type = BRW_REGISTER_TYPE_D;
1280          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1281          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1282       }
1283    }
1284 }
1285
1286 vec4_instruction *
1287 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1288 {
1289    assert(varying < VARYING_SLOT_MAX);
1290    assert(output_reg[varying].type == reg.type);
1291    current_annotation = output_reg_annotation[varying];
1292    if (output_reg[varying].file != BAD_FILE)
1293       return emit(MOV(reg, src_reg(output_reg[varying])));
1294    else
1295       return NULL;
1296 }
1297
1298 void
1299 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1300 {
1301    reg.type = BRW_REGISTER_TYPE_F;
1302    output_reg[varying].type = reg.type;
1303
1304    switch (varying) {
1305    case VARYING_SLOT_PSIZ:
1306    {
1307       /* PSIZ is always in slot 0, and is coupled with other flags. */
1308       current_annotation = "indices, point width, clip flags";
1309       emit_psiz_and_flags(reg);
1310       break;
1311    }
1312    case BRW_VARYING_SLOT_NDC:
1313       current_annotation = "NDC";
1314       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1315          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1316       break;
1317    case VARYING_SLOT_POS:
1318       current_annotation = "gl_Position";
1319       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1320          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1321       break;
1322    case VARYING_SLOT_EDGE:
1323       /* This is present when doing unfilled polygons.  We're supposed to copy
1324        * the edge flag from the user-provided vertex array
1325        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1326        * of that attribute (starts as 1.0f).  This is then used in clipping to
1327        * determine which edges should be drawn as wireframe.
1328        */
1329       current_annotation = "edge flag";
1330       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1331                                     glsl_type::float_type, WRITEMASK_XYZW))));
1332       break;
1333    case BRW_VARYING_SLOT_PAD:
1334       /* No need to write to this slot */
1335       break;
1336    default:
1337       emit_generic_urb_slot(reg, varying);
1338       break;
1339    }
1340 }
1341
1342 static int
1343 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1344 {
1345    if (devinfo->gen >= 6) {
1346       /* URB data written (does not include the message header reg) must
1347        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1348        * section 5.4.3.2.2: URB_INTERLEAVED.
1349        *
1350        * URB entries are allocated on a multiple of 1024 bits, so an
1351        * extra 128 bits written here to make the end align to 256 is
1352        * no problem.
1353        */
1354       if ((mlen % 2) != 1)
1355          mlen++;
1356    }
1357
1358    return mlen;
1359 }
1360
1361
1362 /**
1363  * Generates the VUE payload plus the necessary URB write instructions to
1364  * output it.
1365  *
1366  * The VUE layout is documented in Volume 2a.
1367  */
1368 void
1369 vec4_visitor::emit_vertex()
1370 {
1371    /* MRF 0 is reserved for the debugger, so start with message header
1372     * in MRF 1.
1373     */
1374    int base_mrf = 1;
1375    int mrf = base_mrf;
1376    /* In the process of generating our URB write message contents, we
1377     * may need to unspill a register or load from an array.  Those
1378     * reads would use MRFs 14-15.
1379     */
1380    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1381
1382    /* The following assertion verifies that max_usable_mrf causes an
1383     * even-numbered amount of URB write data, which will meet gen6's
1384     * requirements for length alignment.
1385     */
1386    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1387
1388    /* First mrf is the g0-based message header containing URB handles and
1389     * such.
1390     */
1391    emit_urb_write_header(mrf++);
1392
1393    if (devinfo->gen < 6) {
1394       emit_ndc_computation();
1395    }
1396
1397    /* We may need to split this up into several URB writes, so do them in a
1398     * loop.
1399     */
1400    int slot = 0;
1401    bool complete = false;
1402    do {
1403       /* URB offset is in URB row increments, and each of our MRFs is half of
1404        * one of those, since we're doing interleaved writes.
1405        */
1406       int offset = slot / 2;
1407
1408       mrf = base_mrf + 1;
1409       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1410          emit_urb_slot(dst_reg(MRF, mrf++),
1411                        prog_data->vue_map.slot_to_varying[slot]);
1412
1413          /* If this was max_usable_mrf, we can't fit anything more into this
1414           * URB WRITE. Same thing if we reached the maximum length available.
1415           */
1416          if (mrf > max_usable_mrf ||
1417              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1418             slot++;
1419             break;
1420          }
1421       }
1422
1423       complete = slot >= prog_data->vue_map.num_slots;
1424       current_annotation = "URB write";
1425       vec4_instruction *inst = emit_urb_write_opcode(complete);
1426       inst->base_mrf = base_mrf;
1427       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1428       inst->offset += offset;
1429    } while(!complete);
1430 }
1431
1432
1433 src_reg
1434 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1435                                  src_reg *reladdr, int reg_offset)
1436 {
1437    /* Because we store the values to scratch interleaved like our
1438     * vertex data, we need to scale the vec4 index by 2.
1439     */
1440    int message_header_scale = 2;
1441
1442    /* Pre-gen6, the message header uses byte offsets instead of vec4
1443     * (16-byte) offset units.
1444     */
1445    if (devinfo->gen < 6)
1446       message_header_scale *= 16;
1447
1448    if (reladdr) {
1449       src_reg index = src_reg(this, glsl_type::int_type);
1450
1451       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1452                                    brw_imm_d(reg_offset)));
1453       emit_before(block, inst, MUL(dst_reg(index), index,
1454                                    brw_imm_d(message_header_scale)));
1455
1456       return index;
1457    } else {
1458       return brw_imm_d(reg_offset * message_header_scale);
1459    }
1460 }
1461
1462 /**
1463  * Emits an instruction before @inst to load the value named by @orig_src
1464  * from scratch space at @base_offset to @temp.
1465  *
1466  * @base_offset is measured in 32-byte units (the size of a register).
1467  */
1468 void
1469 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1470                                 dst_reg temp, src_reg orig_src,
1471                                 int base_offset)
1472 {
1473    int reg_offset = base_offset + orig_src.reg_offset;
1474    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1475                                       reg_offset);
1476
1477    emit_before(block, inst, SCRATCH_READ(temp, index));
1478 }
1479
1480 /**
1481  * Emits an instruction after @inst to store the value to be written
1482  * to @orig_dst to scratch space at @base_offset, from @temp.
1483  *
1484  * @base_offset is measured in 32-byte units (the size of a register).
1485  */
1486 void
1487 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1488                                  int base_offset)
1489 {
1490    int reg_offset = base_offset + inst->dst.reg_offset;
1491    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1492                                       reg_offset);
1493
1494    /* Create a temporary register to store *inst's result in.
1495     *
1496     * We have to be careful in MOVing from our temporary result register in
1497     * the scratch write.  If we swizzle from channels of the temporary that
1498     * weren't initialized, it will confuse live interval analysis, which will
1499     * make spilling fail to make progress.
1500     */
1501    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1502                                        inst->dst.type),
1503                                 brw_swizzle_for_mask(inst->dst.writemask));
1504    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1505                                        inst->dst.writemask));
1506    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1507    if (inst->opcode != BRW_OPCODE_SEL)
1508       write->predicate = inst->predicate;
1509    write->ir = inst->ir;
1510    write->annotation = inst->annotation;
1511    inst->insert_after(block, write);
1512
1513    inst->dst.file = temp.file;
1514    inst->dst.nr = temp.nr;
1515    inst->dst.reg_offset = temp.reg_offset;
1516    inst->dst.reladdr = NULL;
1517 }
1518
1519 /**
1520  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1521  * adds the scratch read(s) before \p inst. The function also checks for
1522  * recursive reladdr scratch accesses, issuing the corresponding scratch
1523  * loads and rewriting reladdr references accordingly.
1524  *
1525  * \return \p src if it did not require a scratch load, otherwise, the
1526  * register holding the result of the scratch load that the caller should
1527  * use to rewrite src.
1528  */
1529 src_reg
1530 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1531                                    vec4_instruction *inst, src_reg src)
1532 {
1533    /* Resolve recursive reladdr scratch access by calling ourselves
1534     * with src.reladdr
1535     */
1536    if (src.reladdr)
1537       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1538                                           *src.reladdr);
1539
1540    /* Now handle scratch access on src */
1541    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1542       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1543       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1544       src.nr = temp.nr;
1545       src.reg_offset = temp.reg_offset;
1546       src.reladdr = NULL;
1547    }
1548
1549    return src;
1550 }
1551
1552 /**
1553  * We can't generally support array access in GRF space, because a
1554  * single instruction's destination can only span 2 contiguous
1555  * registers.  So, we send all GRF arrays that get variable index
1556  * access to scratch space.
1557  */
1558 void
1559 vec4_visitor::move_grf_array_access_to_scratch()
1560 {
1561    int scratch_loc[this->alloc.count];
1562    memset(scratch_loc, -1, sizeof(scratch_loc));
1563
1564    /* First, calculate the set of virtual GRFs that need to be punted
1565     * to scratch due to having any array access on them, and where in
1566     * scratch.
1567     */
1568    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1569       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1570          if (scratch_loc[inst->dst.nr] == -1) {
1571             scratch_loc[inst->dst.nr] = last_scratch;
1572             last_scratch += this->alloc.sizes[inst->dst.nr];
1573          }
1574
1575          for (src_reg *iter = inst->dst.reladdr;
1576               iter->reladdr;
1577               iter = iter->reladdr) {
1578             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1579                scratch_loc[iter->nr] = last_scratch;
1580                last_scratch += this->alloc.sizes[iter->nr];
1581             }
1582          }
1583       }
1584
1585       for (int i = 0 ; i < 3; i++) {
1586          for (src_reg *iter = &inst->src[i];
1587               iter->reladdr;
1588               iter = iter->reladdr) {
1589             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1590                scratch_loc[iter->nr] = last_scratch;
1591                last_scratch += this->alloc.sizes[iter->nr];
1592             }
1593          }
1594       }
1595    }
1596
1597    /* Now, for anything that will be accessed through scratch, rewrite
1598     * it to load/store.  Note that this is a _safe list walk, because
1599     * we may generate a new scratch_write instruction after the one
1600     * we're processing.
1601     */
1602    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1603       /* Set up the annotation tracking for new generated instructions. */
1604       base_ir = inst->ir;
1605       current_annotation = inst->annotation;
1606
1607       /* First handle scratch access on the dst. Notice we have to handle
1608        * the case where the dst's reladdr also points to scratch space.
1609        */
1610       if (inst->dst.reladdr)
1611          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1612                                                    *inst->dst.reladdr);
1613
1614       /* Now that we have handled any (possibly recursive) reladdr scratch
1615        * accesses for dst we can safely do the scratch write for dst itself
1616        */
1617       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1618          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1619
1620       /* Now handle scratch access on any src. In this case, since inst->src[i]
1621        * already is a src_reg, we can just call emit_resolve_reladdr with
1622        * inst->src[i] and it will take care of handling scratch loads for
1623        * both src and src.reladdr (recursively).
1624        */
1625       for (int i = 0 ; i < 3; i++) {
1626          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1627                                              inst->src[i]);
1628       }
1629    }
1630 }
1631
1632 /**
1633  * Emits an instruction before @inst to load the value named by @orig_src
1634  * from the pull constant buffer (surface) at @base_offset to @temp.
1635  */
1636 void
1637 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1638                                       dst_reg temp, src_reg orig_src,
1639                                       int base_offset, src_reg indirect)
1640 {
1641    int reg_offset = base_offset + orig_src.reg_offset;
1642    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1643
1644    src_reg offset;
1645    if (indirect.file != BAD_FILE) {
1646       offset = src_reg(this, glsl_type::int_type);
1647
1648       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1649                                    brw_imm_d(reg_offset * 16)));
1650    } else if (devinfo->gen >= 8) {
1651       /* Store the offset in a GRF so we can send-from-GRF. */
1652       offset = src_reg(this, glsl_type::int_type);
1653       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
1654    } else {
1655       offset = brw_imm_d(reg_offset * 16);
1656    }
1657
1658    emit_pull_constant_load_reg(temp,
1659                                brw_imm_ud(index),
1660                                offset,
1661                                block, inst);
1662
1663    brw_mark_surface_used(&prog_data->base, index);
1664 }
1665
1666 /**
1667  * Implements array access of uniforms by inserting a
1668  * PULL_CONSTANT_LOAD instruction.
1669  *
1670  * Unlike temporary GRF array access (where we don't support it due to
1671  * the difficulty of doing relative addressing on instruction
1672  * destinations), we could potentially do array access of uniforms
1673  * that were loaded in GRF space as push constants.  In real-world
1674  * usage we've seen, though, the arrays being used are always larger
1675  * than we could load as push constants, so just always move all
1676  * uniform array access out to a pull constant buffer.
1677  */
1678 void
1679 vec4_visitor::move_uniform_array_access_to_pull_constants()
1680 {
1681    int pull_constant_loc[this->uniforms];
1682    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1683
1684    /* First, walk through the instructions and determine which things need to
1685     * be pulled.  We mark something as needing to be pulled by setting
1686     * pull_constant_loc to 0.
1687     */
1688    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1689       /* We only care about MOV_INDIRECT of a uniform */
1690       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1691           inst->src[0].file != UNIFORM)
1692          continue;
1693
1694       int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1695
1696       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1697          pull_constant_loc[uniform_nr + j] = 0;
1698    }
1699
1700    /* Next, we walk the list of uniforms and assign real pull constant
1701     * locations and set their corresponding entries in pull_param.
1702     */
1703    for (int j = 0; j < this->uniforms; j++) {
1704       if (pull_constant_loc[j] < 0)
1705          continue;
1706
1707       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1708
1709       for (int i = 0; i < 4; i++) {
1710          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1711             = stage_prog_data->param[j * 4 + i];
1712       }
1713    }
1714
1715    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1716     * instructions to actual uniform pulls.
1717     */
1718    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1719       /* We only care about MOV_INDIRECT of a uniform */
1720       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1721           inst->src[0].file != UNIFORM)
1722          continue;
1723
1724       int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1725
1726       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1727
1728       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1729                               pull_constant_loc[uniform_nr], inst->src[1]);
1730       inst->remove(block);
1731    }
1732
1733    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1734     * no need to track them as larger-than-vec4 objects.  This will be
1735     * relied on in cutting out unused uniform vectors from push
1736     * constants.
1737     */
1738    split_uniform_registers();
1739 }
1740
1741 void
1742 vec4_visitor::resolve_ud_negate(src_reg *reg)
1743 {
1744    if (reg->type != BRW_REGISTER_TYPE_UD ||
1745        !reg->negate)
1746       return;
1747
1748    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1749    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1750    *reg = temp;
1751 }
1752
1753 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1754                            void *log_data,
1755                            const struct brw_sampler_prog_key_data *key_tex,
1756                            struct brw_vue_prog_data *prog_data,
1757                            const nir_shader *shader,
1758                            void *mem_ctx,
1759                            bool no_spills,
1760                            int shader_time_index)
1761    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1762      key_tex(key_tex),
1763      prog_data(prog_data),
1764      fail_msg(NULL),
1765      first_non_payload_grf(0),
1766      need_all_constants_in_pull_buffer(false),
1767      no_spills(no_spills),
1768      shader_time_index(shader_time_index),
1769      last_scratch(0)
1770 {
1771    this->failed = false;
1772
1773    this->base_ir = NULL;
1774    this->current_annotation = NULL;
1775    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1776
1777    this->virtual_grf_start = NULL;
1778    this->virtual_grf_end = NULL;
1779    this->live_intervals = NULL;
1780
1781    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1782
1783    this->uniforms = 0;
1784 }
1785
1786 vec4_visitor::~vec4_visitor()
1787 {
1788 }
1789
1790
1791 void
1792 vec4_visitor::fail(const char *format, ...)
1793 {
1794    va_list va;
1795    char *msg;
1796
1797    if (failed)
1798       return;
1799
1800    failed = true;
1801
1802    va_start(va, format);
1803    msg = ralloc_vasprintf(mem_ctx, format, va);
1804    va_end(va);
1805    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1806
1807    this->fail_msg = msg;
1808
1809    if (debug_enabled) {
1810       fprintf(stderr, "%s",  msg);
1811    }
1812 }
1813
1814 } /* namespace brw */