src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = NULL;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_size = 0;
  53    this->flag_subreg = 0;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->exec_size = 8;
  58    this->size_written = (dst.file == BAD_FILE ?
  59                          0 : this->exec_size * type_sz(dst.type));
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188 ALU1(DIM)
 189
 190 /** Gen4 predicated IF. */
 191 vec4_instruction *
 192 vec4_visitor::IF(enum brw_predicate predicate)
 193 {
 194    vec4_instruction *inst;
 195
 196    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 197    inst->predicate = predicate;
 198
 199    return inst;
 200 }
 201
 202 /** Gen6 IF with embedded comparison. */
 203 vec4_instruction *
 204 vec4_visitor::IF(src_reg src0, src_reg src1,
 205                  enum brw_conditional_mod condition)
 206 {
 207    assert(devinfo->gen == 6);
 208
 209    vec4_instruction *inst;
 210
 211    resolve_ud_negate(&src0);
 212    resolve_ud_negate(&src1);
 213
 214    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 215                                         src0, src1);
 216    inst->conditional_mod = condition;
 217
 218    return inst;
 219 }
 220
 221 /**
 222  * CMP: Sets the low bit of the destination channels with the result
 223  * of the comparison, while the upper bits are undefined, and updates
 224  * the flag register with the packed 16 bits of the result.
 225  */
 226 vec4_instruction *
 227 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 228                   enum brw_conditional_mod condition)
 229 {
 230    vec4_instruction *inst;
 231
 232    /* Take the instruction:
 233     *
 234     * CMP null<d> src0<f> src1<f>
 235     *
 236     * Original gen4 does type conversion to the destination type before
 237     * comparison, producing garbage results for floating point comparisons.
 238     *
 239     * The destination type doesn't matter on newer generations, so we set the
 240     * type to match src0 so we can compact the instruction.
 241     */
 242    dst.type = src0.type;
 243
 244    resolve_ud_negate(&src0);
 245    resolve_ud_negate(&src1);
 246
 247    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 248    inst->conditional_mod = condition;
 249
 250    return inst;
 251 }
 252
 253 vec4_instruction *
 254 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 255 {
 256    vec4_instruction *inst;
 257
 258    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 259                                         dst, index);
 260    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 261    inst->mlen = 2;
 262
 263    return inst;
 264 }
 265
 266 vec4_instruction *
 267 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 268                             const src_reg &index)
 269 {
 270    vec4_instruction *inst;
 271
 272    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 273                                         dst, src, index);
 274    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 275    inst->mlen = 3;
 276
 277    return inst;
 278 }
 279
 280 src_reg
 281 vec4_visitor::fix_3src_operand(const src_reg &src)
 282 {
 283    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 284     * able to use vertical stride of zero to replicate the vec4 uniform, like
 285     *
 286     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 287     *
 288     * But you can't, since vertical stride is always four in three-source
 289     * instructions. Instead, insert a MOV instruction to do the replication so
 290     * that the three-source instruction can consume it.
 291     */
 292
 293    /* The MOV is only needed if the source is a uniform or immediate. */
 294    if (src.file != UNIFORM && src.file != IMM)
 295       return src;
 296
 297    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 298       return src;
 299
 300    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 301    expanded.type = src.type;
 302    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 303    return src_reg(expanded);
 304 }
 305
 306 src_reg
 307 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 308 {
 309    if (!src.abs && !src.negate)
 310       return src;
 311
 312    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 313    resolved.type = src.type;
 314    emit(MOV(resolved, src));
 315
 316    return src_reg(resolved);
 317 }
 318
 319 src_reg
 320 vec4_visitor::fix_math_operand(const src_reg &src)
 321 {
 322    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 323       return src;
 324
 325    /* The gen6 math instruction ignores the source modifiers --
 326     * swizzle, abs, negate, and at least some parts of the register
 327     * region description.
 328     *
 329     * Rather than trying to enumerate all these cases, *always* expand the
 330     * operand to a temp GRF for gen6.
 331     *
 332     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 333     * can't use.
 334     */
 335
 336    if (devinfo->gen == 7 && src.file != IMM)
 337       return src;
 338
 339    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 340    expanded.type = src.type;
 341    emit(MOV(expanded, src));
 342    return src_reg(expanded);
 343 }
 344
 345 vec4_instruction *
 346 vec4_visitor::emit_math(enum opcode opcode,
 347                         const dst_reg &dst,
 348                         const src_reg &src0, const src_reg &src1)
 349 {
 350    vec4_instruction *math =
 351       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 352
 353    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 354       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 355       math->dst = dst_reg(this, glsl_type::vec4_type);
 356       math->dst.type = dst.type;
 357       math = emit(MOV(dst, src_reg(math->dst)));
 358    } else if (devinfo->gen < 6) {
 359       math->base_mrf = 1;
 360       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 361    }
 362
 363    return math;
 364 }
 365
 366 void
 367 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 368 {
 369    if (devinfo->gen < 7) {
 370       unreachable("ir_unop_pack_half_2x16 should be lowered");
 371    }
 372
 373    assert(dst.type == BRW_REGISTER_TYPE_UD);
 374    assert(src0.type == BRW_REGISTER_TYPE_F);
 375
 376    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 377     *
 378     *   Because this instruction does not have a 16-bit floating-point type,
 379     *   the destination data type must be Word (W).
 380     *
 381     *   The destination must be DWord-aligned and specify a horizontal stride
 382     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 383     *   each destination channel and the upper word is not modified.
 384     *
 385     * The above restriction implies that the f32to16 instruction must use
 386     * align1 mode, because only in align1 mode is it possible to specify
 387     * horizontal stride.  We choose here to defy the hardware docs and emit
 388     * align16 instructions.
 389     *
 390     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 391     * instructions. I was partially successful in that the code passed all
 392     * tests.  However, the code was dubiously correct and fragile, and the
 393     * tests were not harsh enough to probe that frailty. Not trusting the
 394     * code, I chose instead to remain in align16 mode in defiance of the hw
 395     * docs).
 396     *
 397     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 398     * simulator, emitting a f32to16 in align16 mode with UD as destination
 399     * data type is safe. The behavior differs from that specified in the PRM
 400     * in that the upper word of each destination channel is cleared to 0.
 401     */
 402
 403    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 404    src_reg tmp_src(tmp_dst);
 405
 406 #if 0
 407    /* Verify the undocumented behavior on which the following instructions
 408     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 409     * then the result of the bit-or instruction below will be incorrect.
 410     *
 411     * You should inspect the disasm output in order to verify that the MOV is
 412     * not optimized away.
 413     */
 414    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 415 #endif
 416
 417    /* Give tmp the form below, where "." means untouched.
 418     *
 419     *     w z          y          x w z          y          x
 420     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 421     *
 422     * That the upper word of each write-channel be 0 is required for the
 423     * following bit-shift and bit-or instructions to work. Note that this
 424     * relies on the undocumented hardware behavior mentioned above.
 425     */
 426    tmp_dst.writemask = WRITEMASK_XY;
 427    emit(F32TO16(tmp_dst, src0));
 428
 429    /* Give the write-channels of dst the form:
 430     *   0xhhhh0000
 431     */
 432    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 433    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 434
 435    /* Finally, give the write-channels of dst the form of packHalf2x16's
 436     * output:
 437     *   0xhhhhllll
 438     */
 439    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 440    emit(OR(dst, src_reg(dst), tmp_src));
 441 }
 442
 443 void
 444 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 445 {
 446    if (devinfo->gen < 7) {
 447       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 448    }
 449
 450    assert(dst.type == BRW_REGISTER_TYPE_F);
 451    assert(src0.type == BRW_REGISTER_TYPE_UD);
 452
 453    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 454     *
 455     *   Because this instruction does not have a 16-bit floating-point type,
 456     *   the source data type must be Word (W). The destination type must be
 457     *   F (Float).
 458     *
 459     * To use W as the source data type, we must adjust horizontal strides,
 460     * which is only possible in align1 mode. All my [chadv] attempts at
 461     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 462     * Piglit tests, so I gave up.
 463     *
 464     * I've verified that, on gen7 hardware and the simulator, it is safe to
 465     * emit f16to32 in align16 mode with UD as source data type.
 466     */
 467
 468    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 469    src_reg tmp_src(tmp_dst);
 470
 471    tmp_dst.writemask = WRITEMASK_X;
 472    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 473
 474    tmp_dst.writemask = WRITEMASK_Y;
 475    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 476
 477    dst.writemask = WRITEMASK_XY;
 478    emit(F16TO32(dst, tmp_src));
 479 }
 480
 481 void
 482 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 483 {
 484    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 485     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 486     * is not suitable to generate the shift values, but we can use the packed
 487     * vector float and a type-converting MOV.
 488     */
 489    dst_reg shift(this, glsl_type::uvec4_type);
 490    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 491
 492    dst_reg shifted(this, glsl_type::uvec4_type);
 493    src0.swizzle = BRW_SWIZZLE_XXXX;
 494    emit(SHR(shifted, src0, src_reg(shift)));
 495
 496    shifted.type = BRW_REGISTER_TYPE_UB;
 497    dst_reg f(this, glsl_type::vec4_type);
 498    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 499
 500    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 501 }
 502
 503 void
 504 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 505 {
 506    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 507     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 508     * is not suitable to generate the shift values, but we can use the packed
 509     * vector float and a type-converting MOV.
 510     */
 511    dst_reg shift(this, glsl_type::uvec4_type);
 512    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 513
 514    dst_reg shifted(this, glsl_type::uvec4_type);
 515    src0.swizzle = BRW_SWIZZLE_XXXX;
 516    emit(SHR(shifted, src0, src_reg(shift)));
 517
 518    shifted.type = BRW_REGISTER_TYPE_B;
 519    dst_reg f(this, glsl_type::vec4_type);
 520    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 521
 522    dst_reg scaled(this, glsl_type::vec4_type);
 523    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 524
 525    dst_reg max(this, glsl_type::vec4_type);
 526    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 527    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 528 }
 529
 530 void
 531 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 532 {
 533    dst_reg saturated(this, glsl_type::vec4_type);
 534    vec4_instruction *inst = emit(MOV(saturated, src0));
 535    inst->saturate = true;
 536
 537    dst_reg scaled(this, glsl_type::vec4_type);
 538    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 539
 540    dst_reg rounded(this, glsl_type::vec4_type);
 541    emit(RNDE(rounded, src_reg(scaled)));
 542
 543    dst_reg u(this, glsl_type::uvec4_type);
 544    emit(MOV(u, src_reg(rounded)));
 545
 546    src_reg bytes(u);
 547    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 548 }
 549
 550 void
 551 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 552 {
 553    dst_reg max(this, glsl_type::vec4_type);
 554    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 555
 556    dst_reg min(this, glsl_type::vec4_type);
 557    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 558
 559    dst_reg scaled(this, glsl_type::vec4_type);
 560    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 561
 562    dst_reg rounded(this, glsl_type::vec4_type);
 563    emit(RNDE(rounded, src_reg(scaled)));
 564
 565    dst_reg i(this, glsl_type::ivec4_type);
 566    emit(MOV(i, src_reg(rounded)));
 567
 568    src_reg bytes(i);
 569    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 570 }
 571
 572 /*
 573  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 574  * false) elements needed to pack a type.
 575  */
 576 static int
 577 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587    case GLSL_TYPE_DOUBLE:
 588       if (type->is_matrix()) {
 589          const glsl_type *col_type = type->column_type();
 590          unsigned col_slots =
 591             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 592          return type->matrix_columns * col_slots;
 593       } else {
 594          /* Regardless of size of vector, it gets a vec4. This is bad
 595           * packing for things like floats, but otherwise arrays become a
 596           * mess.  Hopefully a later pass over the code can pack scalars
 597           * down if appropriate.
 598           */
 599          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 600       }
 601    case GLSL_TYPE_ARRAY:
 602       assert(type->length > 0);
 603       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 604    case GLSL_TYPE_STRUCT:
 605       size = 0;
 606       for (i = 0; i < type->length; i++) {
 607          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 608       }
 609       return size;
 610    case GLSL_TYPE_SUBROUTINE:
 611       return 1;
 612
 613    case GLSL_TYPE_SAMPLER:
 614       /* Samplers take up no register space, since they're baked in at
 615        * link time.
 616        */
 617       return 0;
 618    case GLSL_TYPE_ATOMIC_UINT:
 619       return 0;
 620    case GLSL_TYPE_IMAGE:
 621       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 622    case GLSL_TYPE_VOID:
 623    case GLSL_TYPE_ERROR:
 624    case GLSL_TYPE_INTERFACE:
 625    case GLSL_TYPE_FUNCTION:
 626       unreachable("not reached");
 627    }
 628
 629    return 0;
 630 }
 631
 632 /**
 633  * Returns the minimum number of vec4 elements needed to pack a type.
 634  *
 635  * For simple types, it will return 1 (a single vec4); for matrices, the
 636  * number of columns; for array and struct, the sum of the vec4_size of
 637  * each of its elements; and for sampler and atomic, zero.
 638  *
 639  * This method is useful to calculate how much register space is needed to
 640  * store a particular type.
 641  */
 642 extern "C" int
 643 type_size_vec4(const struct glsl_type *type)
 644 {
 645    return type_size_xvec4(type, true);
 646 }
 647
 648 /**
 649  * Returns the minimum number of dvec4 elements needed to pack a type.
 650  *
 651  * For simple types, it will return 1 (a single dvec4); for matrices, the
 652  * number of columns; for array and struct, the sum of the dvec4_size of
 653  * each of its elements; and for sampler and atomic, zero.
 654  *
 655  * This method is useful to calculate how much register space is needed to
 656  * store a particular type.
 657  *
 658  * Measuring double-precision vertex inputs as dvec4 is required because
 659  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 660  * than the single-precision version. That is, two consecutives dvec4 would be
 661  * located in location "x" and location "x+1", not "x+2".
 662  *
 663  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 664  * remap_vs_attrs() will take in account both the location and also if the
 665  * type fits in one or two vec4 slots.
 666  */
 667 extern "C" int
 668 type_size_dvec4(const struct glsl_type *type)
 669 {
 670    return type_size_xvec4(type, false);
 671 }
 672
 673 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 674 {
 675    init();
 676
 677    this->file = VGRF;
 678    this->nr = v->alloc.allocate(type_size_vec4(type));
 679
 680    if (type->is_array() || type->is_record()) {
 681       this->swizzle = BRW_SWIZZLE_NOOP;
 682    } else {
 683       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 684    }
 685
 686    this->type = brw_type_for_base_type(type);
 687 }
 688
 689 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 690 {
 691    assert(size > 0);
 692
 693    init();
 694
 695    this->file = VGRF;
 696    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 697
 698    this->swizzle = BRW_SWIZZLE_NOOP;
 699
 700    this->type = brw_type_for_base_type(type);
 701 }
 702
 703 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 704 {
 705    init();
 706
 707    this->file = VGRF;
 708    this->nr = v->alloc.allocate(type_size_vec4(type));
 709
 710    if (type->is_array() || type->is_record()) {
 711       this->writemask = WRITEMASK_XYZW;
 712    } else {
 713       this->writemask = (1 << type->vector_elements) - 1;
 714    }
 715
 716    this->type = brw_type_for_base_type(type);
 717 }
 718
 719 vec4_instruction *
 720 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 721                           src_reg src0, src_reg src1)
 722 {
 723    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 724    inst->conditional_mod = conditionalmod;
 725    return inst;
 726 }
 727
 728 vec4_instruction *
 729 vec4_visitor::emit_lrp(const dst_reg &dst,
 730                        const src_reg &x, const src_reg &y, const src_reg &a)
 731 {
 732    if (devinfo->gen >= 6) {
 733       /* Note that the instruction's argument order is reversed from GLSL
 734        * and the IR.
 735        */
 736      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 737                      fix_3src_operand(x)));
 738    } else {
 739       /* Earlier generations don't support three source operations, so we
 740        * need to emit x*(1-a) + y*a.
 741        */
 742       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 743       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 744       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 745       y_times_a.writemask           = dst.writemask;
 746       one_minus_a.writemask         = dst.writemask;
 747       x_times_one_minus_a.writemask = dst.writemask;
 748
 749       emit(MUL(y_times_a, y, a));
 750       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 751       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 752       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 753    }
 754 }
 755
 756 /**
 757  * Emits the instructions needed to perform a pull constant load. before_block
 758  * and before_inst can be NULL in which case the instruction will be appended
 759  * to the end of the instruction list.
 760  */
 761 void
 762 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 763                                           src_reg surf_index,
 764                                           src_reg offset_reg,
 765                                           bblock_t *before_block,
 766                                           vec4_instruction *before_inst)
 767 {
 768    assert((before_inst == NULL && before_block == NULL) ||
 769           (before_inst && before_block));
 770
 771    vec4_instruction *pull;
 772
 773    if (devinfo->gen >= 9) {
 774       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 775       src_reg header(this, glsl_type::uvec4_type, 2);
 776
 777       pull = new(mem_ctx)
 778          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 779                           dst_reg(header));
 780
 781       if (before_inst)
 782          emit_before(before_block, before_inst, pull);
 783       else
 784          emit(pull);
 785
 786       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 787                                  offset_reg.type);
 788       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 789
 790       if (before_inst)
 791          emit_before(before_block, before_inst, pull);
 792       else
 793          emit(pull);
 794
 795       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 796                                            dst,
 797                                            surf_index,
 798                                            header);
 799       pull->mlen = 2;
 800       pull->header_size = 1;
 801    } else if (devinfo->gen >= 7) {
 802       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 803
 804       grf_offset.type = offset_reg.type;
 805
 806       pull = MOV(grf_offset, offset_reg);
 807
 808       if (before_inst)
 809          emit_before(before_block, before_inst, pull);
 810       else
 811          emit(pull);
 812
 813       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 814                                            dst,
 815                                            surf_index,
 816                                            src_reg(grf_offset));
 817       pull->mlen = 1;
 818    } else {
 819       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 820                                            dst,
 821                                            surf_index,
 822                                            offset_reg);
 823       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 824       pull->mlen = 1;
 825    }
 826
 827    if (before_inst)
 828       emit_before(before_block, before_inst, pull);
 829    else
 830       emit(pull);
 831 }
 832
 833 src_reg
 834 vec4_visitor::emit_uniformize(const src_reg &src)
 835 {
 836    const src_reg chan_index(this, glsl_type::uint_type);
 837    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 838                               src.type);
 839
 840    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 841       ->force_writemask_all = true;
 842    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 843       ->force_writemask_all = true;
 844
 845    return src_reg(dst);
 846 }
 847
 848 src_reg
 849 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 850                              src_reg coordinate, src_reg surface)
 851 {
 852    vec4_instruction *inst =
 853       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 854                                     dst_reg(this, glsl_type::uvec4_type));
 855    inst->base_mrf = 2;
 856    inst->src[1] = surface;
 857    inst->src[2] = surface;
 858
 859    int param_base;
 860
 861    if (devinfo->gen >= 9) {
 862       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 863       vec4_instruction *header_inst = new(mem_ctx)
 864          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 865                           dst_reg(MRF, inst->base_mrf));
 866
 867       emit(header_inst);
 868
 869       inst->mlen = 2;
 870       inst->header_size = 1;
 871       param_base = inst->base_mrf + 1;
 872    } else {
 873       inst->mlen = 1;
 874       param_base = inst->base_mrf;
 875    }
 876
 877    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 878    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 879    int zero_mask = 0xf & ~coord_mask;
 880
 881    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 882             coordinate));
 883
 884    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 885             brw_imm_d(0)));
 886
 887    emit(inst);
 888    return src_reg(inst->dst);
 889 }
 890
 891 bool
 892 vec4_visitor::is_high_sampler(src_reg sampler)
 893 {
 894    if (devinfo->gen < 8 && !devinfo->is_haswell)
 895       return false;
 896
 897    return sampler.file != IMM || sampler.ud >= 16;
 898 }
 899
 900 void
 901 vec4_visitor::emit_texture(ir_texture_opcode op,
 902                            dst_reg dest,
 903                            const glsl_type *dest_type,
 904                            src_reg coordinate,
 905                            int coord_components,
 906                            src_reg shadow_comparator,
 907                            src_reg lod, src_reg lod2,
 908                            src_reg sample_index,
 909                            uint32_t constant_offset,
 910                            src_reg offset_value,
 911                            src_reg mcs,
 912                            uint32_t surface,
 913                            src_reg surface_reg,
 914                            src_reg sampler_reg)
 915 {
 916    /* The sampler can only meaningfully compute LOD for fragment shader
 917     * messages. For all other stages, we change the opcode to TXL and hardcode
 918     * the LOD to 0.
 919     *
 920     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 921     * valid LOD argument.
 922     */
 923    if (op == ir_tex || op == ir_query_levels) {
 924       assert(lod.file == BAD_FILE);
 925       lod = brw_imm_f(0.0f);
 926    }
 927
 928    enum opcode opcode;
 929    switch (op) {
 930    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 931    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 932    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 933    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 934    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 935                              SHADER_OPCODE_TXF_CMS); break;
 936    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 937    case ir_tg4: opcode = offset_value.file != BAD_FILE
 938                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 939    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 940    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 941    case ir_txb:
 942       unreachable("TXB is not valid for vertex shaders.");
 943    case ir_lod:
 944       unreachable("LOD is not valid for vertex shaders.");
 945    case ir_samples_identical: {
 946       /* There are some challenges implementing this for vec4, and it seems
 947        * unlikely to be used anyway.  For now, just return false ways.
 948        */
 949       emit(MOV(dest, brw_imm_ud(0u)));
 950       return;
 951    }
 952    default:
 953       unreachable("Unrecognized tex op");
 954    }
 955
 956    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 957
 958    inst->offset = constant_offset;
 959
 960    /* The message header is necessary for:
 961     * - Gen4 (always)
 962     * - Gen9+ for selecting SIMD4x2
 963     * - Texel offsets
 964     * - Gather channel selection
 965     * - Sampler indices too large to fit in a 4-bit value.
 966     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 967     */
 968    inst->header_size =
 969       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 970        inst->offset != 0 || op == ir_tg4 ||
 971        op == ir_texture_samples ||
 972        is_high_sampler(sampler_reg)) ? 1 : 0;
 973    inst->base_mrf = 2;
 974    inst->mlen = inst->header_size;
 975    inst->dst.writemask = WRITEMASK_XYZW;
 976    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 977
 978    inst->src[1] = surface_reg;
 979    inst->src[2] = sampler_reg;
 980
 981    /* MRF for the first parameter */
 982    int param_base = inst->base_mrf + inst->header_size;
 983
 984    if (op == ir_txs || op == ir_query_levels) {
 985       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 986       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 987       inst->mlen++;
 988    } else if (op == ir_texture_samples) {
 989       inst->dst.writemask = WRITEMASK_X;
 990    } else {
 991       /* Load the coordinate */
 992       /* FINISHME: gl_clamp_mask and saturate */
 993       int coord_mask = (1 << coord_components) - 1;
 994       int zero_mask = 0xf & ~coord_mask;
 995
 996       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 997                coordinate));
 998       inst->mlen++;
 999
1000       if (zero_mask != 0) {
1001          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1002                   brw_imm_d(0)));
1003       }
1004       /* Load the shadow comparator */
1005       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1006          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1007                           WRITEMASK_X),
1008                   shadow_comparator));
1009          inst->mlen++;
1010       }
1011
1012       /* Load the LOD info */
1013       if (op == ir_tex || op == ir_txl) {
1014          int mrf, writemask;
1015          if (devinfo->gen >= 5) {
1016             mrf = param_base + 1;
1017             if (shadow_comparator.file != BAD_FILE) {
1018                writemask = WRITEMASK_Y;
1019                /* mlen already incremented */
1020             } else {
1021                writemask = WRITEMASK_X;
1022                inst->mlen++;
1023             }
1024          } else /* devinfo->gen == 4 */ {
1025             mrf = param_base;
1026             writemask = WRITEMASK_W;
1027          }
1028          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1029       } else if (op == ir_txf) {
1030          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1031       } else if (op == ir_txf_ms) {
1032          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1033                   sample_index));
1034          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1035             /* MCS data is stored in the first two channels of ‘mcs’, but we
1036              * need to get it into the .y and .z channels of the second vec4
1037              * of params.
1038              */
1039             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1040             emit(MOV(dst_reg(MRF, param_base + 1,
1041                              glsl_type::uint_type, WRITEMASK_YZ),
1042                      mcs));
1043          } else if (devinfo->gen >= 7) {
1044             /* MCS data is in the first channel of `mcs`, but we need to get it into
1045              * the .y channel of the second vec4 of params, so replicate .x across
1046              * the whole vec4 and then mask off everything except .y
1047              */
1048             mcs.swizzle = BRW_SWIZZLE_XXXX;
1049             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1050                      mcs));
1051          }
1052          inst->mlen++;
1053       } else if (op == ir_txd) {
1054          const brw_reg_type type = lod.type;
1055
1056          if (devinfo->gen >= 5) {
1057             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1059             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1060             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1061             inst->mlen++;
1062
1063             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1064                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1065                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1066                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1067                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1068                inst->mlen++;
1069
1070                if (shadow_comparator.file != BAD_FILE) {
1071                   emit(MOV(dst_reg(MRF, param_base + 2,
1072                                    shadow_comparator.type, WRITEMASK_Z),
1073                            shadow_comparator));
1074                }
1075             }
1076          } else /* devinfo->gen == 4 */ {
1077             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1078             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1079             inst->mlen += 2;
1080          }
1081       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1082          if (shadow_comparator.file != BAD_FILE) {
1083             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1084                      shadow_comparator));
1085          }
1086
1087          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1088                   offset_value));
1089          inst->mlen++;
1090       }
1091    }
1092
1093    emit(inst);
1094
1095    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1096     * spec requires layers.
1097     */
1098    if (op == ir_txs && devinfo->gen < 7) {
1099       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1100       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1101                   src_reg(inst->dst), brw_imm_d(1));
1102    }
1103
1104    if (devinfo->gen == 6 && op == ir_tg4) {
1105       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1106    }
1107
1108    if (op == ir_query_levels) {
1109       /* # levels is in .w */
1110       src_reg swizzled(dest);
1111       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1112                                       SWIZZLE_W, SWIZZLE_W);
1113       emit(MOV(dest, swizzled));
1114    }
1115 }
1116
1117 /**
1118  * Apply workarounds for Gen6 gather with UINT/SINT
1119  */
1120 void
1121 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1122 {
1123    if (!wa)
1124       return;
1125
1126    int width = (wa & WA_8BIT) ? 8 : 16;
1127    dst_reg dst_f = dst;
1128    dst_f.type = BRW_REGISTER_TYPE_F;
1129
1130    /* Convert from UNORM to UINT */
1131    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1132    emit(MOV(dst, src_reg(dst_f)));
1133
1134    if (wa & WA_SIGN) {
1135       /* Reinterpret the UINT value as a signed INT value by
1136        * shifting the sign bit into place, then shifting back
1137        * preserving sign.
1138        */
1139       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1140       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1141    }
1142 }
1143
1144 void
1145 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1146 {
1147    unreachable("not reached");
1148 }
1149
1150 void
1151 vec4_visitor::gs_end_primitive()
1152 {
1153    unreachable("not reached");
1154 }
1155
1156 void
1157 vec4_visitor::emit_ndc_computation()
1158 {
1159    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1160       return;
1161
1162    /* Get the position */
1163    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1164
1165    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1166    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1167    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1168    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1169
1170    current_annotation = "NDC";
1171    dst_reg ndc_w = ndc;
1172    ndc_w.writemask = WRITEMASK_W;
1173    src_reg pos_w = pos;
1174    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1175    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1176
1177    dst_reg ndc_xyz = ndc;
1178    ndc_xyz.writemask = WRITEMASK_XYZ;
1179
1180    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1181 }
1182
1183 void
1184 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1185 {
1186    if (devinfo->gen < 6 &&
1187        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1188         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1189         devinfo->has_negative_rhw_bug)) {
1190       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1191       dst_reg header1_w = header1;
1192       header1_w.writemask = WRITEMASK_W;
1193
1194       emit(MOV(header1, brw_imm_ud(0u)));
1195
1196       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1197          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1198
1199          current_annotation = "Point size";
1200          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1201          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1202       }
1203
1204       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1205          current_annotation = "Clipping flags";
1206          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1207          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1208
1209          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1210          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1211          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1212
1213          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1214          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1215          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1216          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1217       }
1218
1219       /* i965 clipping workaround:
1220        * 1) Test for -ve rhw
1221        * 2) If set,
1222        *      set ndc = (0,0,0,0)
1223        *      set ucp[6] = 1
1224        *
1225        * Later, clipping will detect ucp[6] and ensure the primitive is
1226        * clipped against all fixed planes.
1227        */
1228       if (devinfo->has_negative_rhw_bug &&
1229           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1230          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1231          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1232          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1233          vec4_instruction *inst;
1234          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1235          inst->predicate = BRW_PREDICATE_NORMAL;
1236          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1237          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1238          inst->predicate = BRW_PREDICATE_NORMAL;
1239       }
1240
1241       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1242    } else if (devinfo->gen < 6) {
1243       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1244    } else {
1245       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1246       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1247          dst_reg reg_w = reg;
1248          reg_w.writemask = WRITEMASK_W;
1249          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1250          reg_as_src.type = reg_w.type;
1251          reg_as_src.swizzle = brw_swizzle_for_size(1);
1252          emit(MOV(reg_w, reg_as_src));
1253       }
1254       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1255          dst_reg reg_y = reg;
1256          reg_y.writemask = WRITEMASK_Y;
1257          reg_y.type = BRW_REGISTER_TYPE_D;
1258          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1259          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1260       }
1261       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1262          dst_reg reg_z = reg;
1263          reg_z.writemask = WRITEMASK_Z;
1264          reg_z.type = BRW_REGISTER_TYPE_D;
1265          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1266          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1267       }
1268    }
1269 }
1270
1271 vec4_instruction *
1272 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1273 {
1274    assert(varying < VARYING_SLOT_MAX);
1275
1276    unsigned num_comps = output_num_components[varying][component];
1277    if (num_comps == 0)
1278       return NULL;
1279
1280    assert(output_reg[varying][component].type == reg.type);
1281    current_annotation = output_reg_annotation[varying];
1282    if (output_reg[varying][component].file != BAD_FILE) {
1283       src_reg src = src_reg(output_reg[varying][component]);
1284       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1285       reg.writemask =
1286          brw_writemask_for_component_packing(num_comps, component);
1287       return emit(MOV(reg, src));
1288    }
1289    return NULL;
1290 }
1291
1292 void
1293 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1294 {
1295    reg.type = BRW_REGISTER_TYPE_F;
1296    output_reg[varying][0].type = reg.type;
1297
1298    switch (varying) {
1299    case VARYING_SLOT_PSIZ:
1300    {
1301       /* PSIZ is always in slot 0, and is coupled with other flags. */
1302       current_annotation = "indices, point width, clip flags";
1303       emit_psiz_and_flags(reg);
1304       break;
1305    }
1306    case BRW_VARYING_SLOT_NDC:
1307       current_annotation = "NDC";
1308       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1309          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1310       break;
1311    case VARYING_SLOT_POS:
1312       current_annotation = "gl_Position";
1313       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1314          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1315       break;
1316    case VARYING_SLOT_EDGE:
1317       /* This is present when doing unfilled polygons.  We're supposed to copy
1318        * the edge flag from the user-provided vertex array
1319        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1320        * of that attribute (starts as 1.0f).  This is then used in clipping to
1321        * determine which edges should be drawn as wireframe.
1322        */
1323       current_annotation = "edge flag";
1324       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1325                                     glsl_type::float_type, WRITEMASK_XYZW))));
1326       break;
1327    case BRW_VARYING_SLOT_PAD:
1328       /* No need to write to this slot */
1329       break;
1330    default:
1331       for (int i = 0; i < 4; i++) {
1332          emit_generic_urb_slot(reg, varying, i);
1333       }
1334       break;
1335    }
1336 }
1337
1338 static int
1339 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1340 {
1341    if (devinfo->gen >= 6) {
1342       /* URB data written (does not include the message header reg) must
1343        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1344        * section 5.4.3.2.2: URB_INTERLEAVED.
1345        *
1346        * URB entries are allocated on a multiple of 1024 bits, so an
1347        * extra 128 bits written here to make the end align to 256 is
1348        * no problem.
1349        */
1350       if ((mlen % 2) != 1)
1351          mlen++;
1352    }
1353
1354    return mlen;
1355 }
1356
1357
1358 /**
1359  * Generates the VUE payload plus the necessary URB write instructions to
1360  * output it.
1361  *
1362  * The VUE layout is documented in Volume 2a.
1363  */
1364 void
1365 vec4_visitor::emit_vertex()
1366 {
1367    /* MRF 0 is reserved for the debugger, so start with message header
1368     * in MRF 1.
1369     */
1370    int base_mrf = 1;
1371    int mrf = base_mrf;
1372    /* In the process of generating our URB write message contents, we
1373     * may need to unspill a register or load from an array.  Those
1374     * reads would use MRFs 14-15.
1375     */
1376    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1377
1378    /* The following assertion verifies that max_usable_mrf causes an
1379     * even-numbered amount of URB write data, which will meet gen6's
1380     * requirements for length alignment.
1381     */
1382    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1383
1384    /* First mrf is the g0-based message header containing URB handles and
1385     * such.
1386     */
1387    emit_urb_write_header(mrf++);
1388
1389    if (devinfo->gen < 6) {
1390       emit_ndc_computation();
1391    }
1392
1393    /* We may need to split this up into several URB writes, so do them in a
1394     * loop.
1395     */
1396    int slot = 0;
1397    bool complete = false;
1398    do {
1399       /* URB offset is in URB row increments, and each of our MRFs is half of
1400        * one of those, since we're doing interleaved writes.
1401        */
1402       int offset = slot / 2;
1403
1404       mrf = base_mrf + 1;
1405       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1406          emit_urb_slot(dst_reg(MRF, mrf++),
1407                        prog_data->vue_map.slot_to_varying[slot]);
1408
1409          /* If this was max_usable_mrf, we can't fit anything more into this
1410           * URB WRITE. Same thing if we reached the maximum length available.
1411           */
1412          if (mrf > max_usable_mrf ||
1413              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1414             slot++;
1415             break;
1416          }
1417       }
1418
1419       complete = slot >= prog_data->vue_map.num_slots;
1420       current_annotation = "URB write";
1421       vec4_instruction *inst = emit_urb_write_opcode(complete);
1422       inst->base_mrf = base_mrf;
1423       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1424       inst->offset += offset;
1425    } while(!complete);
1426 }
1427
1428
1429 src_reg
1430 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1431                                  src_reg *reladdr, int reg_offset)
1432 {
1433    /* Because we store the values to scratch interleaved like our
1434     * vertex data, we need to scale the vec4 index by 2.
1435     */
1436    int message_header_scale = 2;
1437
1438    /* Pre-gen6, the message header uses byte offsets instead of vec4
1439     * (16-byte) offset units.
1440     */
1441    if (devinfo->gen < 6)
1442       message_header_scale *= 16;
1443
1444    if (reladdr) {
1445       src_reg index = src_reg(this, glsl_type::int_type);
1446
1447       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1448                                    brw_imm_d(reg_offset)));
1449       emit_before(block, inst, MUL(dst_reg(index), index,
1450                                    brw_imm_d(message_header_scale)));
1451
1452       return index;
1453    } else {
1454       return brw_imm_d(reg_offset * message_header_scale);
1455    }
1456 }
1457
1458 /**
1459  * Emits an instruction before @inst to load the value named by @orig_src
1460  * from scratch space at @base_offset to @temp.
1461  *
1462  * @base_offset is measured in 32-byte units (the size of a register).
1463  */
1464 void
1465 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1466                                 dst_reg temp, src_reg orig_src,
1467                                 int base_offset)
1468 {
1469    assert(orig_src.offset % REG_SIZE == 0);
1470    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1471    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1472                                       reg_offset);
1473
1474    emit_before(block, inst, SCRATCH_READ(temp, index));
1475 }
1476
1477 /**
1478  * Emits an instruction after @inst to store the value to be written
1479  * to @orig_dst to scratch space at @base_offset, from @temp.
1480  *
1481  * @base_offset is measured in 32-byte units (the size of a register).
1482  */
1483 void
1484 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1485                                  int base_offset)
1486 {
1487    assert(inst->dst.offset % REG_SIZE == 0);
1488    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1489    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1490                                       reg_offset);
1491
1492    /* Create a temporary register to store *inst's result in.
1493     *
1494     * We have to be careful in MOVing from our temporary result register in
1495     * the scratch write.  If we swizzle from channels of the temporary that
1496     * weren't initialized, it will confuse live interval analysis, which will
1497     * make spilling fail to make progress.
1498     */
1499    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1500                                        inst->dst.type),
1501                                 brw_swizzle_for_mask(inst->dst.writemask));
1502    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1503                                        inst->dst.writemask));
1504    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1505    if (inst->opcode != BRW_OPCODE_SEL)
1506       write->predicate = inst->predicate;
1507    write->ir = inst->ir;
1508    write->annotation = inst->annotation;
1509    inst->insert_after(block, write);
1510
1511    inst->dst.file = temp.file;
1512    inst->dst.nr = temp.nr;
1513    inst->dst.offset %= REG_SIZE;
1514    inst->dst.reladdr = NULL;
1515 }
1516
1517 /**
1518  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1519  * adds the scratch read(s) before \p inst. The function also checks for
1520  * recursive reladdr scratch accesses, issuing the corresponding scratch
1521  * loads and rewriting reladdr references accordingly.
1522  *
1523  * \return \p src if it did not require a scratch load, otherwise, the
1524  * register holding the result of the scratch load that the caller should
1525  * use to rewrite src.
1526  */
1527 src_reg
1528 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1529                                    vec4_instruction *inst, src_reg src)
1530 {
1531    /* Resolve recursive reladdr scratch access by calling ourselves
1532     * with src.reladdr
1533     */
1534    if (src.reladdr)
1535       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1536                                           *src.reladdr);
1537
1538    /* Now handle scratch access on src */
1539    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1540       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1541       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1542       src.nr = temp.nr;
1543       src.offset %= REG_SIZE;
1544       src.reladdr = NULL;
1545    }
1546
1547    return src;
1548 }
1549
1550 /**
1551  * We can't generally support array access in GRF space, because a
1552  * single instruction's destination can only span 2 contiguous
1553  * registers.  So, we send all GRF arrays that get variable index
1554  * access to scratch space.
1555  */
1556 void
1557 vec4_visitor::move_grf_array_access_to_scratch()
1558 {
1559    int scratch_loc[this->alloc.count];
1560    memset(scratch_loc, -1, sizeof(scratch_loc));
1561
1562    /* First, calculate the set of virtual GRFs that need to be punted
1563     * to scratch due to having any array access on them, and where in
1564     * scratch.
1565     */
1566    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1567       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1568          if (scratch_loc[inst->dst.nr] == -1) {
1569             scratch_loc[inst->dst.nr] = last_scratch;
1570             last_scratch += this->alloc.sizes[inst->dst.nr];
1571          }
1572
1573          for (src_reg *iter = inst->dst.reladdr;
1574               iter->reladdr;
1575               iter = iter->reladdr) {
1576             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1577                scratch_loc[iter->nr] = last_scratch;
1578                last_scratch += this->alloc.sizes[iter->nr];
1579             }
1580          }
1581       }
1582
1583       for (int i = 0 ; i < 3; i++) {
1584          for (src_reg *iter = &inst->src[i];
1585               iter->reladdr;
1586               iter = iter->reladdr) {
1587             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1588                scratch_loc[iter->nr] = last_scratch;
1589                last_scratch += this->alloc.sizes[iter->nr];
1590             }
1591          }
1592       }
1593    }
1594
1595    /* Now, for anything that will be accessed through scratch, rewrite
1596     * it to load/store.  Note that this is a _safe list walk, because
1597     * we may generate a new scratch_write instruction after the one
1598     * we're processing.
1599     */
1600    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1601       /* Set up the annotation tracking for new generated instructions. */
1602       base_ir = inst->ir;
1603       current_annotation = inst->annotation;
1604
1605       /* First handle scratch access on the dst. Notice we have to handle
1606        * the case where the dst's reladdr also points to scratch space.
1607        */
1608       if (inst->dst.reladdr)
1609          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1610                                                    *inst->dst.reladdr);
1611
1612       /* Now that we have handled any (possibly recursive) reladdr scratch
1613        * accesses for dst we can safely do the scratch write for dst itself
1614        */
1615       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1616          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1617
1618       /* Now handle scratch access on any src. In this case, since inst->src[i]
1619        * already is a src_reg, we can just call emit_resolve_reladdr with
1620        * inst->src[i] and it will take care of handling scratch loads for
1621        * both src and src.reladdr (recursively).
1622        */
1623       for (int i = 0 ; i < 3; i++) {
1624          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1625                                              inst->src[i]);
1626       }
1627    }
1628 }
1629
1630 /**
1631  * Emits an instruction before @inst to load the value named by @orig_src
1632  * from the pull constant buffer (surface) at @base_offset to @temp.
1633  */
1634 void
1635 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1636                                       dst_reg temp, src_reg orig_src,
1637                                       int base_offset, src_reg indirect)
1638 {
1639    assert(orig_src.offset % 16 == 0);
1640    int reg_offset = base_offset + orig_src.offset / 16;
1641    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1642
1643    src_reg offset;
1644    if (indirect.file != BAD_FILE) {
1645       offset = src_reg(this, glsl_type::uint_type);
1646
1647       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1648                                    brw_imm_ud(reg_offset * 16)));
1649    } else if (devinfo->gen >= 8) {
1650       /* Store the offset in a GRF so we can send-from-GRF. */
1651       offset = src_reg(this, glsl_type::uint_type);
1652       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1653    } else {
1654       offset = brw_imm_d(reg_offset * 16);
1655    }
1656
1657    emit_pull_constant_load_reg(temp,
1658                                brw_imm_ud(index),
1659                                offset,
1660                                block, inst);
1661
1662    brw_mark_surface_used(&prog_data->base, index);
1663 }
1664
1665 /**
1666  * Implements array access of uniforms by inserting a
1667  * PULL_CONSTANT_LOAD instruction.
1668  *
1669  * Unlike temporary GRF array access (where we don't support it due to
1670  * the difficulty of doing relative addressing on instruction
1671  * destinations), we could potentially do array access of uniforms
1672  * that were loaded in GRF space as push constants.  In real-world
1673  * usage we've seen, though, the arrays being used are always larger
1674  * than we could load as push constants, so just always move all
1675  * uniform array access out to a pull constant buffer.
1676  */
1677 void
1678 vec4_visitor::move_uniform_array_access_to_pull_constants()
1679 {
1680    /* The vulkan dirver doesn't support pull constants other than UBOs so
1681     * everything has to be pushed regardless.
1682     */
1683    if (stage_prog_data->pull_param == NULL) {
1684       split_uniform_registers();
1685       return;
1686    }
1687
1688    int pull_constant_loc[this->uniforms];
1689    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1690
1691    /* First, walk through the instructions and determine which things need to
1692     * be pulled.  We mark something as needing to be pulled by setting
1693     * pull_constant_loc to 0.
1694     */
1695    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1696       /* We only care about MOV_INDIRECT of a uniform */
1697       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1698           inst->src[0].file != UNIFORM)
1699          continue;
1700
1701       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1702
1703       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1704          pull_constant_loc[uniform_nr + j] = 0;
1705    }
1706
1707    /* Next, we walk the list of uniforms and assign real pull constant
1708     * locations and set their corresponding entries in pull_param.
1709     */
1710    for (int j = 0; j < this->uniforms; j++) {
1711       if (pull_constant_loc[j] < 0)
1712          continue;
1713
1714       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1715
1716       for (int i = 0; i < 4; i++) {
1717          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1718             = stage_prog_data->param[j * 4 + i];
1719       }
1720    }
1721
1722    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1723     * instructions to actual uniform pulls.
1724     */
1725    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1726       /* We only care about MOV_INDIRECT of a uniform */
1727       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1728           inst->src[0].file != UNIFORM)
1729          continue;
1730
1731       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1732
1733       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1734
1735       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1736                               pull_constant_loc[uniform_nr], inst->src[1]);
1737       inst->remove(block);
1738    }
1739
1740    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1741     * no need to track them as larger-than-vec4 objects.  This will be
1742     * relied on in cutting out unused uniform vectors from push
1743     * constants.
1744     */
1745    split_uniform_registers();
1746 }
1747
1748 void
1749 vec4_visitor::resolve_ud_negate(src_reg *reg)
1750 {
1751    if (reg->type != BRW_REGISTER_TYPE_UD ||
1752        !reg->negate)
1753       return;
1754
1755    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1756    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1757    *reg = temp;
1758 }
1759
1760 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1761                            void *log_data,
1762                            const struct brw_sampler_prog_key_data *key_tex,
1763                            struct brw_vue_prog_data *prog_data,
1764                            const nir_shader *shader,
1765                            void *mem_ctx,
1766                            bool no_spills,
1767                            int shader_time_index)
1768    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1769      key_tex(key_tex),
1770      prog_data(prog_data),
1771      fail_msg(NULL),
1772      first_non_payload_grf(0),
1773      need_all_constants_in_pull_buffer(false),
1774      no_spills(no_spills),
1775      shader_time_index(shader_time_index),
1776      last_scratch(0)
1777 {
1778    this->failed = false;
1779
1780    this->base_ir = NULL;
1781    this->current_annotation = NULL;
1782    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1783
1784    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1785
1786    this->virtual_grf_start = NULL;
1787    this->virtual_grf_end = NULL;
1788    this->live_intervals = NULL;
1789
1790    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1791
1792    this->uniforms = 0;
1793 }
1794
1795 vec4_visitor::~vec4_visitor()
1796 {
1797 }
1798
1799
1800 void
1801 vec4_visitor::fail(const char *format, ...)
1802 {
1803    va_list va;
1804    char *msg;
1805
1806    if (failed)
1807       return;
1808
1809    failed = true;
1810
1811    va_start(va, format);
1812    msg = ralloc_vasprintf(mem_ctx, format, va);
1813    va_end(va);
1814    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1815
1816    this->fail_msg = msg;
1817
1818    if (debug_enabled) {
1819       fprintf(stderr, "%s",  msg);
1820    }
1821 }
1822
1823 } /* namespace brw */