src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240
 241    resolve_ud_negate(&src0);
 242    resolve_ud_negate(&src1);
 243
 244    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 245    inst->conditional_mod = condition;
 246
 247    return inst;
 248 }
 249
 250 vec4_instruction *
 251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 252 {
 253    vec4_instruction *inst;
 254
 255    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 256                                         dst, index);
 257    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 258    inst->mlen = 2;
 259
 260    return inst;
 261 }
 262
 263 vec4_instruction *
 264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 265                             const src_reg &index)
 266 {
 267    vec4_instruction *inst;
 268
 269    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 270                                         dst, src, index);
 271    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 272    inst->mlen = 3;
 273
 274    return inst;
 275 }
 276
 277 src_reg
 278 vec4_visitor::fix_3src_operand(const src_reg &src)
 279 {
 280    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 281     * able to use vertical stride of zero to replicate the vec4 uniform, like
 282     *
 283     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 284     *
 285     * But you can't, since vertical stride is always four in three-source
 286     * instructions. Instead, insert a MOV instruction to do the replication so
 287     * that the three-source instruction can consume it.
 288     */
 289
 290    /* The MOV is only needed if the source is a uniform or immediate. */
 291    if (src.file != UNIFORM && src.file != IMM)
 292       return src;
 293
 294    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 295       return src;
 296
 297    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 298    expanded.type = src.type;
 299    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 300    return src_reg(expanded);
 301 }
 302
 303 src_reg
 304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 305 {
 306    if (!src.abs && !src.negate)
 307       return src;
 308
 309    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 310    resolved.type = src.type;
 311    emit(MOV(resolved, src));
 312
 313    return src_reg(resolved);
 314 }
 315
 316 src_reg
 317 vec4_visitor::fix_math_operand(const src_reg &src)
 318 {
 319    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 320       return src;
 321
 322    /* The gen6 math instruction ignores the source modifiers --
 323     * swizzle, abs, negate, and at least some parts of the register
 324     * region description.
 325     *
 326     * Rather than trying to enumerate all these cases, *always* expand the
 327     * operand to a temp GRF for gen6.
 328     *
 329     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 330     * can't use.
 331     */
 332
 333    if (devinfo->gen == 7 && src.file != IMM)
 334       return src;
 335
 336    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 337    expanded.type = src.type;
 338    emit(MOV(expanded, src));
 339    return src_reg(expanded);
 340 }
 341
 342 vec4_instruction *
 343 vec4_visitor::emit_math(enum opcode opcode,
 344                         const dst_reg &dst,
 345                         const src_reg &src0, const src_reg &src1)
 346 {
 347    vec4_instruction *math =
 348       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 349
 350    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 351       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 352       math->dst = dst_reg(this, glsl_type::vec4_type);
 353       math->dst.type = dst.type;
 354       math = emit(MOV(dst, src_reg(math->dst)));
 355    } else if (devinfo->gen < 6) {
 356       math->base_mrf = 1;
 357       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 358    }
 359
 360    return math;
 361 }
 362
 363 void
 364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 365 {
 366    if (devinfo->gen < 7) {
 367       unreachable("ir_unop_pack_half_2x16 should be lowered");
 368    }
 369
 370    assert(dst.type == BRW_REGISTER_TYPE_UD);
 371    assert(src0.type == BRW_REGISTER_TYPE_F);
 372
 373    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 374     *
 375     *   Because this instruction does not have a 16-bit floating-point type,
 376     *   the destination data type must be Word (W).
 377     *
 378     *   The destination must be DWord-aligned and specify a horizontal stride
 379     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 380     *   each destination channel and the upper word is not modified.
 381     *
 382     * The above restriction implies that the f32to16 instruction must use
 383     * align1 mode, because only in align1 mode is it possible to specify
 384     * horizontal stride.  We choose here to defy the hardware docs and emit
 385     * align16 instructions.
 386     *
 387     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 388     * instructions. I was partially successful in that the code passed all
 389     * tests.  However, the code was dubiously correct and fragile, and the
 390     * tests were not harsh enough to probe that frailty. Not trusting the
 391     * code, I chose instead to remain in align16 mode in defiance of the hw
 392     * docs).
 393     *
 394     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 395     * simulator, emitting a f32to16 in align16 mode with UD as destination
 396     * data type is safe. The behavior differs from that specified in the PRM
 397     * in that the upper word of each destination channel is cleared to 0.
 398     */
 399
 400    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 401    src_reg tmp_src(tmp_dst);
 402
 403 #if 0
 404    /* Verify the undocumented behavior on which the following instructions
 405     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 406     * then the result of the bit-or instruction below will be incorrect.
 407     *
 408     * You should inspect the disasm output in order to verify that the MOV is
 409     * not optimized away.
 410     */
 411    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 412 #endif
 413
 414    /* Give tmp the form below, where "." means untouched.
 415     *
 416     *     w z          y          x w z          y          x
 417     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 418     *
 419     * That the upper word of each write-channel be 0 is required for the
 420     * following bit-shift and bit-or instructions to work. Note that this
 421     * relies on the undocumented hardware behavior mentioned above.
 422     */
 423    tmp_dst.writemask = WRITEMASK_XY;
 424    emit(F32TO16(tmp_dst, src0));
 425
 426    /* Give the write-channels of dst the form:
 427     *   0xhhhh0000
 428     */
 429    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 430    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 431
 432    /* Finally, give the write-channels of dst the form of packHalf2x16's
 433     * output:
 434     *   0xhhhhllll
 435     */
 436    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 437    emit(OR(dst, src_reg(dst), tmp_src));
 438 }
 439
 440 void
 441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 442 {
 443    if (devinfo->gen < 7) {
 444       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 445    }
 446
 447    assert(dst.type == BRW_REGISTER_TYPE_F);
 448    assert(src0.type == BRW_REGISTER_TYPE_UD);
 449
 450    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 451     *
 452     *   Because this instruction does not have a 16-bit floating-point type,
 453     *   the source data type must be Word (W). The destination type must be
 454     *   F (Float).
 455     *
 456     * To use W as the source data type, we must adjust horizontal strides,
 457     * which is only possible in align1 mode. All my [chadv] attempts at
 458     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 459     * Piglit tests, so I gave up.
 460     *
 461     * I've verified that, on gen7 hardware and the simulator, it is safe to
 462     * emit f16to32 in align16 mode with UD as source data type.
 463     */
 464
 465    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 466    src_reg tmp_src(tmp_dst);
 467
 468    tmp_dst.writemask = WRITEMASK_X;
 469    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 470
 471    tmp_dst.writemask = WRITEMASK_Y;
 472    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 473
 474    dst.writemask = WRITEMASK_XY;
 475    emit(F16TO32(dst, tmp_src));
 476 }
 477
 478 void
 479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 480 {
 481    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 482     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 483     * is not suitable to generate the shift values, but we can use the packed
 484     * vector float and a type-converting MOV.
 485     */
 486    dst_reg shift(this, glsl_type::uvec4_type);
 487    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 488
 489    dst_reg shifted(this, glsl_type::uvec4_type);
 490    src0.swizzle = BRW_SWIZZLE_XXXX;
 491    emit(SHR(shifted, src0, src_reg(shift)));
 492
 493    shifted.type = BRW_REGISTER_TYPE_UB;
 494    dst_reg f(this, glsl_type::vec4_type);
 495    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 496
 497    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 498 }
 499
 500 void
 501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 502 {
 503    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 504     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 505     * is not suitable to generate the shift values, but we can use the packed
 506     * vector float and a type-converting MOV.
 507     */
 508    dst_reg shift(this, glsl_type::uvec4_type);
 509    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 510
 511    dst_reg shifted(this, glsl_type::uvec4_type);
 512    src0.swizzle = BRW_SWIZZLE_XXXX;
 513    emit(SHR(shifted, src0, src_reg(shift)));
 514
 515    shifted.type = BRW_REGISTER_TYPE_B;
 516    dst_reg f(this, glsl_type::vec4_type);
 517    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 518
 519    dst_reg scaled(this, glsl_type::vec4_type);
 520    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 521
 522    dst_reg max(this, glsl_type::vec4_type);
 523    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 524    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 525 }
 526
 527 void
 528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 529 {
 530    dst_reg saturated(this, glsl_type::vec4_type);
 531    vec4_instruction *inst = emit(MOV(saturated, src0));
 532    inst->saturate = true;
 533
 534    dst_reg scaled(this, glsl_type::vec4_type);
 535    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 536
 537    dst_reg rounded(this, glsl_type::vec4_type);
 538    emit(RNDE(rounded, src_reg(scaled)));
 539
 540    dst_reg u(this, glsl_type::uvec4_type);
 541    emit(MOV(u, src_reg(rounded)));
 542
 543    src_reg bytes(u);
 544    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 545 }
 546
 547 void
 548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 549 {
 550    dst_reg max(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 552
 553    dst_reg min(this, glsl_type::vec4_type);
 554    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 555
 556    dst_reg scaled(this, glsl_type::vec4_type);
 557    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 558
 559    dst_reg rounded(this, glsl_type::vec4_type);
 560    emit(RNDE(rounded, src_reg(scaled)));
 561
 562    dst_reg i(this, glsl_type::ivec4_type);
 563    emit(MOV(i, src_reg(rounded)));
 564
 565    src_reg bytes(i);
 566    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 567 }
 568
 569 /**
 570  * Returns the minimum number of vec4 elements needed to pack a type.
 571  *
 572  * For simple types, it will return 1 (a single vec4); for matrices, the
 573  * number of columns; for array and struct, the sum of the vec4_size of
 574  * each of its elements; and for sampler and atomic, zero.
 575  *
 576  * This method is useful to calculate how much register space is needed to
 577  * store a particular type.
 578  */
 579 extern "C" int
 580 type_size_vec4(const struct glsl_type *type)
 581 {
 582    unsigned int i;
 583    int size;
 584
 585    switch (type->base_type) {
 586    case GLSL_TYPE_UINT:
 587    case GLSL_TYPE_INT:
 588    case GLSL_TYPE_FLOAT:
 589    case GLSL_TYPE_BOOL:
 590       if (type->is_matrix()) {
 591          return type->matrix_columns;
 592       } else {
 593          /* Regardless of size of vector, it gets a vec4. This is bad
 594           * packing for things like floats, but otherwise arrays become a
 595           * mess.  Hopefully a later pass over the code can pack scalars
 596           * down if appropriate.
 597           */
 598          return 1;
 599       }
 600    case GLSL_TYPE_ARRAY:
 601       assert(type->length > 0);
 602       return type_size_vec4(type->fields.array) * type->length;
 603    case GLSL_TYPE_STRUCT:
 604       size = 0;
 605       for (i = 0; i < type->length; i++) {
 606          size += type_size_vec4(type->fields.structure[i].type);
 607       }
 608       return size;
 609    case GLSL_TYPE_SUBROUTINE:
 610       return 1;
 611
 612    case GLSL_TYPE_SAMPLER:
 613       /* Samplers take up no register space, since they're baked in at
 614        * link time.
 615        */
 616       return 0;
 617    case GLSL_TYPE_ATOMIC_UINT:
 618       return 0;
 619    case GLSL_TYPE_IMAGE:
 620       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 621    case GLSL_TYPE_VOID:
 622    case GLSL_TYPE_DOUBLE:
 623    case GLSL_TYPE_ERROR:
 624    case GLSL_TYPE_INTERFACE:
 625    case GLSL_TYPE_FUNCTION:
 626       unreachable("not reached");
 627    }
 628
 629    return 0;
 630 }
 631
 632 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 633 {
 634    init();
 635
 636    this->file = VGRF;
 637    this->nr = v->alloc.allocate(type_size_vec4(type));
 638
 639    if (type->is_array() || type->is_record()) {
 640       this->swizzle = BRW_SWIZZLE_NOOP;
 641    } else {
 642       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 643    }
 644
 645    this->type = brw_type_for_base_type(type);
 646 }
 647
 648 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 649 {
 650    assert(size > 0);
 651
 652    init();
 653
 654    this->file = VGRF;
 655    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 656
 657    this->swizzle = BRW_SWIZZLE_NOOP;
 658
 659    this->type = brw_type_for_base_type(type);
 660 }
 661
 662 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 663 {
 664    init();
 665
 666    this->file = VGRF;
 667    this->nr = v->alloc.allocate(type_size_vec4(type));
 668
 669    if (type->is_array() || type->is_record()) {
 670       this->writemask = WRITEMASK_XYZW;
 671    } else {
 672       this->writemask = (1 << type->vector_elements) - 1;
 673    }
 674
 675    this->type = brw_type_for_base_type(type);
 676 }
 677
 678 vec4_instruction *
 679 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 680                           src_reg src0, src_reg src1)
 681 {
 682    vec4_instruction *inst;
 683
 684    if (devinfo->gen >= 6) {
 685       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 686       inst->conditional_mod = conditionalmod;
 687    } else {
 688       emit(CMP(dst, src0, src1, conditionalmod));
 689
 690       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 691       inst->predicate = BRW_PREDICATE_NORMAL;
 692    }
 693
 694    return inst;
 695 }
 696
 697 vec4_instruction *
 698 vec4_visitor::emit_lrp(const dst_reg &dst,
 699                        const src_reg &x, const src_reg &y, const src_reg &a)
 700 {
 701    if (devinfo->gen >= 6) {
 702       /* Note that the instruction's argument order is reversed from GLSL
 703        * and the IR.
 704        */
 705      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 706                      fix_3src_operand(x)));
 707    } else {
 708       /* Earlier generations don't support three source operations, so we
 709        * need to emit x*(1-a) + y*a.
 710        */
 711       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 712       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 713       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 714       y_times_a.writemask           = dst.writemask;
 715       one_minus_a.writemask         = dst.writemask;
 716       x_times_one_minus_a.writemask = dst.writemask;
 717
 718       emit(MUL(y_times_a, y, a));
 719       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 720       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 721       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 722    }
 723 }
 724
 725 /**
 726  * Emits the instructions needed to perform a pull constant load. before_block
 727  * and before_inst can be NULL in which case the instruction will be appended
 728  * to the end of the instruction list.
 729  */
 730 void
 731 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 732                                           src_reg surf_index,
 733                                           src_reg offset_reg,
 734                                           bblock_t *before_block,
 735                                           vec4_instruction *before_inst)
 736 {
 737    assert((before_inst == NULL && before_block == NULL) ||
 738           (before_inst && before_block));
 739
 740    vec4_instruction *pull;
 741
 742    if (devinfo->gen >= 9) {
 743       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 744       src_reg header(this, glsl_type::uvec4_type, 2);
 745
 746       pull = new(mem_ctx)
 747          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 748                           dst_reg(header));
 749
 750       if (before_inst)
 751          emit_before(before_block, before_inst, pull);
 752       else
 753          emit(pull);
 754
 755       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 756                                  offset_reg.type);
 757       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 758
 759       if (before_inst)
 760          emit_before(before_block, before_inst, pull);
 761       else
 762          emit(pull);
 763
 764       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 765                                            dst,
 766                                            surf_index,
 767                                            header);
 768       pull->mlen = 2;
 769       pull->header_size = 1;
 770    } else if (devinfo->gen >= 7) {
 771       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 772
 773       grf_offset.type = offset_reg.type;
 774
 775       pull = MOV(grf_offset, offset_reg);
 776
 777       if (before_inst)
 778          emit_before(before_block, before_inst, pull);
 779       else
 780          emit(pull);
 781
 782       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 783                                            dst,
 784                                            surf_index,
 785                                            src_reg(grf_offset));
 786       pull->mlen = 1;
 787    } else {
 788       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 789                                            dst,
 790                                            surf_index,
 791                                            offset_reg);
 792       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 793       pull->mlen = 1;
 794    }
 795
 796    if (before_inst)
 797       emit_before(before_block, before_inst, pull);
 798    else
 799       emit(pull);
 800 }
 801
 802 src_reg
 803 vec4_visitor::emit_uniformize(const src_reg &src)
 804 {
 805    const src_reg chan_index(this, glsl_type::uint_type);
 806    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 807                               src.type);
 808
 809    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 810       ->force_writemask_all = true;
 811    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 812       ->force_writemask_all = true;
 813
 814    return src_reg(dst);
 815 }
 816
 817 src_reg
 818 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 819                              src_reg coordinate, src_reg surface)
 820 {
 821    vec4_instruction *inst =
 822       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 823                                     dst_reg(this, glsl_type::uvec4_type));
 824    inst->base_mrf = 2;
 825    inst->src[1] = surface;
 826    inst->src[2] = surface;
 827
 828    int param_base;
 829
 830    if (devinfo->gen >= 9) {
 831       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 832       vec4_instruction *header_inst = new(mem_ctx)
 833          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 834                           dst_reg(MRF, inst->base_mrf));
 835
 836       emit(header_inst);
 837
 838       inst->mlen = 2;
 839       inst->header_size = 1;
 840       param_base = inst->base_mrf + 1;
 841    } else {
 842       inst->mlen = 1;
 843       param_base = inst->base_mrf;
 844    }
 845
 846    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 847    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 848    int zero_mask = 0xf & ~coord_mask;
 849
 850    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 851             coordinate));
 852
 853    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 854             brw_imm_d(0)));
 855
 856    emit(inst);
 857    return src_reg(inst->dst);
 858 }
 859
 860 bool
 861 vec4_visitor::is_high_sampler(src_reg sampler)
 862 {
 863    if (devinfo->gen < 8 && !devinfo->is_haswell)
 864       return false;
 865
 866    return sampler.file != IMM || sampler.ud >= 16;
 867 }
 868
 869 void
 870 vec4_visitor::emit_texture(ir_texture_opcode op,
 871                            dst_reg dest,
 872                            const glsl_type *dest_type,
 873                            src_reg coordinate,
 874                            int coord_components,
 875                            src_reg shadow_comparitor,
 876                            src_reg lod, src_reg lod2,
 877                            src_reg sample_index,
 878                            uint32_t constant_offset,
 879                            src_reg offset_value,
 880                            src_reg mcs,
 881                            bool is_cube_array,
 882                            uint32_t surface,
 883                            src_reg surface_reg,
 884                            uint32_t sampler,
 885                            src_reg sampler_reg)
 886 {
 887    /* The sampler can only meaningfully compute LOD for fragment shader
 888     * messages. For all other stages, we change the opcode to TXL and hardcode
 889     * the LOD to 0.
 890     *
 891     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 892     * valid LOD argument.
 893     */
 894    if (op == ir_tex || op == ir_query_levels) {
 895       assert(lod.file == BAD_FILE);
 896       lod = brw_imm_f(0.0f);
 897    }
 898
 899    enum opcode opcode;
 900    switch (op) {
 901    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 902    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 903    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 904    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 905    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 906                              SHADER_OPCODE_TXF_CMS); break;
 907    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 908    case ir_tg4: opcode = offset_value.file != BAD_FILE
 909                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 910    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 911    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 912    case ir_txb:
 913       unreachable("TXB is not valid for vertex shaders.");
 914    case ir_lod:
 915       unreachable("LOD is not valid for vertex shaders.");
 916    case ir_samples_identical: {
 917       /* There are some challenges implementing this for vec4, and it seems
 918        * unlikely to be used anyway.  For now, just return false ways.
 919        */
 920       emit(MOV(dest, brw_imm_ud(0u)));
 921       return;
 922    }
 923    default:
 924       unreachable("Unrecognized tex op");
 925    }
 926
 927    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 928
 929    inst->offset = constant_offset;
 930
 931    /* The message header is necessary for:
 932     * - Gen4 (always)
 933     * - Gen9+ for selecting SIMD4x2
 934     * - Texel offsets
 935     * - Gather channel selection
 936     * - Sampler indices too large to fit in a 4-bit value.
 937     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 938     */
 939    inst->header_size =
 940       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 941        inst->offset != 0 || op == ir_tg4 ||
 942        op == ir_texture_samples ||
 943        is_high_sampler(sampler_reg)) ? 1 : 0;
 944    inst->base_mrf = 2;
 945    inst->mlen = inst->header_size;
 946    inst->dst.writemask = WRITEMASK_XYZW;
 947    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 948
 949    inst->src[1] = surface_reg;
 950    inst->src[2] = sampler_reg;
 951
 952    /* MRF for the first parameter */
 953    int param_base = inst->base_mrf + inst->header_size;
 954
 955    if (op == ir_txs || op == ir_query_levels) {
 956       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 957       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 958       inst->mlen++;
 959    } else if (op == ir_texture_samples) {
 960       inst->dst.writemask = WRITEMASK_X;
 961    } else {
 962       /* Load the coordinate */
 963       /* FINISHME: gl_clamp_mask and saturate */
 964       int coord_mask = (1 << coord_components) - 1;
 965       int zero_mask = 0xf & ~coord_mask;
 966
 967       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 968                coordinate));
 969       inst->mlen++;
 970
 971       if (zero_mask != 0) {
 972          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 973                   brw_imm_d(0)));
 974       }
 975       /* Load the shadow comparitor */
 976       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 977          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 978                           WRITEMASK_X),
 979                   shadow_comparitor));
 980          inst->mlen++;
 981       }
 982
 983       /* Load the LOD info */
 984       if (op == ir_tex || op == ir_txl) {
 985          int mrf, writemask;
 986          if (devinfo->gen >= 5) {
 987             mrf = param_base + 1;
 988             if (shadow_comparitor.file != BAD_FILE) {
 989                writemask = WRITEMASK_Y;
 990                /* mlen already incremented */
 991             } else {
 992                writemask = WRITEMASK_X;
 993                inst->mlen++;
 994             }
 995          } else /* devinfo->gen == 4 */ {
 996             mrf = param_base;
 997             writemask = WRITEMASK_W;
 998          }
 999          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1000       } else if (op == ir_txf) {
1001          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1002       } else if (op == ir_txf_ms) {
1003          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1004                   sample_index));
1005          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1006             /* MCS data is stored in the first two channels of ‘mcs’, but we
1007              * need to get it into the .y and .z channels of the second vec4
1008              * of params.
1009              */
1010             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1011             emit(MOV(dst_reg(MRF, param_base + 1,
1012                              glsl_type::uint_type, WRITEMASK_YZ),
1013                      mcs));
1014          } else if (devinfo->gen >= 7) {
1015             /* MCS data is in the first channel of `mcs`, but we need to get it into
1016              * the .y channel of the second vec4 of params, so replicate .x across
1017              * the whole vec4 and then mask off everything except .y
1018              */
1019             mcs.swizzle = BRW_SWIZZLE_XXXX;
1020             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1021                      mcs));
1022          }
1023          inst->mlen++;
1024       } else if (op == ir_txd) {
1025          const brw_reg_type type = lod.type;
1026
1027          if (devinfo->gen >= 5) {
1028             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1029             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1030             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1031             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1032             inst->mlen++;
1033
1034             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1035                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1036                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1037                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1038                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1039                inst->mlen++;
1040
1041                if (shadow_comparitor.file != BAD_FILE) {
1042                   emit(MOV(dst_reg(MRF, param_base + 2,
1043                                    shadow_comparitor.type, WRITEMASK_Z),
1044                            shadow_comparitor));
1045                }
1046             }
1047          } else /* devinfo->gen == 4 */ {
1048             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1049             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1050             inst->mlen += 2;
1051          }
1052       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1053          if (shadow_comparitor.file != BAD_FILE) {
1054             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1055                      shadow_comparitor));
1056          }
1057
1058          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1059                   offset_value));
1060          inst->mlen++;
1061       }
1062    }
1063
1064    emit(inst);
1065
1066    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1067     * spec requires layers.
1068     */
1069    if (op == ir_txs && is_cube_array) {
1070       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1071                 writemask(inst->dst, WRITEMASK_Z),
1072                 src_reg(inst->dst), brw_imm_d(6));
1073    }
1074
1075    if (devinfo->gen == 6 && op == ir_tg4) {
1076       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1077    }
1078
1079    if (op == ir_query_levels) {
1080       /* # levels is in .w */
1081       src_reg swizzled(dest);
1082       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1083                                       SWIZZLE_W, SWIZZLE_W);
1084       emit(MOV(dest, swizzled));
1085    }
1086 }
1087
1088 /**
1089  * Apply workarounds for Gen6 gather with UINT/SINT
1090  */
1091 void
1092 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1093 {
1094    if (!wa)
1095       return;
1096
1097    int width = (wa & WA_8BIT) ? 8 : 16;
1098    dst_reg dst_f = dst;
1099    dst_f.type = BRW_REGISTER_TYPE_F;
1100
1101    /* Convert from UNORM to UINT */
1102    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1103    emit(MOV(dst, src_reg(dst_f)));
1104
1105    if (wa & WA_SIGN) {
1106       /* Reinterpret the UINT value as a signed INT value by
1107        * shifting the sign bit into place, then shifting back
1108        * preserving sign.
1109        */
1110       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1111       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1112    }
1113 }
1114
1115 void
1116 vec4_visitor::gs_emit_vertex(int stream_id)
1117 {
1118    unreachable("not reached");
1119 }
1120
1121 void
1122 vec4_visitor::gs_end_primitive()
1123 {
1124    unreachable("not reached");
1125 }
1126
1127 void
1128 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1129                                   dst_reg dst, src_reg surf_offset,
1130                                   src_reg src0, src_reg src1)
1131 {
1132    unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1133    src_reg src_payload(this, glsl_type::uint_type, mlen);
1134    dst_reg payload(src_payload);
1135    payload.writemask = WRITEMASK_X;
1136
1137    /* Set the atomic operation offset. */
1138    emit(MOV(offset(payload, 0), surf_offset));
1139    unsigned i = 1;
1140
1141    /* Set the atomic operation arguments. */
1142    if (src0.file != BAD_FILE) {
1143       emit(MOV(offset(payload, i), src0));
1144       i++;
1145    }
1146
1147    if (src1.file != BAD_FILE) {
1148       emit(MOV(offset(payload, i), src1));
1149       i++;
1150    }
1151
1152    /* Emit the instruction.  Note that this maps to the normal SIMD8
1153     * untyped atomic message on Ivy Bridge, but that's OK because
1154     * unused channels will be masked out.
1155     */
1156    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1157                                  src_payload,
1158                                  brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1159    inst->mlen = mlen;
1160 }
1161
1162 void
1163 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1164                                         src_reg surf_offset)
1165 {
1166    dst_reg offset(this, glsl_type::uint_type);
1167    offset.writemask = WRITEMASK_X;
1168
1169    /* Set the surface read offset. */
1170    emit(MOV(offset, surf_offset));
1171
1172    /* Emit the instruction.  Note that this maps to the normal SIMD8
1173     * untyped surface read message, but that's OK because unused
1174     * channels will be masked out.
1175     */
1176    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1177                                  src_reg(offset),
1178                                  brw_imm_ud(surf_index), brw_imm_d(1));
1179    inst->mlen = 1;
1180 }
1181
1182 void
1183 vec4_visitor::emit_ndc_computation()
1184 {
1185    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1186       return;
1187
1188    /* Get the position */
1189    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1190
1191    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1192    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1193    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1194
1195    current_annotation = "NDC";
1196    dst_reg ndc_w = ndc;
1197    ndc_w.writemask = WRITEMASK_W;
1198    src_reg pos_w = pos;
1199    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1200    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1201
1202    dst_reg ndc_xyz = ndc;
1203    ndc_xyz.writemask = WRITEMASK_XYZ;
1204
1205    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1206 }
1207
1208 void
1209 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1210 {
1211    if (devinfo->gen < 6 &&
1212        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1213         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1214         devinfo->has_negative_rhw_bug)) {
1215       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1216       dst_reg header1_w = header1;
1217       header1_w.writemask = WRITEMASK_W;
1218
1219       emit(MOV(header1, brw_imm_ud(0u)));
1220
1221       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1222          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1223
1224          current_annotation = "Point size";
1225          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1226          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1227       }
1228
1229       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1230          current_annotation = "Clipping flags";
1231          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1232          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1233
1234          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1235          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1236          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1237
1238          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1239          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1240          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1241          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1242       }
1243
1244       /* i965 clipping workaround:
1245        * 1) Test for -ve rhw
1246        * 2) If set,
1247        *      set ndc = (0,0,0,0)
1248        *      set ucp[6] = 1
1249        *
1250        * Later, clipping will detect ucp[6] and ensure the primitive is
1251        * clipped against all fixed planes.
1252        */
1253       if (devinfo->has_negative_rhw_bug &&
1254           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1255          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1256          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1257          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1258          vec4_instruction *inst;
1259          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1260          inst->predicate = BRW_PREDICATE_NORMAL;
1261          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1262          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1263          inst->predicate = BRW_PREDICATE_NORMAL;
1264       }
1265
1266       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1267    } else if (devinfo->gen < 6) {
1268       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1269    } else {
1270       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1271       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1272          dst_reg reg_w = reg;
1273          reg_w.writemask = WRITEMASK_W;
1274          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1275          reg_as_src.type = reg_w.type;
1276          reg_as_src.swizzle = brw_swizzle_for_size(1);
1277          emit(MOV(reg_w, reg_as_src));
1278       }
1279       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1280          dst_reg reg_y = reg;
1281          reg_y.writemask = WRITEMASK_Y;
1282          reg_y.type = BRW_REGISTER_TYPE_D;
1283          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1284          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1285       }
1286       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1287          dst_reg reg_z = reg;
1288          reg_z.writemask = WRITEMASK_Z;
1289          reg_z.type = BRW_REGISTER_TYPE_D;
1290          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1291          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1292       }
1293    }
1294 }
1295
1296 vec4_instruction *
1297 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1298 {
1299    assert(varying < VARYING_SLOT_MAX);
1300    assert(output_reg[varying].type == reg.type);
1301    current_annotation = output_reg_annotation[varying];
1302    if (output_reg[varying].file != BAD_FILE)
1303       return emit(MOV(reg, src_reg(output_reg[varying])));
1304    else
1305       return NULL;
1306 }
1307
1308 void
1309 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1310 {
1311    reg.type = BRW_REGISTER_TYPE_F;
1312    output_reg[varying].type = reg.type;
1313
1314    switch (varying) {
1315    case VARYING_SLOT_PSIZ:
1316    {
1317       /* PSIZ is always in slot 0, and is coupled with other flags. */
1318       current_annotation = "indices, point width, clip flags";
1319       emit_psiz_and_flags(reg);
1320       break;
1321    }
1322    case BRW_VARYING_SLOT_NDC:
1323       current_annotation = "NDC";
1324       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1325          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1326       break;
1327    case VARYING_SLOT_POS:
1328       current_annotation = "gl_Position";
1329       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1330          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1331       break;
1332    case VARYING_SLOT_EDGE:
1333       /* This is present when doing unfilled polygons.  We're supposed to copy
1334        * the edge flag from the user-provided vertex array
1335        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1336        * of that attribute (starts as 1.0f).  This is then used in clipping to
1337        * determine which edges should be drawn as wireframe.
1338        */
1339       current_annotation = "edge flag";
1340       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1341                                     glsl_type::float_type, WRITEMASK_XYZW))));
1342       break;
1343    case BRW_VARYING_SLOT_PAD:
1344       /* No need to write to this slot */
1345       break;
1346    default:
1347       emit_generic_urb_slot(reg, varying);
1348       break;
1349    }
1350 }
1351
1352 static int
1353 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1354 {
1355    if (devinfo->gen >= 6) {
1356       /* URB data written (does not include the message header reg) must
1357        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1358        * section 5.4.3.2.2: URB_INTERLEAVED.
1359        *
1360        * URB entries are allocated on a multiple of 1024 bits, so an
1361        * extra 128 bits written here to make the end align to 256 is
1362        * no problem.
1363        */
1364       if ((mlen % 2) != 1)
1365          mlen++;
1366    }
1367
1368    return mlen;
1369 }
1370
1371
1372 /**
1373  * Generates the VUE payload plus the necessary URB write instructions to
1374  * output it.
1375  *
1376  * The VUE layout is documented in Volume 2a.
1377  */
1378 void
1379 vec4_visitor::emit_vertex()
1380 {
1381    /* MRF 0 is reserved for the debugger, so start with message header
1382     * in MRF 1.
1383     */
1384    int base_mrf = 1;
1385    int mrf = base_mrf;
1386    /* In the process of generating our URB write message contents, we
1387     * may need to unspill a register or load from an array.  Those
1388     * reads would use MRFs 14-15.
1389     */
1390    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1391
1392    /* The following assertion verifies that max_usable_mrf causes an
1393     * even-numbered amount of URB write data, which will meet gen6's
1394     * requirements for length alignment.
1395     */
1396    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1397
1398    /* First mrf is the g0-based message header containing URB handles and
1399     * such.
1400     */
1401    emit_urb_write_header(mrf++);
1402
1403    if (devinfo->gen < 6) {
1404       emit_ndc_computation();
1405    }
1406
1407    /* We may need to split this up into several URB writes, so do them in a
1408     * loop.
1409     */
1410    int slot = 0;
1411    bool complete = false;
1412    do {
1413       /* URB offset is in URB row increments, and each of our MRFs is half of
1414        * one of those, since we're doing interleaved writes.
1415        */
1416       int offset = slot / 2;
1417
1418       mrf = base_mrf + 1;
1419       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1420          emit_urb_slot(dst_reg(MRF, mrf++),
1421                        prog_data->vue_map.slot_to_varying[slot]);
1422
1423          /* If this was max_usable_mrf, we can't fit anything more into this
1424           * URB WRITE. Same thing if we reached the maximum length available.
1425           */
1426          if (mrf > max_usable_mrf ||
1427              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1428             slot++;
1429             break;
1430          }
1431       }
1432
1433       complete = slot >= prog_data->vue_map.num_slots;
1434       current_annotation = "URB write";
1435       vec4_instruction *inst = emit_urb_write_opcode(complete);
1436       inst->base_mrf = base_mrf;
1437       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1438       inst->offset += offset;
1439    } while(!complete);
1440 }
1441
1442
1443 src_reg
1444 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1445                                  src_reg *reladdr, int reg_offset)
1446 {
1447    /* Because we store the values to scratch interleaved like our
1448     * vertex data, we need to scale the vec4 index by 2.
1449     */
1450    int message_header_scale = 2;
1451
1452    /* Pre-gen6, the message header uses byte offsets instead of vec4
1453     * (16-byte) offset units.
1454     */
1455    if (devinfo->gen < 6)
1456       message_header_scale *= 16;
1457
1458    if (reladdr) {
1459       src_reg index = src_reg(this, glsl_type::int_type);
1460
1461       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1462                                    brw_imm_d(reg_offset)));
1463       emit_before(block, inst, MUL(dst_reg(index), index,
1464                                    brw_imm_d(message_header_scale)));
1465
1466       return index;
1467    } else {
1468       return brw_imm_d(reg_offset * message_header_scale);
1469    }
1470 }
1471
1472 /**
1473  * Emits an instruction before @inst to load the value named by @orig_src
1474  * from scratch space at @base_offset to @temp.
1475  *
1476  * @base_offset is measured in 32-byte units (the size of a register).
1477  */
1478 void
1479 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1480                                 dst_reg temp, src_reg orig_src,
1481                                 int base_offset)
1482 {
1483    int reg_offset = base_offset + orig_src.reg_offset;
1484    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1485                                       reg_offset);
1486
1487    emit_before(block, inst, SCRATCH_READ(temp, index));
1488 }
1489
1490 /**
1491  * Emits an instruction after @inst to store the value to be written
1492  * to @orig_dst to scratch space at @base_offset, from @temp.
1493  *
1494  * @base_offset is measured in 32-byte units (the size of a register).
1495  */
1496 void
1497 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1498                                  int base_offset)
1499 {
1500    int reg_offset = base_offset + inst->dst.reg_offset;
1501    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1502                                       reg_offset);
1503
1504    /* Create a temporary register to store *inst's result in.
1505     *
1506     * We have to be careful in MOVing from our temporary result register in
1507     * the scratch write.  If we swizzle from channels of the temporary that
1508     * weren't initialized, it will confuse live interval analysis, which will
1509     * make spilling fail to make progress.
1510     */
1511    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1512                                        inst->dst.type),
1513                                 brw_swizzle_for_mask(inst->dst.writemask));
1514    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1515                                        inst->dst.writemask));
1516    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1517    if (inst->opcode != BRW_OPCODE_SEL)
1518       write->predicate = inst->predicate;
1519    write->ir = inst->ir;
1520    write->annotation = inst->annotation;
1521    inst->insert_after(block, write);
1522
1523    inst->dst.file = temp.file;
1524    inst->dst.nr = temp.nr;
1525    inst->dst.reg_offset = temp.reg_offset;
1526    inst->dst.reladdr = NULL;
1527 }
1528
1529 /**
1530  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1531  * adds the scratch read(s) before \p inst. The function also checks for
1532  * recursive reladdr scratch accesses, issuing the corresponding scratch
1533  * loads and rewriting reladdr references accordingly.
1534  *
1535  * \return \p src if it did not require a scratch load, otherwise, the
1536  * register holding the result of the scratch load that the caller should
1537  * use to rewrite src.
1538  */
1539 src_reg
1540 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1541                                    vec4_instruction *inst, src_reg src)
1542 {
1543    /* Resolve recursive reladdr scratch access by calling ourselves
1544     * with src.reladdr
1545     */
1546    if (src.reladdr)
1547       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1548                                           *src.reladdr);
1549
1550    /* Now handle scratch access on src */
1551    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1552       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1553       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1554       src.nr = temp.nr;
1555       src.reg_offset = temp.reg_offset;
1556       src.reladdr = NULL;
1557    }
1558
1559    return src;
1560 }
1561
1562 /**
1563  * We can't generally support array access in GRF space, because a
1564  * single instruction's destination can only span 2 contiguous
1565  * registers.  So, we send all GRF arrays that get variable index
1566  * access to scratch space.
1567  */
1568 void
1569 vec4_visitor::move_grf_array_access_to_scratch()
1570 {
1571    int scratch_loc[this->alloc.count];
1572    memset(scratch_loc, -1, sizeof(scratch_loc));
1573
1574    /* First, calculate the set of virtual GRFs that need to be punted
1575     * to scratch due to having any array access on them, and where in
1576     * scratch.
1577     */
1578    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1579       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1580          if (scratch_loc[inst->dst.nr] == -1) {
1581             scratch_loc[inst->dst.nr] = last_scratch;
1582             last_scratch += this->alloc.sizes[inst->dst.nr];
1583          }
1584
1585          for (src_reg *iter = inst->dst.reladdr;
1586               iter->reladdr;
1587               iter = iter->reladdr) {
1588             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1589                scratch_loc[iter->nr] = last_scratch;
1590                last_scratch += this->alloc.sizes[iter->nr];
1591             }
1592          }
1593       }
1594
1595       for (int i = 0 ; i < 3; i++) {
1596          for (src_reg *iter = &inst->src[i];
1597               iter->reladdr;
1598               iter = iter->reladdr) {
1599             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1600                scratch_loc[iter->nr] = last_scratch;
1601                last_scratch += this->alloc.sizes[iter->nr];
1602             }
1603          }
1604       }
1605    }
1606
1607    /* Now, for anything that will be accessed through scratch, rewrite
1608     * it to load/store.  Note that this is a _safe list walk, because
1609     * we may generate a new scratch_write instruction after the one
1610     * we're processing.
1611     */
1612    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1613       /* Set up the annotation tracking for new generated instructions. */
1614       base_ir = inst->ir;
1615       current_annotation = inst->annotation;
1616
1617       /* First handle scratch access on the dst. Notice we have to handle
1618        * the case where the dst's reladdr also points to scratch space.
1619        */
1620       if (inst->dst.reladdr)
1621          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1622                                                    *inst->dst.reladdr);
1623
1624       /* Now that we have handled any (possibly recursive) reladdr scratch
1625        * accesses for dst we can safely do the scratch write for dst itself
1626        */
1627       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1628          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1629
1630       /* Now handle scratch access on any src. In this case, since inst->src[i]
1631        * already is a src_reg, we can just call emit_resolve_reladdr with
1632        * inst->src[i] and it will take care of handling scratch loads for
1633        * both src and src.reladdr (recursively).
1634        */
1635       for (int i = 0 ; i < 3; i++) {
1636          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1637                                              inst->src[i]);
1638       }
1639    }
1640 }
1641
1642 /**
1643  * Emits an instruction before @inst to load the value named by @orig_src
1644  * from the pull constant buffer (surface) at @base_offset to @temp.
1645  */
1646 void
1647 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1648                                       dst_reg temp, src_reg orig_src,
1649                                       int base_offset, src_reg indirect)
1650 {
1651    int reg_offset = base_offset + orig_src.reg_offset;
1652    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1653
1654    src_reg offset;
1655    if (indirect.file != BAD_FILE) {
1656       offset = src_reg(this, glsl_type::int_type);
1657
1658       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1659                                    brw_imm_d(reg_offset * 16)));
1660    } else if (devinfo->gen >= 8) {
1661       /* Store the offset in a GRF so we can send-from-GRF. */
1662       offset = src_reg(this, glsl_type::int_type);
1663       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
1664    } else {
1665       offset = brw_imm_d(reg_offset * 16);
1666    }
1667
1668    emit_pull_constant_load_reg(temp,
1669                                brw_imm_ud(index),
1670                                offset,
1671                                block, inst);
1672
1673    brw_mark_surface_used(&prog_data->base, index);
1674 }
1675
1676 /**
1677  * Implements array access of uniforms by inserting a
1678  * PULL_CONSTANT_LOAD instruction.
1679  *
1680  * Unlike temporary GRF array access (where we don't support it due to
1681  * the difficulty of doing relative addressing on instruction
1682  * destinations), we could potentially do array access of uniforms
1683  * that were loaded in GRF space as push constants.  In real-world
1684  * usage we've seen, though, the arrays being used are always larger
1685  * than we could load as push constants, so just always move all
1686  * uniform array access out to a pull constant buffer.
1687  */
1688 void
1689 vec4_visitor::move_uniform_array_access_to_pull_constants()
1690 {
1691    int pull_constant_loc[this->uniforms];
1692    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1693
1694    /* First, walk through the instructions and determine which things need to
1695     * be pulled.  We mark something as needing to be pulled by setting
1696     * pull_constant_loc to 0.
1697     */
1698    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1699       /* We only care about MOV_INDIRECT of a uniform */
1700       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1701           inst->src[0].file != UNIFORM)
1702          continue;
1703
1704       int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1705
1706       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1707          pull_constant_loc[uniform_nr + j] = 0;
1708    }
1709
1710    /* Next, we walk the list of uniforms and assign real pull constant
1711     * locations and set their corresponding entries in pull_param.
1712     */
1713    for (int j = 0; j < this->uniforms; j++) {
1714       if (pull_constant_loc[j] < 0)
1715          continue;
1716
1717       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1718
1719       for (int i = 0; i < 4; i++) {
1720          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1721             = stage_prog_data->param[j * 4 + i];
1722       }
1723    }
1724
1725    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1726     * instructions to actual uniform pulls.
1727     */
1728    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1729       /* We only care about MOV_INDIRECT of a uniform */
1730       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1731           inst->src[0].file != UNIFORM)
1732          continue;
1733
1734       int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1735
1736       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1737
1738       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1739                               pull_constant_loc[uniform_nr], inst->src[1]);
1740       inst->remove(block);
1741    }
1742
1743    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1744     * no need to track them as larger-than-vec4 objects.  This will be
1745     * relied on in cutting out unused uniform vectors from push
1746     * constants.
1747     */
1748    split_uniform_registers();
1749 }
1750
1751 void
1752 vec4_visitor::resolve_ud_negate(src_reg *reg)
1753 {
1754    if (reg->type != BRW_REGISTER_TYPE_UD ||
1755        !reg->negate)
1756       return;
1757
1758    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1759    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1760    *reg = temp;
1761 }
1762
1763 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1764                            void *log_data,
1765                            const struct brw_sampler_prog_key_data *key_tex,
1766                            struct brw_vue_prog_data *prog_data,
1767                            const nir_shader *shader,
1768                            void *mem_ctx,
1769                            bool no_spills,
1770                            int shader_time_index)
1771    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1772      key_tex(key_tex),
1773      prog_data(prog_data),
1774      fail_msg(NULL),
1775      first_non_payload_grf(0),
1776      need_all_constants_in_pull_buffer(false),
1777      no_spills(no_spills),
1778      shader_time_index(shader_time_index),
1779      last_scratch(0)
1780 {
1781    this->failed = false;
1782
1783    this->base_ir = NULL;
1784    this->current_annotation = NULL;
1785    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1786
1787    this->virtual_grf_start = NULL;
1788    this->virtual_grf_end = NULL;
1789    this->live_intervals = NULL;
1790
1791    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1792
1793    this->uniforms = 0;
1794 }
1795
1796 vec4_visitor::~vec4_visitor()
1797 {
1798 }
1799
1800
1801 void
1802 vec4_visitor::fail(const char *format, ...)
1803 {
1804    va_list va;
1805    char *msg;
1806
1807    if (failed)
1808       return;
1809
1810    failed = true;
1811
1812    va_start(va, format);
1813    msg = ralloc_vasprintf(mem_ctx, format, va);
1814    va_end(va);
1815    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1816
1817    this->fail_msg = msg;
1818
1819    if (debug_enabled) {
1820       fprintf(stderr, "%s",  msg);
1821    }
1822 }
1823
1824 } /* namespace brw */