src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_program.h"
  27 #include "glsl/ir_uniform.h"
  28 #include "program/sampler.h"
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  33                                    const src_reg &src0, const src_reg &src1,
  34                                    const src_reg &src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->writes_accumulator = false;
  46    this->conditional_mod = BRW_CONDITIONAL_NONE;
  47    this->predicate = BRW_PREDICATE_NONE;
  48    this->predicate_inverse = false;
  49    this->target = 0;
  50    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  51    this->shadow_compare = false;
  52    this->ir = NULL;
  53    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  54    this->header_size = 0;
  55    this->flag_subreg = 0;
  56    this->mlen = 0;
  57    this->base_mrf = 0;
  58    this->offset = 0;
  59    this->annotation = NULL;
  60 }
  61
  62 vec4_instruction *
  63 vec4_visitor::emit(vec4_instruction *inst)
  64 {
  65    inst->ir = this->base_ir;
  66    inst->annotation = this->current_annotation;
  67
  68    this->instructions.push_tail(inst);
  69
  70    return inst;
  71 }
  72
  73 vec4_instruction *
  74 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  75                           vec4_instruction *new_inst)
  76 {
  77    new_inst->ir = inst->ir;
  78    new_inst->annotation = inst->annotation;
  79
  80    inst->insert_before(block, new_inst);
  81
  82    return inst;
  83 }
  84
  85 vec4_instruction *
  86 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  87                    const src_reg &src1, const src_reg &src2)
  88 {
  89    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  90 }
  91
  92
  93 vec4_instruction *
  94 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  95                    const src_reg &src1)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 110 }
 111
 112 vec4_instruction *
 113 vec4_visitor::emit(enum opcode opcode)
 114 {
 115    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 116 }
 117
 118 #define ALU1(op)                                                        \
 119    vec4_instruction *                                                   \
 120    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 121    {                                                                    \
 122       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 123    }
 124
 125 #define ALU2(op)                                                        \
 126    vec4_instruction *                                                   \
 127    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 128                     const src_reg &src1)                                \
 129    {                                                                    \
 130       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 131                                            src0, src1);                 \
 132    }
 133
 134 #define ALU2_ACC(op)                                                    \
 135    vec4_instruction *                                                   \
 136    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 137                     const src_reg &src1)                                \
 138    {                                                                    \
 139       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 140                        BRW_OPCODE_##op, dst, src0, src1);               \
 141       inst->writes_accumulator = true;                                  \
 142       return inst;                                                      \
 143    }
 144
 145 #define ALU3(op)                                                        \
 146    vec4_instruction *                                                   \
 147    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 148                     const src_reg &src1, const src_reg &src2)           \
 149    {                                                                    \
 150       assert(devinfo->gen >= 6);                                                \
 151       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 152                                            src0, src1, src2);           \
 153    }
 154
 155 ALU1(NOT)
 156 ALU1(MOV)
 157 ALU1(FRC)
 158 ALU1(RNDD)
 159 ALU1(RNDE)
 160 ALU1(RNDZ)
 161 ALU1(F32TO16)
 162 ALU1(F16TO32)
 163 ALU2(ADD)
 164 ALU2(MUL)
 165 ALU2_ACC(MACH)
 166 ALU2(AND)
 167 ALU2(OR)
 168 ALU2(XOR)
 169 ALU2(DP3)
 170 ALU2(DP4)
 171 ALU2(DPH)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2_ACC(ADDC)
 185 ALU2_ACC(SUBB)
 186 ALU2(MAC)
 187
 188 /** Gen4 predicated IF. */
 189 vec4_instruction *
 190 vec4_visitor::IF(enum brw_predicate predicate)
 191 {
 192    vec4_instruction *inst;
 193
 194    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196
 197    return inst;
 198 }
 199
 200 /** Gen6 IF with embedded comparison. */
 201 vec4_instruction *
 202 vec4_visitor::IF(src_reg src0, src_reg src1,
 203                  enum brw_conditional_mod condition)
 204 {
 205    assert(devinfo->gen == 6);
 206
 207    vec4_instruction *inst;
 208
 209    resolve_ud_negate(&src0);
 210    resolve_ud_negate(&src1);
 211
 212    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 213                                         src0, src1);
 214    inst->conditional_mod = condition;
 215
 216    return inst;
 217 }
 218
 219 /**
 220  * CMP: Sets the low bit of the destination channels with the result
 221  * of the comparison, while the upper bits are undefined, and updates
 222  * the flag register with the packed 16 bits of the result.
 223  */
 224 vec4_instruction *
 225 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 226                   enum brw_conditional_mod condition)
 227 {
 228    vec4_instruction *inst;
 229
 230    /* Take the instruction:
 231     *
 232     * CMP null<d> src0<f> src1<f>
 233     *
 234     * Original gen4 does type conversion to the destination type before
 235     * comparison, producing garbage results for floating point comparisons.
 236     *
 237     * The destination type doesn't matter on newer generations, so we set the
 238     * type to match src0 so we can compact the instruction.
 239     */
 240    dst.type = src0.type;
 241
 242    resolve_ud_negate(&src0);
 243    resolve_ud_negate(&src1);
 244
 245    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 246    inst->conditional_mod = condition;
 247
 248    return inst;
 249 }
 250
 251 vec4_instruction *
 252 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 253 {
 254    vec4_instruction *inst;
 255
 256    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 257                                         dst, index);
 258    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 259    inst->mlen = 2;
 260
 261    return inst;
 262 }
 263
 264 vec4_instruction *
 265 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 266                             const src_reg &index)
 267 {
 268    vec4_instruction *inst;
 269
 270    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 271                                         dst, src, index);
 272    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 273    inst->mlen = 3;
 274
 275    return inst;
 276 }
 277
 278 src_reg
 279 vec4_visitor::fix_3src_operand(const src_reg &src)
 280 {
 281    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 282     * able to use vertical stride of zero to replicate the vec4 uniform, like
 283     *
 284     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 285     *
 286     * But you can't, since vertical stride is always four in three-source
 287     * instructions. Instead, insert a MOV instruction to do the replication so
 288     * that the three-source instruction can consume it.
 289     */
 290
 291    /* The MOV is only needed if the source is a uniform or immediate. */
 292    if (src.file != UNIFORM && src.file != IMM)
 293       return src;
 294
 295    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 296       return src;
 297
 298    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 299    expanded.type = src.type;
 300    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 301    return src_reg(expanded);
 302 }
 303
 304 src_reg
 305 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 306 {
 307    if (!src.abs && !src.negate)
 308       return src;
 309
 310    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 311    resolved.type = src.type;
 312    emit(MOV(resolved, src));
 313
 314    return src_reg(resolved);
 315 }
 316
 317 src_reg
 318 vec4_visitor::fix_math_operand(const src_reg &src)
 319 {
 320    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 321       return src;
 322
 323    /* The gen6 math instruction ignores the source modifiers --
 324     * swizzle, abs, negate, and at least some parts of the register
 325     * region description.
 326     *
 327     * Rather than trying to enumerate all these cases, *always* expand the
 328     * operand to a temp GRF for gen6.
 329     *
 330     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 331     * can't use.
 332     */
 333
 334    if (devinfo->gen == 7 && src.file != IMM)
 335       return src;
 336
 337    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 338    expanded.type = src.type;
 339    emit(MOV(expanded, src));
 340    return src_reg(expanded);
 341 }
 342
 343 vec4_instruction *
 344 vec4_visitor::emit_math(enum opcode opcode,
 345                         const dst_reg &dst,
 346                         const src_reg &src0, const src_reg &src1)
 347 {
 348    vec4_instruction *math =
 349       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 350
 351    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 352       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 353       math->dst = dst_reg(this, glsl_type::vec4_type);
 354       math->dst.type = dst.type;
 355       math = emit(MOV(dst, src_reg(math->dst)));
 356    } else if (devinfo->gen < 6) {
 357       math->base_mrf = 1;
 358       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 359    }
 360
 361    return math;
 362 }
 363
 364 void
 365 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 366 {
 367    if (devinfo->gen < 7) {
 368       unreachable("ir_unop_pack_half_2x16 should be lowered");
 369    }
 370
 371    assert(dst.type == BRW_REGISTER_TYPE_UD);
 372    assert(src0.type == BRW_REGISTER_TYPE_F);
 373
 374    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 375     *
 376     *   Because this instruction does not have a 16-bit floating-point type,
 377     *   the destination data type must be Word (W).
 378     *
 379     *   The destination must be DWord-aligned and specify a horizontal stride
 380     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 381     *   each destination channel and the upper word is not modified.
 382     *
 383     * The above restriction implies that the f32to16 instruction must use
 384     * align1 mode, because only in align1 mode is it possible to specify
 385     * horizontal stride.  We choose here to defy the hardware docs and emit
 386     * align16 instructions.
 387     *
 388     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 389     * instructions. I was partially successful in that the code passed all
 390     * tests.  However, the code was dubiously correct and fragile, and the
 391     * tests were not harsh enough to probe that frailty. Not trusting the
 392     * code, I chose instead to remain in align16 mode in defiance of the hw
 393     * docs).
 394     *
 395     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 396     * simulator, emitting a f32to16 in align16 mode with UD as destination
 397     * data type is safe. The behavior differs from that specified in the PRM
 398     * in that the upper word of each destination channel is cleared to 0.
 399     */
 400
 401    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 402    src_reg tmp_src(tmp_dst);
 403
 404 #if 0
 405    /* Verify the undocumented behavior on which the following instructions
 406     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 407     * then the result of the bit-or instruction below will be incorrect.
 408     *
 409     * You should inspect the disasm output in order to verify that the MOV is
 410     * not optimized away.
 411     */
 412    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 413 #endif
 414
 415    /* Give tmp the form below, where "." means untouched.
 416     *
 417     *     w z          y          x w z          y          x
 418     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 419     *
 420     * That the upper word of each write-channel be 0 is required for the
 421     * following bit-shift and bit-or instructions to work. Note that this
 422     * relies on the undocumented hardware behavior mentioned above.
 423     */
 424    tmp_dst.writemask = WRITEMASK_XY;
 425    emit(F32TO16(tmp_dst, src0));
 426
 427    /* Give the write-channels of dst the form:
 428     *   0xhhhh0000
 429     */
 430    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 431    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 432
 433    /* Finally, give the write-channels of dst the form of packHalf2x16's
 434     * output:
 435     *   0xhhhhllll
 436     */
 437    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 438    emit(OR(dst, src_reg(dst), tmp_src));
 439 }
 440
 441 void
 442 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 443 {
 444    if (devinfo->gen < 7) {
 445       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 446    }
 447
 448    assert(dst.type == BRW_REGISTER_TYPE_F);
 449    assert(src0.type == BRW_REGISTER_TYPE_UD);
 450
 451    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 452     *
 453     *   Because this instruction does not have a 16-bit floating-point type,
 454     *   the source data type must be Word (W). The destination type must be
 455     *   F (Float).
 456     *
 457     * To use W as the source data type, we must adjust horizontal strides,
 458     * which is only possible in align1 mode. All my [chadv] attempts at
 459     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 460     * Piglit tests, so I gave up.
 461     *
 462     * I've verified that, on gen7 hardware and the simulator, it is safe to
 463     * emit f16to32 in align16 mode with UD as source data type.
 464     */
 465
 466    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 467    src_reg tmp_src(tmp_dst);
 468
 469    tmp_dst.writemask = WRITEMASK_X;
 470    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 471
 472    tmp_dst.writemask = WRITEMASK_Y;
 473    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 474
 475    dst.writemask = WRITEMASK_XY;
 476    emit(F16TO32(dst, tmp_src));
 477 }
 478
 479 void
 480 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 481 {
 482    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 483     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 484     * is not suitable to generate the shift values, but we can use the packed
 485     * vector float and a type-converting MOV.
 486     */
 487    dst_reg shift(this, glsl_type::uvec4_type);
 488    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 489
 490    dst_reg shifted(this, glsl_type::uvec4_type);
 491    src0.swizzle = BRW_SWIZZLE_XXXX;
 492    emit(SHR(shifted, src0, src_reg(shift)));
 493
 494    shifted.type = BRW_REGISTER_TYPE_UB;
 495    dst_reg f(this, glsl_type::vec4_type);
 496    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 497
 498    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 499 }
 500
 501 void
 502 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 503 {
 504    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 505     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 506     * is not suitable to generate the shift values, but we can use the packed
 507     * vector float and a type-converting MOV.
 508     */
 509    dst_reg shift(this, glsl_type::uvec4_type);
 510    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 511
 512    dst_reg shifted(this, glsl_type::uvec4_type);
 513    src0.swizzle = BRW_SWIZZLE_XXXX;
 514    emit(SHR(shifted, src0, src_reg(shift)));
 515
 516    shifted.type = BRW_REGISTER_TYPE_B;
 517    dst_reg f(this, glsl_type::vec4_type);
 518    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 519
 520    dst_reg scaled(this, glsl_type::vec4_type);
 521    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 522
 523    dst_reg max(this, glsl_type::vec4_type);
 524    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 525    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 526 }
 527
 528 void
 529 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 530 {
 531    dst_reg saturated(this, glsl_type::vec4_type);
 532    vec4_instruction *inst = emit(MOV(saturated, src0));
 533    inst->saturate = true;
 534
 535    dst_reg scaled(this, glsl_type::vec4_type);
 536    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 537
 538    dst_reg rounded(this, glsl_type::vec4_type);
 539    emit(RNDE(rounded, src_reg(scaled)));
 540
 541    dst_reg u(this, glsl_type::uvec4_type);
 542    emit(MOV(u, src_reg(rounded)));
 543
 544    src_reg bytes(u);
 545    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 546 }
 547
 548 void
 549 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 550 {
 551    dst_reg max(this, glsl_type::vec4_type);
 552    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 553
 554    dst_reg min(this, glsl_type::vec4_type);
 555    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 556
 557    dst_reg scaled(this, glsl_type::vec4_type);
 558    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 559
 560    dst_reg rounded(this, glsl_type::vec4_type);
 561    emit(RNDE(rounded, src_reg(scaled)));
 562
 563    dst_reg i(this, glsl_type::ivec4_type);
 564    emit(MOV(i, src_reg(rounded)));
 565
 566    src_reg bytes(i);
 567    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 568 }
 569
 570 /**
 571  * Returns the minimum number of vec4 elements needed to pack a type.
 572  *
 573  * For simple types, it will return 1 (a single vec4); for matrices, the
 574  * number of columns; for array and struct, the sum of the vec4_size of
 575  * each of its elements; and for sampler and atomic, zero.
 576  *
 577  * This method is useful to calculate how much register space is needed to
 578  * store a particular type.
 579  */
 580 extern "C" int
 581 type_size_vec4(const struct glsl_type *type)
 582 {
 583    unsigned int i;
 584    int size;
 585
 586    switch (type->base_type) {
 587    case GLSL_TYPE_UINT:
 588    case GLSL_TYPE_INT:
 589    case GLSL_TYPE_FLOAT:
 590    case GLSL_TYPE_BOOL:
 591       if (type->is_matrix()) {
 592          return type->matrix_columns;
 593       } else {
 594          /* Regardless of size of vector, it gets a vec4. This is bad
 595           * packing for things like floats, but otherwise arrays become a
 596           * mess.  Hopefully a later pass over the code can pack scalars
 597           * down if appropriate.
 598           */
 599          return 1;
 600       }
 601    case GLSL_TYPE_ARRAY:
 602       assert(type->length > 0);
 603       return type_size_vec4(type->fields.array) * type->length;
 604    case GLSL_TYPE_STRUCT:
 605       size = 0;
 606       for (i = 0; i < type->length; i++) {
 607          size += type_size_vec4(type->fields.structure[i].type);
 608       }
 609       return size;
 610    case GLSL_TYPE_SUBROUTINE:
 611       return 1;
 612
 613    case GLSL_TYPE_SAMPLER:
 614       /* Samplers take up no register space, since they're baked in at
 615        * link time.
 616        */
 617       return 0;
 618    case GLSL_TYPE_ATOMIC_UINT:
 619       return 0;
 620    case GLSL_TYPE_IMAGE:
 621       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 622    case GLSL_TYPE_VOID:
 623    case GLSL_TYPE_DOUBLE:
 624    case GLSL_TYPE_ERROR:
 625    case GLSL_TYPE_INTERFACE:
 626       unreachable("not reached");
 627    }
 628
 629    return 0;
 630 }
 631
 632 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 633 {
 634    init();
 635
 636    this->file = VGRF;
 637    this->nr = v->alloc.allocate(type_size_vec4(type));
 638
 639    if (type->is_array() || type->is_record()) {
 640       this->swizzle = BRW_SWIZZLE_NOOP;
 641    } else {
 642       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 643    }
 644
 645    this->type = brw_type_for_base_type(type);
 646 }
 647
 648 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 649 {
 650    assert(size > 0);
 651
 652    init();
 653
 654    this->file = VGRF;
 655    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 656
 657    this->swizzle = BRW_SWIZZLE_NOOP;
 658
 659    this->type = brw_type_for_base_type(type);
 660 }
 661
 662 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 663 {
 664    init();
 665
 666    this->file = VGRF;
 667    this->nr = v->alloc.allocate(type_size_vec4(type));
 668
 669    if (type->is_array() || type->is_record()) {
 670       this->writemask = WRITEMASK_XYZW;
 671    } else {
 672       this->writemask = (1 << type->vector_elements) - 1;
 673    }
 674
 675    this->type = brw_type_for_base_type(type);
 676 }
 677
 678 vec4_instruction *
 679 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 680                           src_reg src0, src_reg src1)
 681 {
 682    vec4_instruction *inst;
 683
 684    if (devinfo->gen >= 6) {
 685       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 686       inst->conditional_mod = conditionalmod;
 687    } else {
 688       emit(CMP(dst, src0, src1, conditionalmod));
 689
 690       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 691       inst->predicate = BRW_PREDICATE_NORMAL;
 692    }
 693
 694    return inst;
 695 }
 696
 697 vec4_instruction *
 698 vec4_visitor::emit_lrp(const dst_reg &dst,
 699                        const src_reg &x, const src_reg &y, const src_reg &a)
 700 {
 701    if (devinfo->gen >= 6) {
 702       /* Note that the instruction's argument order is reversed from GLSL
 703        * and the IR.
 704        */
 705      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 706                      fix_3src_operand(x)));
 707    } else {
 708       /* Earlier generations don't support three source operations, so we
 709        * need to emit x*(1-a) + y*a.
 710        */
 711       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 712       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 713       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 714       y_times_a.writemask           = dst.writemask;
 715       one_minus_a.writemask         = dst.writemask;
 716       x_times_one_minus_a.writemask = dst.writemask;
 717
 718       emit(MUL(y_times_a, y, a));
 719       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 720       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 721       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 722    }
 723 }
 724
 725 /**
 726  * Emits the instructions needed to perform a pull constant load. before_block
 727  * and before_inst can be NULL in which case the instruction will be appended
 728  * to the end of the instruction list.
 729  */
 730 void
 731 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 732                                           src_reg surf_index,
 733                                           src_reg offset_reg,
 734                                           bblock_t *before_block,
 735                                           vec4_instruction *before_inst)
 736 {
 737    assert((before_inst == NULL && before_block == NULL) ||
 738           (before_inst && before_block));
 739
 740    vec4_instruction *pull;
 741
 742    if (devinfo->gen >= 9) {
 743       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 744       src_reg header(this, glsl_type::uvec4_type, 2);
 745
 746       pull = new(mem_ctx)
 747          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 748                           dst_reg(header));
 749
 750       if (before_inst)
 751          emit_before(before_block, before_inst, pull);
 752       else
 753          emit(pull);
 754
 755       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 756                                  offset_reg.type);
 757       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 758
 759       if (before_inst)
 760          emit_before(before_block, before_inst, pull);
 761       else
 762          emit(pull);
 763
 764       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 765                                            dst,
 766                                            surf_index,
 767                                            header);
 768       pull->mlen = 2;
 769       pull->header_size = 1;
 770    } else if (devinfo->gen >= 7) {
 771       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 772
 773       grf_offset.type = offset_reg.type;
 774
 775       pull = MOV(grf_offset, offset_reg);
 776
 777       if (before_inst)
 778          emit_before(before_block, before_inst, pull);
 779       else
 780          emit(pull);
 781
 782       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 783                                            dst,
 784                                            surf_index,
 785                                            src_reg(grf_offset));
 786       pull->mlen = 1;
 787    } else {
 788       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 789                                            dst,
 790                                            surf_index,
 791                                            offset_reg);
 792       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 793       pull->mlen = 1;
 794    }
 795
 796    if (before_inst)
 797       emit_before(before_block, before_inst, pull);
 798    else
 799       emit(pull);
 800 }
 801
 802 src_reg
 803 vec4_visitor::emit_uniformize(const src_reg &src)
 804 {
 805    const src_reg chan_index(this, glsl_type::uint_type);
 806    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 807                               src.type);
 808
 809    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 810       ->force_writemask_all = true;
 811    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 812       ->force_writemask_all = true;
 813
 814    return src_reg(dst);
 815 }
 816
 817 src_reg
 818 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 819                              src_reg coordinate, src_reg sampler)
 820 {
 821    vec4_instruction *inst =
 822       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 823                                     dst_reg(this, glsl_type::uvec4_type));
 824    inst->base_mrf = 2;
 825    inst->src[1] = sampler;
 826
 827    int param_base;
 828
 829    if (devinfo->gen >= 9) {
 830       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 831       vec4_instruction *header_inst = new(mem_ctx)
 832          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 833                           dst_reg(MRF, inst->base_mrf));
 834
 835       emit(header_inst);
 836
 837       inst->mlen = 2;
 838       inst->header_size = 1;
 839       param_base = inst->base_mrf + 1;
 840    } else {
 841       inst->mlen = 1;
 842       param_base = inst->base_mrf;
 843    }
 844
 845    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 846    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 847    int zero_mask = 0xf & ~coord_mask;
 848
 849    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 850             coordinate));
 851
 852    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 853             brw_imm_d(0)));
 854
 855    emit(inst);
 856    return src_reg(inst->dst);
 857 }
 858
 859 bool
 860 vec4_visitor::is_high_sampler(src_reg sampler)
 861 {
 862    if (devinfo->gen < 8 && !devinfo->is_haswell)
 863       return false;
 864
 865    return sampler.file != IMM || sampler.ud >= 16;
 866 }
 867
 868 void
 869 vec4_visitor::emit_texture(ir_texture_opcode op,
 870                            dst_reg dest,
 871                            const glsl_type *dest_type,
 872                            src_reg coordinate,
 873                            int coord_components,
 874                            src_reg shadow_comparitor,
 875                            src_reg lod, src_reg lod2,
 876                            src_reg sample_index,
 877                            uint32_t constant_offset,
 878                            src_reg offset_value,
 879                            src_reg mcs,
 880                            bool is_cube_array,
 881                            uint32_t sampler,
 882                            src_reg sampler_reg)
 883 {
 884    /* The sampler can only meaningfully compute LOD for fragment shader
 885     * messages. For all other stages, we change the opcode to TXL and hardcode
 886     * the LOD to 0.
 887     *
 888     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 889     * valid LOD argument.
 890     */
 891    if (op == ir_tex || op == ir_query_levels) {
 892       assert(lod.file == BAD_FILE);
 893       lod = brw_imm_f(0.0f);
 894    }
 895
 896    enum opcode opcode;
 897    switch (op) {
 898    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 899    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 900    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 901    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 902    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 903                              SHADER_OPCODE_TXF_CMS); break;
 904    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 905    case ir_tg4: opcode = offset_value.file != BAD_FILE
 906                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 907    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 908    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 909    case ir_txb:
 910       unreachable("TXB is not valid for vertex shaders.");
 911    case ir_lod:
 912       unreachable("LOD is not valid for vertex shaders.");
 913    case ir_samples_identical: {
 914       /* There are some challenges implementing this for vec4, and it seems
 915        * unlikely to be used anyway.  For now, just return false ways.
 916        */
 917       emit(MOV(dest, brw_imm_ud(0u)));
 918       return;
 919    }
 920    default:
 921       unreachable("Unrecognized tex op");
 922    }
 923
 924    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 925
 926    inst->offset = constant_offset;
 927
 928    /* The message header is necessary for:
 929     * - Gen4 (always)
 930     * - Gen9+ for selecting SIMD4x2
 931     * - Texel offsets
 932     * - Gather channel selection
 933     * - Sampler indices too large to fit in a 4-bit value.
 934     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 935     */
 936    inst->header_size =
 937       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 938        inst->offset != 0 || op == ir_tg4 ||
 939        op == ir_texture_samples ||
 940        is_high_sampler(sampler_reg)) ? 1 : 0;
 941    inst->base_mrf = 2;
 942    inst->mlen = inst->header_size;
 943    inst->dst.writemask = WRITEMASK_XYZW;
 944    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 945
 946    inst->src[1] = sampler_reg;
 947
 948    /* MRF for the first parameter */
 949    int param_base = inst->base_mrf + inst->header_size;
 950
 951    if (op == ir_txs || op == ir_query_levels) {
 952       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 953       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 954       inst->mlen++;
 955    } else if (op == ir_texture_samples) {
 956       inst->dst.writemask = WRITEMASK_X;
 957    } else {
 958       /* Load the coordinate */
 959       /* FINISHME: gl_clamp_mask and saturate */
 960       int coord_mask = (1 << coord_components) - 1;
 961       int zero_mask = 0xf & ~coord_mask;
 962
 963       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 964                coordinate));
 965       inst->mlen++;
 966
 967       if (zero_mask != 0) {
 968          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 969                   brw_imm_d(0)));
 970       }
 971       /* Load the shadow comparitor */
 972       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 973          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 974                           WRITEMASK_X),
 975                   shadow_comparitor));
 976          inst->mlen++;
 977       }
 978
 979       /* Load the LOD info */
 980       if (op == ir_tex || op == ir_txl) {
 981          int mrf, writemask;
 982          if (devinfo->gen >= 5) {
 983             mrf = param_base + 1;
 984             if (shadow_comparitor.file != BAD_FILE) {
 985                writemask = WRITEMASK_Y;
 986                /* mlen already incremented */
 987             } else {
 988                writemask = WRITEMASK_X;
 989                inst->mlen++;
 990             }
 991          } else /* devinfo->gen == 4 */ {
 992             mrf = param_base;
 993             writemask = WRITEMASK_W;
 994          }
 995          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 996       } else if (op == ir_txf) {
 997          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 998       } else if (op == ir_txf_ms) {
 999          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1000                   sample_index));
1001          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1002             /* MCS data is stored in the first two channels of ‘mcs’, but we
1003              * need to get it into the .y and .z channels of the second vec4
1004              * of params.
1005              */
1006             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1007             emit(MOV(dst_reg(MRF, param_base + 1,
1008                              glsl_type::uint_type, WRITEMASK_YZ),
1009                      mcs));
1010          } else if (devinfo->gen >= 7) {
1011             /* MCS data is in the first channel of `mcs`, but we need to get it into
1012              * the .y channel of the second vec4 of params, so replicate .x across
1013              * the whole vec4 and then mask off everything except .y
1014              */
1015             mcs.swizzle = BRW_SWIZZLE_XXXX;
1016             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1017                      mcs));
1018          }
1019          inst->mlen++;
1020       } else if (op == ir_txd) {
1021          const brw_reg_type type = lod.type;
1022
1023          if (devinfo->gen >= 5) {
1024             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1025             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1026             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1027             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1028             inst->mlen++;
1029
1030             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1031                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1032                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1033                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1034                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1035                inst->mlen++;
1036
1037                if (shadow_comparitor.file != BAD_FILE) {
1038                   emit(MOV(dst_reg(MRF, param_base + 2,
1039                                    shadow_comparitor.type, WRITEMASK_Z),
1040                            shadow_comparitor));
1041                }
1042             }
1043          } else /* devinfo->gen == 4 */ {
1044             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1045             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1046             inst->mlen += 2;
1047          }
1048       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1049          if (shadow_comparitor.file != BAD_FILE) {
1050             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1051                      shadow_comparitor));
1052          }
1053
1054          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1055                   offset_value));
1056          inst->mlen++;
1057       }
1058    }
1059
1060    emit(inst);
1061
1062    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1063     * spec requires layers.
1064     */
1065    if (op == ir_txs && is_cube_array) {
1066       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1067                 writemask(inst->dst, WRITEMASK_Z),
1068                 src_reg(inst->dst), brw_imm_d(6));
1069    }
1070
1071    if (devinfo->gen == 6 && op == ir_tg4) {
1072       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1073    }
1074
1075    if (op == ir_query_levels) {
1076       /* # levels is in .w */
1077       src_reg swizzled(dest);
1078       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1079                                       SWIZZLE_W, SWIZZLE_W);
1080       emit(MOV(dest, swizzled));
1081    }
1082 }
1083
1084 /**
1085  * Apply workarounds for Gen6 gather with UINT/SINT
1086  */
1087 void
1088 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1089 {
1090    if (!wa)
1091       return;
1092
1093    int width = (wa & WA_8BIT) ? 8 : 16;
1094    dst_reg dst_f = dst;
1095    dst_f.type = BRW_REGISTER_TYPE_F;
1096
1097    /* Convert from UNORM to UINT */
1098    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1099    emit(MOV(dst, src_reg(dst_f)));
1100
1101    if (wa & WA_SIGN) {
1102       /* Reinterpret the UINT value as a signed INT value by
1103        * shifting the sign bit into place, then shifting back
1104        * preserving sign.
1105        */
1106       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1107       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1108    }
1109 }
1110
1111 void
1112 vec4_visitor::gs_emit_vertex(int stream_id)
1113 {
1114    unreachable("not reached");
1115 }
1116
1117 void
1118 vec4_visitor::gs_end_primitive()
1119 {
1120    unreachable("not reached");
1121 }
1122
1123 void
1124 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1125                                   dst_reg dst, src_reg surf_offset,
1126                                   src_reg src0, src_reg src1)
1127 {
1128    unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1129    src_reg src_payload(this, glsl_type::uint_type, mlen);
1130    dst_reg payload(src_payload);
1131    payload.writemask = WRITEMASK_X;
1132
1133    /* Set the atomic operation offset. */
1134    emit(MOV(offset(payload, 0), surf_offset));
1135    unsigned i = 1;
1136
1137    /* Set the atomic operation arguments. */
1138    if (src0.file != BAD_FILE) {
1139       emit(MOV(offset(payload, i), src0));
1140       i++;
1141    }
1142
1143    if (src1.file != BAD_FILE) {
1144       emit(MOV(offset(payload, i), src1));
1145       i++;
1146    }
1147
1148    /* Emit the instruction.  Note that this maps to the normal SIMD8
1149     * untyped atomic message on Ivy Bridge, but that's OK because
1150     * unused channels will be masked out.
1151     */
1152    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1153                                  src_payload,
1154                                  brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1155    inst->mlen = mlen;
1156 }
1157
1158 void
1159 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1160                                         src_reg surf_offset)
1161 {
1162    dst_reg offset(this, glsl_type::uint_type);
1163    offset.writemask = WRITEMASK_X;
1164
1165    /* Set the surface read offset. */
1166    emit(MOV(offset, surf_offset));
1167
1168    /* Emit the instruction.  Note that this maps to the normal SIMD8
1169     * untyped surface read message, but that's OK because unused
1170     * channels will be masked out.
1171     */
1172    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1173                                  src_reg(offset),
1174                                  brw_imm_ud(surf_index), brw_imm_d(1));
1175    inst->mlen = 1;
1176 }
1177
1178 void
1179 vec4_visitor::emit_ndc_computation()
1180 {
1181    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1182       return;
1183
1184    /* Get the position */
1185    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1186
1187    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1188    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1189    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1190
1191    current_annotation = "NDC";
1192    dst_reg ndc_w = ndc;
1193    ndc_w.writemask = WRITEMASK_W;
1194    src_reg pos_w = pos;
1195    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1196    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1197
1198    dst_reg ndc_xyz = ndc;
1199    ndc_xyz.writemask = WRITEMASK_XYZ;
1200
1201    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1202 }
1203
1204 void
1205 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1206 {
1207    if (devinfo->gen < 6 &&
1208        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1209         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1210         devinfo->has_negative_rhw_bug)) {
1211       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1212       dst_reg header1_w = header1;
1213       header1_w.writemask = WRITEMASK_W;
1214
1215       emit(MOV(header1, brw_imm_ud(0u)));
1216
1217       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1218          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1219
1220          current_annotation = "Point size";
1221          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1222          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1223       }
1224
1225       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1226          current_annotation = "Clipping flags";
1227          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1228          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1229
1230          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1231          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1232          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1233
1234          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1235          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1236          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1237          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1238       }
1239
1240       /* i965 clipping workaround:
1241        * 1) Test for -ve rhw
1242        * 2) If set,
1243        *      set ndc = (0,0,0,0)
1244        *      set ucp[6] = 1
1245        *
1246        * Later, clipping will detect ucp[6] and ensure the primitive is
1247        * clipped against all fixed planes.
1248        */
1249       if (devinfo->has_negative_rhw_bug &&
1250           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1251          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1252          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1253          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1254          vec4_instruction *inst;
1255          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1256          inst->predicate = BRW_PREDICATE_NORMAL;
1257          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1258          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1259          inst->predicate = BRW_PREDICATE_NORMAL;
1260       }
1261
1262       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1263    } else if (devinfo->gen < 6) {
1264       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1265    } else {
1266       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1267       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1268          dst_reg reg_w = reg;
1269          reg_w.writemask = WRITEMASK_W;
1270          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1271          reg_as_src.type = reg_w.type;
1272          reg_as_src.swizzle = brw_swizzle_for_size(1);
1273          emit(MOV(reg_w, reg_as_src));
1274       }
1275       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1276          dst_reg reg_y = reg;
1277          reg_y.writemask = WRITEMASK_Y;
1278          reg_y.type = BRW_REGISTER_TYPE_D;
1279          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1280          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1281       }
1282       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1283          dst_reg reg_z = reg;
1284          reg_z.writemask = WRITEMASK_Z;
1285          reg_z.type = BRW_REGISTER_TYPE_D;
1286          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1287          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1288       }
1289    }
1290 }
1291
1292 vec4_instruction *
1293 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1294 {
1295    assert(varying < VARYING_SLOT_MAX);
1296    assert(output_reg[varying].type == reg.type);
1297    current_annotation = output_reg_annotation[varying];
1298    if (output_reg[varying].file != BAD_FILE)
1299       return emit(MOV(reg, src_reg(output_reg[varying])));
1300    else
1301       return NULL;
1302 }
1303
1304 void
1305 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1306 {
1307    reg.type = BRW_REGISTER_TYPE_F;
1308    output_reg[varying].type = reg.type;
1309
1310    switch (varying) {
1311    case VARYING_SLOT_PSIZ:
1312    {
1313       /* PSIZ is always in slot 0, and is coupled with other flags. */
1314       current_annotation = "indices, point width, clip flags";
1315       emit_psiz_and_flags(reg);
1316       break;
1317    }
1318    case BRW_VARYING_SLOT_NDC:
1319       current_annotation = "NDC";
1320       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1321          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1322       break;
1323    case VARYING_SLOT_POS:
1324       current_annotation = "gl_Position";
1325       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1326          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1327       break;
1328    case VARYING_SLOT_EDGE:
1329       /* This is present when doing unfilled polygons.  We're supposed to copy
1330        * the edge flag from the user-provided vertex array
1331        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1332        * of that attribute (starts as 1.0f).  This is then used in clipping to
1333        * determine which edges should be drawn as wireframe.
1334        */
1335       current_annotation = "edge flag";
1336       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1337                                     glsl_type::float_type, WRITEMASK_XYZW))));
1338       break;
1339    case BRW_VARYING_SLOT_PAD:
1340       /* No need to write to this slot */
1341       break;
1342    default:
1343       emit_generic_urb_slot(reg, varying);
1344       break;
1345    }
1346 }
1347
1348 static int
1349 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1350 {
1351    if (devinfo->gen >= 6) {
1352       /* URB data written (does not include the message header reg) must
1353        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1354        * section 5.4.3.2.2: URB_INTERLEAVED.
1355        *
1356        * URB entries are allocated on a multiple of 1024 bits, so an
1357        * extra 128 bits written here to make the end align to 256 is
1358        * no problem.
1359        */
1360       if ((mlen % 2) != 1)
1361          mlen++;
1362    }
1363
1364    return mlen;
1365 }
1366
1367
1368 /**
1369  * Generates the VUE payload plus the necessary URB write instructions to
1370  * output it.
1371  *
1372  * The VUE layout is documented in Volume 2a.
1373  */
1374 void
1375 vec4_visitor::emit_vertex()
1376 {
1377    /* MRF 0 is reserved for the debugger, so start with message header
1378     * in MRF 1.
1379     */
1380    int base_mrf = 1;
1381    int mrf = base_mrf;
1382    /* In the process of generating our URB write message contents, we
1383     * may need to unspill a register or load from an array.  Those
1384     * reads would use MRFs 14-15.
1385     */
1386    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1387
1388    /* The following assertion verifies that max_usable_mrf causes an
1389     * even-numbered amount of URB write data, which will meet gen6's
1390     * requirements for length alignment.
1391     */
1392    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1393
1394    /* First mrf is the g0-based message header containing URB handles and
1395     * such.
1396     */
1397    emit_urb_write_header(mrf++);
1398
1399    if (devinfo->gen < 6) {
1400       emit_ndc_computation();
1401    }
1402
1403    /* We may need to split this up into several URB writes, so do them in a
1404     * loop.
1405     */
1406    int slot = 0;
1407    bool complete = false;
1408    do {
1409       /* URB offset is in URB row increments, and each of our MRFs is half of
1410        * one of those, since we're doing interleaved writes.
1411        */
1412       int offset = slot / 2;
1413
1414       mrf = base_mrf + 1;
1415       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1416          emit_urb_slot(dst_reg(MRF, mrf++),
1417                        prog_data->vue_map.slot_to_varying[slot]);
1418
1419          /* If this was max_usable_mrf, we can't fit anything more into this
1420           * URB WRITE. Same thing if we reached the maximum length available.
1421           */
1422          if (mrf > max_usable_mrf ||
1423              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1424             slot++;
1425             break;
1426          }
1427       }
1428
1429       complete = slot >= prog_data->vue_map.num_slots;
1430       current_annotation = "URB write";
1431       vec4_instruction *inst = emit_urb_write_opcode(complete);
1432       inst->base_mrf = base_mrf;
1433       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1434       inst->offset += offset;
1435    } while(!complete);
1436 }
1437
1438
1439 src_reg
1440 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1441                                  src_reg *reladdr, int reg_offset)
1442 {
1443    /* Because we store the values to scratch interleaved like our
1444     * vertex data, we need to scale the vec4 index by 2.
1445     */
1446    int message_header_scale = 2;
1447
1448    /* Pre-gen6, the message header uses byte offsets instead of vec4
1449     * (16-byte) offset units.
1450     */
1451    if (devinfo->gen < 6)
1452       message_header_scale *= 16;
1453
1454    if (reladdr) {
1455       src_reg index = src_reg(this, glsl_type::int_type);
1456
1457       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1458                                    brw_imm_d(reg_offset)));
1459       emit_before(block, inst, MUL(dst_reg(index), index,
1460                                    brw_imm_d(message_header_scale)));
1461
1462       return index;
1463    } else {
1464       return brw_imm_d(reg_offset * message_header_scale);
1465    }
1466 }
1467
1468 src_reg
1469 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1470                                        src_reg *reladdr, int reg_offset)
1471 {
1472    if (reladdr) {
1473       src_reg index = src_reg(this, glsl_type::int_type);
1474
1475       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1476                                    brw_imm_d(reg_offset)));
1477
1478       /* Pre-gen6, the message header uses byte offsets instead of vec4
1479        * (16-byte) offset units.
1480        */
1481       if (devinfo->gen < 6) {
1482          emit_before(block, inst, MUL(dst_reg(index), index, brw_imm_d(16)));
1483       }
1484
1485       return index;
1486    } else if (devinfo->gen >= 8) {
1487       /* Store the offset in a GRF so we can send-from-GRF. */
1488       src_reg offset = src_reg(this, glsl_type::int_type);
1489       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset)));
1490       return offset;
1491    } else {
1492       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1493       return brw_imm_d(reg_offset * message_header_scale);
1494    }
1495 }
1496
1497 /**
1498  * Emits an instruction before @inst to load the value named by @orig_src
1499  * from scratch space at @base_offset to @temp.
1500  *
1501  * @base_offset is measured in 32-byte units (the size of a register).
1502  */
1503 void
1504 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1505                                 dst_reg temp, src_reg orig_src,
1506                                 int base_offset)
1507 {
1508    int reg_offset = base_offset + orig_src.reg_offset;
1509    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1510                                       reg_offset);
1511
1512    emit_before(block, inst, SCRATCH_READ(temp, index));
1513 }
1514
1515 /**
1516  * Emits an instruction after @inst to store the value to be written
1517  * to @orig_dst to scratch space at @base_offset, from @temp.
1518  *
1519  * @base_offset is measured in 32-byte units (the size of a register).
1520  */
1521 void
1522 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1523                                  int base_offset)
1524 {
1525    int reg_offset = base_offset + inst->dst.reg_offset;
1526    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1527                                       reg_offset);
1528
1529    /* Create a temporary register to store *inst's result in.
1530     *
1531     * We have to be careful in MOVing from our temporary result register in
1532     * the scratch write.  If we swizzle from channels of the temporary that
1533     * weren't initialized, it will confuse live interval analysis, which will
1534     * make spilling fail to make progress.
1535     */
1536    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1537                                        inst->dst.type),
1538                                 brw_swizzle_for_mask(inst->dst.writemask));
1539    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1540                                        inst->dst.writemask));
1541    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1542    if (inst->opcode != BRW_OPCODE_SEL)
1543       write->predicate = inst->predicate;
1544    write->ir = inst->ir;
1545    write->annotation = inst->annotation;
1546    inst->insert_after(block, write);
1547
1548    inst->dst.file = temp.file;
1549    inst->dst.nr = temp.nr;
1550    inst->dst.reg_offset = temp.reg_offset;
1551    inst->dst.reladdr = NULL;
1552 }
1553
1554 /**
1555  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1556  * adds the scratch read(s) before \p inst. The function also checks for
1557  * recursive reladdr scratch accesses, issuing the corresponding scratch
1558  * loads and rewriting reladdr references accordingly.
1559  *
1560  * \return \p src if it did not require a scratch load, otherwise, the
1561  * register holding the result of the scratch load that the caller should
1562  * use to rewrite src.
1563  */
1564 src_reg
1565 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1566                                    vec4_instruction *inst, src_reg src)
1567 {
1568    /* Resolve recursive reladdr scratch access by calling ourselves
1569     * with src.reladdr
1570     */
1571    if (src.reladdr)
1572       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1573                                           *src.reladdr);
1574
1575    /* Now handle scratch access on src */
1576    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1577       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1578       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1579       src.nr = temp.nr;
1580       src.reg_offset = temp.reg_offset;
1581       src.reladdr = NULL;
1582    }
1583
1584    return src;
1585 }
1586
1587 /**
1588  * We can't generally support array access in GRF space, because a
1589  * single instruction's destination can only span 2 contiguous
1590  * registers.  So, we send all GRF arrays that get variable index
1591  * access to scratch space.
1592  */
1593 void
1594 vec4_visitor::move_grf_array_access_to_scratch()
1595 {
1596    int scratch_loc[this->alloc.count];
1597    memset(scratch_loc, -1, sizeof(scratch_loc));
1598
1599    /* First, calculate the set of virtual GRFs that need to be punted
1600     * to scratch due to having any array access on them, and where in
1601     * scratch.
1602     */
1603    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1604       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1605          if (scratch_loc[inst->dst.nr] == -1) {
1606             scratch_loc[inst->dst.nr] = last_scratch;
1607             last_scratch += this->alloc.sizes[inst->dst.nr];
1608          }
1609
1610          for (src_reg *iter = inst->dst.reladdr;
1611               iter->reladdr;
1612               iter = iter->reladdr) {
1613             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1614                scratch_loc[iter->nr] = last_scratch;
1615                last_scratch += this->alloc.sizes[iter->nr];
1616             }
1617          }
1618       }
1619
1620       for (int i = 0 ; i < 3; i++) {
1621          for (src_reg *iter = &inst->src[i];
1622               iter->reladdr;
1623               iter = iter->reladdr) {
1624             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1625                scratch_loc[iter->nr] = last_scratch;
1626                last_scratch += this->alloc.sizes[iter->nr];
1627             }
1628          }
1629       }
1630    }
1631
1632    /* Now, for anything that will be accessed through scratch, rewrite
1633     * it to load/store.  Note that this is a _safe list walk, because
1634     * we may generate a new scratch_write instruction after the one
1635     * we're processing.
1636     */
1637    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1638       /* Set up the annotation tracking for new generated instructions. */
1639       base_ir = inst->ir;
1640       current_annotation = inst->annotation;
1641
1642       /* First handle scratch access on the dst. Notice we have to handle
1643        * the case where the dst's reladdr also points to scratch space.
1644        */
1645       if (inst->dst.reladdr)
1646          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1647                                                    *inst->dst.reladdr);
1648
1649       /* Now that we have handled any (possibly recursive) reladdr scratch
1650        * accesses for dst we can safely do the scratch write for dst itself
1651        */
1652       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1653          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1654
1655       /* Now handle scratch access on any src. In this case, since inst->src[i]
1656        * already is a src_reg, we can just call emit_resolve_reladdr with
1657        * inst->src[i] and it will take care of handling scratch loads for
1658        * both src and src.reladdr (recursively).
1659        */
1660       for (int i = 0 ; i < 3; i++) {
1661          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1662                                              inst->src[i]);
1663       }
1664    }
1665 }
1666
1667 /**
1668  * Emits an instruction before @inst to load the value named by @orig_src
1669  * from the pull constant buffer (surface) at @base_offset to @temp.
1670  */
1671 void
1672 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1673                                       dst_reg temp, src_reg orig_src,
1674                                       int base_offset)
1675 {
1676    int reg_offset = base_offset + orig_src.reg_offset;
1677    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1678    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1679                                              reg_offset);
1680
1681    emit_pull_constant_load_reg(temp,
1682                                brw_imm_ud(index),
1683                                offset,
1684                                block, inst);
1685
1686    brw_mark_surface_used(&prog_data->base, index);
1687 }
1688
1689 /**
1690  * Implements array access of uniforms by inserting a
1691  * PULL_CONSTANT_LOAD instruction.
1692  *
1693  * Unlike temporary GRF array access (where we don't support it due to
1694  * the difficulty of doing relative addressing on instruction
1695  * destinations), we could potentially do array access of uniforms
1696  * that were loaded in GRF space as push constants.  In real-world
1697  * usage we've seen, though, the arrays being used are always larger
1698  * than we could load as push constants, so just always move all
1699  * uniform array access out to a pull constant buffer.
1700  */
1701 void
1702 vec4_visitor::move_uniform_array_access_to_pull_constants()
1703 {
1704    int pull_constant_loc[this->uniforms];
1705    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1706    bool nested_reladdr;
1707
1708    /* Walk through and find array access of uniforms.  Put a copy of that
1709     * uniform in the pull constant buffer.
1710     *
1711     * Note that we don't move constant-indexed accesses to arrays.  No
1712     * testing has been done of the performance impact of this choice.
1713     */
1714    do {
1715       nested_reladdr = false;
1716
1717       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1718          for (int i = 0 ; i < 3; i++) {
1719             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1720                continue;
1721
1722             int uniform = inst->src[i].nr;
1723
1724             if (inst->src[i].reladdr->reladdr)
1725                nested_reladdr = true;  /* will need another pass */
1726
1727             /* If this array isn't already present in the pull constant buffer,
1728              * add it.
1729              */
1730             if (pull_constant_loc[uniform] == -1) {
1731                const gl_constant_value **values =
1732                   &stage_prog_data->param[uniform * 4];
1733
1734                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1735
1736                assert(uniform < uniform_array_size);
1737                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1738                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1739                      = values[j];
1740                }
1741             }
1742
1743             /* Set up the annotation tracking for new generated instructions. */
1744             base_ir = inst->ir;
1745             current_annotation = inst->annotation;
1746
1747             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1748
1749             emit_pull_constant_load(block, inst, temp, inst->src[i],
1750                                     pull_constant_loc[uniform]);
1751
1752             inst->src[i].file = temp.file;
1753             inst->src[i].nr = temp.nr;
1754             inst->src[i].reg_offset = temp.reg_offset;
1755             inst->src[i].reladdr = NULL;
1756          }
1757       }
1758    } while (nested_reladdr);
1759
1760    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1761     * no need to track them as larger-than-vec4 objects.  This will be
1762     * relied on in cutting out unused uniform vectors from push
1763     * constants.
1764     */
1765    split_uniform_registers();
1766 }
1767
1768 void
1769 vec4_visitor::resolve_ud_negate(src_reg *reg)
1770 {
1771    if (reg->type != BRW_REGISTER_TYPE_UD ||
1772        !reg->negate)
1773       return;
1774
1775    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1776    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1777    *reg = temp;
1778 }
1779
1780 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1781                            void *log_data,
1782                            const struct brw_sampler_prog_key_data *key_tex,
1783                            struct brw_vue_prog_data *prog_data,
1784                            const nir_shader *shader,
1785                            void *mem_ctx,
1786                            bool no_spills,
1787                            int shader_time_index)
1788    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1789      key_tex(key_tex),
1790      prog_data(prog_data),
1791      fail_msg(NULL),
1792      first_non_payload_grf(0),
1793      need_all_constants_in_pull_buffer(false),
1794      no_spills(no_spills),
1795      shader_time_index(shader_time_index),
1796      last_scratch(0)
1797 {
1798    this->failed = false;
1799
1800    this->base_ir = NULL;
1801    this->current_annotation = NULL;
1802    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1803
1804    this->virtual_grf_start = NULL;
1805    this->virtual_grf_end = NULL;
1806    this->live_intervals = NULL;
1807
1808    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1809
1810    this->uniforms = 0;
1811
1812    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1813     * at least one. See setup_uniforms() in brw_vec4.cpp.
1814     */
1815    this->uniform_array_size = 1;
1816    if (prog_data) {
1817       this->uniform_array_size =
1818          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1819    }
1820
1821    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1822 }
1823
1824 vec4_visitor::~vec4_visitor()
1825 {
1826 }
1827
1828
1829 void
1830 vec4_visitor::fail(const char *format, ...)
1831 {
1832    va_list va;
1833    char *msg;
1834
1835    if (failed)
1836       return;
1837
1838    failed = true;
1839
1840    va_start(va, format);
1841    msg = ralloc_vasprintf(mem_ctx, format, va);
1842    va_end(va);
1843    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1844
1845    this->fail_msg = msg;
1846
1847    if (debug_enabled) {
1848       fprintf(stderr, "%s",  msg);
1849    }
1850 }
1851
1852 } /* namespace brw */