src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->predicate = BRW_PREDICATE_NONE;
  49    this->predicate_inverse = false;
  50    this->target = 0;
  51    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  52    this->shadow_compare = false;
  53    this->ir = NULL;
  54    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  55    this->header_size = 0;
  56    this->flag_subreg = 0;
  57    this->mlen = 0;
  58    this->base_mrf = 0;
  59    this->offset = 0;
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188
 189 /** Gen4 predicated IF. */
 190 vec4_instruction *
 191 vec4_visitor::IF(enum brw_predicate predicate)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197
 198    return inst;
 199 }
 200
 201 /** Gen6 IF with embedded comparison. */
 202 vec4_instruction *
 203 vec4_visitor::IF(src_reg src0, src_reg src1,
 204                  enum brw_conditional_mod condition)
 205 {
 206    assert(devinfo->gen == 6);
 207
 208    vec4_instruction *inst;
 209
 210    resolve_ud_negate(&src0);
 211    resolve_ud_negate(&src1);
 212
 213    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 214                                         src0, src1);
 215    inst->conditional_mod = condition;
 216
 217    return inst;
 218 }
 219
 220 /**
 221  * CMP: Sets the low bit of the destination channels with the result
 222  * of the comparison, while the upper bits are undefined, and updates
 223  * the flag register with the packed 16 bits of the result.
 224  */
 225 vec4_instruction *
 226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 227                   enum brw_conditional_mod condition)
 228 {
 229    vec4_instruction *inst;
 230
 231    /* Take the instruction:
 232     *
 233     * CMP null<d> src0<f> src1<f>
 234     *
 235     * Original gen4 does type conversion to the destination type before
 236     * comparison, producing garbage results for floating point comparisons.
 237     *
 238     * The destination type doesn't matter on newer generations, so we set the
 239     * type to match src0 so we can compact the instruction.
 240     */
 241    dst.type = src0.type;
 242    if (dst.file == HW_REG)
 243       dst.fixed_hw_reg.type = dst.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 void
 282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 283 {
 284    static enum opcode dot_opcodes[] = {
 285       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 286    };
 287
 288    emit(dot_opcodes[elements - 2], dst, src0, src1);
 289 }
 290
 291 src_reg
 292 vec4_visitor::fix_3src_operand(const src_reg &src)
 293 {
 294    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 295     * able to use vertical stride of zero to replicate the vec4 uniform, like
 296     *
 297     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 298     *
 299     * But you can't, since vertical stride is always four in three-source
 300     * instructions. Instead, insert a MOV instruction to do the replication so
 301     * that the three-source instruction can consume it.
 302     */
 303
 304    /* The MOV is only needed if the source is a uniform or immediate. */
 305    if (src.file != UNIFORM && src.file != IMM)
 306       return src;
 307
 308    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 309       return src;
 310
 311    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 312    expanded.type = src.type;
 313    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 314    return src_reg(expanded);
 315 }
 316
 317 src_reg
 318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 319 {
 320    if (!src.abs && !src.negate)
 321       return src;
 322
 323    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 324    resolved.type = src.type;
 325    emit(MOV(resolved, src));
 326
 327    return src_reg(resolved);
 328 }
 329
 330 src_reg
 331 vec4_visitor::fix_math_operand(const src_reg &src)
 332 {
 333    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 334       return src;
 335
 336    /* The gen6 math instruction ignores the source modifiers --
 337     * swizzle, abs, negate, and at least some parts of the register
 338     * region description.
 339     *
 340     * Rather than trying to enumerate all these cases, *always* expand the
 341     * operand to a temp GRF for gen6.
 342     *
 343     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 344     * can't use.
 345     */
 346
 347    if (devinfo->gen == 7 && src.file != IMM)
 348       return src;
 349
 350    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 351    expanded.type = src.type;
 352    emit(MOV(expanded, src));
 353    return src_reg(expanded);
 354 }
 355
 356 vec4_instruction *
 357 vec4_visitor::emit_math(enum opcode opcode,
 358                         const dst_reg &dst,
 359                         const src_reg &src0, const src_reg &src1)
 360 {
 361    vec4_instruction *math =
 362       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 363
 364    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 365       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 366       math->dst = dst_reg(this, glsl_type::vec4_type);
 367       math->dst.type = dst.type;
 368       math = emit(MOV(dst, src_reg(math->dst)));
 369    } else if (devinfo->gen < 6) {
 370       math->base_mrf = 1;
 371       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 372    }
 373
 374    return math;
 375 }
 376
 377 void
 378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 379 {
 380    if (devinfo->gen < 7) {
 381       unreachable("ir_unop_pack_half_2x16 should be lowered");
 382    }
 383
 384    assert(dst.type == BRW_REGISTER_TYPE_UD);
 385    assert(src0.type == BRW_REGISTER_TYPE_F);
 386
 387    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 388     *
 389     *   Because this instruction does not have a 16-bit floating-point type,
 390     *   the destination data type must be Word (W).
 391     *
 392     *   The destination must be DWord-aligned and specify a horizontal stride
 393     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 394     *   each destination channel and the upper word is not modified.
 395     *
 396     * The above restriction implies that the f32to16 instruction must use
 397     * align1 mode, because only in align1 mode is it possible to specify
 398     * horizontal stride.  We choose here to defy the hardware docs and emit
 399     * align16 instructions.
 400     *
 401     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 402     * instructions. I was partially successful in that the code passed all
 403     * tests.  However, the code was dubiously correct and fragile, and the
 404     * tests were not harsh enough to probe that frailty. Not trusting the
 405     * code, I chose instead to remain in align16 mode in defiance of the hw
 406     * docs).
 407     *
 408     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 409     * simulator, emitting a f32to16 in align16 mode with UD as destination
 410     * data type is safe. The behavior differs from that specified in the PRM
 411     * in that the upper word of each destination channel is cleared to 0.
 412     */
 413
 414    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 415    src_reg tmp_src(tmp_dst);
 416
 417 #if 0
 418    /* Verify the undocumented behavior on which the following instructions
 419     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 420     * then the result of the bit-or instruction below will be incorrect.
 421     *
 422     * You should inspect the disasm output in order to verify that the MOV is
 423     * not optimized away.
 424     */
 425    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 426 #endif
 427
 428    /* Give tmp the form below, where "." means untouched.
 429     *
 430     *     w z          y          x w z          y          x
 431     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 432     *
 433     * That the upper word of each write-channel be 0 is required for the
 434     * following bit-shift and bit-or instructions to work. Note that this
 435     * relies on the undocumented hardware behavior mentioned above.
 436     */
 437    tmp_dst.writemask = WRITEMASK_XY;
 438    emit(F32TO16(tmp_dst, src0));
 439
 440    /* Give the write-channels of dst the form:
 441     *   0xhhhh0000
 442     */
 443    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 444    emit(SHL(dst, tmp_src, src_reg(16u)));
 445
 446    /* Finally, give the write-channels of dst the form of packHalf2x16's
 447     * output:
 448     *   0xhhhhllll
 449     */
 450    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 451    emit(OR(dst, src_reg(dst), tmp_src));
 452 }
 453
 454 void
 455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 456 {
 457    if (devinfo->gen < 7) {
 458       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 459    }
 460
 461    assert(dst.type == BRW_REGISTER_TYPE_F);
 462    assert(src0.type == BRW_REGISTER_TYPE_UD);
 463
 464    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 465     *
 466     *   Because this instruction does not have a 16-bit floating-point type,
 467     *   the source data type must be Word (W). The destination type must be
 468     *   F (Float).
 469     *
 470     * To use W as the source data type, we must adjust horizontal strides,
 471     * which is only possible in align1 mode. All my [chadv] attempts at
 472     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 473     * Piglit tests, so I gave up.
 474     *
 475     * I've verified that, on gen7 hardware and the simulator, it is safe to
 476     * emit f16to32 in align16 mode with UD as source data type.
 477     */
 478
 479    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 480    src_reg tmp_src(tmp_dst);
 481
 482    tmp_dst.writemask = WRITEMASK_X;
 483    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 484
 485    tmp_dst.writemask = WRITEMASK_Y;
 486    emit(SHR(tmp_dst, src0, src_reg(16u)));
 487
 488    dst.writemask = WRITEMASK_XY;
 489    emit(F16TO32(dst, tmp_src));
 490 }
 491
 492 void
 493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 494 {
 495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 497     * is not suitable to generate the shift values, but we can use the packed
 498     * vector float and a type-converting MOV.
 499     */
 500    dst_reg shift(this, glsl_type::uvec4_type);
 501    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 502
 503    dst_reg shifted(this, glsl_type::uvec4_type);
 504    src0.swizzle = BRW_SWIZZLE_XXXX;
 505    emit(SHR(shifted, src0, src_reg(shift)));
 506
 507    shifted.type = BRW_REGISTER_TYPE_UB;
 508    dst_reg f(this, glsl_type::vec4_type);
 509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 510
 511    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 516 {
 517    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 518     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 519     * is not suitable to generate the shift values, but we can use the packed
 520     * vector float and a type-converting MOV.
 521     */
 522    dst_reg shift(this, glsl_type::uvec4_type);
 523    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 524
 525    dst_reg shifted(this, glsl_type::uvec4_type);
 526    src0.swizzle = BRW_SWIZZLE_XXXX;
 527    emit(SHR(shifted, src0, src_reg(shift)));
 528
 529    shifted.type = BRW_REGISTER_TYPE_B;
 530    dst_reg f(this, glsl_type::vec4_type);
 531    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 532
 533    dst_reg scaled(this, glsl_type::vec4_type);
 534    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 535
 536    dst_reg max(this, glsl_type::vec4_type);
 537    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 538    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 539 }
 540
 541 void
 542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 543 {
 544    dst_reg saturated(this, glsl_type::vec4_type);
 545    vec4_instruction *inst = emit(MOV(saturated, src0));
 546    inst->saturate = true;
 547
 548    dst_reg scaled(this, glsl_type::vec4_type);
 549    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 550
 551    dst_reg rounded(this, glsl_type::vec4_type);
 552    emit(RNDE(rounded, src_reg(scaled)));
 553
 554    dst_reg u(this, glsl_type::uvec4_type);
 555    emit(MOV(u, src_reg(rounded)));
 556
 557    src_reg bytes(u);
 558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 559 }
 560
 561 void
 562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 563 {
 564    dst_reg max(this, glsl_type::vec4_type);
 565    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 566
 567    dst_reg min(this, glsl_type::vec4_type);
 568    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 569
 570    dst_reg scaled(this, glsl_type::vec4_type);
 571    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 572
 573    dst_reg rounded(this, glsl_type::vec4_type);
 574    emit(RNDE(rounded, src_reg(scaled)));
 575
 576    dst_reg i(this, glsl_type::ivec4_type);
 577    emit(MOV(i, src_reg(rounded)));
 578
 579    src_reg bytes(i);
 580    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 581 }
 582
 583 /**
 584  * Returns the minimum number of vec4 elements needed to pack a type.
 585  *
 586  * For simple types, it will return 1 (a single vec4); for matrices, the
 587  * number of columns; for array and struct, the sum of the vec4_size of
 588  * each of its elements; and for sampler and atomic, zero.
 589  *
 590  * This method is useful to calculate how much register space is needed to
 591  * store a particular type.
 592  */
 593 extern "C" int
 594 type_size_vec4(const struct glsl_type *type)
 595 {
 596    unsigned int i;
 597    int size;
 598
 599    switch (type->base_type) {
 600    case GLSL_TYPE_UINT:
 601    case GLSL_TYPE_INT:
 602    case GLSL_TYPE_FLOAT:
 603    case GLSL_TYPE_BOOL:
 604       if (type->is_matrix()) {
 605          return type->matrix_columns;
 606       } else {
 607          /* Regardless of size of vector, it gets a vec4. This is bad
 608           * packing for things like floats, but otherwise arrays become a
 609           * mess.  Hopefully a later pass over the code can pack scalars
 610           * down if appropriate.
 611           */
 612          return 1;
 613       }
 614    case GLSL_TYPE_ARRAY:
 615       assert(type->length > 0);
 616       return type_size_vec4(type->fields.array) * type->length;
 617    case GLSL_TYPE_STRUCT:
 618       size = 0;
 619       for (i = 0; i < type->length; i++) {
 620          size += type_size_vec4(type->fields.structure[i].type);
 621       }
 622       return size;
 623    case GLSL_TYPE_SUBROUTINE:
 624       return 1;
 625
 626    case GLSL_TYPE_SAMPLER:
 627       /* Samplers take up no register space, since they're baked in at
 628        * link time.
 629        */
 630       return 0;
 631    case GLSL_TYPE_ATOMIC_UINT:
 632       return 0;
 633    case GLSL_TYPE_IMAGE:
 634       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 635    case GLSL_TYPE_VOID:
 636    case GLSL_TYPE_DOUBLE:
 637    case GLSL_TYPE_ERROR:
 638    case GLSL_TYPE_INTERFACE:
 639    case GLSL_TYPE_FUNCTION:
 640       unreachable("not reached");
 641    }
 642
 643    return 0;
 644 }
 645
 646 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 647 {
 648    init();
 649
 650    this->file = GRF;
 651    this->reg = v->alloc.allocate(type_size_vec4(type));
 652
 653    if (type->is_array() || type->is_record()) {
 654       this->swizzle = BRW_SWIZZLE_NOOP;
 655    } else {
 656       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 657    }
 658
 659    this->type = brw_type_for_base_type(type);
 660 }
 661
 662 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 663 {
 664    assert(size > 0);
 665
 666    init();
 667
 668    this->file = GRF;
 669    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 670
 671    this->swizzle = BRW_SWIZZLE_NOOP;
 672
 673    this->type = brw_type_for_base_type(type);
 674 }
 675
 676 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 677 {
 678    init();
 679
 680    this->file = GRF;
 681    this->reg = v->alloc.allocate(type_size_vec4(type));
 682
 683    if (type->is_array() || type->is_record()) {
 684       this->writemask = WRITEMASK_XYZW;
 685    } else {
 686       this->writemask = (1 << type->vector_elements) - 1;
 687    }
 688
 689    this->type = brw_type_for_base_type(type);
 690 }
 691
 692 vec4_instruction *
 693 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 694                           src_reg src0, src_reg src1)
 695 {
 696    vec4_instruction *inst;
 697
 698    if (devinfo->gen >= 6) {
 699       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 700       inst->conditional_mod = conditionalmod;
 701    } else {
 702       emit(CMP(dst, src0, src1, conditionalmod));
 703
 704       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 705       inst->predicate = BRW_PREDICATE_NORMAL;
 706    }
 707
 708    return inst;
 709 }
 710
 711 vec4_instruction *
 712 vec4_visitor::emit_lrp(const dst_reg &dst,
 713                        const src_reg &x, const src_reg &y, const src_reg &a)
 714 {
 715    if (devinfo->gen >= 6) {
 716       /* Note that the instruction's argument order is reversed from GLSL
 717        * and the IR.
 718        */
 719      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 720                      fix_3src_operand(x)));
 721    } else {
 722       /* Earlier generations don't support three source operations, so we
 723        * need to emit x*(1-a) + y*a.
 724        */
 725       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 726       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 727       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 728       y_times_a.writemask           = dst.writemask;
 729       one_minus_a.writemask         = dst.writemask;
 730       x_times_one_minus_a.writemask = dst.writemask;
 731
 732       emit(MUL(y_times_a, y, a));
 733       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
 734       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 735       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 736    }
 737 }
 738
 739 /**
 740  * Emits the instructions needed to perform a pull constant load. before_block
 741  * and before_inst can be NULL in which case the instruction will be appended
 742  * to the end of the instruction list.
 743  */
 744 void
 745 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 746                                           src_reg surf_index,
 747                                           src_reg offset_reg,
 748                                           bblock_t *before_block,
 749                                           vec4_instruction *before_inst)
 750 {
 751    assert((before_inst == NULL && before_block == NULL) ||
 752           (before_inst && before_block));
 753
 754    vec4_instruction *pull;
 755
 756    if (devinfo->gen >= 9) {
 757       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 758       src_reg header(this, glsl_type::uvec4_type, 2);
 759
 760       pull = new(mem_ctx)
 761          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 762                           dst_reg(header));
 763
 764       if (before_inst)
 765          emit_before(before_block, before_inst, pull);
 766       else
 767          emit(pull);
 768
 769       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 770                                  offset_reg.type);
 771       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 772
 773       if (before_inst)
 774          emit_before(before_block, before_inst, pull);
 775       else
 776          emit(pull);
 777
 778       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 779                                            dst,
 780                                            surf_index,
 781                                            header);
 782       pull->mlen = 2;
 783       pull->header_size = 1;
 784    } else if (devinfo->gen >= 7) {
 785       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 786
 787       grf_offset.type = offset_reg.type;
 788
 789       pull = MOV(grf_offset, offset_reg);
 790
 791       if (before_inst)
 792          emit_before(before_block, before_inst, pull);
 793       else
 794          emit(pull);
 795
 796       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 797                                            dst,
 798                                            surf_index,
 799                                            src_reg(grf_offset));
 800       pull->mlen = 1;
 801    } else {
 802       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 803                                            dst,
 804                                            surf_index,
 805                                            offset_reg);
 806       pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 807       pull->mlen = 1;
 808    }
 809
 810    if (before_inst)
 811       emit_before(before_block, before_inst, pull);
 812    else
 813       emit(pull);
 814 }
 815
 816 src_reg
 817 vec4_visitor::emit_uniformize(const src_reg &src)
 818 {
 819    const src_reg chan_index(this, glsl_type::uint_type);
 820    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 821                               src.type);
 822
 823    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 824       ->force_writemask_all = true;
 825    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 826       ->force_writemask_all = true;
 827
 828    return src_reg(dst);
 829 }
 830
 831 src_reg
 832 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 833                              src_reg coordinate, src_reg sampler)
 834 {
 835    vec4_instruction *inst =
 836       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 837                                     dst_reg(this, glsl_type::uvec4_type));
 838    inst->base_mrf = 2;
 839    inst->src[1] = sampler;
 840
 841    int param_base;
 842
 843    if (devinfo->gen >= 9) {
 844       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 845       vec4_instruction *header_inst = new(mem_ctx)
 846          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 847                           dst_reg(MRF, inst->base_mrf));
 848
 849       emit(header_inst);
 850
 851       inst->mlen = 2;
 852       inst->header_size = 1;
 853       param_base = inst->base_mrf + 1;
 854    } else {
 855       inst->mlen = 1;
 856       param_base = inst->base_mrf;
 857    }
 858
 859    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 860    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 861    int zero_mask = 0xf & ~coord_mask;
 862
 863    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 864             coordinate));
 865
 866    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 867             src_reg(0)));
 868
 869    emit(inst);
 870    return src_reg(inst->dst);
 871 }
 872
 873 bool
 874 vec4_visitor::is_high_sampler(src_reg sampler)
 875 {
 876    if (devinfo->gen < 8 && !devinfo->is_haswell)
 877       return false;
 878
 879    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 880 }
 881
 882 void
 883 vec4_visitor::emit_texture(ir_texture_opcode op,
 884                            dst_reg dest,
 885                            const glsl_type *dest_type,
 886                            src_reg coordinate,
 887                            int coord_components,
 888                            src_reg shadow_comparitor,
 889                            src_reg lod, src_reg lod2,
 890                            src_reg sample_index,
 891                            uint32_t constant_offset,
 892                            src_reg offset_value,
 893                            src_reg mcs,
 894                            bool is_cube_array,
 895                            uint32_t sampler,
 896                            src_reg sampler_reg)
 897 {
 898    enum opcode opcode;
 899    switch (op) {
 900    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 901    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 902    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 903    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 904    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 905    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 906    case ir_tg4: opcode = offset_value.file != BAD_FILE
 907                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 908    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 909    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 910    case ir_txb:
 911       unreachable("TXB is not valid for vertex shaders.");
 912    case ir_lod:
 913       unreachable("LOD is not valid for vertex shaders.");
 914    default:
 915       unreachable("Unrecognized tex op");
 916    }
 917
 918    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 919       opcode, dst_reg(this, dest_type));
 920
 921    inst->offset = constant_offset;
 922
 923    /* The message header is necessary for:
 924     * - Gen4 (always)
 925     * - Gen9+ for selecting SIMD4x2
 926     * - Texel offsets
 927     * - Gather channel selection
 928     * - Sampler indices too large to fit in a 4-bit value.
 929     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 930     */
 931    inst->header_size =
 932       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 933        inst->offset != 0 || op == ir_tg4 ||
 934        op == ir_texture_samples ||
 935        is_high_sampler(sampler_reg)) ? 1 : 0;
 936    inst->base_mrf = 2;
 937    inst->mlen = inst->header_size;
 938    inst->dst.writemask = WRITEMASK_XYZW;
 939    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 940
 941    inst->src[1] = sampler_reg;
 942
 943    /* MRF for the first parameter */
 944    int param_base = inst->base_mrf + inst->header_size;
 945
 946    if (op == ir_txs || op == ir_query_levels) {
 947       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 948       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 949       inst->mlen++;
 950    } else if (op == ir_texture_samples) {
 951       inst->dst.writemask = WRITEMASK_X;
 952    } else {
 953       /* Load the coordinate */
 954       /* FINISHME: gl_clamp_mask and saturate */
 955       int coord_mask = (1 << coord_components) - 1;
 956       int zero_mask = 0xf & ~coord_mask;
 957
 958       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 959                coordinate));
 960       inst->mlen++;
 961
 962       if (zero_mask != 0) {
 963          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 964                   src_reg(0)));
 965       }
 966       /* Load the shadow comparitor */
 967       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 968          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 969                           WRITEMASK_X),
 970                   shadow_comparitor));
 971          inst->mlen++;
 972       }
 973
 974       /* Load the LOD info */
 975       if (op == ir_tex || op == ir_txl) {
 976          int mrf, writemask;
 977          if (devinfo->gen >= 5) {
 978             mrf = param_base + 1;
 979             if (shadow_comparitor.file != BAD_FILE) {
 980                writemask = WRITEMASK_Y;
 981                /* mlen already incremented */
 982             } else {
 983                writemask = WRITEMASK_X;
 984                inst->mlen++;
 985             }
 986          } else /* devinfo->gen == 4 */ {
 987             mrf = param_base;
 988             writemask = WRITEMASK_W;
 989          }
 990          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 991       } else if (op == ir_txf) {
 992          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 993       } else if (op == ir_txf_ms) {
 994          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
 995                   sample_index));
 996          if (devinfo->gen >= 7) {
 997             /* MCS data is in the first channel of `mcs`, but we need to get it into
 998              * the .y channel of the second vec4 of params, so replicate .x across
 999              * the whole vec4 and then mask off everything except .y
1000              */
1001             mcs.swizzle = BRW_SWIZZLE_XXXX;
1002             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1003                      mcs));
1004          }
1005          inst->mlen++;
1006       } else if (op == ir_txd) {
1007          const brw_reg_type type = lod.type;
1008
1009          if (devinfo->gen >= 5) {
1010             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1011             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1012             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1013             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1014             inst->mlen++;
1015
1016             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1017                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1018                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1019                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1020                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1021                inst->mlen++;
1022
1023                if (shadow_comparitor.file != BAD_FILE) {
1024                   emit(MOV(dst_reg(MRF, param_base + 2,
1025                                    shadow_comparitor.type, WRITEMASK_Z),
1026                            shadow_comparitor));
1027                }
1028             }
1029          } else /* devinfo->gen == 4 */ {
1030             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1031             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1032             inst->mlen += 2;
1033          }
1034       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1035          if (shadow_comparitor.file != BAD_FILE) {
1036             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1037                      shadow_comparitor));
1038          }
1039
1040          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1041                   offset_value));
1042          inst->mlen++;
1043       }
1044    }
1045
1046    emit(inst);
1047
1048    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1049     * spec requires layers.
1050     */
1051    if (op == ir_txs && is_cube_array) {
1052       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1053                 writemask(inst->dst, WRITEMASK_Z),
1054                 src_reg(inst->dst), src_reg(6));
1055    }
1056
1057    if (devinfo->gen == 6 && op == ir_tg4) {
1058       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1059    }
1060
1061    swizzle_result(op, dest,
1062                   src_reg(inst->dst), sampler, dest_type);
1063 }
1064
1065 /**
1066  * Apply workarounds for Gen6 gather with UINT/SINT
1067  */
1068 void
1069 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1070 {
1071    if (!wa)
1072       return;
1073
1074    int width = (wa & WA_8BIT) ? 8 : 16;
1075    dst_reg dst_f = dst;
1076    dst_f.type = BRW_REGISTER_TYPE_F;
1077
1078    /* Convert from UNORM to UINT */
1079    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1080    emit(MOV(dst, src_reg(dst_f)));
1081
1082    if (wa & WA_SIGN) {
1083       /* Reinterpret the UINT value as a signed INT value by
1084        * shifting the sign bit into place, then shifting back
1085        * preserving sign.
1086        */
1087       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1088       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1089    }
1090 }
1091
1092 /**
1093  * Set up the gather channel based on the swizzle, for gather4.
1094  */
1095 uint32_t
1096 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1097 {
1098    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1099    switch (swiz) {
1100       case SWIZZLE_X: return 0;
1101       case SWIZZLE_Y:
1102          /* gather4 sampler is broken for green channel on RG32F --
1103           * we must ask for blue instead.
1104           */
1105          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1106             return 2;
1107          return 1;
1108       case SWIZZLE_Z: return 2;
1109       case SWIZZLE_W: return 3;
1110       default:
1111          unreachable("Not reached"); /* zero, one swizzles handled already */
1112    }
1113 }
1114
1115 void
1116 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1117                              src_reg orig_val, uint32_t sampler,
1118                              const glsl_type *dest_type)
1119 {
1120    int s = key_tex->swizzles[sampler];
1121
1122    dst_reg swizzled_result = dest;
1123
1124    if (op == ir_query_levels) {
1125       /* # levels is in .w */
1126       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1127       emit(MOV(swizzled_result, orig_val));
1128       return;
1129    }
1130
1131    if (op == ir_txs || dest_type == glsl_type::float_type
1132                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1133       emit(MOV(swizzled_result, orig_val));
1134       return;
1135    }
1136
1137
1138    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1139    int swizzle[4] = {0};
1140
1141    for (int i = 0; i < 4; i++) {
1142       switch (GET_SWZ(s, i)) {
1143       case SWIZZLE_ZERO:
1144          zero_mask |= (1 << i);
1145          break;
1146       case SWIZZLE_ONE:
1147          one_mask |= (1 << i);
1148          break;
1149       default:
1150          copy_mask |= (1 << i);
1151          swizzle[i] = GET_SWZ(s, i);
1152          break;
1153       }
1154    }
1155
1156    if (copy_mask) {
1157       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1158       swizzled_result.writemask = copy_mask;
1159       emit(MOV(swizzled_result, orig_val));
1160    }
1161
1162    if (zero_mask) {
1163       swizzled_result.writemask = zero_mask;
1164       emit(MOV(swizzled_result, src_reg(0.0f)));
1165    }
1166
1167    if (one_mask) {
1168       swizzled_result.writemask = one_mask;
1169       emit(MOV(swizzled_result, src_reg(1.0f)));
1170    }
1171 }
1172
1173 void
1174 vec4_visitor::gs_emit_vertex(int stream_id)
1175 {
1176    unreachable("not reached");
1177 }
1178
1179 void
1180 vec4_visitor::gs_end_primitive()
1181 {
1182    unreachable("not reached");
1183 }
1184
1185 void
1186 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1187                                   dst_reg dst, src_reg offset,
1188                                   src_reg src0, src_reg src1)
1189 {
1190    unsigned mlen = 0;
1191
1192    /* Set the atomic operation offset. */
1193    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1194    mlen++;
1195
1196    /* Set the atomic operation arguments. */
1197    if (src0.file != BAD_FILE) {
1198       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1199       mlen++;
1200    }
1201
1202    if (src1.file != BAD_FILE) {
1203       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1204       mlen++;
1205    }
1206
1207    /* Emit the instruction.  Note that this maps to the normal SIMD8
1208     * untyped atomic message on Ivy Bridge, but that's OK because
1209     * unused channels will be masked out.
1210     */
1211    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1212                                  brw_message_reg(0),
1213                                  src_reg(surf_index), src_reg(atomic_op));
1214    inst->mlen = mlen;
1215 }
1216
1217 void
1218 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1219                                         src_reg offset)
1220 {
1221    /* Set the surface read offset. */
1222    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1223
1224    /* Emit the instruction.  Note that this maps to the normal SIMD8
1225     * untyped surface read message, but that's OK because unused
1226     * channels will be masked out.
1227     */
1228    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1229                                  brw_message_reg(0),
1230                                  src_reg(surf_index), src_reg(1));
1231    inst->mlen = 1;
1232 }
1233
1234 void
1235 vec4_visitor::emit_ndc_computation()
1236 {
1237    /* Get the position */
1238    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1239
1240    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1241    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1242    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1243
1244    current_annotation = "NDC";
1245    dst_reg ndc_w = ndc;
1246    ndc_w.writemask = WRITEMASK_W;
1247    src_reg pos_w = pos;
1248    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1249    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1250
1251    dst_reg ndc_xyz = ndc;
1252    ndc_xyz.writemask = WRITEMASK_XYZ;
1253
1254    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1255 }
1256
1257 void
1258 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1259 {
1260    if (devinfo->gen < 6 &&
1261        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1262         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1263         devinfo->has_negative_rhw_bug)) {
1264       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1265       dst_reg header1_w = header1;
1266       header1_w.writemask = WRITEMASK_W;
1267
1268       emit(MOV(header1, 0u));
1269
1270       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1271          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1272
1273          current_annotation = "Point size";
1274          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1275          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1276       }
1277
1278       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1279          current_annotation = "Clipping flags";
1280          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1281          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1282
1283          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1284          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1285          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1286
1287          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1288          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1289          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1290          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1291       }
1292
1293       /* i965 clipping workaround:
1294        * 1) Test for -ve rhw
1295        * 2) If set,
1296        *      set ndc = (0,0,0,0)
1297        *      set ucp[6] = 1
1298        *
1299        * Later, clipping will detect ucp[6] and ensure the primitive is
1300        * clipped against all fixed planes.
1301        */
1302       if (devinfo->has_negative_rhw_bug) {
1303          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1304          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1305          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1306          vec4_instruction *inst;
1307          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1308          inst->predicate = BRW_PREDICATE_NORMAL;
1309          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1310          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1311          inst->predicate = BRW_PREDICATE_NORMAL;
1312       }
1313
1314       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1315    } else if (devinfo->gen < 6) {
1316       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1317    } else {
1318       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1319       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1320          dst_reg reg_w = reg;
1321          reg_w.writemask = WRITEMASK_W;
1322          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1323          reg_as_src.type = reg_w.type;
1324          reg_as_src.swizzle = brw_swizzle_for_size(1);
1325          emit(MOV(reg_w, reg_as_src));
1326       }
1327       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1328          dst_reg reg_y = reg;
1329          reg_y.writemask = WRITEMASK_Y;
1330          reg_y.type = BRW_REGISTER_TYPE_D;
1331          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1332          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1333       }
1334       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1335          dst_reg reg_z = reg;
1336          reg_z.writemask = WRITEMASK_Z;
1337          reg_z.type = BRW_REGISTER_TYPE_D;
1338          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1339          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1340       }
1341    }
1342 }
1343
1344 vec4_instruction *
1345 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1346 {
1347    assert(varying < VARYING_SLOT_MAX);
1348    assert(output_reg[varying].type == reg.type);
1349    current_annotation = output_reg_annotation[varying];
1350    /* Copy the register, saturating if necessary */
1351    return emit(MOV(reg, src_reg(output_reg[varying])));
1352 }
1353
1354 void
1355 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1356 {
1357    reg.type = BRW_REGISTER_TYPE_F;
1358    output_reg[varying].type = reg.type;
1359
1360    switch (varying) {
1361    case VARYING_SLOT_PSIZ:
1362    {
1363       /* PSIZ is always in slot 0, and is coupled with other flags. */
1364       current_annotation = "indices, point width, clip flags";
1365       emit_psiz_and_flags(reg);
1366       break;
1367    }
1368    case BRW_VARYING_SLOT_NDC:
1369       current_annotation = "NDC";
1370       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1371       break;
1372    case VARYING_SLOT_POS:
1373       current_annotation = "gl_Position";
1374       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1375       break;
1376    case VARYING_SLOT_EDGE:
1377       /* This is present when doing unfilled polygons.  We're supposed to copy
1378        * the edge flag from the user-provided vertex array
1379        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1380        * of that attribute (starts as 1.0f).  This is then used in clipping to
1381        * determine which edges should be drawn as wireframe.
1382        */
1383       current_annotation = "edge flag";
1384       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1385                                     glsl_type::float_type, WRITEMASK_XYZW))));
1386       break;
1387    case BRW_VARYING_SLOT_PAD:
1388       /* No need to write to this slot */
1389       break;
1390    default:
1391       emit_generic_urb_slot(reg, varying);
1392       break;
1393    }
1394 }
1395
1396 static int
1397 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1398 {
1399    if (devinfo->gen >= 6) {
1400       /* URB data written (does not include the message header reg) must
1401        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1402        * section 5.4.3.2.2: URB_INTERLEAVED.
1403        *
1404        * URB entries are allocated on a multiple of 1024 bits, so an
1405        * extra 128 bits written here to make the end align to 256 is
1406        * no problem.
1407        */
1408       if ((mlen % 2) != 1)
1409          mlen++;
1410    }
1411
1412    return mlen;
1413 }
1414
1415
1416 /**
1417  * Generates the VUE payload plus the necessary URB write instructions to
1418  * output it.
1419  *
1420  * The VUE layout is documented in Volume 2a.
1421  */
1422 void
1423 vec4_visitor::emit_vertex()
1424 {
1425    /* MRF 0 is reserved for the debugger, so start with message header
1426     * in MRF 1.
1427     */
1428    int base_mrf = 1;
1429    int mrf = base_mrf;
1430    /* In the process of generating our URB write message contents, we
1431     * may need to unspill a register or load from an array.  Those
1432     * reads would use MRFs 14-15.
1433     */
1434    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1435
1436    /* The following assertion verifies that max_usable_mrf causes an
1437     * even-numbered amount of URB write data, which will meet gen6's
1438     * requirements for length alignment.
1439     */
1440    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1441
1442    /* First mrf is the g0-based message header containing URB handles and
1443     * such.
1444     */
1445    emit_urb_write_header(mrf++);
1446
1447    if (devinfo->gen < 6) {
1448       emit_ndc_computation();
1449    }
1450
1451    /* We may need to split this up into several URB writes, so do them in a
1452     * loop.
1453     */
1454    int slot = 0;
1455    bool complete = false;
1456    do {
1457       /* URB offset is in URB row increments, and each of our MRFs is half of
1458        * one of those, since we're doing interleaved writes.
1459        */
1460       int offset = slot / 2;
1461
1462       mrf = base_mrf + 1;
1463       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1464          emit_urb_slot(dst_reg(MRF, mrf++),
1465                        prog_data->vue_map.slot_to_varying[slot]);
1466
1467          /* If this was max_usable_mrf, we can't fit anything more into this
1468           * URB WRITE. Same thing if we reached the maximum length available.
1469           */
1470          if (mrf > max_usable_mrf ||
1471              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1472             slot++;
1473             break;
1474          }
1475       }
1476
1477       complete = slot >= prog_data->vue_map.num_slots;
1478       current_annotation = "URB write";
1479       vec4_instruction *inst = emit_urb_write_opcode(complete);
1480       inst->base_mrf = base_mrf;
1481       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1482       inst->offset += offset;
1483    } while(!complete);
1484 }
1485
1486
1487 src_reg
1488 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1489                                  src_reg *reladdr, int reg_offset)
1490 {
1491    /* Because we store the values to scratch interleaved like our
1492     * vertex data, we need to scale the vec4 index by 2.
1493     */
1494    int message_header_scale = 2;
1495
1496    /* Pre-gen6, the message header uses byte offsets instead of vec4
1497     * (16-byte) offset units.
1498     */
1499    if (devinfo->gen < 6)
1500       message_header_scale *= 16;
1501
1502    if (reladdr) {
1503       src_reg index = src_reg(this, glsl_type::int_type);
1504
1505       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1506                                    src_reg(reg_offset)));
1507       emit_before(block, inst, MUL(dst_reg(index), index,
1508                                    src_reg(message_header_scale)));
1509
1510       return index;
1511    } else {
1512       return src_reg(reg_offset * message_header_scale);
1513    }
1514 }
1515
1516 src_reg
1517 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1518                                        src_reg *reladdr, int reg_offset)
1519 {
1520    if (reladdr) {
1521       src_reg index = src_reg(this, glsl_type::int_type);
1522
1523       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1524                                    src_reg(reg_offset)));
1525
1526       /* Pre-gen6, the message header uses byte offsets instead of vec4
1527        * (16-byte) offset units.
1528        */
1529       if (devinfo->gen < 6) {
1530          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1531       }
1532
1533       return index;
1534    } else if (devinfo->gen >= 8) {
1535       /* Store the offset in a GRF so we can send-from-GRF. */
1536       src_reg offset = src_reg(this, glsl_type::int_type);
1537       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1538       return offset;
1539    } else {
1540       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1541       return src_reg(reg_offset * message_header_scale);
1542    }
1543 }
1544
1545 /**
1546  * Emits an instruction before @inst to load the value named by @orig_src
1547  * from scratch space at @base_offset to @temp.
1548  *
1549  * @base_offset is measured in 32-byte units (the size of a register).
1550  */
1551 void
1552 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1553                                 dst_reg temp, src_reg orig_src,
1554                                 int base_offset)
1555 {
1556    int reg_offset = base_offset + orig_src.reg_offset;
1557    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1558                                       reg_offset);
1559
1560    emit_before(block, inst, SCRATCH_READ(temp, index));
1561 }
1562
1563 /**
1564  * Emits an instruction after @inst to store the value to be written
1565  * to @orig_dst to scratch space at @base_offset, from @temp.
1566  *
1567  * @base_offset is measured in 32-byte units (the size of a register).
1568  */
1569 void
1570 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1571                                  int base_offset)
1572 {
1573    int reg_offset = base_offset + inst->dst.reg_offset;
1574    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1575                                       reg_offset);
1576
1577    /* Create a temporary register to store *inst's result in.
1578     *
1579     * We have to be careful in MOVing from our temporary result register in
1580     * the scratch write.  If we swizzle from channels of the temporary that
1581     * weren't initialized, it will confuse live interval analysis, which will
1582     * make spilling fail to make progress.
1583     */
1584    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1585                                        inst->dst.type),
1586                                 brw_swizzle_for_mask(inst->dst.writemask));
1587    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1588                                        inst->dst.writemask));
1589    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1590    if (inst->opcode != BRW_OPCODE_SEL)
1591       write->predicate = inst->predicate;
1592    write->ir = inst->ir;
1593    write->annotation = inst->annotation;
1594    inst->insert_after(block, write);
1595
1596    inst->dst.file = temp.file;
1597    inst->dst.reg = temp.reg;
1598    inst->dst.reg_offset = temp.reg_offset;
1599    inst->dst.reladdr = NULL;
1600 }
1601
1602 /**
1603  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1604  * adds the scratch read(s) before \p inst. The function also checks for
1605  * recursive reladdr scratch accesses, issuing the corresponding scratch
1606  * loads and rewriting reladdr references accordingly.
1607  *
1608  * \return \p src if it did not require a scratch load, otherwise, the
1609  * register holding the result of the scratch load that the caller should
1610  * use to rewrite src.
1611  */
1612 src_reg
1613 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1614                                    vec4_instruction *inst, src_reg src)
1615 {
1616    /* Resolve recursive reladdr scratch access by calling ourselves
1617     * with src.reladdr
1618     */
1619    if (src.reladdr)
1620       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1621                                           *src.reladdr);
1622
1623    /* Now handle scratch access on src */
1624    if (src.file == GRF && scratch_loc[src.reg] != -1) {
1625       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1626       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1627       src.reg = temp.reg;
1628       src.reg_offset = temp.reg_offset;
1629       src.reladdr = NULL;
1630    }
1631
1632    return src;
1633 }
1634
1635 /**
1636  * We can't generally support array access in GRF space, because a
1637  * single instruction's destination can only span 2 contiguous
1638  * registers.  So, we send all GRF arrays that get variable index
1639  * access to scratch space.
1640  */
1641 void
1642 vec4_visitor::move_grf_array_access_to_scratch()
1643 {
1644    int scratch_loc[this->alloc.count];
1645    memset(scratch_loc, -1, sizeof(scratch_loc));
1646
1647    /* First, calculate the set of virtual GRFs that need to be punted
1648     * to scratch due to having any array access on them, and where in
1649     * scratch.
1650     */
1651    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1652       if (inst->dst.file == GRF && inst->dst.reladdr) {
1653          if (scratch_loc[inst->dst.reg] == -1) {
1654             scratch_loc[inst->dst.reg] = last_scratch;
1655             last_scratch += this->alloc.sizes[inst->dst.reg];
1656          }
1657
1658          for (src_reg *iter = inst->dst.reladdr;
1659               iter->reladdr;
1660               iter = iter->reladdr) {
1661             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1662                scratch_loc[iter->reg] = last_scratch;
1663                last_scratch += this->alloc.sizes[iter->reg];
1664             }
1665          }
1666       }
1667
1668       for (int i = 0 ; i < 3; i++) {
1669          for (src_reg *iter = &inst->src[i];
1670               iter->reladdr;
1671               iter = iter->reladdr) {
1672             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1673                scratch_loc[iter->reg] = last_scratch;
1674                last_scratch += this->alloc.sizes[iter->reg];
1675             }
1676          }
1677       }
1678    }
1679
1680    /* Now, for anything that will be accessed through scratch, rewrite
1681     * it to load/store.  Note that this is a _safe list walk, because
1682     * we may generate a new scratch_write instruction after the one
1683     * we're processing.
1684     */
1685    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1686       /* Set up the annotation tracking for new generated instructions. */
1687       base_ir = inst->ir;
1688       current_annotation = inst->annotation;
1689
1690       /* First handle scratch access on the dst. Notice we have to handle
1691        * the case where the dst's reladdr also points to scratch space.
1692        */
1693       if (inst->dst.reladdr)
1694          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1695                                                    *inst->dst.reladdr);
1696
1697       /* Now that we have handled any (possibly recursive) reladdr scratch
1698        * accesses for dst we can safely do the scratch write for dst itself
1699        */
1700       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1701          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1702
1703       /* Now handle scratch access on any src. In this case, since inst->src[i]
1704        * already is a src_reg, we can just call emit_resolve_reladdr with
1705        * inst->src[i] and it will take care of handling scratch loads for
1706        * both src and src.reladdr (recursively).
1707        */
1708       for (int i = 0 ; i < 3; i++) {
1709          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1710                                              inst->src[i]);
1711       }
1712    }
1713 }
1714
1715 /**
1716  * Emits an instruction before @inst to load the value named by @orig_src
1717  * from the pull constant buffer (surface) at @base_offset to @temp.
1718  */
1719 void
1720 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1721                                       dst_reg temp, src_reg orig_src,
1722                                       int base_offset)
1723 {
1724    int reg_offset = base_offset + orig_src.reg_offset;
1725    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1726    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1727                                              reg_offset);
1728
1729    emit_pull_constant_load_reg(temp,
1730                                index,
1731                                offset,
1732                                block, inst);
1733 }
1734
1735 /**
1736  * Implements array access of uniforms by inserting a
1737  * PULL_CONSTANT_LOAD instruction.
1738  *
1739  * Unlike temporary GRF array access (where we don't support it due to
1740  * the difficulty of doing relative addressing on instruction
1741  * destinations), we could potentially do array access of uniforms
1742  * that were loaded in GRF space as push constants.  In real-world
1743  * usage we've seen, though, the arrays being used are always larger
1744  * than we could load as push constants, so just always move all
1745  * uniform array access out to a pull constant buffer.
1746  */
1747 void
1748 vec4_visitor::move_uniform_array_access_to_pull_constants()
1749 {
1750    int pull_constant_loc[this->uniforms];
1751    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1752    bool nested_reladdr;
1753
1754    /* Walk through and find array access of uniforms.  Put a copy of that
1755     * uniform in the pull constant buffer.
1756     *
1757     * Note that we don't move constant-indexed accesses to arrays.  No
1758     * testing has been done of the performance impact of this choice.
1759     */
1760    do {
1761       nested_reladdr = false;
1762
1763       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1764          for (int i = 0 ; i < 3; i++) {
1765             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1766                continue;
1767
1768             int uniform = inst->src[i].reg;
1769
1770             if (inst->src[i].reladdr->reladdr)
1771                nested_reladdr = true;  /* will need another pass */
1772
1773             /* If this array isn't already present in the pull constant buffer,
1774              * add it.
1775              */
1776             if (pull_constant_loc[uniform] == -1) {
1777                const gl_constant_value **values =
1778                   &stage_prog_data->param[uniform * 4];
1779
1780                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1781
1782                assert(uniform < uniform_array_size);
1783                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1784                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1785                      = values[j];
1786                }
1787             }
1788
1789             /* Set up the annotation tracking for new generated instructions. */
1790             base_ir = inst->ir;
1791             current_annotation = inst->annotation;
1792
1793             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1794
1795             emit_pull_constant_load(block, inst, temp, inst->src[i],
1796                                     pull_constant_loc[uniform]);
1797
1798             inst->src[i].file = temp.file;
1799             inst->src[i].reg = temp.reg;
1800             inst->src[i].reg_offset = temp.reg_offset;
1801             inst->src[i].reladdr = NULL;
1802          }
1803       }
1804    } while (nested_reladdr);
1805
1806    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1807     * no need to track them as larger-than-vec4 objects.  This will be
1808     * relied on in cutting out unused uniform vectors from push
1809     * constants.
1810     */
1811    split_uniform_registers();
1812 }
1813
1814 void
1815 vec4_visitor::resolve_ud_negate(src_reg *reg)
1816 {
1817    if (reg->type != BRW_REGISTER_TYPE_UD ||
1818        !reg->negate)
1819       return;
1820
1821    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1822    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1823    *reg = temp;
1824 }
1825
1826 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1827                            void *log_data,
1828                            const struct brw_sampler_prog_key_data *key_tex,
1829                            struct brw_vue_prog_data *prog_data,
1830                            nir_shader *shader,
1831                            void *mem_ctx,
1832                            bool no_spills,
1833                            int shader_time_index)
1834    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1835      key_tex(key_tex),
1836      prog_data(prog_data),
1837      fail_msg(NULL),
1838      first_non_payload_grf(0),
1839      need_all_constants_in_pull_buffer(false),
1840      no_spills(no_spills),
1841      shader_time_index(shader_time_index),
1842      last_scratch(0)
1843 {
1844    this->failed = false;
1845
1846    this->base_ir = NULL;
1847    this->current_annotation = NULL;
1848    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1849
1850    this->virtual_grf_start = NULL;
1851    this->virtual_grf_end = NULL;
1852    this->live_intervals = NULL;
1853
1854    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1855
1856    this->uniforms = 0;
1857
1858    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1859     * at least one. See setup_uniforms() in brw_vec4.cpp.
1860     */
1861    this->uniform_array_size = 1;
1862    if (prog_data) {
1863       this->uniform_array_size =
1864          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1865    }
1866
1867    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1868 }
1869
1870 vec4_visitor::~vec4_visitor()
1871 {
1872 }
1873
1874
1875 void
1876 vec4_visitor::fail(const char *format, ...)
1877 {
1878    va_list va;
1879    char *msg;
1880
1881    if (failed)
1882       return;
1883
1884    failed = true;
1885
1886    va_start(va, format);
1887    msg = ralloc_vasprintf(mem_ctx, format, va);
1888    va_end(va);
1889    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1890
1891    this->fail_msg = msg;
1892
1893    if (debug_enabled) {
1894       fprintf(stderr, "%s",  msg);
1895    }
1896 }
1897
1898 } /* namespace brw */