src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 src_reg
 280 vec4_visitor::fix_3src_operand(const src_reg &src)
 281 {
 282    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 283     * able to use vertical stride of zero to replicate the vec4 uniform, like
 284     *
 285     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 286     *
 287     * But you can't, since vertical stride is always four in three-source
 288     * instructions. Instead, insert a MOV instruction to do the replication so
 289     * that the three-source instruction can consume it.
 290     */
 291
 292    /* The MOV is only needed if the source is a uniform or immediate. */
 293    if (src.file != UNIFORM && src.file != IMM)
 294       return src;
 295
 296    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 297       return src;
 298
 299    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 300    expanded.type = src.type;
 301    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 302    return src_reg(expanded);
 303 }
 304
 305 src_reg
 306 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 307 {
 308    if (!src.abs && !src.negate)
 309       return src;
 310
 311    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 312    resolved.type = src.type;
 313    emit(MOV(resolved, src));
 314
 315    return src_reg(resolved);
 316 }
 317
 318 src_reg
 319 vec4_visitor::fix_math_operand(const src_reg &src)
 320 {
 321    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 322       return src;
 323
 324    /* The gen6 math instruction ignores the source modifiers --
 325     * swizzle, abs, negate, and at least some parts of the register
 326     * region description.
 327     *
 328     * Rather than trying to enumerate all these cases, *always* expand the
 329     * operand to a temp GRF for gen6.
 330     *
 331     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 332     * can't use.
 333     */
 334
 335    if (devinfo->gen == 7 && src.file != IMM)
 336       return src;
 337
 338    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 339    expanded.type = src.type;
 340    emit(MOV(expanded, src));
 341    return src_reg(expanded);
 342 }
 343
 344 vec4_instruction *
 345 vec4_visitor::emit_math(enum opcode opcode,
 346                         const dst_reg &dst,
 347                         const src_reg &src0, const src_reg &src1)
 348 {
 349    vec4_instruction *math =
 350       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 351
 352    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 353       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 354       math->dst = dst_reg(this, glsl_type::vec4_type);
 355       math->dst.type = dst.type;
 356       math = emit(MOV(dst, src_reg(math->dst)));
 357    } else if (devinfo->gen < 6) {
 358       math->base_mrf = 1;
 359       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 360    }
 361
 362    return math;
 363 }
 364
 365 void
 366 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 367 {
 368    if (devinfo->gen < 7) {
 369       unreachable("ir_unop_pack_half_2x16 should be lowered");
 370    }
 371
 372    assert(dst.type == BRW_REGISTER_TYPE_UD);
 373    assert(src0.type == BRW_REGISTER_TYPE_F);
 374
 375    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 376     *
 377     *   Because this instruction does not have a 16-bit floating-point type,
 378     *   the destination data type must be Word (W).
 379     *
 380     *   The destination must be DWord-aligned and specify a horizontal stride
 381     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 382     *   each destination channel and the upper word is not modified.
 383     *
 384     * The above restriction implies that the f32to16 instruction must use
 385     * align1 mode, because only in align1 mode is it possible to specify
 386     * horizontal stride.  We choose here to defy the hardware docs and emit
 387     * align16 instructions.
 388     *
 389     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 390     * instructions. I was partially successful in that the code passed all
 391     * tests.  However, the code was dubiously correct and fragile, and the
 392     * tests were not harsh enough to probe that frailty. Not trusting the
 393     * code, I chose instead to remain in align16 mode in defiance of the hw
 394     * docs).
 395     *
 396     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 397     * simulator, emitting a f32to16 in align16 mode with UD as destination
 398     * data type is safe. The behavior differs from that specified in the PRM
 399     * in that the upper word of each destination channel is cleared to 0.
 400     */
 401
 402    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 403    src_reg tmp_src(tmp_dst);
 404
 405 #if 0
 406    /* Verify the undocumented behavior on which the following instructions
 407     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 408     * then the result of the bit-or instruction below will be incorrect.
 409     *
 410     * You should inspect the disasm output in order to verify that the MOV is
 411     * not optimized away.
 412     */
 413    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 414 #endif
 415
 416    /* Give tmp the form below, where "." means untouched.
 417     *
 418     *     w z          y          x w z          y          x
 419     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 420     *
 421     * That the upper word of each write-channel be 0 is required for the
 422     * following bit-shift and bit-or instructions to work. Note that this
 423     * relies on the undocumented hardware behavior mentioned above.
 424     */
 425    tmp_dst.writemask = WRITEMASK_XY;
 426    emit(F32TO16(tmp_dst, src0));
 427
 428    /* Give the write-channels of dst the form:
 429     *   0xhhhh0000
 430     */
 431    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 432    emit(SHL(dst, tmp_src, src_reg(16u)));
 433
 434    /* Finally, give the write-channels of dst the form of packHalf2x16's
 435     * output:
 436     *   0xhhhhllll
 437     */
 438    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 439    emit(OR(dst, src_reg(dst), tmp_src));
 440 }
 441
 442 void
 443 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 444 {
 445    if (devinfo->gen < 7) {
 446       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 447    }
 448
 449    assert(dst.type == BRW_REGISTER_TYPE_F);
 450    assert(src0.type == BRW_REGISTER_TYPE_UD);
 451
 452    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 453     *
 454     *   Because this instruction does not have a 16-bit floating-point type,
 455     *   the source data type must be Word (W). The destination type must be
 456     *   F (Float).
 457     *
 458     * To use W as the source data type, we must adjust horizontal strides,
 459     * which is only possible in align1 mode. All my [chadv] attempts at
 460     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 461     * Piglit tests, so I gave up.
 462     *
 463     * I've verified that, on gen7 hardware and the simulator, it is safe to
 464     * emit f16to32 in align16 mode with UD as source data type.
 465     */
 466
 467    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 468    src_reg tmp_src(tmp_dst);
 469
 470    tmp_dst.writemask = WRITEMASK_X;
 471    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 472
 473    tmp_dst.writemask = WRITEMASK_Y;
 474    emit(SHR(tmp_dst, src0, src_reg(16u)));
 475
 476    dst.writemask = WRITEMASK_XY;
 477    emit(F16TO32(dst, tmp_src));
 478 }
 479
 480 void
 481 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 482 {
 483    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 484     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 485     * is not suitable to generate the shift values, but we can use the packed
 486     * vector float and a type-converting MOV.
 487     */
 488    dst_reg shift(this, glsl_type::uvec4_type);
 489    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 490
 491    dst_reg shifted(this, glsl_type::uvec4_type);
 492    src0.swizzle = BRW_SWIZZLE_XXXX;
 493    emit(SHR(shifted, src0, src_reg(shift)));
 494
 495    shifted.type = BRW_REGISTER_TYPE_UB;
 496    dst_reg f(this, glsl_type::vec4_type);
 497    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 498
 499    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 500 }
 501
 502 void
 503 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 504 {
 505    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 506     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 507     * is not suitable to generate the shift values, but we can use the packed
 508     * vector float and a type-converting MOV.
 509     */
 510    dst_reg shift(this, glsl_type::uvec4_type);
 511    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 512
 513    dst_reg shifted(this, glsl_type::uvec4_type);
 514    src0.swizzle = BRW_SWIZZLE_XXXX;
 515    emit(SHR(shifted, src0, src_reg(shift)));
 516
 517    shifted.type = BRW_REGISTER_TYPE_B;
 518    dst_reg f(this, glsl_type::vec4_type);
 519    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 520
 521    dst_reg scaled(this, glsl_type::vec4_type);
 522    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 523
 524    dst_reg max(this, glsl_type::vec4_type);
 525    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 526    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 527 }
 528
 529 void
 530 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 531 {
 532    dst_reg saturated(this, glsl_type::vec4_type);
 533    vec4_instruction *inst = emit(MOV(saturated, src0));
 534    inst->saturate = true;
 535
 536    dst_reg scaled(this, glsl_type::vec4_type);
 537    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 538
 539    dst_reg rounded(this, glsl_type::vec4_type);
 540    emit(RNDE(rounded, src_reg(scaled)));
 541
 542    dst_reg u(this, glsl_type::uvec4_type);
 543    emit(MOV(u, src_reg(rounded)));
 544
 545    src_reg bytes(u);
 546    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 547 }
 548
 549 void
 550 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 551 {
 552    dst_reg max(this, glsl_type::vec4_type);
 553    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 554
 555    dst_reg min(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 557
 558    dst_reg scaled(this, glsl_type::vec4_type);
 559    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 560
 561    dst_reg rounded(this, glsl_type::vec4_type);
 562    emit(RNDE(rounded, src_reg(scaled)));
 563
 564    dst_reg i(this, glsl_type::ivec4_type);
 565    emit(MOV(i, src_reg(rounded)));
 566
 567    src_reg bytes(i);
 568    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 569 }
 570
 571 /**
 572  * Returns the minimum number of vec4 elements needed to pack a type.
 573  *
 574  * For simple types, it will return 1 (a single vec4); for matrices, the
 575  * number of columns; for array and struct, the sum of the vec4_size of
 576  * each of its elements; and for sampler and atomic, zero.
 577  *
 578  * This method is useful to calculate how much register space is needed to
 579  * store a particular type.
 580  */
 581 extern "C" int
 582 type_size_vec4(const struct glsl_type *type)
 583 {
 584    unsigned int i;
 585    int size;
 586
 587    switch (type->base_type) {
 588    case GLSL_TYPE_UINT:
 589    case GLSL_TYPE_INT:
 590    case GLSL_TYPE_FLOAT:
 591    case GLSL_TYPE_BOOL:
 592       if (type->is_matrix()) {
 593          return type->matrix_columns;
 594       } else {
 595          /* Regardless of size of vector, it gets a vec4. This is bad
 596           * packing for things like floats, but otherwise arrays become a
 597           * mess.  Hopefully a later pass over the code can pack scalars
 598           * down if appropriate.
 599           */
 600          return 1;
 601       }
 602    case GLSL_TYPE_ARRAY:
 603       assert(type->length > 0);
 604       return type_size_vec4(type->fields.array) * type->length;
 605    case GLSL_TYPE_STRUCT:
 606       size = 0;
 607       for (i = 0; i < type->length; i++) {
 608          size += type_size_vec4(type->fields.structure[i].type);
 609       }
 610       return size;
 611    case GLSL_TYPE_SUBROUTINE:
 612       return 1;
 613
 614    case GLSL_TYPE_SAMPLER:
 615       /* Samplers take up no register space, since they're baked in at
 616        * link time.
 617        */
 618       return 0;
 619    case GLSL_TYPE_ATOMIC_UINT:
 620       return 0;
 621    case GLSL_TYPE_IMAGE:
 622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 623    case GLSL_TYPE_VOID:
 624    case GLSL_TYPE_DOUBLE:
 625    case GLSL_TYPE_ERROR:
 626    case GLSL_TYPE_INTERFACE:
 627       unreachable("not reached");
 628    }
 629
 630    return 0;
 631 }
 632
 633 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 634 {
 635    init();
 636
 637    this->file = GRF;
 638    this->reg = v->alloc.allocate(type_size_vec4(type));
 639
 640    if (type->is_array() || type->is_record()) {
 641       this->swizzle = BRW_SWIZZLE_NOOP;
 642    } else {
 643       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 644    }
 645
 646    this->type = brw_type_for_base_type(type);
 647 }
 648
 649 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 650 {
 651    assert(size > 0);
 652
 653    init();
 654
 655    this->file = GRF;
 656    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 657
 658    this->swizzle = BRW_SWIZZLE_NOOP;
 659
 660    this->type = brw_type_for_base_type(type);
 661 }
 662
 663 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 664 {
 665    init();
 666
 667    this->file = GRF;
 668    this->reg = v->alloc.allocate(type_size_vec4(type));
 669
 670    if (type->is_array() || type->is_record()) {
 671       this->writemask = WRITEMASK_XYZW;
 672    } else {
 673       this->writemask = (1 << type->vector_elements) - 1;
 674    }
 675
 676    this->type = brw_type_for_base_type(type);
 677 }
 678
 679 vec4_instruction *
 680 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 681                           src_reg src0, src_reg src1)
 682 {
 683    vec4_instruction *inst;
 684
 685    if (devinfo->gen >= 6) {
 686       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 687       inst->conditional_mod = conditionalmod;
 688    } else {
 689       emit(CMP(dst, src0, src1, conditionalmod));
 690
 691       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 692       inst->predicate = BRW_PREDICATE_NORMAL;
 693    }
 694
 695    return inst;
 696 }
 697
 698 vec4_instruction *
 699 vec4_visitor::emit_lrp(const dst_reg &dst,
 700                        const src_reg &x, const src_reg &y, const src_reg &a)
 701 {
 702    if (devinfo->gen >= 6) {
 703       /* Note that the instruction's argument order is reversed from GLSL
 704        * and the IR.
 705        */
 706      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 707                      fix_3src_operand(x)));
 708    } else {
 709       /* Earlier generations don't support three source operations, so we
 710        * need to emit x*(1-a) + y*a.
 711        */
 712       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 713       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 714       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 715       y_times_a.writemask           = dst.writemask;
 716       one_minus_a.writemask         = dst.writemask;
 717       x_times_one_minus_a.writemask = dst.writemask;
 718
 719       emit(MUL(y_times_a, y, a));
 720       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
 721       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 722       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 723    }
 724 }
 725
 726 /**
 727  * Emits the instructions needed to perform a pull constant load. before_block
 728  * and before_inst can be NULL in which case the instruction will be appended
 729  * to the end of the instruction list.
 730  */
 731 void
 732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 733                                           src_reg surf_index,
 734                                           src_reg offset_reg,
 735                                           bblock_t *before_block,
 736                                           vec4_instruction *before_inst)
 737 {
 738    assert((before_inst == NULL && before_block == NULL) ||
 739           (before_inst && before_block));
 740
 741    vec4_instruction *pull;
 742
 743    if (devinfo->gen >= 9) {
 744       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 745       src_reg header(this, glsl_type::uvec4_type, 2);
 746
 747       pull = new(mem_ctx)
 748          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 749                           dst_reg(header));
 750
 751       if (before_inst)
 752          emit_before(before_block, before_inst, pull);
 753       else
 754          emit(pull);
 755
 756       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 757                                  offset_reg.type);
 758       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 759
 760       if (before_inst)
 761          emit_before(before_block, before_inst, pull);
 762       else
 763          emit(pull);
 764
 765       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 766                                            dst,
 767                                            surf_index,
 768                                            header);
 769       pull->mlen = 2;
 770       pull->header_size = 1;
 771    } else if (devinfo->gen >= 7) {
 772       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 773
 774       grf_offset.type = offset_reg.type;
 775
 776       pull = MOV(grf_offset, offset_reg);
 777
 778       if (before_inst)
 779          emit_before(before_block, before_inst, pull);
 780       else
 781          emit(pull);
 782
 783       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 784                                            dst,
 785                                            surf_index,
 786                                            src_reg(grf_offset));
 787       pull->mlen = 1;
 788    } else {
 789       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 790                                            dst,
 791                                            surf_index,
 792                                            offset_reg);
 793       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 794       pull->mlen = 1;
 795    }
 796
 797    if (before_inst)
 798       emit_before(before_block, before_inst, pull);
 799    else
 800       emit(pull);
 801 }
 802
 803 src_reg
 804 vec4_visitor::emit_uniformize(const src_reg &src)
 805 {
 806    const src_reg chan_index(this, glsl_type::uint_type);
 807    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 808                               src.type);
 809
 810    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 811       ->force_writemask_all = true;
 812    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 813       ->force_writemask_all = true;
 814
 815    return src_reg(dst);
 816 }
 817
 818 src_reg
 819 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 820                              src_reg coordinate, src_reg sampler)
 821 {
 822    vec4_instruction *inst =
 823       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 824                                     dst_reg(this, glsl_type::uvec4_type));
 825    inst->base_mrf = 2;
 826    inst->src[1] = sampler;
 827
 828    int param_base;
 829
 830    if (devinfo->gen >= 9) {
 831       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 832       vec4_instruction *header_inst = new(mem_ctx)
 833          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 834                           dst_reg(MRF, inst->base_mrf));
 835
 836       emit(header_inst);
 837
 838       inst->mlen = 2;
 839       inst->header_size = 1;
 840       param_base = inst->base_mrf + 1;
 841    } else {
 842       inst->mlen = 1;
 843       param_base = inst->base_mrf;
 844    }
 845
 846    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 847    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 848    int zero_mask = 0xf & ~coord_mask;
 849
 850    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 851             coordinate));
 852
 853    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 854             src_reg(0)));
 855
 856    emit(inst);
 857    return src_reg(inst->dst);
 858 }
 859
 860 bool
 861 vec4_visitor::is_high_sampler(src_reg sampler)
 862 {
 863    if (devinfo->gen < 8 && !devinfo->is_haswell)
 864       return false;
 865
 866    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 867 }
 868
 869 void
 870 vec4_visitor::emit_texture(ir_texture_opcode op,
 871                            dst_reg dest,
 872                            const glsl_type *dest_type,
 873                            src_reg coordinate,
 874                            int coord_components,
 875                            src_reg shadow_comparitor,
 876                            src_reg lod, src_reg lod2,
 877                            src_reg sample_index,
 878                            uint32_t constant_offset,
 879                            src_reg offset_value,
 880                            src_reg mcs,
 881                            bool is_cube_array,
 882                            uint32_t sampler,
 883                            src_reg sampler_reg)
 884 {
 885    /* The sampler can only meaningfully compute LOD for fragment shader
 886     * messages. For all other stages, we change the opcode to TXL and hardcode
 887     * the LOD to 0.
 888     *
 889     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 890     * valid LOD argument.
 891     */
 892    if (op == ir_tex || op == ir_query_levels) {
 893       assert(lod.file == BAD_FILE);
 894       lod = src_reg(0.0f);
 895    }
 896
 897    enum opcode opcode;
 898    switch (op) {
 899    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 900    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 901    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 902    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 903    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 904                              SHADER_OPCODE_TXF_CMS); break;
 905    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 906    case ir_tg4: opcode = offset_value.file != BAD_FILE
 907                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 908    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 909    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 910    case ir_txb:
 911       unreachable("TXB is not valid for vertex shaders.");
 912    case ir_lod:
 913       unreachable("LOD is not valid for vertex shaders.");
 914    default:
 915       unreachable("Unrecognized tex op");
 916    }
 917
 918    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 919       opcode, dst_reg(this, dest_type));
 920
 921    inst->offset = constant_offset;
 922
 923    /* The message header is necessary for:
 924     * - Gen4 (always)
 925     * - Gen9+ for selecting SIMD4x2
 926     * - Texel offsets
 927     * - Gather channel selection
 928     * - Sampler indices too large to fit in a 4-bit value.
 929     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 930     */
 931    inst->header_size =
 932       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 933        inst->offset != 0 || op == ir_tg4 ||
 934        op == ir_texture_samples ||
 935        is_high_sampler(sampler_reg)) ? 1 : 0;
 936    inst->base_mrf = 2;
 937    inst->mlen = inst->header_size;
 938    inst->dst.writemask = WRITEMASK_XYZW;
 939    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 940
 941    inst->src[1] = sampler_reg;
 942
 943    /* MRF for the first parameter */
 944    int param_base = inst->base_mrf + inst->header_size;
 945
 946    if (op == ir_txs || op == ir_query_levels) {
 947       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 948       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 949       inst->mlen++;
 950    } else if (op == ir_texture_samples) {
 951       inst->dst.writemask = WRITEMASK_X;
 952    } else {
 953       /* Load the coordinate */
 954       /* FINISHME: gl_clamp_mask and saturate */
 955       int coord_mask = (1 << coord_components) - 1;
 956       int zero_mask = 0xf & ~coord_mask;
 957
 958       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 959                coordinate));
 960       inst->mlen++;
 961
 962       if (zero_mask != 0) {
 963          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 964                   src_reg(0)));
 965       }
 966       /* Load the shadow comparitor */
 967       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 968          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 969                           WRITEMASK_X),
 970                   shadow_comparitor));
 971          inst->mlen++;
 972       }
 973
 974       /* Load the LOD info */
 975       if (op == ir_tex || op == ir_txl) {
 976          int mrf, writemask;
 977          if (devinfo->gen >= 5) {
 978             mrf = param_base + 1;
 979             if (shadow_comparitor.file != BAD_FILE) {
 980                writemask = WRITEMASK_Y;
 981                /* mlen already incremented */
 982             } else {
 983                writemask = WRITEMASK_X;
 984                inst->mlen++;
 985             }
 986          } else /* devinfo->gen == 4 */ {
 987             mrf = param_base;
 988             writemask = WRITEMASK_W;
 989          }
 990          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 991       } else if (op == ir_txf) {
 992          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 993       } else if (op == ir_txf_ms) {
 994          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
 995                   sample_index));
 996          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
 997             /* MCS data is stored in the first two channels of ‘mcs’, but we
 998              * need to get it into the .y and .z channels of the second vec4
 999              * of params.
1000              */
1001             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1002             emit(MOV(dst_reg(MRF, param_base + 1,
1003                              glsl_type::uint_type, WRITEMASK_YZ),
1004                      mcs));
1005          } else if (devinfo->gen >= 7) {
1006             /* MCS data is in the first channel of `mcs`, but we need to get it into
1007              * the .y channel of the second vec4 of params, so replicate .x across
1008              * the whole vec4 and then mask off everything except .y
1009              */
1010             mcs.swizzle = BRW_SWIZZLE_XXXX;
1011             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1012                      mcs));
1013          }
1014          inst->mlen++;
1015       } else if (op == ir_txd) {
1016          const brw_reg_type type = lod.type;
1017
1018          if (devinfo->gen >= 5) {
1019             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1020             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1021             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1022             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1023             inst->mlen++;
1024
1025             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1026                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1027                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1028                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1029                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1030                inst->mlen++;
1031
1032                if (shadow_comparitor.file != BAD_FILE) {
1033                   emit(MOV(dst_reg(MRF, param_base + 2,
1034                                    shadow_comparitor.type, WRITEMASK_Z),
1035                            shadow_comparitor));
1036                }
1037             }
1038          } else /* devinfo->gen == 4 */ {
1039             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1040             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1041             inst->mlen += 2;
1042          }
1043       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1044          if (shadow_comparitor.file != BAD_FILE) {
1045             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1046                      shadow_comparitor));
1047          }
1048
1049          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1050                   offset_value));
1051          inst->mlen++;
1052       }
1053    }
1054
1055    emit(inst);
1056
1057    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1058     * spec requires layers.
1059     */
1060    if (op == ir_txs && is_cube_array) {
1061       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1062                 writemask(inst->dst, WRITEMASK_Z),
1063                 src_reg(inst->dst), src_reg(6));
1064    }
1065
1066    if (devinfo->gen == 6 && op == ir_tg4) {
1067       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1068    }
1069
1070    swizzle_result(op, dest,
1071                   src_reg(inst->dst), sampler, dest_type);
1072 }
1073
1074 /**
1075  * Apply workarounds for Gen6 gather with UINT/SINT
1076  */
1077 void
1078 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1079 {
1080    if (!wa)
1081       return;
1082
1083    int width = (wa & WA_8BIT) ? 8 : 16;
1084    dst_reg dst_f = dst;
1085    dst_f.type = BRW_REGISTER_TYPE_F;
1086
1087    /* Convert from UNORM to UINT */
1088    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1089    emit(MOV(dst, src_reg(dst_f)));
1090
1091    if (wa & WA_SIGN) {
1092       /* Reinterpret the UINT value as a signed INT value by
1093        * shifting the sign bit into place, then shifting back
1094        * preserving sign.
1095        */
1096       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1097       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1098    }
1099 }
1100
1101 /**
1102  * Set up the gather channel based on the swizzle, for gather4.
1103  */
1104 uint32_t
1105 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1106 {
1107    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1108    switch (swiz) {
1109       case SWIZZLE_X: return 0;
1110       case SWIZZLE_Y:
1111          /* gather4 sampler is broken for green channel on RG32F --
1112           * we must ask for blue instead.
1113           */
1114          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1115             return 2;
1116          return 1;
1117       case SWIZZLE_Z: return 2;
1118       case SWIZZLE_W: return 3;
1119       default:
1120          unreachable("Not reached"); /* zero, one swizzles handled already */
1121    }
1122 }
1123
1124 void
1125 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1126                              src_reg orig_val, uint32_t sampler,
1127                              const glsl_type *dest_type)
1128 {
1129    int s = key_tex->swizzles[sampler];
1130
1131    dst_reg swizzled_result = dest;
1132
1133    if (op == ir_query_levels) {
1134       /* # levels is in .w */
1135       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1136       emit(MOV(swizzled_result, orig_val));
1137       return;
1138    }
1139
1140    if (op == ir_txs || dest_type == glsl_type::float_type
1141                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1142       emit(MOV(swizzled_result, orig_val));
1143       return;
1144    }
1145
1146
1147    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1148    int swizzle[4] = {0};
1149
1150    for (int i = 0; i < 4; i++) {
1151       switch (GET_SWZ(s, i)) {
1152       case SWIZZLE_ZERO:
1153          zero_mask |= (1 << i);
1154          break;
1155       case SWIZZLE_ONE:
1156          one_mask |= (1 << i);
1157          break;
1158       default:
1159          copy_mask |= (1 << i);
1160          swizzle[i] = GET_SWZ(s, i);
1161          break;
1162       }
1163    }
1164
1165    if (copy_mask) {
1166       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1167       swizzled_result.writemask = copy_mask;
1168       emit(MOV(swizzled_result, orig_val));
1169    }
1170
1171    if (zero_mask) {
1172       swizzled_result.writemask = zero_mask;
1173       emit(MOV(swizzled_result, src_reg(0.0f)));
1174    }
1175
1176    if (one_mask) {
1177       swizzled_result.writemask = one_mask;
1178       emit(MOV(swizzled_result, src_reg(1.0f)));
1179    }
1180 }
1181
1182 void
1183 vec4_visitor::gs_emit_vertex(int stream_id)
1184 {
1185    unreachable("not reached");
1186 }
1187
1188 void
1189 vec4_visitor::gs_end_primitive()
1190 {
1191    unreachable("not reached");
1192 }
1193
1194 void
1195 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1196                                   dst_reg dst, src_reg surf_offset,
1197                                   src_reg src0, src_reg src1)
1198 {
1199    unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1200    src_reg src_payload(this, glsl_type::uint_type, mlen);
1201    dst_reg payload(src_payload);
1202    payload.writemask = WRITEMASK_X;
1203
1204    /* Set the atomic operation offset. */
1205    emit(MOV(offset(payload, 0), surf_offset));
1206    unsigned i = 1;
1207
1208    /* Set the atomic operation arguments. */
1209    if (src0.file != BAD_FILE) {
1210       emit(MOV(offset(payload, i), src0));
1211       i++;
1212    }
1213
1214    if (src1.file != BAD_FILE) {
1215       emit(MOV(offset(payload, i), src1));
1216       i++;
1217    }
1218
1219    /* Emit the instruction.  Note that this maps to the normal SIMD8
1220     * untyped atomic message on Ivy Bridge, but that's OK because
1221     * unused channels will be masked out.
1222     */
1223    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1224                                  src_payload,
1225                                  src_reg(surf_index), src_reg(atomic_op));
1226    inst->mlen = mlen;
1227 }
1228
1229 void
1230 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1231                                         src_reg surf_offset)
1232 {
1233    dst_reg offset(this, glsl_type::uint_type);
1234    offset.writemask = WRITEMASK_X;
1235
1236    /* Set the surface read offset. */
1237    emit(MOV(offset, surf_offset));
1238
1239    /* Emit the instruction.  Note that this maps to the normal SIMD8
1240     * untyped surface read message, but that's OK because unused
1241     * channels will be masked out.
1242     */
1243    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1244                                  src_reg(offset),
1245                                  src_reg(surf_index), src_reg(1));
1246    inst->mlen = 1;
1247 }
1248
1249 void
1250 vec4_visitor::emit_ndc_computation()
1251 {
1252    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1253       return;
1254
1255    /* Get the position */
1256    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1257
1258    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1259    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1260    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1261
1262    current_annotation = "NDC";
1263    dst_reg ndc_w = ndc;
1264    ndc_w.writemask = WRITEMASK_W;
1265    src_reg pos_w = pos;
1266    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1267    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1268
1269    dst_reg ndc_xyz = ndc;
1270    ndc_xyz.writemask = WRITEMASK_XYZ;
1271
1272    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1273 }
1274
1275 void
1276 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1277 {
1278    if (devinfo->gen < 6 &&
1279        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1280         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1281         devinfo->has_negative_rhw_bug)) {
1282       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1283       dst_reg header1_w = header1;
1284       header1_w.writemask = WRITEMASK_W;
1285
1286       emit(MOV(header1, 0u));
1287
1288       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1289          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1290
1291          current_annotation = "Point size";
1292          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1293          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1294       }
1295
1296       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1297          current_annotation = "Clipping flags";
1298          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1299          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1300
1301          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1302          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1303          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1304
1305          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1306          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1307          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1308          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1309       }
1310
1311       /* i965 clipping workaround:
1312        * 1) Test for -ve rhw
1313        * 2) If set,
1314        *      set ndc = (0,0,0,0)
1315        *      set ucp[6] = 1
1316        *
1317        * Later, clipping will detect ucp[6] and ensure the primitive is
1318        * clipped against all fixed planes.
1319        */
1320       if (devinfo->has_negative_rhw_bug &&
1321           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1322          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1323          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1324          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1325          vec4_instruction *inst;
1326          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1327          inst->predicate = BRW_PREDICATE_NORMAL;
1328          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1329          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1330          inst->predicate = BRW_PREDICATE_NORMAL;
1331       }
1332
1333       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1334    } else if (devinfo->gen < 6) {
1335       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1336    } else {
1337       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1338       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1339          dst_reg reg_w = reg;
1340          reg_w.writemask = WRITEMASK_W;
1341          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1342          reg_as_src.type = reg_w.type;
1343          reg_as_src.swizzle = brw_swizzle_for_size(1);
1344          emit(MOV(reg_w, reg_as_src));
1345       }
1346       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1347          dst_reg reg_y = reg;
1348          reg_y.writemask = WRITEMASK_Y;
1349          reg_y.type = BRW_REGISTER_TYPE_D;
1350          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1351          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1352       }
1353       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1354          dst_reg reg_z = reg;
1355          reg_z.writemask = WRITEMASK_Z;
1356          reg_z.type = BRW_REGISTER_TYPE_D;
1357          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1358          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1359       }
1360    }
1361 }
1362
1363 vec4_instruction *
1364 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1365 {
1366    assert(varying < VARYING_SLOT_MAX);
1367    assert(output_reg[varying].type == reg.type);
1368    current_annotation = output_reg_annotation[varying];
1369    if (output_reg[varying].file != BAD_FILE)
1370       return emit(MOV(reg, src_reg(output_reg[varying])));
1371    else
1372       return NULL;
1373 }
1374
1375 void
1376 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1377 {
1378    reg.type = BRW_REGISTER_TYPE_F;
1379    output_reg[varying].type = reg.type;
1380
1381    switch (varying) {
1382    case VARYING_SLOT_PSIZ:
1383    {
1384       /* PSIZ is always in slot 0, and is coupled with other flags. */
1385       current_annotation = "indices, point width, clip flags";
1386       emit_psiz_and_flags(reg);
1387       break;
1388    }
1389    case BRW_VARYING_SLOT_NDC:
1390       current_annotation = "NDC";
1391       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1392          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1393       break;
1394    case VARYING_SLOT_POS:
1395       current_annotation = "gl_Position";
1396       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1397          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1398       break;
1399    case VARYING_SLOT_EDGE:
1400       /* This is present when doing unfilled polygons.  We're supposed to copy
1401        * the edge flag from the user-provided vertex array
1402        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1403        * of that attribute (starts as 1.0f).  This is then used in clipping to
1404        * determine which edges should be drawn as wireframe.
1405        */
1406       current_annotation = "edge flag";
1407       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1408                                     glsl_type::float_type, WRITEMASK_XYZW))));
1409       break;
1410    case BRW_VARYING_SLOT_PAD:
1411       /* No need to write to this slot */
1412       break;
1413    default:
1414       emit_generic_urb_slot(reg, varying);
1415       break;
1416    }
1417 }
1418
1419 static int
1420 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1421 {
1422    if (devinfo->gen >= 6) {
1423       /* URB data written (does not include the message header reg) must
1424        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1425        * section 5.4.3.2.2: URB_INTERLEAVED.
1426        *
1427        * URB entries are allocated on a multiple of 1024 bits, so an
1428        * extra 128 bits written here to make the end align to 256 is
1429        * no problem.
1430        */
1431       if ((mlen % 2) != 1)
1432          mlen++;
1433    }
1434
1435    return mlen;
1436 }
1437
1438
1439 /**
1440  * Generates the VUE payload plus the necessary URB write instructions to
1441  * output it.
1442  *
1443  * The VUE layout is documented in Volume 2a.
1444  */
1445 void
1446 vec4_visitor::emit_vertex()
1447 {
1448    /* MRF 0 is reserved for the debugger, so start with message header
1449     * in MRF 1.
1450     */
1451    int base_mrf = 1;
1452    int mrf = base_mrf;
1453    /* In the process of generating our URB write message contents, we
1454     * may need to unspill a register or load from an array.  Those
1455     * reads would use MRFs 14-15.
1456     */
1457    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1458
1459    /* The following assertion verifies that max_usable_mrf causes an
1460     * even-numbered amount of URB write data, which will meet gen6's
1461     * requirements for length alignment.
1462     */
1463    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1464
1465    /* First mrf is the g0-based message header containing URB handles and
1466     * such.
1467     */
1468    emit_urb_write_header(mrf++);
1469
1470    if (devinfo->gen < 6) {
1471       emit_ndc_computation();
1472    }
1473
1474    /* We may need to split this up into several URB writes, so do them in a
1475     * loop.
1476     */
1477    int slot = 0;
1478    bool complete = false;
1479    do {
1480       /* URB offset is in URB row increments, and each of our MRFs is half of
1481        * one of those, since we're doing interleaved writes.
1482        */
1483       int offset = slot / 2;
1484
1485       mrf = base_mrf + 1;
1486       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1487          emit_urb_slot(dst_reg(MRF, mrf++),
1488                        prog_data->vue_map.slot_to_varying[slot]);
1489
1490          /* If this was max_usable_mrf, we can't fit anything more into this
1491           * URB WRITE. Same thing if we reached the maximum length available.
1492           */
1493          if (mrf > max_usable_mrf ||
1494              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1495             slot++;
1496             break;
1497          }
1498       }
1499
1500       complete = slot >= prog_data->vue_map.num_slots;
1501       current_annotation = "URB write";
1502       vec4_instruction *inst = emit_urb_write_opcode(complete);
1503       inst->base_mrf = base_mrf;
1504       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1505       inst->offset += offset;
1506    } while(!complete);
1507 }
1508
1509
1510 src_reg
1511 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1512                                  src_reg *reladdr, int reg_offset)
1513 {
1514    /* Because we store the values to scratch interleaved like our
1515     * vertex data, we need to scale the vec4 index by 2.
1516     */
1517    int message_header_scale = 2;
1518
1519    /* Pre-gen6, the message header uses byte offsets instead of vec4
1520     * (16-byte) offset units.
1521     */
1522    if (devinfo->gen < 6)
1523       message_header_scale *= 16;
1524
1525    if (reladdr) {
1526       src_reg index = src_reg(this, glsl_type::int_type);
1527
1528       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1529                                    src_reg(reg_offset)));
1530       emit_before(block, inst, MUL(dst_reg(index), index,
1531                                    src_reg(message_header_scale)));
1532
1533       return index;
1534    } else {
1535       return src_reg(reg_offset * message_header_scale);
1536    }
1537 }
1538
1539 src_reg
1540 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1541                                        src_reg *reladdr, int reg_offset)
1542 {
1543    if (reladdr) {
1544       src_reg index = src_reg(this, glsl_type::int_type);
1545
1546       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1547                                    src_reg(reg_offset)));
1548
1549       /* Pre-gen6, the message header uses byte offsets instead of vec4
1550        * (16-byte) offset units.
1551        */
1552       if (devinfo->gen < 6) {
1553          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1554       }
1555
1556       return index;
1557    } else if (devinfo->gen >= 8) {
1558       /* Store the offset in a GRF so we can send-from-GRF. */
1559       src_reg offset = src_reg(this, glsl_type::int_type);
1560       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1561       return offset;
1562    } else {
1563       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1564       return src_reg(reg_offset * message_header_scale);
1565    }
1566 }
1567
1568 /**
1569  * Emits an instruction before @inst to load the value named by @orig_src
1570  * from scratch space at @base_offset to @temp.
1571  *
1572  * @base_offset is measured in 32-byte units (the size of a register).
1573  */
1574 void
1575 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1576                                 dst_reg temp, src_reg orig_src,
1577                                 int base_offset)
1578 {
1579    int reg_offset = base_offset + orig_src.reg_offset;
1580    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1581                                       reg_offset);
1582
1583    emit_before(block, inst, SCRATCH_READ(temp, index));
1584 }
1585
1586 /**
1587  * Emits an instruction after @inst to store the value to be written
1588  * to @orig_dst to scratch space at @base_offset, from @temp.
1589  *
1590  * @base_offset is measured in 32-byte units (the size of a register).
1591  */
1592 void
1593 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1594                                  int base_offset)
1595 {
1596    int reg_offset = base_offset + inst->dst.reg_offset;
1597    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1598                                       reg_offset);
1599
1600    /* Create a temporary register to store *inst's result in.
1601     *
1602     * We have to be careful in MOVing from our temporary result register in
1603     * the scratch write.  If we swizzle from channels of the temporary that
1604     * weren't initialized, it will confuse live interval analysis, which will
1605     * make spilling fail to make progress.
1606     */
1607    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1608                                        inst->dst.type),
1609                                 brw_swizzle_for_mask(inst->dst.writemask));
1610    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1611                                        inst->dst.writemask));
1612    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1613    if (inst->opcode != BRW_OPCODE_SEL)
1614       write->predicate = inst->predicate;
1615    write->ir = inst->ir;
1616    write->annotation = inst->annotation;
1617    inst->insert_after(block, write);
1618
1619    inst->dst.file = temp.file;
1620    inst->dst.reg = temp.reg;
1621    inst->dst.reg_offset = temp.reg_offset;
1622    inst->dst.reladdr = NULL;
1623 }
1624
1625 /**
1626  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1627  * adds the scratch read(s) before \p inst. The function also checks for
1628  * recursive reladdr scratch accesses, issuing the corresponding scratch
1629  * loads and rewriting reladdr references accordingly.
1630  *
1631  * \return \p src if it did not require a scratch load, otherwise, the
1632  * register holding the result of the scratch load that the caller should
1633  * use to rewrite src.
1634  */
1635 src_reg
1636 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1637                                    vec4_instruction *inst, src_reg src)
1638 {
1639    /* Resolve recursive reladdr scratch access by calling ourselves
1640     * with src.reladdr
1641     */
1642    if (src.reladdr)
1643       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1644                                           *src.reladdr);
1645
1646    /* Now handle scratch access on src */
1647    if (src.file == GRF && scratch_loc[src.reg] != -1) {
1648       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1649       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1650       src.reg = temp.reg;
1651       src.reg_offset = temp.reg_offset;
1652       src.reladdr = NULL;
1653    }
1654
1655    return src;
1656 }
1657
1658 /**
1659  * We can't generally support array access in GRF space, because a
1660  * single instruction's destination can only span 2 contiguous
1661  * registers.  So, we send all GRF arrays that get variable index
1662  * access to scratch space.
1663  */
1664 void
1665 vec4_visitor::move_grf_array_access_to_scratch()
1666 {
1667    int scratch_loc[this->alloc.count];
1668    memset(scratch_loc, -1, sizeof(scratch_loc));
1669
1670    /* First, calculate the set of virtual GRFs that need to be punted
1671     * to scratch due to having any array access on them, and where in
1672     * scratch.
1673     */
1674    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1675       if (inst->dst.file == GRF && inst->dst.reladdr) {
1676          if (scratch_loc[inst->dst.reg] == -1) {
1677             scratch_loc[inst->dst.reg] = last_scratch;
1678             last_scratch += this->alloc.sizes[inst->dst.reg];
1679          }
1680
1681          for (src_reg *iter = inst->dst.reladdr;
1682               iter->reladdr;
1683               iter = iter->reladdr) {
1684             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1685                scratch_loc[iter->reg] = last_scratch;
1686                last_scratch += this->alloc.sizes[iter->reg];
1687             }
1688          }
1689       }
1690
1691       for (int i = 0 ; i < 3; i++) {
1692          for (src_reg *iter = &inst->src[i];
1693               iter->reladdr;
1694               iter = iter->reladdr) {
1695             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1696                scratch_loc[iter->reg] = last_scratch;
1697                last_scratch += this->alloc.sizes[iter->reg];
1698             }
1699          }
1700       }
1701    }
1702
1703    /* Now, for anything that will be accessed through scratch, rewrite
1704     * it to load/store.  Note that this is a _safe list walk, because
1705     * we may generate a new scratch_write instruction after the one
1706     * we're processing.
1707     */
1708    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1709       /* Set up the annotation tracking for new generated instructions. */
1710       base_ir = inst->ir;
1711       current_annotation = inst->annotation;
1712
1713       /* First handle scratch access on the dst. Notice we have to handle
1714        * the case where the dst's reladdr also points to scratch space.
1715        */
1716       if (inst->dst.reladdr)
1717          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1718                                                    *inst->dst.reladdr);
1719
1720       /* Now that we have handled any (possibly recursive) reladdr scratch
1721        * accesses for dst we can safely do the scratch write for dst itself
1722        */
1723       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1724          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1725
1726       /* Now handle scratch access on any src. In this case, since inst->src[i]
1727        * already is a src_reg, we can just call emit_resolve_reladdr with
1728        * inst->src[i] and it will take care of handling scratch loads for
1729        * both src and src.reladdr (recursively).
1730        */
1731       for (int i = 0 ; i < 3; i++) {
1732          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1733                                              inst->src[i]);
1734       }
1735    }
1736 }
1737
1738 /**
1739  * Emits an instruction before @inst to load the value named by @orig_src
1740  * from the pull constant buffer (surface) at @base_offset to @temp.
1741  */
1742 void
1743 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1744                                       dst_reg temp, src_reg orig_src,
1745                                       int base_offset)
1746 {
1747    int reg_offset = base_offset + orig_src.reg_offset;
1748    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1749    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1750                                              reg_offset);
1751
1752    emit_pull_constant_load_reg(temp,
1753                                src_reg(index),
1754                                offset,
1755                                block, inst);
1756
1757    brw_mark_surface_used(&prog_data->base, index);
1758 }
1759
1760 /**
1761  * Implements array access of uniforms by inserting a
1762  * PULL_CONSTANT_LOAD instruction.
1763  *
1764  * Unlike temporary GRF array access (where we don't support it due to
1765  * the difficulty of doing relative addressing on instruction
1766  * destinations), we could potentially do array access of uniforms
1767  * that were loaded in GRF space as push constants.  In real-world
1768  * usage we've seen, though, the arrays being used are always larger
1769  * than we could load as push constants, so just always move all
1770  * uniform array access out to a pull constant buffer.
1771  */
1772 void
1773 vec4_visitor::move_uniform_array_access_to_pull_constants()
1774 {
1775    int pull_constant_loc[this->uniforms];
1776    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1777    bool nested_reladdr;
1778
1779    /* Walk through and find array access of uniforms.  Put a copy of that
1780     * uniform in the pull constant buffer.
1781     *
1782     * Note that we don't move constant-indexed accesses to arrays.  No
1783     * testing has been done of the performance impact of this choice.
1784     */
1785    do {
1786       nested_reladdr = false;
1787
1788       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1789          for (int i = 0 ; i < 3; i++) {
1790             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1791                continue;
1792
1793             int uniform = inst->src[i].reg;
1794
1795             if (inst->src[i].reladdr->reladdr)
1796                nested_reladdr = true;  /* will need another pass */
1797
1798             /* If this array isn't already present in the pull constant buffer,
1799              * add it.
1800              */
1801             if (pull_constant_loc[uniform] == -1) {
1802                const gl_constant_value **values =
1803                   &stage_prog_data->param[uniform * 4];
1804
1805                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1806
1807                assert(uniform < uniform_array_size);
1808                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1809                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1810                      = values[j];
1811                }
1812             }
1813
1814             /* Set up the annotation tracking for new generated instructions. */
1815             base_ir = inst->ir;
1816             current_annotation = inst->annotation;
1817
1818             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1819
1820             emit_pull_constant_load(block, inst, temp, inst->src[i],
1821                                     pull_constant_loc[uniform]);
1822
1823             inst->src[i].file = temp.file;
1824             inst->src[i].reg = temp.reg;
1825             inst->src[i].reg_offset = temp.reg_offset;
1826             inst->src[i].reladdr = NULL;
1827          }
1828       }
1829    } while (nested_reladdr);
1830
1831    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1832     * no need to track them as larger-than-vec4 objects.  This will be
1833     * relied on in cutting out unused uniform vectors from push
1834     * constants.
1835     */
1836    split_uniform_registers();
1837 }
1838
1839 void
1840 vec4_visitor::resolve_ud_negate(src_reg *reg)
1841 {
1842    if (reg->type != BRW_REGISTER_TYPE_UD ||
1843        !reg->negate)
1844       return;
1845
1846    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1847    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1848    *reg = temp;
1849 }
1850
1851 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1852                            void *log_data,
1853                            const struct brw_sampler_prog_key_data *key_tex,
1854                            struct brw_vue_prog_data *prog_data,
1855                            const nir_shader *shader,
1856                            void *mem_ctx,
1857                            bool no_spills,
1858                            int shader_time_index)
1859    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1860      key_tex(key_tex),
1861      prog_data(prog_data),
1862      fail_msg(NULL),
1863      first_non_payload_grf(0),
1864      need_all_constants_in_pull_buffer(false),
1865      no_spills(no_spills),
1866      shader_time_index(shader_time_index),
1867      last_scratch(0)
1868 {
1869    this->failed = false;
1870
1871    this->base_ir = NULL;
1872    this->current_annotation = NULL;
1873    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1874
1875    this->virtual_grf_start = NULL;
1876    this->virtual_grf_end = NULL;
1877    this->live_intervals = NULL;
1878
1879    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1880
1881    this->uniforms = 0;
1882
1883    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1884     * at least one. See setup_uniforms() in brw_vec4.cpp.
1885     */
1886    this->uniform_array_size = 1;
1887    if (prog_data) {
1888       this->uniform_array_size =
1889          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1890    }
1891
1892    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1893 }
1894
1895 vec4_visitor::~vec4_visitor()
1896 {
1897 }
1898
1899
1900 void
1901 vec4_visitor::fail(const char *format, ...)
1902 {
1903    va_list va;
1904    char *msg;
1905
1906    if (failed)
1907       return;
1908
1909    failed = true;
1910
1911    va_start(va, format);
1912    msg = ralloc_vasprintf(mem_ctx, format, va);
1913    va_end(va);
1914    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1915
1916    this->fail_msg = msg;
1917
1918    if (debug_enabled) {
1919       fprintf(stderr, "%s",  msg);
1920    }
1921 }
1922
1923 } /* namespace brw */