src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 src_reg
 280 vec4_visitor::fix_3src_operand(const src_reg &src)
 281 {
 282    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 283     * able to use vertical stride of zero to replicate the vec4 uniform, like
 284     *
 285     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 286     *
 287     * But you can't, since vertical stride is always four in three-source
 288     * instructions. Instead, insert a MOV instruction to do the replication so
 289     * that the three-source instruction can consume it.
 290     */
 291
 292    /* The MOV is only needed if the source is a uniform or immediate. */
 293    if (src.file != UNIFORM && src.file != IMM)
 294       return src;
 295
 296    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 297       return src;
 298
 299    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 300    expanded.type = src.type;
 301    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 302    return src_reg(expanded);
 303 }
 304
 305 src_reg
 306 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 307 {
 308    if (!src.abs && !src.negate)
 309       return src;
 310
 311    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 312    resolved.type = src.type;
 313    emit(MOV(resolved, src));
 314
 315    return src_reg(resolved);
 316 }
 317
 318 src_reg
 319 vec4_visitor::fix_math_operand(const src_reg &src)
 320 {
 321    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 322       return src;
 323
 324    /* The gen6 math instruction ignores the source modifiers --
 325     * swizzle, abs, negate, and at least some parts of the register
 326     * region description.
 327     *
 328     * Rather than trying to enumerate all these cases, *always* expand the
 329     * operand to a temp GRF for gen6.
 330     *
 331     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 332     * can't use.
 333     */
 334
 335    if (devinfo->gen == 7 && src.file != IMM)
 336       return src;
 337
 338    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 339    expanded.type = src.type;
 340    emit(MOV(expanded, src));
 341    return src_reg(expanded);
 342 }
 343
 344 vec4_instruction *
 345 vec4_visitor::emit_math(enum opcode opcode,
 346                         const dst_reg &dst,
 347                         const src_reg &src0, const src_reg &src1)
 348 {
 349    vec4_instruction *math =
 350       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 351
 352    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 353       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 354       math->dst = dst_reg(this, glsl_type::vec4_type);
 355       math->dst.type = dst.type;
 356       math = emit(MOV(dst, src_reg(math->dst)));
 357    } else if (devinfo->gen < 6) {
 358       math->base_mrf = 1;
 359       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 360    }
 361
 362    return math;
 363 }
 364
 365 void
 366 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 367 {
 368    if (devinfo->gen < 7) {
 369       unreachable("ir_unop_pack_half_2x16 should be lowered");
 370    }
 371
 372    assert(dst.type == BRW_REGISTER_TYPE_UD);
 373    assert(src0.type == BRW_REGISTER_TYPE_F);
 374
 375    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 376     *
 377     *   Because this instruction does not have a 16-bit floating-point type,
 378     *   the destination data type must be Word (W).
 379     *
 380     *   The destination must be DWord-aligned and specify a horizontal stride
 381     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 382     *   each destination channel and the upper word is not modified.
 383     *
 384     * The above restriction implies that the f32to16 instruction must use
 385     * align1 mode, because only in align1 mode is it possible to specify
 386     * horizontal stride.  We choose here to defy the hardware docs and emit
 387     * align16 instructions.
 388     *
 389     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 390     * instructions. I was partially successful in that the code passed all
 391     * tests.  However, the code was dubiously correct and fragile, and the
 392     * tests were not harsh enough to probe that frailty. Not trusting the
 393     * code, I chose instead to remain in align16 mode in defiance of the hw
 394     * docs).
 395     *
 396     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 397     * simulator, emitting a f32to16 in align16 mode with UD as destination
 398     * data type is safe. The behavior differs from that specified in the PRM
 399     * in that the upper word of each destination channel is cleared to 0.
 400     */
 401
 402    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 403    src_reg tmp_src(tmp_dst);
 404
 405 #if 0
 406    /* Verify the undocumented behavior on which the following instructions
 407     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 408     * then the result of the bit-or instruction below will be incorrect.
 409     *
 410     * You should inspect the disasm output in order to verify that the MOV is
 411     * not optimized away.
 412     */
 413    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 414 #endif
 415
 416    /* Give tmp the form below, where "." means untouched.
 417     *
 418     *     w z          y          x w z          y          x
 419     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 420     *
 421     * That the upper word of each write-channel be 0 is required for the
 422     * following bit-shift and bit-or instructions to work. Note that this
 423     * relies on the undocumented hardware behavior mentioned above.
 424     */
 425    tmp_dst.writemask = WRITEMASK_XY;
 426    emit(F32TO16(tmp_dst, src0));
 427
 428    /* Give the write-channels of dst the form:
 429     *   0xhhhh0000
 430     */
 431    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 432    emit(SHL(dst, tmp_src, src_reg(16u)));
 433
 434    /* Finally, give the write-channels of dst the form of packHalf2x16's
 435     * output:
 436     *   0xhhhhllll
 437     */
 438    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 439    emit(OR(dst, src_reg(dst), tmp_src));
 440 }
 441
 442 void
 443 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 444 {
 445    if (devinfo->gen < 7) {
 446       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 447    }
 448
 449    assert(dst.type == BRW_REGISTER_TYPE_F);
 450    assert(src0.type == BRW_REGISTER_TYPE_UD);
 451
 452    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 453     *
 454     *   Because this instruction does not have a 16-bit floating-point type,
 455     *   the source data type must be Word (W). The destination type must be
 456     *   F (Float).
 457     *
 458     * To use W as the source data type, we must adjust horizontal strides,
 459     * which is only possible in align1 mode. All my [chadv] attempts at
 460     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 461     * Piglit tests, so I gave up.
 462     *
 463     * I've verified that, on gen7 hardware and the simulator, it is safe to
 464     * emit f16to32 in align16 mode with UD as source data type.
 465     */
 466
 467    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 468    src_reg tmp_src(tmp_dst);
 469
 470    tmp_dst.writemask = WRITEMASK_X;
 471    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 472
 473    tmp_dst.writemask = WRITEMASK_Y;
 474    emit(SHR(tmp_dst, src0, src_reg(16u)));
 475
 476    dst.writemask = WRITEMASK_XY;
 477    emit(F16TO32(dst, tmp_src));
 478 }
 479
 480 void
 481 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 482 {
 483    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 484     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 485     * is not suitable to generate the shift values, but we can use the packed
 486     * vector float and a type-converting MOV.
 487     */
 488    dst_reg shift(this, glsl_type::uvec4_type);
 489    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 490
 491    dst_reg shifted(this, glsl_type::uvec4_type);
 492    src0.swizzle = BRW_SWIZZLE_XXXX;
 493    emit(SHR(shifted, src0, src_reg(shift)));
 494
 495    shifted.type = BRW_REGISTER_TYPE_UB;
 496    dst_reg f(this, glsl_type::vec4_type);
 497    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 498
 499    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 500 }
 501
 502 void
 503 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 504 {
 505    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 506     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 507     * is not suitable to generate the shift values, but we can use the packed
 508     * vector float and a type-converting MOV.
 509     */
 510    dst_reg shift(this, glsl_type::uvec4_type);
 511    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 512
 513    dst_reg shifted(this, glsl_type::uvec4_type);
 514    src0.swizzle = BRW_SWIZZLE_XXXX;
 515    emit(SHR(shifted, src0, src_reg(shift)));
 516
 517    shifted.type = BRW_REGISTER_TYPE_B;
 518    dst_reg f(this, glsl_type::vec4_type);
 519    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 520
 521    dst_reg scaled(this, glsl_type::vec4_type);
 522    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 523
 524    dst_reg max(this, glsl_type::vec4_type);
 525    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 526    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 527 }
 528
 529 void
 530 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 531 {
 532    dst_reg saturated(this, glsl_type::vec4_type);
 533    vec4_instruction *inst = emit(MOV(saturated, src0));
 534    inst->saturate = true;
 535
 536    dst_reg scaled(this, glsl_type::vec4_type);
 537    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 538
 539    dst_reg rounded(this, glsl_type::vec4_type);
 540    emit(RNDE(rounded, src_reg(scaled)));
 541
 542    dst_reg u(this, glsl_type::uvec4_type);
 543    emit(MOV(u, src_reg(rounded)));
 544
 545    src_reg bytes(u);
 546    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 547 }
 548
 549 void
 550 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 551 {
 552    dst_reg max(this, glsl_type::vec4_type);
 553    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 554
 555    dst_reg min(this, glsl_type::vec4_type);
 556    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 557
 558    dst_reg scaled(this, glsl_type::vec4_type);
 559    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 560
 561    dst_reg rounded(this, glsl_type::vec4_type);
 562    emit(RNDE(rounded, src_reg(scaled)));
 563
 564    dst_reg i(this, glsl_type::ivec4_type);
 565    emit(MOV(i, src_reg(rounded)));
 566
 567    src_reg bytes(i);
 568    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 569 }
 570
 571 /**
 572  * Returns the minimum number of vec4 elements needed to pack a type.
 573  *
 574  * For simple types, it will return 1 (a single vec4); for matrices, the
 575  * number of columns; for array and struct, the sum of the vec4_size of
 576  * each of its elements; and for sampler and atomic, zero.
 577  *
 578  * This method is useful to calculate how much register space is needed to
 579  * store a particular type.
 580  */
 581 extern "C" int
 582 type_size_vec4(const struct glsl_type *type)
 583 {
 584    unsigned int i;
 585    int size;
 586
 587    switch (type->base_type) {
 588    case GLSL_TYPE_UINT:
 589    case GLSL_TYPE_INT:
 590    case GLSL_TYPE_FLOAT:
 591    case GLSL_TYPE_BOOL:
 592       if (type->is_matrix()) {
 593          return type->matrix_columns;
 594       } else {
 595          /* Regardless of size of vector, it gets a vec4. This is bad
 596           * packing for things like floats, but otherwise arrays become a
 597           * mess.  Hopefully a later pass over the code can pack scalars
 598           * down if appropriate.
 599           */
 600          return 1;
 601       }
 602    case GLSL_TYPE_ARRAY:
 603       assert(type->length > 0);
 604       return type_size_vec4(type->fields.array) * type->length;
 605    case GLSL_TYPE_STRUCT:
 606       size = 0;
 607       for (i = 0; i < type->length; i++) {
 608          size += type_size_vec4(type->fields.structure[i].type);
 609       }
 610       return size;
 611    case GLSL_TYPE_SUBROUTINE:
 612       return 1;
 613
 614    case GLSL_TYPE_SAMPLER:
 615       /* Samplers take up no register space, since they're baked in at
 616        * link time.
 617        */
 618       return 0;
 619    case GLSL_TYPE_ATOMIC_UINT:
 620       return 0;
 621    case GLSL_TYPE_IMAGE:
 622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 623    case GLSL_TYPE_VOID:
 624    case GLSL_TYPE_DOUBLE:
 625    case GLSL_TYPE_ERROR:
 626    case GLSL_TYPE_INTERFACE:
 627       unreachable("not reached");
 628    }
 629
 630    return 0;
 631 }
 632
 633 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 634 {
 635    init();
 636
 637    this->file = GRF;
 638    this->reg = v->alloc.allocate(type_size_vec4(type));
 639
 640    if (type->is_array() || type->is_record()) {
 641       this->swizzle = BRW_SWIZZLE_NOOP;
 642    } else {
 643       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 644    }
 645
 646    this->type = brw_type_for_base_type(type);
 647 }
 648
 649 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 650 {
 651    assert(size > 0);
 652
 653    init();
 654
 655    this->file = GRF;
 656    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 657
 658    this->swizzle = BRW_SWIZZLE_NOOP;
 659
 660    this->type = brw_type_for_base_type(type);
 661 }
 662
 663 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 664 {
 665    init();
 666
 667    this->file = GRF;
 668    this->reg = v->alloc.allocate(type_size_vec4(type));
 669
 670    if (type->is_array() || type->is_record()) {
 671       this->writemask = WRITEMASK_XYZW;
 672    } else {
 673       this->writemask = (1 << type->vector_elements) - 1;
 674    }
 675
 676    this->type = brw_type_for_base_type(type);
 677 }
 678
 679 vec4_instruction *
 680 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 681                           src_reg src0, src_reg src1)
 682 {
 683    vec4_instruction *inst;
 684
 685    if (devinfo->gen >= 6) {
 686       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 687       inst->conditional_mod = conditionalmod;
 688    } else {
 689       emit(CMP(dst, src0, src1, conditionalmod));
 690
 691       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 692       inst->predicate = BRW_PREDICATE_NORMAL;
 693    }
 694
 695    return inst;
 696 }
 697
 698 vec4_instruction *
 699 vec4_visitor::emit_lrp(const dst_reg &dst,
 700                        const src_reg &x, const src_reg &y, const src_reg &a)
 701 {
 702    if (devinfo->gen >= 6) {
 703       /* Note that the instruction's argument order is reversed from GLSL
 704        * and the IR.
 705        */
 706      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 707                      fix_3src_operand(x)));
 708    } else {
 709       /* Earlier generations don't support three source operations, so we
 710        * need to emit x*(1-a) + y*a.
 711        */
 712       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 713       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 714       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 715       y_times_a.writemask           = dst.writemask;
 716       one_minus_a.writemask         = dst.writemask;
 717       x_times_one_minus_a.writemask = dst.writemask;
 718
 719       emit(MUL(y_times_a, y, a));
 720       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
 721       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 722       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 723    }
 724 }
 725
 726 /**
 727  * Emits the instructions needed to perform a pull constant load. before_block
 728  * and before_inst can be NULL in which case the instruction will be appended
 729  * to the end of the instruction list.
 730  */
 731 void
 732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 733                                           src_reg surf_index,
 734                                           src_reg offset_reg,
 735                                           bblock_t *before_block,
 736                                           vec4_instruction *before_inst)
 737 {
 738    assert((before_inst == NULL && before_block == NULL) ||
 739           (before_inst && before_block));
 740
 741    vec4_instruction *pull;
 742
 743    if (devinfo->gen >= 9) {
 744       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 745       src_reg header(this, glsl_type::uvec4_type, 2);
 746
 747       pull = new(mem_ctx)
 748          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 749                           dst_reg(header));
 750
 751       if (before_inst)
 752          emit_before(before_block, before_inst, pull);
 753       else
 754          emit(pull);
 755
 756       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 757                                  offset_reg.type);
 758       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 759
 760       if (before_inst)
 761          emit_before(before_block, before_inst, pull);
 762       else
 763          emit(pull);
 764
 765       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 766                                            dst,
 767                                            surf_index,
 768                                            header);
 769       pull->mlen = 2;
 770       pull->header_size = 1;
 771    } else if (devinfo->gen >= 7) {
 772       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 773
 774       grf_offset.type = offset_reg.type;
 775
 776       pull = MOV(grf_offset, offset_reg);
 777
 778       if (before_inst)
 779          emit_before(before_block, before_inst, pull);
 780       else
 781          emit(pull);
 782
 783       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 784                                            dst,
 785                                            surf_index,
 786                                            src_reg(grf_offset));
 787       pull->mlen = 1;
 788    } else {
 789       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 790                                            dst,
 791                                            surf_index,
 792                                            offset_reg);
 793       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 794       pull->mlen = 1;
 795    }
 796
 797    if (before_inst)
 798       emit_before(before_block, before_inst, pull);
 799    else
 800       emit(pull);
 801 }
 802
 803 src_reg
 804 vec4_visitor::emit_uniformize(const src_reg &src)
 805 {
 806    const src_reg chan_index(this, glsl_type::uint_type);
 807    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 808                               src.type);
 809
 810    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 811       ->force_writemask_all = true;
 812    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 813       ->force_writemask_all = true;
 814
 815    return src_reg(dst);
 816 }
 817
 818 src_reg
 819 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 820                              src_reg coordinate, src_reg sampler)
 821 {
 822    vec4_instruction *inst =
 823       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 824                                     dst_reg(this, glsl_type::uvec4_type));
 825    inst->base_mrf = 2;
 826    inst->src[1] = sampler;
 827
 828    int param_base;
 829
 830    if (devinfo->gen >= 9) {
 831       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 832       vec4_instruction *header_inst = new(mem_ctx)
 833          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 834                           dst_reg(MRF, inst->base_mrf));
 835
 836       emit(header_inst);
 837
 838       inst->mlen = 2;
 839       inst->header_size = 1;
 840       param_base = inst->base_mrf + 1;
 841    } else {
 842       inst->mlen = 1;
 843       param_base = inst->base_mrf;
 844    }
 845
 846    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 847    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 848    int zero_mask = 0xf & ~coord_mask;
 849
 850    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 851             coordinate));
 852
 853    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 854             src_reg(0)));
 855
 856    emit(inst);
 857    return src_reg(inst->dst);
 858 }
 859
 860 bool
 861 vec4_visitor::is_high_sampler(src_reg sampler)
 862 {
 863    if (devinfo->gen < 8 && !devinfo->is_haswell)
 864       return false;
 865
 866    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 867 }
 868
 869 void
 870 vec4_visitor::emit_texture(ir_texture_opcode op,
 871                            dst_reg dest,
 872                            const glsl_type *dest_type,
 873                            src_reg coordinate,
 874                            int coord_components,
 875                            src_reg shadow_comparitor,
 876                            src_reg lod, src_reg lod2,
 877                            src_reg sample_index,
 878                            uint32_t constant_offset,
 879                            src_reg offset_value,
 880                            src_reg mcs,
 881                            bool is_cube_array,
 882                            uint32_t sampler,
 883                            src_reg sampler_reg)
 884 {
 885    enum opcode opcode;
 886    switch (op) {
 887    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 888    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 889    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 890    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 891    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 892    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 893    case ir_tg4: opcode = offset_value.file != BAD_FILE
 894                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 895    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 896    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 897    case ir_txb:
 898       unreachable("TXB is not valid for vertex shaders.");
 899    case ir_lod:
 900       unreachable("LOD is not valid for vertex shaders.");
 901    default:
 902       unreachable("Unrecognized tex op");
 903    }
 904
 905    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 906       opcode, dst_reg(this, dest_type));
 907
 908    inst->offset = constant_offset;
 909
 910    /* The message header is necessary for:
 911     * - Gen4 (always)
 912     * - Gen9+ for selecting SIMD4x2
 913     * - Texel offsets
 914     * - Gather channel selection
 915     * - Sampler indices too large to fit in a 4-bit value.
 916     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 917     */
 918    inst->header_size =
 919       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 920        inst->offset != 0 || op == ir_tg4 ||
 921        op == ir_texture_samples ||
 922        is_high_sampler(sampler_reg)) ? 1 : 0;
 923    inst->base_mrf = 2;
 924    inst->mlen = inst->header_size;
 925    inst->dst.writemask = WRITEMASK_XYZW;
 926    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 927
 928    inst->src[1] = sampler_reg;
 929
 930    /* MRF for the first parameter */
 931    int param_base = inst->base_mrf + inst->header_size;
 932
 933    if (op == ir_txs || op == ir_query_levels) {
 934       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 935       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 936       inst->mlen++;
 937    } else if (op == ir_texture_samples) {
 938       inst->dst.writemask = WRITEMASK_X;
 939    } else {
 940       /* Load the coordinate */
 941       /* FINISHME: gl_clamp_mask and saturate */
 942       int coord_mask = (1 << coord_components) - 1;
 943       int zero_mask = 0xf & ~coord_mask;
 944
 945       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 946                coordinate));
 947       inst->mlen++;
 948
 949       if (zero_mask != 0) {
 950          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 951                   src_reg(0)));
 952       }
 953       /* Load the shadow comparitor */
 954       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 955          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 956                           WRITEMASK_X),
 957                   shadow_comparitor));
 958          inst->mlen++;
 959       }
 960
 961       /* Load the LOD info */
 962       if (op == ir_tex || op == ir_txl) {
 963          int mrf, writemask;
 964          if (devinfo->gen >= 5) {
 965             mrf = param_base + 1;
 966             if (shadow_comparitor.file != BAD_FILE) {
 967                writemask = WRITEMASK_Y;
 968                /* mlen already incremented */
 969             } else {
 970                writemask = WRITEMASK_X;
 971                inst->mlen++;
 972             }
 973          } else /* devinfo->gen == 4 */ {
 974             mrf = param_base;
 975             writemask = WRITEMASK_W;
 976          }
 977          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 978       } else if (op == ir_txf) {
 979          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 980       } else if (op == ir_txf_ms) {
 981          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
 982                   sample_index));
 983          if (devinfo->gen >= 7) {
 984             /* MCS data is in the first channel of `mcs`, but we need to get it into
 985              * the .y channel of the second vec4 of params, so replicate .x across
 986              * the whole vec4 and then mask off everything except .y
 987              */
 988             mcs.swizzle = BRW_SWIZZLE_XXXX;
 989             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
 990                      mcs));
 991          }
 992          inst->mlen++;
 993       } else if (op == ir_txd) {
 994          const brw_reg_type type = lod.type;
 995
 996          if (devinfo->gen >= 5) {
 997             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
 998             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
 999             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1000             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1001             inst->mlen++;
1002
1003             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1004                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1005                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1006                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1007                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1008                inst->mlen++;
1009
1010                if (shadow_comparitor.file != BAD_FILE) {
1011                   emit(MOV(dst_reg(MRF, param_base + 2,
1012                                    shadow_comparitor.type, WRITEMASK_Z),
1013                            shadow_comparitor));
1014                }
1015             }
1016          } else /* devinfo->gen == 4 */ {
1017             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1018             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1019             inst->mlen += 2;
1020          }
1021       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1022          if (shadow_comparitor.file != BAD_FILE) {
1023             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1024                      shadow_comparitor));
1025          }
1026
1027          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1028                   offset_value));
1029          inst->mlen++;
1030       }
1031    }
1032
1033    emit(inst);
1034
1035    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1036     * spec requires layers.
1037     */
1038    if (op == ir_txs && is_cube_array) {
1039       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1040                 writemask(inst->dst, WRITEMASK_Z),
1041                 src_reg(inst->dst), src_reg(6));
1042    }
1043
1044    if (devinfo->gen == 6 && op == ir_tg4) {
1045       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1046    }
1047
1048    swizzle_result(op, dest,
1049                   src_reg(inst->dst), sampler, dest_type);
1050 }
1051
1052 /**
1053  * Apply workarounds for Gen6 gather with UINT/SINT
1054  */
1055 void
1056 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1057 {
1058    if (!wa)
1059       return;
1060
1061    int width = (wa & WA_8BIT) ? 8 : 16;
1062    dst_reg dst_f = dst;
1063    dst_f.type = BRW_REGISTER_TYPE_F;
1064
1065    /* Convert from UNORM to UINT */
1066    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1067    emit(MOV(dst, src_reg(dst_f)));
1068
1069    if (wa & WA_SIGN) {
1070       /* Reinterpret the UINT value as a signed INT value by
1071        * shifting the sign bit into place, then shifting back
1072        * preserving sign.
1073        */
1074       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1075       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1076    }
1077 }
1078
1079 /**
1080  * Set up the gather channel based on the swizzle, for gather4.
1081  */
1082 uint32_t
1083 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1084 {
1085    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1086    switch (swiz) {
1087       case SWIZZLE_X: return 0;
1088       case SWIZZLE_Y:
1089          /* gather4 sampler is broken for green channel on RG32F --
1090           * we must ask for blue instead.
1091           */
1092          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1093             return 2;
1094          return 1;
1095       case SWIZZLE_Z: return 2;
1096       case SWIZZLE_W: return 3;
1097       default:
1098          unreachable("Not reached"); /* zero, one swizzles handled already */
1099    }
1100 }
1101
1102 void
1103 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1104                              src_reg orig_val, uint32_t sampler,
1105                              const glsl_type *dest_type)
1106 {
1107    int s = key_tex->swizzles[sampler];
1108
1109    dst_reg swizzled_result = dest;
1110
1111    if (op == ir_query_levels) {
1112       /* # levels is in .w */
1113       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1114       emit(MOV(swizzled_result, orig_val));
1115       return;
1116    }
1117
1118    if (op == ir_txs || dest_type == glsl_type::float_type
1119                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1120       emit(MOV(swizzled_result, orig_val));
1121       return;
1122    }
1123
1124
1125    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1126    int swizzle[4] = {0};
1127
1128    for (int i = 0; i < 4; i++) {
1129       switch (GET_SWZ(s, i)) {
1130       case SWIZZLE_ZERO:
1131          zero_mask |= (1 << i);
1132          break;
1133       case SWIZZLE_ONE:
1134          one_mask |= (1 << i);
1135          break;
1136       default:
1137          copy_mask |= (1 << i);
1138          swizzle[i] = GET_SWZ(s, i);
1139          break;
1140       }
1141    }
1142
1143    if (copy_mask) {
1144       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1145       swizzled_result.writemask = copy_mask;
1146       emit(MOV(swizzled_result, orig_val));
1147    }
1148
1149    if (zero_mask) {
1150       swizzled_result.writemask = zero_mask;
1151       emit(MOV(swizzled_result, src_reg(0.0f)));
1152    }
1153
1154    if (one_mask) {
1155       swizzled_result.writemask = one_mask;
1156       emit(MOV(swizzled_result, src_reg(1.0f)));
1157    }
1158 }
1159
1160 void
1161 vec4_visitor::gs_emit_vertex(int stream_id)
1162 {
1163    unreachable("not reached");
1164 }
1165
1166 void
1167 vec4_visitor::gs_end_primitive()
1168 {
1169    unreachable("not reached");
1170 }
1171
1172 void
1173 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1174                                   dst_reg dst, src_reg offset,
1175                                   src_reg src0, src_reg src1)
1176 {
1177    unsigned mlen = 0;
1178
1179    /* Set the atomic operation offset. */
1180    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1181    mlen++;
1182
1183    /* Set the atomic operation arguments. */
1184    if (src0.file != BAD_FILE) {
1185       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1186       mlen++;
1187    }
1188
1189    if (src1.file != BAD_FILE) {
1190       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1191       mlen++;
1192    }
1193
1194    /* Emit the instruction.  Note that this maps to the normal SIMD8
1195     * untyped atomic message on Ivy Bridge, but that's OK because
1196     * unused channels will be masked out.
1197     */
1198    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1199                                  brw_message_reg(0),
1200                                  src_reg(surf_index), src_reg(atomic_op));
1201    inst->mlen = mlen;
1202 }
1203
1204 void
1205 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1206                                         src_reg offset)
1207 {
1208    /* Set the surface read offset. */
1209    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1210
1211    /* Emit the instruction.  Note that this maps to the normal SIMD8
1212     * untyped surface read message, but that's OK because unused
1213     * channels will be masked out.
1214     */
1215    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1216                                  brw_message_reg(0),
1217                                  src_reg(surf_index), src_reg(1));
1218    inst->mlen = 1;
1219 }
1220
1221 void
1222 vec4_visitor::emit_ndc_computation()
1223 {
1224    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1225       return;
1226
1227    /* Get the position */
1228    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1229
1230    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1231    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1232    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1233
1234    current_annotation = "NDC";
1235    dst_reg ndc_w = ndc;
1236    ndc_w.writemask = WRITEMASK_W;
1237    src_reg pos_w = pos;
1238    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1239    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1240
1241    dst_reg ndc_xyz = ndc;
1242    ndc_xyz.writemask = WRITEMASK_XYZ;
1243
1244    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1245 }
1246
1247 void
1248 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1249 {
1250    if (devinfo->gen < 6 &&
1251        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1252         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1253         devinfo->has_negative_rhw_bug)) {
1254       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1255       dst_reg header1_w = header1;
1256       header1_w.writemask = WRITEMASK_W;
1257
1258       emit(MOV(header1, 0u));
1259
1260       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1261          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1262
1263          current_annotation = "Point size";
1264          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1265          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1266       }
1267
1268       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1269          current_annotation = "Clipping flags";
1270          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1271          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1272
1273          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1274          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1275          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1276
1277          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1278          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1279          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1280          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1281       }
1282
1283       /* i965 clipping workaround:
1284        * 1) Test for -ve rhw
1285        * 2) If set,
1286        *      set ndc = (0,0,0,0)
1287        *      set ucp[6] = 1
1288        *
1289        * Later, clipping will detect ucp[6] and ensure the primitive is
1290        * clipped against all fixed planes.
1291        */
1292       if (devinfo->has_negative_rhw_bug &&
1293           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1294          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1295          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1296          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1297          vec4_instruction *inst;
1298          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1299          inst->predicate = BRW_PREDICATE_NORMAL;
1300          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1301          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1302          inst->predicate = BRW_PREDICATE_NORMAL;
1303       }
1304
1305       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1306    } else if (devinfo->gen < 6) {
1307       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1308    } else {
1309       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1310       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1311          dst_reg reg_w = reg;
1312          reg_w.writemask = WRITEMASK_W;
1313          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1314          reg_as_src.type = reg_w.type;
1315          reg_as_src.swizzle = brw_swizzle_for_size(1);
1316          emit(MOV(reg_w, reg_as_src));
1317       }
1318       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1319          dst_reg reg_y = reg;
1320          reg_y.writemask = WRITEMASK_Y;
1321          reg_y.type = BRW_REGISTER_TYPE_D;
1322          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1323          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1324       }
1325       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1326          dst_reg reg_z = reg;
1327          reg_z.writemask = WRITEMASK_Z;
1328          reg_z.type = BRW_REGISTER_TYPE_D;
1329          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1330          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1331       }
1332    }
1333 }
1334
1335 vec4_instruction *
1336 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1337 {
1338    assert(varying < VARYING_SLOT_MAX);
1339    assert(output_reg[varying].type == reg.type);
1340    current_annotation = output_reg_annotation[varying];
1341    if (output_reg[varying].file != BAD_FILE)
1342       return emit(MOV(reg, src_reg(output_reg[varying])));
1343    else
1344       return NULL;
1345 }
1346
1347 void
1348 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1349 {
1350    reg.type = BRW_REGISTER_TYPE_F;
1351    output_reg[varying].type = reg.type;
1352
1353    switch (varying) {
1354    case VARYING_SLOT_PSIZ:
1355    {
1356       /* PSIZ is always in slot 0, and is coupled with other flags. */
1357       current_annotation = "indices, point width, clip flags";
1358       emit_psiz_and_flags(reg);
1359       break;
1360    }
1361    case BRW_VARYING_SLOT_NDC:
1362       current_annotation = "NDC";
1363       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1364          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1365       break;
1366    case VARYING_SLOT_POS:
1367       current_annotation = "gl_Position";
1368       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1369          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1370       break;
1371    case VARYING_SLOT_EDGE:
1372       /* This is present when doing unfilled polygons.  We're supposed to copy
1373        * the edge flag from the user-provided vertex array
1374        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1375        * of that attribute (starts as 1.0f).  This is then used in clipping to
1376        * determine which edges should be drawn as wireframe.
1377        */
1378       current_annotation = "edge flag";
1379       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1380                                     glsl_type::float_type, WRITEMASK_XYZW))));
1381       break;
1382    case BRW_VARYING_SLOT_PAD:
1383       /* No need to write to this slot */
1384       break;
1385    default:
1386       emit_generic_urb_slot(reg, varying);
1387       break;
1388    }
1389 }
1390
1391 static int
1392 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1393 {
1394    if (devinfo->gen >= 6) {
1395       /* URB data written (does not include the message header reg) must
1396        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1397        * section 5.4.3.2.2: URB_INTERLEAVED.
1398        *
1399        * URB entries are allocated on a multiple of 1024 bits, so an
1400        * extra 128 bits written here to make the end align to 256 is
1401        * no problem.
1402        */
1403       if ((mlen % 2) != 1)
1404          mlen++;
1405    }
1406
1407    return mlen;
1408 }
1409
1410
1411 /**
1412  * Generates the VUE payload plus the necessary URB write instructions to
1413  * output it.
1414  *
1415  * The VUE layout is documented in Volume 2a.
1416  */
1417 void
1418 vec4_visitor::emit_vertex()
1419 {
1420    /* MRF 0 is reserved for the debugger, so start with message header
1421     * in MRF 1.
1422     */
1423    int base_mrf = 1;
1424    int mrf = base_mrf;
1425    /* In the process of generating our URB write message contents, we
1426     * may need to unspill a register or load from an array.  Those
1427     * reads would use MRFs 14-15.
1428     */
1429    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1430
1431    /* The following assertion verifies that max_usable_mrf causes an
1432     * even-numbered amount of URB write data, which will meet gen6's
1433     * requirements for length alignment.
1434     */
1435    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1436
1437    /* First mrf is the g0-based message header containing URB handles and
1438     * such.
1439     */
1440    emit_urb_write_header(mrf++);
1441
1442    if (devinfo->gen < 6) {
1443       emit_ndc_computation();
1444    }
1445
1446    /* We may need to split this up into several URB writes, so do them in a
1447     * loop.
1448     */
1449    int slot = 0;
1450    bool complete = false;
1451    do {
1452       /* URB offset is in URB row increments, and each of our MRFs is half of
1453        * one of those, since we're doing interleaved writes.
1454        */
1455       int offset = slot / 2;
1456
1457       mrf = base_mrf + 1;
1458       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1459          emit_urb_slot(dst_reg(MRF, mrf++),
1460                        prog_data->vue_map.slot_to_varying[slot]);
1461
1462          /* If this was max_usable_mrf, we can't fit anything more into this
1463           * URB WRITE. Same thing if we reached the maximum length available.
1464           */
1465          if (mrf > max_usable_mrf ||
1466              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1467             slot++;
1468             break;
1469          }
1470       }
1471
1472       complete = slot >= prog_data->vue_map.num_slots;
1473       current_annotation = "URB write";
1474       vec4_instruction *inst = emit_urb_write_opcode(complete);
1475       inst->base_mrf = base_mrf;
1476       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1477       inst->offset += offset;
1478    } while(!complete);
1479 }
1480
1481
1482 src_reg
1483 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1484                                  src_reg *reladdr, int reg_offset)
1485 {
1486    /* Because we store the values to scratch interleaved like our
1487     * vertex data, we need to scale the vec4 index by 2.
1488     */
1489    int message_header_scale = 2;
1490
1491    /* Pre-gen6, the message header uses byte offsets instead of vec4
1492     * (16-byte) offset units.
1493     */
1494    if (devinfo->gen < 6)
1495       message_header_scale *= 16;
1496
1497    if (reladdr) {
1498       src_reg index = src_reg(this, glsl_type::int_type);
1499
1500       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1501                                    src_reg(reg_offset)));
1502       emit_before(block, inst, MUL(dst_reg(index), index,
1503                                    src_reg(message_header_scale)));
1504
1505       return index;
1506    } else {
1507       return src_reg(reg_offset * message_header_scale);
1508    }
1509 }
1510
1511 src_reg
1512 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1513                                        src_reg *reladdr, int reg_offset)
1514 {
1515    if (reladdr) {
1516       src_reg index = src_reg(this, glsl_type::int_type);
1517
1518       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1519                                    src_reg(reg_offset)));
1520
1521       /* Pre-gen6, the message header uses byte offsets instead of vec4
1522        * (16-byte) offset units.
1523        */
1524       if (devinfo->gen < 6) {
1525          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1526       }
1527
1528       return index;
1529    } else if (devinfo->gen >= 8) {
1530       /* Store the offset in a GRF so we can send-from-GRF. */
1531       src_reg offset = src_reg(this, glsl_type::int_type);
1532       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1533       return offset;
1534    } else {
1535       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1536       return src_reg(reg_offset * message_header_scale);
1537    }
1538 }
1539
1540 /**
1541  * Emits an instruction before @inst to load the value named by @orig_src
1542  * from scratch space at @base_offset to @temp.
1543  *
1544  * @base_offset is measured in 32-byte units (the size of a register).
1545  */
1546 void
1547 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1548                                 dst_reg temp, src_reg orig_src,
1549                                 int base_offset)
1550 {
1551    int reg_offset = base_offset + orig_src.reg_offset;
1552    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1553                                       reg_offset);
1554
1555    emit_before(block, inst, SCRATCH_READ(temp, index));
1556 }
1557
1558 /**
1559  * Emits an instruction after @inst to store the value to be written
1560  * to @orig_dst to scratch space at @base_offset, from @temp.
1561  *
1562  * @base_offset is measured in 32-byte units (the size of a register).
1563  */
1564 void
1565 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1566                                  int base_offset)
1567 {
1568    int reg_offset = base_offset + inst->dst.reg_offset;
1569    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1570                                       reg_offset);
1571
1572    /* Create a temporary register to store *inst's result in.
1573     *
1574     * We have to be careful in MOVing from our temporary result register in
1575     * the scratch write.  If we swizzle from channels of the temporary that
1576     * weren't initialized, it will confuse live interval analysis, which will
1577     * make spilling fail to make progress.
1578     */
1579    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1580                                        inst->dst.type),
1581                                 brw_swizzle_for_mask(inst->dst.writemask));
1582    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1583                                        inst->dst.writemask));
1584    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1585    if (inst->opcode != BRW_OPCODE_SEL)
1586       write->predicate = inst->predicate;
1587    write->ir = inst->ir;
1588    write->annotation = inst->annotation;
1589    inst->insert_after(block, write);
1590
1591    inst->dst.file = temp.file;
1592    inst->dst.reg = temp.reg;
1593    inst->dst.reg_offset = temp.reg_offset;
1594    inst->dst.reladdr = NULL;
1595 }
1596
1597 /**
1598  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1599  * adds the scratch read(s) before \p inst. The function also checks for
1600  * recursive reladdr scratch accesses, issuing the corresponding scratch
1601  * loads and rewriting reladdr references accordingly.
1602  *
1603  * \return \p src if it did not require a scratch load, otherwise, the
1604  * register holding the result of the scratch load that the caller should
1605  * use to rewrite src.
1606  */
1607 src_reg
1608 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1609                                    vec4_instruction *inst, src_reg src)
1610 {
1611    /* Resolve recursive reladdr scratch access by calling ourselves
1612     * with src.reladdr
1613     */
1614    if (src.reladdr)
1615       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1616                                           *src.reladdr);
1617
1618    /* Now handle scratch access on src */
1619    if (src.file == GRF && scratch_loc[src.reg] != -1) {
1620       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1621       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1622       src.reg = temp.reg;
1623       src.reg_offset = temp.reg_offset;
1624       src.reladdr = NULL;
1625    }
1626
1627    return src;
1628 }
1629
1630 /**
1631  * We can't generally support array access in GRF space, because a
1632  * single instruction's destination can only span 2 contiguous
1633  * registers.  So, we send all GRF arrays that get variable index
1634  * access to scratch space.
1635  */
1636 void
1637 vec4_visitor::move_grf_array_access_to_scratch()
1638 {
1639    int scratch_loc[this->alloc.count];
1640    memset(scratch_loc, -1, sizeof(scratch_loc));
1641
1642    /* First, calculate the set of virtual GRFs that need to be punted
1643     * to scratch due to having any array access on them, and where in
1644     * scratch.
1645     */
1646    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1647       if (inst->dst.file == GRF && inst->dst.reladdr) {
1648          if (scratch_loc[inst->dst.reg] == -1) {
1649             scratch_loc[inst->dst.reg] = last_scratch;
1650             last_scratch += this->alloc.sizes[inst->dst.reg];
1651          }
1652
1653          for (src_reg *iter = inst->dst.reladdr;
1654               iter->reladdr;
1655               iter = iter->reladdr) {
1656             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1657                scratch_loc[iter->reg] = last_scratch;
1658                last_scratch += this->alloc.sizes[iter->reg];
1659             }
1660          }
1661       }
1662
1663       for (int i = 0 ; i < 3; i++) {
1664          for (src_reg *iter = &inst->src[i];
1665               iter->reladdr;
1666               iter = iter->reladdr) {
1667             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1668                scratch_loc[iter->reg] = last_scratch;
1669                last_scratch += this->alloc.sizes[iter->reg];
1670             }
1671          }
1672       }
1673    }
1674
1675    /* Now, for anything that will be accessed through scratch, rewrite
1676     * it to load/store.  Note that this is a _safe list walk, because
1677     * we may generate a new scratch_write instruction after the one
1678     * we're processing.
1679     */
1680    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1681       /* Set up the annotation tracking for new generated instructions. */
1682       base_ir = inst->ir;
1683       current_annotation = inst->annotation;
1684
1685       /* First handle scratch access on the dst. Notice we have to handle
1686        * the case where the dst's reladdr also points to scratch space.
1687        */
1688       if (inst->dst.reladdr)
1689          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1690                                                    *inst->dst.reladdr);
1691
1692       /* Now that we have handled any (possibly recursive) reladdr scratch
1693        * accesses for dst we can safely do the scratch write for dst itself
1694        */
1695       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1696          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1697
1698       /* Now handle scratch access on any src. In this case, since inst->src[i]
1699        * already is a src_reg, we can just call emit_resolve_reladdr with
1700        * inst->src[i] and it will take care of handling scratch loads for
1701        * both src and src.reladdr (recursively).
1702        */
1703       for (int i = 0 ; i < 3; i++) {
1704          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1705                                              inst->src[i]);
1706       }
1707    }
1708 }
1709
1710 /**
1711  * Emits an instruction before @inst to load the value named by @orig_src
1712  * from the pull constant buffer (surface) at @base_offset to @temp.
1713  */
1714 void
1715 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1716                                       dst_reg temp, src_reg orig_src,
1717                                       int base_offset)
1718 {
1719    int reg_offset = base_offset + orig_src.reg_offset;
1720    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1721    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1722                                              reg_offset);
1723
1724    emit_pull_constant_load_reg(temp,
1725                                index,
1726                                offset,
1727                                block, inst);
1728 }
1729
1730 /**
1731  * Implements array access of uniforms by inserting a
1732  * PULL_CONSTANT_LOAD instruction.
1733  *
1734  * Unlike temporary GRF array access (where we don't support it due to
1735  * the difficulty of doing relative addressing on instruction
1736  * destinations), we could potentially do array access of uniforms
1737  * that were loaded in GRF space as push constants.  In real-world
1738  * usage we've seen, though, the arrays being used are always larger
1739  * than we could load as push constants, so just always move all
1740  * uniform array access out to a pull constant buffer.
1741  */
1742 void
1743 vec4_visitor::move_uniform_array_access_to_pull_constants()
1744 {
1745    int pull_constant_loc[this->uniforms];
1746    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1747    bool nested_reladdr;
1748
1749    /* Walk through and find array access of uniforms.  Put a copy of that
1750     * uniform in the pull constant buffer.
1751     *
1752     * Note that we don't move constant-indexed accesses to arrays.  No
1753     * testing has been done of the performance impact of this choice.
1754     */
1755    do {
1756       nested_reladdr = false;
1757
1758       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1759          for (int i = 0 ; i < 3; i++) {
1760             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1761                continue;
1762
1763             int uniform = inst->src[i].reg;
1764
1765             if (inst->src[i].reladdr->reladdr)
1766                nested_reladdr = true;  /* will need another pass */
1767
1768             /* If this array isn't already present in the pull constant buffer,
1769              * add it.
1770              */
1771             if (pull_constant_loc[uniform] == -1) {
1772                const gl_constant_value **values =
1773                   &stage_prog_data->param[uniform * 4];
1774
1775                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1776
1777                assert(uniform < uniform_array_size);
1778                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1779                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1780                      = values[j];
1781                }
1782             }
1783
1784             /* Set up the annotation tracking for new generated instructions. */
1785             base_ir = inst->ir;
1786             current_annotation = inst->annotation;
1787
1788             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1789
1790             emit_pull_constant_load(block, inst, temp, inst->src[i],
1791                                     pull_constant_loc[uniform]);
1792
1793             inst->src[i].file = temp.file;
1794             inst->src[i].reg = temp.reg;
1795             inst->src[i].reg_offset = temp.reg_offset;
1796             inst->src[i].reladdr = NULL;
1797          }
1798       }
1799    } while (nested_reladdr);
1800
1801    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1802     * no need to track them as larger-than-vec4 objects.  This will be
1803     * relied on in cutting out unused uniform vectors from push
1804     * constants.
1805     */
1806    split_uniform_registers();
1807 }
1808
1809 void
1810 vec4_visitor::resolve_ud_negate(src_reg *reg)
1811 {
1812    if (reg->type != BRW_REGISTER_TYPE_UD ||
1813        !reg->negate)
1814       return;
1815
1816    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1817    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1818    *reg = temp;
1819 }
1820
1821 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1822                            void *log_data,
1823                            const struct brw_sampler_prog_key_data *key_tex,
1824                            struct brw_vue_prog_data *prog_data,
1825                            const nir_shader *shader,
1826                            void *mem_ctx,
1827                            bool no_spills,
1828                            int shader_time_index)
1829    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1830      key_tex(key_tex),
1831      prog_data(prog_data),
1832      fail_msg(NULL),
1833      first_non_payload_grf(0),
1834      need_all_constants_in_pull_buffer(false),
1835      no_spills(no_spills),
1836      shader_time_index(shader_time_index),
1837      last_scratch(0)
1838 {
1839    this->failed = false;
1840
1841    this->base_ir = NULL;
1842    this->current_annotation = NULL;
1843    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1844
1845    this->virtual_grf_start = NULL;
1846    this->virtual_grf_end = NULL;
1847    this->live_intervals = NULL;
1848
1849    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1850
1851    this->uniforms = 0;
1852
1853    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1854     * at least one. See setup_uniforms() in brw_vec4.cpp.
1855     */
1856    this->uniform_array_size = 1;
1857    if (prog_data) {
1858       this->uniform_array_size =
1859          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1860    }
1861
1862    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1863 }
1864
1865 vec4_visitor::~vec4_visitor()
1866 {
1867 }
1868
1869
1870 void
1871 vec4_visitor::fail(const char *format, ...)
1872 {
1873    va_list va;
1874    char *msg;
1875
1876    if (failed)
1877       return;
1878
1879    failed = true;
1880
1881    va_start(va, format);
1882    msg = ralloc_vasprintf(mem_ctx, format, va);
1883    va_end(va);
1884    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1885
1886    this->fail_msg = msg;
1887
1888    if (debug_enabled) {
1889       fprintf(stderr, "%s",  msg);
1890    }
1891 }
1892
1893 } /* namespace brw */