src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->predicate = BRW_PREDICATE_NONE;
  49    this->predicate_inverse = false;
  50    this->target = 0;
  51    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  52    this->shadow_compare = false;
  53    this->ir = NULL;
  54    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  55    this->header_size = 0;
  56    this->flag_subreg = 0;
  57    this->mlen = 0;
  58    this->base_mrf = 0;
  59    this->offset = 0;
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188
 189 /** Gen4 predicated IF. */
 190 vec4_instruction *
 191 vec4_visitor::IF(enum brw_predicate predicate)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197
 198    return inst;
 199 }
 200
 201 /** Gen6 IF with embedded comparison. */
 202 vec4_instruction *
 203 vec4_visitor::IF(src_reg src0, src_reg src1,
 204                  enum brw_conditional_mod condition)
 205 {
 206    assert(devinfo->gen == 6);
 207
 208    vec4_instruction *inst;
 209
 210    resolve_ud_negate(&src0);
 211    resolve_ud_negate(&src1);
 212
 213    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 214                                         src0, src1);
 215    inst->conditional_mod = condition;
 216
 217    return inst;
 218 }
 219
 220 /**
 221  * CMP: Sets the low bit of the destination channels with the result
 222  * of the comparison, while the upper bits are undefined, and updates
 223  * the flag register with the packed 16 bits of the result.
 224  */
 225 vec4_instruction *
 226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 227                   enum brw_conditional_mod condition)
 228 {
 229    vec4_instruction *inst;
 230
 231    /* Take the instruction:
 232     *
 233     * CMP null<d> src0<f> src1<f>
 234     *
 235     * Original gen4 does type conversion to the destination type before
 236     * comparison, producing garbage results for floating point comparisons.
 237     *
 238     * The destination type doesn't matter on newer generations, so we set the
 239     * type to match src0 so we can compact the instruction.
 240     */
 241    dst.type = src0.type;
 242    if (dst.file == HW_REG)
 243       dst.fixed_hw_reg.type = dst.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 void
 282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 283 {
 284    static enum opcode dot_opcodes[] = {
 285       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 286    };
 287
 288    emit(dot_opcodes[elements - 2], dst, src0, src1);
 289 }
 290
 291 src_reg
 292 vec4_visitor::fix_3src_operand(const src_reg &src)
 293 {
 294    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 295     * able to use vertical stride of zero to replicate the vec4 uniform, like
 296     *
 297     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 298     *
 299     * But you can't, since vertical stride is always four in three-source
 300     * instructions. Instead, insert a MOV instruction to do the replication so
 301     * that the three-source instruction can consume it.
 302     */
 303
 304    /* The MOV is only needed if the source is a uniform or immediate. */
 305    if (src.file != UNIFORM && src.file != IMM)
 306       return src;
 307
 308    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 309       return src;
 310
 311    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 312    expanded.type = src.type;
 313    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 314    return src_reg(expanded);
 315 }
 316
 317 src_reg
 318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 319 {
 320    if (!src.abs && !src.negate)
 321       return src;
 322
 323    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 324    resolved.type = src.type;
 325    emit(MOV(resolved, src));
 326
 327    return src_reg(resolved);
 328 }
 329
 330 src_reg
 331 vec4_visitor::fix_math_operand(const src_reg &src)
 332 {
 333    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 334       return src;
 335
 336    /* The gen6 math instruction ignores the source modifiers --
 337     * swizzle, abs, negate, and at least some parts of the register
 338     * region description.
 339     *
 340     * Rather than trying to enumerate all these cases, *always* expand the
 341     * operand to a temp GRF for gen6.
 342     *
 343     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 344     * can't use.
 345     */
 346
 347    if (devinfo->gen == 7 && src.file != IMM)
 348       return src;
 349
 350    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 351    expanded.type = src.type;
 352    emit(MOV(expanded, src));
 353    return src_reg(expanded);
 354 }
 355
 356 vec4_instruction *
 357 vec4_visitor::emit_math(enum opcode opcode,
 358                         const dst_reg &dst,
 359                         const src_reg &src0, const src_reg &src1)
 360 {
 361    vec4_instruction *math =
 362       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 363
 364    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 365       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 366       math->dst = dst_reg(this, glsl_type::vec4_type);
 367       math->dst.type = dst.type;
 368       math = emit(MOV(dst, src_reg(math->dst)));
 369    } else if (devinfo->gen < 6) {
 370       math->base_mrf = 1;
 371       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 372    }
 373
 374    return math;
 375 }
 376
 377 void
 378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 379 {
 380    if (devinfo->gen < 7) {
 381       unreachable("ir_unop_pack_half_2x16 should be lowered");
 382    }
 383
 384    assert(dst.type == BRW_REGISTER_TYPE_UD);
 385    assert(src0.type == BRW_REGISTER_TYPE_F);
 386
 387    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 388     *
 389     *   Because this instruction does not have a 16-bit floating-point type,
 390     *   the destination data type must be Word (W).
 391     *
 392     *   The destination must be DWord-aligned and specify a horizontal stride
 393     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 394     *   each destination channel and the upper word is not modified.
 395     *
 396     * The above restriction implies that the f32to16 instruction must use
 397     * align1 mode, because only in align1 mode is it possible to specify
 398     * horizontal stride.  We choose here to defy the hardware docs and emit
 399     * align16 instructions.
 400     *
 401     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 402     * instructions. I was partially successful in that the code passed all
 403     * tests.  However, the code was dubiously correct and fragile, and the
 404     * tests were not harsh enough to probe that frailty. Not trusting the
 405     * code, I chose instead to remain in align16 mode in defiance of the hw
 406     * docs).
 407     *
 408     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 409     * simulator, emitting a f32to16 in align16 mode with UD as destination
 410     * data type is safe. The behavior differs from that specified in the PRM
 411     * in that the upper word of each destination channel is cleared to 0.
 412     */
 413
 414    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 415    src_reg tmp_src(tmp_dst);
 416
 417 #if 0
 418    /* Verify the undocumented behavior on which the following instructions
 419     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 420     * then the result of the bit-or instruction below will be incorrect.
 421     *
 422     * You should inspect the disasm output in order to verify that the MOV is
 423     * not optimized away.
 424     */
 425    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 426 #endif
 427
 428    /* Give tmp the form below, where "." means untouched.
 429     *
 430     *     w z          y          x w z          y          x
 431     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 432     *
 433     * That the upper word of each write-channel be 0 is required for the
 434     * following bit-shift and bit-or instructions to work. Note that this
 435     * relies on the undocumented hardware behavior mentioned above.
 436     */
 437    tmp_dst.writemask = WRITEMASK_XY;
 438    emit(F32TO16(tmp_dst, src0));
 439
 440    /* Give the write-channels of dst the form:
 441     *   0xhhhh0000
 442     */
 443    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 444    emit(SHL(dst, tmp_src, src_reg(16u)));
 445
 446    /* Finally, give the write-channels of dst the form of packHalf2x16's
 447     * output:
 448     *   0xhhhhllll
 449     */
 450    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 451    emit(OR(dst, src_reg(dst), tmp_src));
 452 }
 453
 454 void
 455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 456 {
 457    if (devinfo->gen < 7) {
 458       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 459    }
 460
 461    assert(dst.type == BRW_REGISTER_TYPE_F);
 462    assert(src0.type == BRW_REGISTER_TYPE_UD);
 463
 464    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 465     *
 466     *   Because this instruction does not have a 16-bit floating-point type,
 467     *   the source data type must be Word (W). The destination type must be
 468     *   F (Float).
 469     *
 470     * To use W as the source data type, we must adjust horizontal strides,
 471     * which is only possible in align1 mode. All my [chadv] attempts at
 472     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 473     * Piglit tests, so I gave up.
 474     *
 475     * I've verified that, on gen7 hardware and the simulator, it is safe to
 476     * emit f16to32 in align16 mode with UD as source data type.
 477     */
 478
 479    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 480    src_reg tmp_src(tmp_dst);
 481
 482    tmp_dst.writemask = WRITEMASK_X;
 483    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 484
 485    tmp_dst.writemask = WRITEMASK_Y;
 486    emit(SHR(tmp_dst, src0, src_reg(16u)));
 487
 488    dst.writemask = WRITEMASK_XY;
 489    emit(F16TO32(dst, tmp_src));
 490 }
 491
 492 void
 493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 494 {
 495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 497     * is not suitable to generate the shift values, but we can use the packed
 498     * vector float and a type-converting MOV.
 499     */
 500    dst_reg shift(this, glsl_type::uvec4_type);
 501    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 502
 503    dst_reg shifted(this, glsl_type::uvec4_type);
 504    src0.swizzle = BRW_SWIZZLE_XXXX;
 505    emit(SHR(shifted, src0, src_reg(shift)));
 506
 507    shifted.type = BRW_REGISTER_TYPE_UB;
 508    dst_reg f(this, glsl_type::vec4_type);
 509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 510
 511    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 516 {
 517    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 518     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 519     * is not suitable to generate the shift values, but we can use the packed
 520     * vector float and a type-converting MOV.
 521     */
 522    dst_reg shift(this, glsl_type::uvec4_type);
 523    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 524
 525    dst_reg shifted(this, glsl_type::uvec4_type);
 526    src0.swizzle = BRW_SWIZZLE_XXXX;
 527    emit(SHR(shifted, src0, src_reg(shift)));
 528
 529    shifted.type = BRW_REGISTER_TYPE_B;
 530    dst_reg f(this, glsl_type::vec4_type);
 531    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 532
 533    dst_reg scaled(this, glsl_type::vec4_type);
 534    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 535
 536    dst_reg max(this, glsl_type::vec4_type);
 537    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 538    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 539 }
 540
 541 void
 542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 543 {
 544    dst_reg saturated(this, glsl_type::vec4_type);
 545    vec4_instruction *inst = emit(MOV(saturated, src0));
 546    inst->saturate = true;
 547
 548    dst_reg scaled(this, glsl_type::vec4_type);
 549    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 550
 551    dst_reg rounded(this, glsl_type::vec4_type);
 552    emit(RNDE(rounded, src_reg(scaled)));
 553
 554    dst_reg u(this, glsl_type::uvec4_type);
 555    emit(MOV(u, src_reg(rounded)));
 556
 557    src_reg bytes(u);
 558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 559 }
 560
 561 void
 562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 563 {
 564    dst_reg max(this, glsl_type::vec4_type);
 565    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 566
 567    dst_reg min(this, glsl_type::vec4_type);
 568    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 569
 570    dst_reg scaled(this, glsl_type::vec4_type);
 571    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 572
 573    dst_reg rounded(this, glsl_type::vec4_type);
 574    emit(RNDE(rounded, src_reg(scaled)));
 575
 576    dst_reg i(this, glsl_type::ivec4_type);
 577    emit(MOV(i, src_reg(rounded)));
 578
 579    src_reg bytes(i);
 580    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 581 }
 582
 583 void
 584 vec4_visitor::visit_instructions(const exec_list *list)
 585 {
 586    foreach_in_list(ir_instruction, ir, list) {
 587       base_ir = ir;
 588       ir->accept(this);
 589    }
 590 }
 591
 592 /**
 593  * Returns the minimum number of vec4 elements needed to pack a type.
 594  *
 595  * For simple types, it will return 1 (a single vec4); for matrices, the
 596  * number of columns; for array and struct, the sum of the vec4_size of
 597  * each of its elements; and for sampler and atomic, zero.
 598  *
 599  * This method is useful to calculate how much register space is needed to
 600  * store a particular type.
 601  */
 602 extern "C" int
 603 type_size_vec4(const struct glsl_type *type)
 604 {
 605    unsigned int i;
 606    int size;
 607
 608    switch (type->base_type) {
 609    case GLSL_TYPE_UINT:
 610    case GLSL_TYPE_INT:
 611    case GLSL_TYPE_FLOAT:
 612    case GLSL_TYPE_BOOL:
 613       if (type->is_matrix()) {
 614          return type->matrix_columns;
 615       } else {
 616          /* Regardless of size of vector, it gets a vec4. This is bad
 617           * packing for things like floats, but otherwise arrays become a
 618           * mess.  Hopefully a later pass over the code can pack scalars
 619           * down if appropriate.
 620           */
 621          return 1;
 622       }
 623    case GLSL_TYPE_ARRAY:
 624       assert(type->length > 0);
 625       return type_size_vec4(type->fields.array) * type->length;
 626    case GLSL_TYPE_STRUCT:
 627       size = 0;
 628       for (i = 0; i < type->length; i++) {
 629          size += type_size_vec4(type->fields.structure[i].type);
 630       }
 631       return size;
 632    case GLSL_TYPE_SUBROUTINE:
 633       return 1;
 634
 635    case GLSL_TYPE_SAMPLER:
 636       /* Samplers take up no register space, since they're baked in at
 637        * link time.
 638        */
 639       return 0;
 640    case GLSL_TYPE_ATOMIC_UINT:
 641       return 0;
 642    case GLSL_TYPE_IMAGE:
 643       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 644    case GLSL_TYPE_VOID:
 645    case GLSL_TYPE_DOUBLE:
 646    case GLSL_TYPE_ERROR:
 647    case GLSL_TYPE_INTERFACE:
 648       unreachable("not reached");
 649    }
 650
 651    return 0;
 652 }
 653
 654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size_vec4(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->swizzle = BRW_SWIZZLE_NOOP;
 663    } else {
 664       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 671 {
 672    assert(size > 0);
 673
 674    init();
 675
 676    this->file = GRF;
 677    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 678
 679    this->swizzle = BRW_SWIZZLE_NOOP;
 680
 681    this->type = brw_type_for_base_type(type);
 682 }
 683
 684 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 685 {
 686    init();
 687
 688    this->file = GRF;
 689    this->reg = v->alloc.allocate(type_size_vec4(type));
 690
 691    if (type->is_array() || type->is_record()) {
 692       this->writemask = WRITEMASK_XYZW;
 693    } else {
 694       this->writemask = (1 << type->vector_elements) - 1;
 695    }
 696
 697    this->type = brw_type_for_base_type(type);
 698 }
 699
 700 void
 701 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 702                                        const gl_constant_value *values,
 703                                        unsigned n)
 704 {
 705    static const gl_constant_value zero = { 0 };
 706
 707    assert(param_offset % 4 == 0);
 708
 709    for (unsigned i = 0; i < n; ++i)
 710       stage_prog_data->param[param_offset + i] = &values[i];
 711
 712    for (unsigned i = n; i < 4; ++i)
 713       stage_prog_data->param[param_offset + i] = &zero;
 714
 715    uniform_vector_size[param_offset / 4] = n;
 716 }
 717
 718 /* Our support for uniforms is piggy-backed on the struct
 719  * gl_fragment_program, because that's where the values actually
 720  * get stored, rather than in some global gl_shader_program uniform
 721  * store.
 722  */
 723 void
 724 vec4_visitor::setup_uniform_values(ir_variable *ir)
 725 {
 726    int namelen = strlen(ir->name);
 727
 728    /* The data for our (non-builtin) uniforms is stored in a series of
 729     * gl_uniform_driver_storage structs for each subcomponent that
 730     * glGetUniformLocation() could name.  We know it's been set up in the same
 731     * order we'd walk the type, so walk the list of storage and find anything
 732     * with our name, or the prefix of a component that starts with our name.
 733     */
 734    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 735       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 736
 737       if (storage->builtin)
 738          continue;
 739
 740       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 741           (storage->name[namelen] != 0 &&
 742            storage->name[namelen] != '.' &&
 743            storage->name[namelen] != '[')) {
 744          continue;
 745       }
 746
 747       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 748                                      storage->type->matrix_columns);
 749       const unsigned vector_size = storage->type->vector_elements;
 750
 751       for (unsigned s = 0; s < vector_count; s++) {
 752          setup_vec4_uniform_value(uniforms * 4,
 753                                   &storage->storage[s * vector_size],
 754                                   vector_size);
 755          uniforms++;
 756       }
 757    }
 758 }
 759
 760 /* Our support for builtin uniforms is even scarier than non-builtin.
 761  * It sits on top of the PROG_STATE_VAR parameters that are
 762  * automatically updated from GL context state.
 763  */
 764 void
 765 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 766 {
 767    const ir_state_slot *const slots = ir->get_state_slots();
 768    assert(slots != NULL);
 769
 770    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 771       /* This state reference has already been setup by ir_to_mesa,
 772        * but we'll get the same index back here.  We can reference
 773        * ParameterValues directly, since unlike brw_fs.cpp, we never
 774        * add new state references during compile.
 775        */
 776       int index = _mesa_add_state_reference(this->prog->Parameters,
 777                                             (gl_state_index *)slots[i].tokens);
 778       gl_constant_value *values =
 779          &this->prog->Parameters->ParameterValues[index][0];
 780
 781       assert(this->uniforms < uniform_array_size);
 782
 783       for (unsigned j = 0; j < 4; j++)
 784          stage_prog_data->param[this->uniforms * 4 + j] =
 785             &values[GET_SWZ(slots[i].swizzle, j)];
 786
 787       this->uniform_vector_size[this->uniforms] =
 788          (ir->type->is_scalar() || ir->type->is_vector() ||
 789           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 790
 791       this->uniforms++;
 792    }
 793 }
 794
 795 dst_reg *
 796 vec4_visitor::variable_storage(ir_variable *var)
 797 {
 798    return (dst_reg *)hash_table_find(this->variable_ht, var);
 799 }
 800
 801 void
 802 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 803                                      enum brw_predicate *predicate)
 804 {
 805    ir_expression *expr = ir->as_expression();
 806
 807    *predicate = BRW_PREDICATE_NORMAL;
 808
 809    if (expr && expr->operation != ir_binop_ubo_load) {
 810       src_reg op[3];
 811       vec4_instruction *inst;
 812
 813       assert(expr->get_num_operands() <= 3);
 814       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 815          expr->operands[i]->accept(this);
 816          op[i] = this->result;
 817
 818          resolve_ud_negate(&op[i]);
 819       }
 820
 821       switch (expr->operation) {
 822       case ir_unop_logic_not:
 823          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 824          inst->conditional_mod = BRW_CONDITIONAL_Z;
 825          break;
 826
 827       case ir_binop_logic_xor:
 828          if (devinfo->gen <= 5) {
 829             src_reg temp = src_reg(this, ir->type);
 830             emit(XOR(dst_reg(temp), op[0], op[1]));
 831             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 832          } else {
 833             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 834          }
 835          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836          break;
 837
 838       case ir_binop_logic_or:
 839          if (devinfo->gen <= 5) {
 840             src_reg temp = src_reg(this, ir->type);
 841             emit(OR(dst_reg(temp), op[0], op[1]));
 842             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 843          } else {
 844             inst = emit(OR(dst_null_d(), op[0], op[1]));
 845          }
 846          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 847          break;
 848
 849       case ir_binop_logic_and:
 850          if (devinfo->gen <= 5) {
 851             src_reg temp = src_reg(this, ir->type);
 852             emit(AND(dst_reg(temp), op[0], op[1]));
 853             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 854          } else {
 855             inst = emit(AND(dst_null_d(), op[0], op[1]));
 856          }
 857          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 858          break;
 859
 860       case ir_unop_f2b:
 861          if (devinfo->gen >= 6) {
 862             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 863          } else {
 864             inst = emit(MOV(dst_null_f(), op[0]));
 865             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 866          }
 867          break;
 868
 869       case ir_unop_i2b:
 870          if (devinfo->gen >= 6) {
 871             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 872          } else {
 873             inst = emit(MOV(dst_null_d(), op[0]));
 874             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 875          }
 876          break;
 877
 878       case ir_binop_all_equal:
 879          if (devinfo->gen <= 5) {
 880             resolve_bool_comparison(expr->operands[0], &op[0]);
 881             resolve_bool_comparison(expr->operands[1], &op[1]);
 882          }
 883          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 884          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 885          break;
 886
 887       case ir_binop_any_nequal:
 888          if (devinfo->gen <= 5) {
 889             resolve_bool_comparison(expr->operands[0], &op[0]);
 890             resolve_bool_comparison(expr->operands[1], &op[1]);
 891          }
 892          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 893          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 894          break;
 895
 896       case ir_unop_any:
 897          if (devinfo->gen <= 5) {
 898             resolve_bool_comparison(expr->operands[0], &op[0]);
 899          }
 900          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 901          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 902          break;
 903
 904       case ir_binop_greater:
 905       case ir_binop_gequal:
 906       case ir_binop_less:
 907       case ir_binop_lequal:
 908       case ir_binop_equal:
 909       case ir_binop_nequal:
 910          if (devinfo->gen <= 5) {
 911             resolve_bool_comparison(expr->operands[0], &op[0]);
 912             resolve_bool_comparison(expr->operands[1], &op[1]);
 913          }
 914          emit(CMP(dst_null_d(), op[0], op[1],
 915                   brw_conditional_for_comparison(expr->operation)));
 916          break;
 917
 918       case ir_triop_csel: {
 919          /* Expand the boolean condition into the flag register. */
 920          inst = emit(MOV(dst_null_d(), op[0]));
 921          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 922
 923          /* Select which boolean to return. */
 924          dst_reg temp(this, expr->operands[1]->type);
 925          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 926          inst->predicate = BRW_PREDICATE_NORMAL;
 927
 928          /* Expand the result to a condition code. */
 929          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 930          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 931          break;
 932       }
 933
 934       default:
 935          unreachable("not reached");
 936       }
 937       return;
 938    }
 939
 940    ir->accept(this);
 941
 942    resolve_ud_negate(&this->result);
 943
 944    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 945    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 946 }
 947
 948 /**
 949  * Emit a gen6 IF statement with the comparison folded into the IF
 950  * instruction.
 951  */
 952 void
 953 vec4_visitor::emit_if_gen6(ir_if *ir)
 954 {
 955    ir_expression *expr = ir->condition->as_expression();
 956
 957    if (expr && expr->operation != ir_binop_ubo_load) {
 958       src_reg op[3];
 959       dst_reg temp;
 960
 961       assert(expr->get_num_operands() <= 3);
 962       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 963          expr->operands[i]->accept(this);
 964          op[i] = this->result;
 965       }
 966
 967       switch (expr->operation) {
 968       case ir_unop_logic_not:
 969          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 970          return;
 971
 972       case ir_binop_logic_xor:
 973          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 974          return;
 975
 976       case ir_binop_logic_or:
 977          temp = dst_reg(this, glsl_type::bool_type);
 978          emit(OR(temp, op[0], op[1]));
 979          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 980          return;
 981
 982       case ir_binop_logic_and:
 983          temp = dst_reg(this, glsl_type::bool_type);
 984          emit(AND(temp, op[0], op[1]));
 985          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 986          return;
 987
 988       case ir_unop_f2b:
 989          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 990          return;
 991
 992       case ir_unop_i2b:
 993          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 994          return;
 995
 996       case ir_binop_greater:
 997       case ir_binop_gequal:
 998       case ir_binop_less:
 999       case ir_binop_lequal:
1000       case ir_binop_equal:
1001       case ir_binop_nequal:
1002          emit(IF(op[0], op[1],
1003                  brw_conditional_for_comparison(expr->operation)));
1004          return;
1005
1006       case ir_binop_all_equal:
1007          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1008          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1009          return;
1010
1011       case ir_binop_any_nequal:
1012          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1013          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1014          return;
1015
1016       case ir_unop_any:
1017          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1018          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1019          return;
1020
1021       case ir_triop_csel: {
1022          /* Expand the boolean condition into the flag register. */
1023          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1024          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1025
1026          /* Select which boolean to return. */
1027          dst_reg temp(this, expr->operands[1]->type);
1028          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1029          inst->predicate = BRW_PREDICATE_NORMAL;
1030
1031          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1032          return;
1033       }
1034
1035       default:
1036          unreachable("not reached");
1037       }
1038       return;
1039    }
1040
1041    ir->condition->accept(this);
1042
1043    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1044 }
1045
1046 void
1047 vec4_visitor::visit(ir_variable *ir)
1048 {
1049    dst_reg *reg = NULL;
1050
1051    if (variable_storage(ir))
1052       return;
1053
1054    switch (ir->data.mode) {
1055    case ir_var_shader_in:
1056       assert(ir->data.location != -1);
1057       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1058       break;
1059
1060    case ir_var_shader_out:
1061       assert(ir->data.location != -1);
1062       reg = new(mem_ctx) dst_reg(this, ir->type);
1063
1064       for (int i = 0; i < type_size_vec4(ir->type); i++) {
1065          output_reg[ir->data.location + i] = *reg;
1066          output_reg[ir->data.location + i].reg_offset = i;
1067          output_reg_annotation[ir->data.location + i] = ir->name;
1068       }
1069       break;
1070
1071    case ir_var_auto:
1072    case ir_var_temporary:
1073       reg = new(mem_ctx) dst_reg(this, ir->type);
1074       break;
1075
1076    case ir_var_uniform:
1077       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1078
1079       /* Thanks to the lower_ubo_reference pass, we will see only
1080        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1081        * variables, so no need for them to be in variable_ht.
1082        *
1083        * Some uniforms, such as samplers and atomic counters, have no actual
1084        * storage, so we should ignore them.
1085        */
1086       if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1087          return;
1088
1089       /* Track how big the whole uniform variable is, in case we need to put a
1090        * copy of its data into pull constants for array access.
1091        */
1092       assert(this->uniforms < uniform_array_size);
1093       this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1094
1095       if (!strncmp(ir->name, "gl_", 3)) {
1096          setup_builtin_uniform_values(ir);
1097       } else {
1098          setup_uniform_values(ir);
1099       }
1100       break;
1101
1102    case ir_var_system_value:
1103       reg = make_reg_for_system_value(ir->data.location, ir->type);
1104       break;
1105
1106    default:
1107       unreachable("not reached");
1108    }
1109
1110    reg->type = brw_type_for_base_type(ir->type);
1111    hash_table_insert(this->variable_ht, reg, ir);
1112 }
1113
1114 void
1115 vec4_visitor::visit(ir_loop *ir)
1116 {
1117    /* We don't want debugging output to print the whole body of the
1118     * loop as the annotation.
1119     */
1120    this->base_ir = NULL;
1121
1122    emit(BRW_OPCODE_DO);
1123
1124    visit_instructions(&ir->body_instructions);
1125
1126    emit(BRW_OPCODE_WHILE);
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_loop_jump *ir)
1131 {
1132    switch (ir->mode) {
1133    case ir_loop_jump::jump_break:
1134       emit(BRW_OPCODE_BREAK);
1135       break;
1136    case ir_loop_jump::jump_continue:
1137       emit(BRW_OPCODE_CONTINUE);
1138       break;
1139    }
1140 }
1141
1142
1143 void
1144 vec4_visitor::visit(ir_function_signature *)
1145 {
1146    unreachable("not reached");
1147 }
1148
1149 void
1150 vec4_visitor::visit(ir_function *ir)
1151 {
1152    /* Ignore function bodies other than main() -- we shouldn't see calls to
1153     * them since they should all be inlined.
1154     */
1155    if (strcmp(ir->name, "main") == 0) {
1156       const ir_function_signature *sig;
1157       exec_list empty;
1158
1159       sig = ir->matching_signature(NULL, &empty, false);
1160
1161       assert(sig);
1162
1163       visit_instructions(&sig->body);
1164    }
1165 }
1166
1167 bool
1168 vec4_visitor::try_emit_mad(ir_expression *ir)
1169 {
1170    /* 3-src instructions were introduced in gen6. */
1171    if (devinfo->gen < 6)
1172       return false;
1173
1174    /* MAD can only handle floating-point data. */
1175    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1176       return false;
1177
1178    ir_rvalue *nonmul;
1179    ir_expression *mul;
1180    bool mul_negate, mul_abs;
1181
1182    for (int i = 0; i < 2; i++) {
1183       mul_negate = false;
1184       mul_abs = false;
1185
1186       mul = ir->operands[i]->as_expression();
1187       nonmul = ir->operands[1 - i];
1188
1189       if (mul && mul->operation == ir_unop_abs) {
1190          mul = mul->operands[0]->as_expression();
1191          mul_abs = true;
1192       } else if (mul && mul->operation == ir_unop_neg) {
1193          mul = mul->operands[0]->as_expression();
1194          mul_negate = true;
1195       }
1196
1197       if (mul && mul->operation == ir_binop_mul)
1198          break;
1199    }
1200
1201    if (!mul || mul->operation != ir_binop_mul)
1202       return false;
1203
1204    nonmul->accept(this);
1205    src_reg src0 = fix_3src_operand(this->result);
1206
1207    mul->operands[0]->accept(this);
1208    src_reg src1 = fix_3src_operand(this->result);
1209    src1.negate ^= mul_negate;
1210    src1.abs = mul_abs;
1211    if (mul_abs)
1212       src1.negate = false;
1213
1214    mul->operands[1]->accept(this);
1215    src_reg src2 = fix_3src_operand(this->result);
1216    src2.abs = mul_abs;
1217    if (mul_abs)
1218       src2.negate = false;
1219
1220    this->result = src_reg(this, ir->type);
1221    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1222
1223    return true;
1224 }
1225
1226 bool
1227 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1228 {
1229    /* This optimization relies on CMP setting the destination to 0 when
1230     * false.  Early hardware only sets the least significant bit, and
1231     * leaves the other bits undefined.  So we can't use it.
1232     */
1233    if (devinfo->gen < 6)
1234       return false;
1235
1236    ir_expression *const cmp = ir->operands[0]->as_expression();
1237
1238    if (cmp == NULL)
1239       return false;
1240
1241    switch (cmp->operation) {
1242    case ir_binop_less:
1243    case ir_binop_greater:
1244    case ir_binop_lequal:
1245    case ir_binop_gequal:
1246    case ir_binop_equal:
1247    case ir_binop_nequal:
1248       break;
1249
1250    default:
1251       return false;
1252    }
1253
1254    cmp->operands[0]->accept(this);
1255    const src_reg cmp_src0 = this->result;
1256
1257    cmp->operands[1]->accept(this);
1258    const src_reg cmp_src1 = this->result;
1259
1260    this->result = src_reg(this, ir->type);
1261
1262    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1263             brw_conditional_for_comparison(cmp->operation)));
1264
1265    /* If the comparison is false, this->result will just happen to be zero.
1266     */
1267    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1268                                        this->result, src_reg(1.0f));
1269    inst->predicate = BRW_PREDICATE_NORMAL;
1270    inst->predicate_inverse = true;
1271
1272    return true;
1273 }
1274
1275 vec4_instruction *
1276 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1277                           src_reg src0, src_reg src1)
1278 {
1279    vec4_instruction *inst;
1280
1281    if (devinfo->gen >= 6) {
1282       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1283       inst->conditional_mod = conditionalmod;
1284    } else {
1285       emit(CMP(dst, src0, src1, conditionalmod));
1286
1287       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1288       inst->predicate = BRW_PREDICATE_NORMAL;
1289    }
1290
1291    return inst;
1292 }
1293
1294 vec4_instruction *
1295 vec4_visitor::emit_lrp(const dst_reg &dst,
1296                        const src_reg &x, const src_reg &y, const src_reg &a)
1297 {
1298    if (devinfo->gen >= 6) {
1299       /* Note that the instruction's argument order is reversed from GLSL
1300        * and the IR.
1301        */
1302      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1303                      fix_3src_operand(x)));
1304    } else {
1305       /* Earlier generations don't support three source operations, so we
1306        * need to emit x*(1-a) + y*a.
1307        */
1308       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1309       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1310       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1311       y_times_a.writemask           = dst.writemask;
1312       one_minus_a.writemask         = dst.writemask;
1313       x_times_one_minus_a.writemask = dst.writemask;
1314
1315       emit(MUL(y_times_a, y, a));
1316       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1317       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1318       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1319    }
1320 }
1321
1322 /**
1323  * Emits the instructions needed to perform a pull constant load. before_block
1324  * and before_inst can be NULL in which case the instruction will be appended
1325  * to the end of the instruction list.
1326  */
1327 void
1328 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1329                                           src_reg surf_index,
1330                                           src_reg offset_reg,
1331                                           bblock_t *before_block,
1332                                           vec4_instruction *before_inst)
1333 {
1334    assert((before_inst == NULL && before_block == NULL) ||
1335           (before_inst && before_block));
1336
1337    vec4_instruction *pull;
1338
1339    if (devinfo->gen >= 9) {
1340       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1341       src_reg header(this, glsl_type::uvec4_type, 2);
1342
1343       pull = new(mem_ctx)
1344          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1345                           dst_reg(header));
1346
1347       if (before_inst)
1348          emit_before(before_block, before_inst, pull);
1349       else
1350          emit(pull);
1351
1352       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1353                                  offset_reg.type);
1354       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1355
1356       if (before_inst)
1357          emit_before(before_block, before_inst, pull);
1358       else
1359          emit(pull);
1360
1361       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1362                                            dst,
1363                                            surf_index,
1364                                            header);
1365       pull->mlen = 2;
1366       pull->header_size = 1;
1367    } else if (devinfo->gen >= 7) {
1368       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1369
1370       grf_offset.type = offset_reg.type;
1371
1372       pull = MOV(grf_offset, offset_reg);
1373
1374       if (before_inst)
1375          emit_before(before_block, before_inst, pull);
1376       else
1377          emit(pull);
1378
1379       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1380                                            dst,
1381                                            surf_index,
1382                                            src_reg(grf_offset));
1383       pull->mlen = 1;
1384    } else {
1385       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1386                                            dst,
1387                                            surf_index,
1388                                            offset_reg);
1389       pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
1390       pull->mlen = 1;
1391    }
1392
1393    if (before_inst)
1394       emit_before(before_block, before_inst, pull);
1395    else
1396       emit(pull);
1397 }
1398
1399 src_reg
1400 vec4_visitor::emit_uniformize(const src_reg &src)
1401 {
1402    const src_reg chan_index(this, glsl_type::uint_type);
1403    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1404                               src.type);
1405
1406    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1407       ->force_writemask_all = true;
1408    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1409       ->force_writemask_all = true;
1410
1411    return src_reg(dst);
1412 }
1413
1414 void
1415 vec4_visitor::visit(ir_expression *ir)
1416 {
1417    unsigned int operand;
1418    src_reg op[ARRAY_SIZE(ir->operands)];
1419    vec4_instruction *inst;
1420
1421    if (ir->operation == ir_binop_add) {
1422       if (try_emit_mad(ir))
1423          return;
1424    }
1425
1426    if (ir->operation == ir_unop_b2f) {
1427       if (try_emit_b2f_of_compare(ir))
1428          return;
1429    }
1430
1431    /* Storage for our result.  Ideally for an assignment we'd be using
1432     * the actual storage for the result here, instead.
1433     */
1434    dst_reg result_dst(this, ir->type);
1435    src_reg result_src(result_dst);
1436
1437    if (ir->operation == ir_triop_csel) {
1438       ir->operands[1]->accept(this);
1439       op[1] = this->result;
1440       ir->operands[2]->accept(this);
1441       op[2] = this->result;
1442
1443       enum brw_predicate predicate;
1444       emit_bool_to_cond_code(ir->operands[0], &predicate);
1445       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1446       inst->predicate = predicate;
1447       this->result = result_src;
1448       return;
1449    }
1450
1451    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1452       this->result.file = BAD_FILE;
1453       ir->operands[operand]->accept(this);
1454       if (this->result.file == BAD_FILE) {
1455          fprintf(stderr, "Failed to get tree for expression operand:\n");
1456          ir->operands[operand]->fprint(stderr);
1457          exit(1);
1458       }
1459       op[operand] = this->result;
1460
1461       /* Matrix expression operands should have been broken down to vector
1462        * operations already.
1463        */
1464       assert(!ir->operands[operand]->type->is_matrix());
1465    }
1466
1467    /* If nothing special happens, this is the result. */
1468    this->result = result_src;
1469
1470    switch (ir->operation) {
1471    case ir_unop_logic_not:
1472       emit(NOT(result_dst, op[0]));
1473       break;
1474    case ir_unop_neg:
1475       op[0].negate = !op[0].negate;
1476       emit(MOV(result_dst, op[0]));
1477       break;
1478    case ir_unop_abs:
1479       op[0].abs = true;
1480       op[0].negate = false;
1481       emit(MOV(result_dst, op[0]));
1482       break;
1483
1484    case ir_unop_sign:
1485       if (ir->type->is_float()) {
1486          /* AND(val, 0x80000000) gives the sign bit.
1487           *
1488           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1489           * zero.
1490           */
1491          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1492
1493          op[0].type = BRW_REGISTER_TYPE_UD;
1494          result_dst.type = BRW_REGISTER_TYPE_UD;
1495          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1496
1497          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1498          inst->predicate = BRW_PREDICATE_NORMAL;
1499
1500          this->result.type = BRW_REGISTER_TYPE_F;
1501       } else {
1502          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1503           *               -> non-negative val generates 0x00000000.
1504           *  Predicated OR sets 1 if val is positive.
1505           */
1506          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1507
1508          emit(ASR(result_dst, op[0], src_reg(31)));
1509
1510          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1511          inst->predicate = BRW_PREDICATE_NORMAL;
1512       }
1513       break;
1514
1515    case ir_unop_rcp:
1516       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1517       break;
1518
1519    case ir_unop_exp2:
1520       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1521       break;
1522    case ir_unop_log2:
1523       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1524       break;
1525    case ir_unop_exp:
1526    case ir_unop_log:
1527       unreachable("not reached: should be handled by ir_explog_to_explog2");
1528    case ir_unop_sin:
1529       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1530       break;
1531    case ir_unop_cos:
1532       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1533       break;
1534
1535    case ir_unop_dFdx:
1536    case ir_unop_dFdx_coarse:
1537    case ir_unop_dFdx_fine:
1538    case ir_unop_dFdy:
1539    case ir_unop_dFdy_coarse:
1540    case ir_unop_dFdy_fine:
1541       unreachable("derivatives not valid in vertex shader");
1542
1543    case ir_unop_bitfield_reverse:
1544       emit(BFREV(result_dst, op[0]));
1545       break;
1546    case ir_unop_bit_count:
1547       emit(CBIT(result_dst, op[0]));
1548       break;
1549    case ir_unop_find_msb: {
1550       src_reg temp = src_reg(this, glsl_type::uint_type);
1551
1552       inst = emit(FBH(dst_reg(temp), op[0]));
1553       inst->dst.writemask = WRITEMASK_XYZW;
1554
1555       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1556        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1557        * subtract the result from 31 to convert the MSB count into an LSB count.
1558        */
1559
1560       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1561       temp.swizzle = BRW_SWIZZLE_NOOP;
1562       emit(MOV(result_dst, temp));
1563
1564       src_reg src_tmp = src_reg(result_dst);
1565       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1566
1567       src_tmp.negate = true;
1568       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1569       inst->predicate = BRW_PREDICATE_NORMAL;
1570       break;
1571    }
1572    case ir_unop_find_lsb:
1573       emit(FBL(result_dst, op[0]));
1574       break;
1575    case ir_unop_saturate:
1576       inst = emit(MOV(result_dst, op[0]));
1577       inst->saturate = true;
1578       break;
1579
1580    case ir_unop_noise:
1581       unreachable("not reached: should be handled by lower_noise");
1582
1583    case ir_unop_subroutine_to_int:
1584       emit(MOV(result_dst, op[0]));
1585       break;
1586
1587    case ir_binop_add:
1588       emit(ADD(result_dst, op[0], op[1]));
1589       break;
1590    case ir_binop_sub:
1591       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1592
1593    case ir_binop_mul:
1594       if (devinfo->gen < 8 && ir->type->is_integer()) {
1595          /* For integer multiplication, the MUL uses the low 16 bits of one of
1596           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1597           * accumulates in the contribution of the upper 16 bits of that
1598           * operand.  If we can determine that one of the args is in the low
1599           * 16 bits, though, we can just emit a single MUL.
1600           */
1601          if (ir->operands[0]->is_uint16_constant()) {
1602             if (devinfo->gen < 7)
1603                emit(MUL(result_dst, op[0], op[1]));
1604             else
1605                emit(MUL(result_dst, op[1], op[0]));
1606          } else if (ir->operands[1]->is_uint16_constant()) {
1607             if (devinfo->gen < 7)
1608                emit(MUL(result_dst, op[1], op[0]));
1609             else
1610                emit(MUL(result_dst, op[0], op[1]));
1611          } else {
1612             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1613
1614             emit(MUL(acc, op[0], op[1]));
1615             emit(MACH(dst_null_d(), op[0], op[1]));
1616             emit(MOV(result_dst, src_reg(acc)));
1617          }
1618       } else {
1619          emit(MUL(result_dst, op[0], op[1]));
1620       }
1621       break;
1622    case ir_binop_imul_high: {
1623       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1624
1625       emit(MUL(acc, op[0], op[1]));
1626       emit(MACH(result_dst, op[0], op[1]));
1627       break;
1628    }
1629    case ir_binop_div:
1630       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1631       assert(ir->type->is_integer());
1632       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1633       break;
1634
1635    case ir_binop_carry:
1636       unreachable("Should have been lowered by carry_to_arith().");
1637
1638    case ir_binop_borrow:
1639       unreachable("Should have been lowered by borrow_to_arith().");
1640
1641    case ir_binop_mod:
1642       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1643       assert(ir->type->is_integer());
1644       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1645       break;
1646
1647    case ir_binop_less:
1648    case ir_binop_greater:
1649    case ir_binop_lequal:
1650    case ir_binop_gequal:
1651    case ir_binop_equal:
1652    case ir_binop_nequal: {
1653       if (devinfo->gen <= 5) {
1654          resolve_bool_comparison(ir->operands[0], &op[0]);
1655          resolve_bool_comparison(ir->operands[1], &op[1]);
1656       }
1657       emit(CMP(result_dst, op[0], op[1],
1658                brw_conditional_for_comparison(ir->operation)));
1659       break;
1660    }
1661
1662    case ir_binop_all_equal:
1663       if (devinfo->gen <= 5) {
1664          resolve_bool_comparison(ir->operands[0], &op[0]);
1665          resolve_bool_comparison(ir->operands[1], &op[1]);
1666       }
1667
1668       /* "==" operator producing a scalar boolean. */
1669       if (ir->operands[0]->type->is_vector() ||
1670           ir->operands[1]->type->is_vector()) {
1671          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1672          emit(MOV(result_dst, src_reg(0)));
1673          inst = emit(MOV(result_dst, src_reg(~0)));
1674          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1675       } else {
1676          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1677       }
1678       break;
1679    case ir_binop_any_nequal:
1680       if (devinfo->gen <= 5) {
1681          resolve_bool_comparison(ir->operands[0], &op[0]);
1682          resolve_bool_comparison(ir->operands[1], &op[1]);
1683       }
1684
1685       /* "!=" operator producing a scalar boolean. */
1686       if (ir->operands[0]->type->is_vector() ||
1687           ir->operands[1]->type->is_vector()) {
1688          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1689
1690          emit(MOV(result_dst, src_reg(0)));
1691          inst = emit(MOV(result_dst, src_reg(~0)));
1692          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1693       } else {
1694          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1695       }
1696       break;
1697
1698    case ir_unop_any:
1699       if (devinfo->gen <= 5) {
1700          resolve_bool_comparison(ir->operands[0], &op[0]);
1701       }
1702       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1703       emit(MOV(result_dst, src_reg(0)));
1704
1705       inst = emit(MOV(result_dst, src_reg(~0)));
1706       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1707       break;
1708
1709    case ir_binop_logic_xor:
1710       emit(XOR(result_dst, op[0], op[1]));
1711       break;
1712
1713    case ir_binop_logic_or:
1714       emit(OR(result_dst, op[0], op[1]));
1715       break;
1716
1717    case ir_binop_logic_and:
1718       emit(AND(result_dst, op[0], op[1]));
1719       break;
1720
1721    case ir_binop_dot:
1722       assert(ir->operands[0]->type->is_vector());
1723       assert(ir->operands[0]->type == ir->operands[1]->type);
1724       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1725       break;
1726
1727    case ir_unop_sqrt:
1728       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1729       break;
1730    case ir_unop_rsq:
1731       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1732       break;
1733
1734    case ir_unop_bitcast_i2f:
1735    case ir_unop_bitcast_u2f:
1736       this->result = op[0];
1737       this->result.type = BRW_REGISTER_TYPE_F;
1738       break;
1739
1740    case ir_unop_bitcast_f2i:
1741       this->result = op[0];
1742       this->result.type = BRW_REGISTER_TYPE_D;
1743       break;
1744
1745    case ir_unop_bitcast_f2u:
1746       this->result = op[0];
1747       this->result.type = BRW_REGISTER_TYPE_UD;
1748       break;
1749
1750    case ir_unop_i2f:
1751    case ir_unop_i2u:
1752    case ir_unop_u2i:
1753    case ir_unop_u2f:
1754    case ir_unop_f2i:
1755    case ir_unop_f2u:
1756       emit(MOV(result_dst, op[0]));
1757       break;
1758    case ir_unop_b2i:
1759    case ir_unop_b2f:
1760       if (devinfo->gen <= 5) {
1761          resolve_bool_comparison(ir->operands[0], &op[0]);
1762       }
1763       emit(MOV(result_dst, negate(op[0])));
1764       break;
1765    case ir_unop_f2b:
1766       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1767       break;
1768    case ir_unop_i2b:
1769       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1770       break;
1771
1772    case ir_unop_trunc:
1773       emit(RNDZ(result_dst, op[0]));
1774       break;
1775    case ir_unop_ceil: {
1776          src_reg tmp = src_reg(this, ir->type);
1777          op[0].negate = !op[0].negate;
1778          emit(RNDD(dst_reg(tmp), op[0]));
1779          tmp.negate = true;
1780          emit(MOV(result_dst, tmp));
1781       }
1782       break;
1783    case ir_unop_floor:
1784       inst = emit(RNDD(result_dst, op[0]));
1785       break;
1786    case ir_unop_fract:
1787       inst = emit(FRC(result_dst, op[0]));
1788       break;
1789    case ir_unop_round_even:
1790       emit(RNDE(result_dst, op[0]));
1791       break;
1792
1793    case ir_binop_min:
1794       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1795       break;
1796    case ir_binop_max:
1797       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1798       break;
1799
1800    case ir_binop_pow:
1801       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1802       break;
1803
1804    case ir_unop_bit_not:
1805       inst = emit(NOT(result_dst, op[0]));
1806       break;
1807    case ir_binop_bit_and:
1808       inst = emit(AND(result_dst, op[0], op[1]));
1809       break;
1810    case ir_binop_bit_xor:
1811       inst = emit(XOR(result_dst, op[0], op[1]));
1812       break;
1813    case ir_binop_bit_or:
1814       inst = emit(OR(result_dst, op[0], op[1]));
1815       break;
1816
1817    case ir_binop_lshift:
1818       inst = emit(SHL(result_dst, op[0], op[1]));
1819       break;
1820
1821    case ir_binop_rshift:
1822       if (ir->type->base_type == GLSL_TYPE_INT)
1823          inst = emit(ASR(result_dst, op[0], op[1]));
1824       else
1825          inst = emit(SHR(result_dst, op[0], op[1]));
1826       break;
1827
1828    case ir_binop_bfm:
1829       emit(BFI1(result_dst, op[0], op[1]));
1830       break;
1831
1832    case ir_binop_ubo_load: {
1833       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1834       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1835       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1836       src_reg offset;
1837
1838       /* Now, load the vector from that offset. */
1839       assert(ir->type->is_vector() || ir->type->is_scalar());
1840
1841       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1842       packed_consts.type = result.type;
1843       src_reg surf_index;
1844
1845       if (const_uniform_block) {
1846          /* The block index is a constant, so just emit the binding table entry
1847           * as an immediate.
1848           */
1849          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1850                               const_uniform_block->value.u[0]);
1851       } else {
1852          /* The block index is not a constant. Evaluate the index expression
1853           * per-channel and add the base UBO index; we have to select a value
1854           * from any live channel.
1855           */
1856          surf_index = src_reg(this, glsl_type::uint_type);
1857          emit(ADD(dst_reg(surf_index), op[0],
1858                   src_reg(prog_data->base.binding_table.ubo_start)));
1859          surf_index = emit_uniformize(surf_index);
1860
1861          /* Assume this may touch any UBO. It would be nice to provide
1862           * a tighter bound, but the array information is already lowered away.
1863           */
1864          brw_mark_surface_used(&prog_data->base,
1865                                prog_data->base.binding_table.ubo_start +
1866                                shader_prog->NumUniformBlocks - 1);
1867       }
1868
1869       if (const_offset_ir) {
1870          if (devinfo->gen >= 8) {
1871             /* Store the offset in a GRF so we can send-from-GRF. */
1872             offset = src_reg(this, glsl_type::int_type);
1873             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1874          } else {
1875             /* Immediates are fine on older generations since they'll be moved
1876              * to a (potentially fake) MRF at the generator level.
1877              */
1878             offset = src_reg(const_offset / 16);
1879          }
1880       } else {
1881          offset = src_reg(this, glsl_type::uint_type);
1882          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1883       }
1884
1885       emit_pull_constant_load_reg(dst_reg(packed_consts),
1886                                   surf_index,
1887                                   offset,
1888                                   NULL, NULL /* before_block/inst */);
1889
1890       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1891       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1892                                             const_offset % 16 / 4,
1893                                             const_offset % 16 / 4,
1894                                             const_offset % 16 / 4);
1895
1896       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1897       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1898          emit(CMP(result_dst, packed_consts, src_reg(0u),
1899                   BRW_CONDITIONAL_NZ));
1900       } else {
1901          emit(MOV(result_dst, packed_consts));
1902       }
1903       break;
1904    }
1905
1906    case ir_binop_vector_extract:
1907       unreachable("should have been lowered by vec_index_to_cond_assign");
1908
1909    case ir_triop_fma:
1910       op[0] = fix_3src_operand(op[0]);
1911       op[1] = fix_3src_operand(op[1]);
1912       op[2] = fix_3src_operand(op[2]);
1913       /* Note that the instruction's argument order is reversed from GLSL
1914        * and the IR.
1915        */
1916       emit(MAD(result_dst, op[2], op[1], op[0]));
1917       break;
1918
1919    case ir_triop_lrp:
1920       emit_lrp(result_dst, op[0], op[1], op[2]);
1921       break;
1922
1923    case ir_triop_csel:
1924       unreachable("already handled above");
1925       break;
1926
1927    case ir_triop_bfi:
1928       op[0] = fix_3src_operand(op[0]);
1929       op[1] = fix_3src_operand(op[1]);
1930       op[2] = fix_3src_operand(op[2]);
1931       emit(BFI2(result_dst, op[0], op[1], op[2]));
1932       break;
1933
1934    case ir_triop_bitfield_extract:
1935       op[0] = fix_3src_operand(op[0]);
1936       op[1] = fix_3src_operand(op[1]);
1937       op[2] = fix_3src_operand(op[2]);
1938       /* Note that the instruction's argument order is reversed from GLSL
1939        * and the IR.
1940        */
1941       emit(BFE(result_dst, op[2], op[1], op[0]));
1942       break;
1943
1944    case ir_triop_vector_insert:
1945       unreachable("should have been lowered by lower_vector_insert");
1946
1947    case ir_quadop_bitfield_insert:
1948       unreachable("not reached: should be handled by "
1949               "bitfield_insert_to_bfm_bfi\n");
1950
1951    case ir_quadop_vector:
1952       unreachable("not reached: should be handled by lower_quadop_vector");
1953
1954    case ir_unop_pack_half_2x16:
1955       emit_pack_half_2x16(result_dst, op[0]);
1956       break;
1957    case ir_unop_unpack_half_2x16:
1958       emit_unpack_half_2x16(result_dst, op[0]);
1959       break;
1960    case ir_unop_unpack_unorm_4x8:
1961       emit_unpack_unorm_4x8(result_dst, op[0]);
1962       break;
1963    case ir_unop_unpack_snorm_4x8:
1964       emit_unpack_snorm_4x8(result_dst, op[0]);
1965       break;
1966    case ir_unop_pack_unorm_4x8:
1967       emit_pack_unorm_4x8(result_dst, op[0]);
1968       break;
1969    case ir_unop_pack_snorm_4x8:
1970       emit_pack_snorm_4x8(result_dst, op[0]);
1971       break;
1972    case ir_unop_pack_snorm_2x16:
1973    case ir_unop_pack_unorm_2x16:
1974    case ir_unop_unpack_snorm_2x16:
1975    case ir_unop_unpack_unorm_2x16:
1976       unreachable("not reached: should be handled by lower_packing_builtins");
1977    case ir_unop_unpack_half_2x16_split_x:
1978    case ir_unop_unpack_half_2x16_split_y:
1979    case ir_binop_pack_half_2x16_split:
1980    case ir_unop_interpolate_at_centroid:
1981    case ir_binop_interpolate_at_sample:
1982    case ir_binop_interpolate_at_offset:
1983       unreachable("not reached: should not occur in vertex shader");
1984    case ir_binop_ldexp:
1985       unreachable("not reached: should be handled by ldexp_to_arith()");
1986    case ir_unop_d2f:
1987    case ir_unop_f2d:
1988    case ir_unop_d2i:
1989    case ir_unop_i2d:
1990    case ir_unop_d2u:
1991    case ir_unop_u2d:
1992    case ir_unop_d2b:
1993    case ir_unop_pack_double_2x32:
1994    case ir_unop_unpack_double_2x32:
1995    case ir_unop_frexp_sig:
1996    case ir_unop_frexp_exp:
1997       unreachable("fp64 todo");
1998    }
1999 }
2000
2001
2002 void
2003 vec4_visitor::visit(ir_swizzle *ir)
2004 {
2005    /* Note that this is only swizzles in expressions, not those on the left
2006     * hand side of an assignment, which do write masking.  See ir_assignment
2007     * for that.
2008     */
2009    const unsigned swz = brw_compose_swizzle(
2010       brw_swizzle_for_size(ir->type->vector_elements),
2011       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2012
2013    ir->val->accept(this);
2014    this->result = swizzle(this->result, swz);
2015 }
2016
2017 void
2018 vec4_visitor::visit(ir_dereference_variable *ir)
2019 {
2020    const struct glsl_type *type = ir->type;
2021    dst_reg *reg = variable_storage(ir->var);
2022
2023    if (!reg) {
2024       fail("Failed to find variable storage for %s\n", ir->var->name);
2025       this->result = src_reg(brw_null_reg());
2026       return;
2027    }
2028
2029    this->result = src_reg(*reg);
2030
2031    /* System values get their swizzle from the dst_reg writemask */
2032    if (ir->var->data.mode == ir_var_system_value)
2033       return;
2034
2035    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2036       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2037 }
2038
2039
2040 int
2041 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2042 {
2043    /* Under normal circumstances array elements are stored consecutively, so
2044     * the stride is equal to the size of the array element.
2045     */
2046    return type_size_vec4(ir->type);
2047 }
2048
2049
2050 void
2051 vec4_visitor::visit(ir_dereference_array *ir)
2052 {
2053    ir_constant *constant_index;
2054    src_reg src;
2055    int array_stride = compute_array_stride(ir);
2056
2057    constant_index = ir->array_index->constant_expression_value();
2058
2059    ir->array->accept(this);
2060    src = this->result;
2061
2062    if (constant_index) {
2063       src.reg_offset += constant_index->value.i[0] * array_stride;
2064    } else {
2065       /* Variable index array dereference.  It eats the "vec4" of the
2066        * base of the array and an index that offsets the Mesa register
2067        * index.
2068        */
2069       ir->array_index->accept(this);
2070
2071       src_reg index_reg;
2072
2073       if (array_stride == 1) {
2074          index_reg = this->result;
2075       } else {
2076          index_reg = src_reg(this, glsl_type::int_type);
2077
2078          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2079       }
2080
2081       if (src.reladdr) {
2082          src_reg temp = src_reg(this, glsl_type::int_type);
2083
2084          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2085
2086          index_reg = temp;
2087       }
2088
2089       src.reladdr = ralloc(mem_ctx, src_reg);
2090       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2091    }
2092
2093    /* If the type is smaller than a vec4, replicate the last channel out. */
2094    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2095       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2096    else
2097       src.swizzle = BRW_SWIZZLE_NOOP;
2098    src.type = brw_type_for_base_type(ir->type);
2099
2100    this->result = src;
2101 }
2102
2103 void
2104 vec4_visitor::visit(ir_dereference_record *ir)
2105 {
2106    unsigned int i;
2107    const glsl_type *struct_type = ir->record->type;
2108    int offset = 0;
2109
2110    ir->record->accept(this);
2111
2112    for (i = 0; i < struct_type->length; i++) {
2113       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2114          break;
2115       offset += type_size_vec4(struct_type->fields.structure[i].type);
2116    }
2117
2118    /* If the type is smaller than a vec4, replicate the last channel out. */
2119    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2120       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2121    else
2122       this->result.swizzle = BRW_SWIZZLE_NOOP;
2123    this->result.type = brw_type_for_base_type(ir->type);
2124
2125    this->result.reg_offset += offset;
2126 }
2127
2128 /**
2129  * We want to be careful in assignment setup to hit the actual storage
2130  * instead of potentially using a temporary like we might with the
2131  * ir_dereference handler.
2132  */
2133 static dst_reg
2134 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2135 {
2136    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2137     * access of a vector, it must be separated into a series conditional moves
2138     * before reaching this point (see ir_vec_index_to_cond_assign).
2139     */
2140    assert(ir->as_dereference());
2141    ir_dereference_array *deref_array = ir->as_dereference_array();
2142    if (deref_array) {
2143       assert(!deref_array->array->type->is_vector());
2144    }
2145
2146    /* Use the rvalue deref handler for the most part.  We'll ignore
2147     * swizzles in it and write swizzles using writemask, though.
2148     */
2149    ir->accept(v);
2150    return dst_reg(v->result);
2151 }
2152
2153 void
2154 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2155                               const struct glsl_type *type,
2156                               enum brw_predicate predicate)
2157 {
2158    if (type->base_type == GLSL_TYPE_STRUCT) {
2159       for (unsigned int i = 0; i < type->length; i++) {
2160          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2161       }
2162       return;
2163    }
2164
2165    if (type->is_array()) {
2166       for (unsigned int i = 0; i < type->length; i++) {
2167          emit_block_move(dst, src, type->fields.array, predicate);
2168       }
2169       return;
2170    }
2171
2172    if (type->is_matrix()) {
2173       const struct glsl_type *vec_type;
2174
2175       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2176                                          type->vector_elements, 1);
2177
2178       for (int i = 0; i < type->matrix_columns; i++) {
2179          emit_block_move(dst, src, vec_type, predicate);
2180       }
2181       return;
2182    }
2183
2184    assert(type->is_scalar() || type->is_vector());
2185
2186    dst->type = brw_type_for_base_type(type);
2187    src->type = dst->type;
2188
2189    dst->writemask = (1 << type->vector_elements) - 1;
2190
2191    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2192
2193    vec4_instruction *inst = emit(MOV(*dst, *src));
2194    inst->predicate = predicate;
2195
2196    dst->reg_offset++;
2197    src->reg_offset++;
2198 }
2199
2200
2201 /* If the RHS processing resulted in an instruction generating a
2202  * temporary value, and it would be easy to rewrite the instruction to
2203  * generate its result right into the LHS instead, do so.  This ends
2204  * up reliably removing instructions where it can be tricky to do so
2205  * later without real UD chain information.
2206  */
2207 bool
2208 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2209                                      dst_reg dst,
2210                                      src_reg src,
2211                                      vec4_instruction *pre_rhs_inst,
2212                                      vec4_instruction *last_rhs_inst)
2213 {
2214    /* This could be supported, but it would take more smarts. */
2215    if (ir->condition)
2216       return false;
2217
2218    if (pre_rhs_inst == last_rhs_inst)
2219       return false; /* No instructions generated to work with. */
2220
2221    /* Make sure the last instruction generated our source reg. */
2222    if (src.file != GRF ||
2223        src.file != last_rhs_inst->dst.file ||
2224        src.reg != last_rhs_inst->dst.reg ||
2225        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2226        src.reladdr ||
2227        src.abs ||
2228        src.negate ||
2229        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2230       return false;
2231
2232    /* Check that that last instruction fully initialized the channels
2233     * we want to use, in the order we want to use them.  We could
2234     * potentially reswizzle the operands of many instructions so that
2235     * we could handle out of order channels, but don't yet.
2236     */
2237
2238    for (unsigned i = 0; i < 4; i++) {
2239       if (dst.writemask & (1 << i)) {
2240          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2241             return false;
2242
2243          if (BRW_GET_SWZ(src.swizzle, i) != i)
2244             return false;
2245       }
2246    }
2247
2248    /* Success!  Rewrite the instruction. */
2249    last_rhs_inst->dst.file = dst.file;
2250    last_rhs_inst->dst.reg = dst.reg;
2251    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2252    last_rhs_inst->dst.reladdr = dst.reladdr;
2253    last_rhs_inst->dst.writemask &= dst.writemask;
2254
2255    return true;
2256 }
2257
2258 void
2259 vec4_visitor::visit(ir_assignment *ir)
2260 {
2261    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2262    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2263
2264    if (!ir->lhs->type->is_scalar() &&
2265        !ir->lhs->type->is_vector()) {
2266       ir->rhs->accept(this);
2267       src_reg src = this->result;
2268
2269       if (ir->condition) {
2270          emit_bool_to_cond_code(ir->condition, &predicate);
2271       }
2272
2273       /* emit_block_move doesn't account for swizzles in the source register.
2274        * This should be ok, since the source register is a structure or an
2275        * array, and those can't be swizzled.  But double-check to be sure.
2276        */
2277       assert(src.swizzle ==
2278              (ir->rhs->type->is_matrix()
2279               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2280               : BRW_SWIZZLE_NOOP));
2281
2282       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2283       return;
2284    }
2285
2286    /* Now we're down to just a scalar/vector with writemasks. */
2287    int i;
2288
2289    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2290    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2291
2292    ir->rhs->accept(this);
2293
2294    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2295
2296    int swizzles[4];
2297    int src_chan = 0;
2298
2299    assert(ir->lhs->type->is_vector() ||
2300           ir->lhs->type->is_scalar());
2301    dst.writemask = ir->write_mask;
2302
2303    /* Swizzle a small RHS vector into the channels being written.
2304     *
2305     * glsl ir treats write_mask as dictating how many channels are
2306     * present on the RHS while in our instructions we need to make
2307     * those channels appear in the slots of the vec4 they're written to.
2308     */
2309    for (int i = 0; i < 4; i++)
2310       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2311
2312    src_reg src = swizzle(this->result,
2313                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2314                                       swizzles[2], swizzles[3]));
2315
2316    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2317       return;
2318    }
2319
2320    if (ir->condition) {
2321       emit_bool_to_cond_code(ir->condition, &predicate);
2322    }
2323
2324    for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2325       vec4_instruction *inst = emit(MOV(dst, src));
2326       inst->predicate = predicate;
2327
2328       dst.reg_offset++;
2329       src.reg_offset++;
2330    }
2331 }
2332
2333 void
2334 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2335 {
2336    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2337       foreach_in_list(ir_constant, field_value, &ir->components) {
2338          emit_constant_values(dst, field_value);
2339       }
2340       return;
2341    }
2342
2343    if (ir->type->is_array()) {
2344       for (unsigned int i = 0; i < ir->type->length; i++) {
2345          emit_constant_values(dst, ir->array_elements[i]);
2346       }
2347       return;
2348    }
2349
2350    if (ir->type->is_matrix()) {
2351       for (int i = 0; i < ir->type->matrix_columns; i++) {
2352          float *vec = &ir->value.f[i * ir->type->vector_elements];
2353
2354          for (int j = 0; j < ir->type->vector_elements; j++) {
2355             dst->writemask = 1 << j;
2356             dst->type = BRW_REGISTER_TYPE_F;
2357
2358             emit(MOV(*dst, src_reg(vec[j])));
2359          }
2360          dst->reg_offset++;
2361       }
2362       return;
2363    }
2364
2365    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2366
2367    for (int i = 0; i < ir->type->vector_elements; i++) {
2368       if (!(remaining_writemask & (1 << i)))
2369          continue;
2370
2371       dst->writemask = 1 << i;
2372       dst->type = brw_type_for_base_type(ir->type);
2373
2374       /* Find other components that match the one we're about to
2375        * write.  Emits fewer instructions for things like vec4(0.5,
2376        * 1.5, 1.5, 1.5).
2377        */
2378       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2379          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2380             if (ir->value.b[i] == ir->value.b[j])
2381                dst->writemask |= (1 << j);
2382          } else {
2383             /* u, i, and f storage all line up, so no need for a
2384              * switch case for comparing each type.
2385              */
2386             if (ir->value.u[i] == ir->value.u[j])
2387                dst->writemask |= (1 << j);
2388          }
2389       }
2390
2391       switch (ir->type->base_type) {
2392       case GLSL_TYPE_FLOAT:
2393          emit(MOV(*dst, src_reg(ir->value.f[i])));
2394          break;
2395       case GLSL_TYPE_INT:
2396          emit(MOV(*dst, src_reg(ir->value.i[i])));
2397          break;
2398       case GLSL_TYPE_UINT:
2399          emit(MOV(*dst, src_reg(ir->value.u[i])));
2400          break;
2401       case GLSL_TYPE_BOOL:
2402          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2403          break;
2404       default:
2405          unreachable("Non-float/uint/int/bool constant");
2406       }
2407
2408       remaining_writemask &= ~dst->writemask;
2409    }
2410    dst->reg_offset++;
2411 }
2412
2413 void
2414 vec4_visitor::visit(ir_constant *ir)
2415 {
2416    dst_reg dst = dst_reg(this, ir->type);
2417    this->result = src_reg(dst);
2418
2419    emit_constant_values(&dst, ir);
2420 }
2421
2422 void
2423 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2424 {
2425    ir_dereference *deref = static_cast<ir_dereference *>(
2426       ir->actual_parameters.get_head());
2427    ir_variable *location = deref->variable_referenced();
2428    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2429                           location->data.binding);
2430
2431    /* Calculate the surface offset */
2432    src_reg offset(this, glsl_type::uint_type);
2433    ir_dereference_array *deref_array = deref->as_dereference_array();
2434    if (deref_array) {
2435       deref_array->array_index->accept(this);
2436
2437       src_reg tmp(this, glsl_type::uint_type);
2438       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2439       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2440    } else {
2441       offset = location->data.atomic.offset;
2442    }
2443
2444    /* Emit the appropriate machine instruction */
2445    const char *callee = ir->callee->function_name();
2446    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2447
2448    if (!strcmp("__intrinsic_atomic_read", callee)) {
2449       emit_untyped_surface_read(surf_index, dst, offset);
2450
2451    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2452       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2453                           src_reg(), src_reg());
2454
2455    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2456       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2457                           src_reg(), src_reg());
2458    }
2459
2460    brw_mark_surface_used(stage_prog_data, surf_index);
2461 }
2462
2463 void
2464 vec4_visitor::visit(ir_call *ir)
2465 {
2466    const char *callee = ir->callee->function_name();
2467
2468    if (!strcmp("__intrinsic_atomic_read", callee) ||
2469        !strcmp("__intrinsic_atomic_increment", callee) ||
2470        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2471       visit_atomic_counter_intrinsic(ir);
2472    } else {
2473       unreachable("Unsupported intrinsic.");
2474    }
2475 }
2476
2477 src_reg
2478 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2479                              src_reg coordinate, src_reg sampler)
2480 {
2481    vec4_instruction *inst =
2482       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2483                                     dst_reg(this, glsl_type::uvec4_type));
2484    inst->base_mrf = 2;
2485    inst->src[1] = sampler;
2486
2487    int param_base;
2488
2489    if (devinfo->gen >= 9) {
2490       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2491       vec4_instruction *header_inst = new(mem_ctx)
2492          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2493                           dst_reg(MRF, inst->base_mrf));
2494
2495       emit(header_inst);
2496
2497       inst->mlen = 2;
2498       inst->header_size = 1;
2499       param_base = inst->base_mrf + 1;
2500    } else {
2501       inst->mlen = 1;
2502       param_base = inst->base_mrf;
2503    }
2504
2505    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2506    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2507    int zero_mask = 0xf & ~coord_mask;
2508
2509    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2510             coordinate));
2511
2512    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2513             src_reg(0)));
2514
2515    emit(inst);
2516    return src_reg(inst->dst);
2517 }
2518
2519 bool
2520 vec4_visitor::is_high_sampler(src_reg sampler)
2521 {
2522    if (devinfo->gen < 8 && !devinfo->is_haswell)
2523       return false;
2524
2525    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2526 }
2527
2528 void
2529 vec4_visitor::emit_texture(ir_texture_opcode op,
2530                            dst_reg dest,
2531                            const glsl_type *dest_type,
2532                            src_reg coordinate,
2533                            int coord_components,
2534                            src_reg shadow_comparitor,
2535                            src_reg lod, src_reg lod2,
2536                            src_reg sample_index,
2537                            uint32_t constant_offset,
2538                            src_reg offset_value,
2539                            src_reg mcs,
2540                            bool is_cube_array,
2541                            uint32_t sampler,
2542                            src_reg sampler_reg)
2543 {
2544    enum opcode opcode;
2545    switch (op) {
2546    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2547    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2548    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2549    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2550    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2551    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2552    case ir_tg4: opcode = offset_value.file != BAD_FILE
2553                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2554    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2555    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
2556    case ir_txb:
2557       unreachable("TXB is not valid for vertex shaders.");
2558    case ir_lod:
2559       unreachable("LOD is not valid for vertex shaders.");
2560    default:
2561       unreachable("Unrecognized tex op");
2562    }
2563
2564    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2565       opcode, dst_reg(this, dest_type));
2566
2567    inst->offset = constant_offset;
2568
2569    /* The message header is necessary for:
2570     * - Gen4 (always)
2571     * - Gen9+ for selecting SIMD4x2
2572     * - Texel offsets
2573     * - Gather channel selection
2574     * - Sampler indices too large to fit in a 4-bit value.
2575     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2576     */
2577    inst->header_size =
2578       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2579        inst->offset != 0 || op == ir_tg4 ||
2580        op == ir_texture_samples ||
2581        is_high_sampler(sampler_reg)) ? 1 : 0;
2582    inst->base_mrf = 2;
2583    inst->mlen = inst->header_size;
2584    inst->dst.writemask = WRITEMASK_XYZW;
2585    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2586
2587    inst->src[1] = sampler_reg;
2588
2589    /* MRF for the first parameter */
2590    int param_base = inst->base_mrf + inst->header_size;
2591
2592    if (op == ir_txs || op == ir_query_levels) {
2593       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2594       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2595       inst->mlen++;
2596    } else if (op == ir_texture_samples) {
2597       inst->dst.writemask = WRITEMASK_X;
2598    } else {
2599       /* Load the coordinate */
2600       /* FINISHME: gl_clamp_mask and saturate */
2601       int coord_mask = (1 << coord_components) - 1;
2602       int zero_mask = 0xf & ~coord_mask;
2603
2604       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2605                coordinate));
2606       inst->mlen++;
2607
2608       if (zero_mask != 0) {
2609          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2610                   src_reg(0)));
2611       }
2612       /* Load the shadow comparitor */
2613       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2614          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2615                           WRITEMASK_X),
2616                   shadow_comparitor));
2617          inst->mlen++;
2618       }
2619
2620       /* Load the LOD info */
2621       if (op == ir_tex || op == ir_txl) {
2622          int mrf, writemask;
2623          if (devinfo->gen >= 5) {
2624             mrf = param_base + 1;
2625             if (shadow_comparitor.file != BAD_FILE) {
2626                writemask = WRITEMASK_Y;
2627                /* mlen already incremented */
2628             } else {
2629                writemask = WRITEMASK_X;
2630                inst->mlen++;
2631             }
2632          } else /* devinfo->gen == 4 */ {
2633             mrf = param_base;
2634             writemask = WRITEMASK_W;
2635          }
2636          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2637       } else if (op == ir_txf) {
2638          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2639       } else if (op == ir_txf_ms) {
2640          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2641                   sample_index));
2642          if (devinfo->gen >= 7) {
2643             /* MCS data is in the first channel of `mcs`, but we need to get it into
2644              * the .y channel of the second vec4 of params, so replicate .x across
2645              * the whole vec4 and then mask off everything except .y
2646              */
2647             mcs.swizzle = BRW_SWIZZLE_XXXX;
2648             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2649                      mcs));
2650          }
2651          inst->mlen++;
2652       } else if (op == ir_txd) {
2653          const brw_reg_type type = lod.type;
2654
2655          if (devinfo->gen >= 5) {
2656             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2657             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2658             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2659             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2660             inst->mlen++;
2661
2662             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2663                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2664                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2665                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2666                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2667                inst->mlen++;
2668
2669                if (shadow_comparitor.file != BAD_FILE) {
2670                   emit(MOV(dst_reg(MRF, param_base + 2,
2671                                    shadow_comparitor.type, WRITEMASK_Z),
2672                            shadow_comparitor));
2673                }
2674             }
2675          } else /* devinfo->gen == 4 */ {
2676             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2677             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2678             inst->mlen += 2;
2679          }
2680       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2681          if (shadow_comparitor.file != BAD_FILE) {
2682             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2683                      shadow_comparitor));
2684          }
2685
2686          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2687                   offset_value));
2688          inst->mlen++;
2689       }
2690    }
2691
2692    emit(inst);
2693
2694    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2695     * spec requires layers.
2696     */
2697    if (op == ir_txs && is_cube_array) {
2698       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2699                 writemask(inst->dst, WRITEMASK_Z),
2700                 src_reg(inst->dst), src_reg(6));
2701    }
2702
2703    if (devinfo->gen == 6 && op == ir_tg4) {
2704       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2705    }
2706
2707    swizzle_result(op, dest,
2708                   src_reg(inst->dst), sampler, dest_type);
2709 }
2710
2711 void
2712 vec4_visitor::visit(ir_texture *ir)
2713 {
2714    uint32_t sampler =
2715       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2716
2717    ir_rvalue *nonconst_sampler_index =
2718       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2719
2720    /* Handle non-constant sampler array indexing */
2721    src_reg sampler_reg;
2722    if (nonconst_sampler_index) {
2723       /* The highest sampler which may be used by this operation is
2724        * the last element of the array. Mark it here, because the generator
2725        * doesn't have enough information to determine the bound.
2726        */
2727       uint32_t array_size = ir->sampler->as_dereference_array()
2728          ->array->type->array_size();
2729
2730       uint32_t max_used = sampler + array_size - 1;
2731       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2732          max_used += prog_data->base.binding_table.gather_texture_start;
2733       } else {
2734          max_used += prog_data->base.binding_table.texture_start;
2735       }
2736
2737       brw_mark_surface_used(&prog_data->base, max_used);
2738
2739       /* Emit code to evaluate the actual indexing expression */
2740       nonconst_sampler_index->accept(this);
2741       src_reg temp(this, glsl_type::uint_type);
2742       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2743       sampler_reg = emit_uniformize(temp);
2744    } else {
2745       /* Single sampler, or constant array index; the indexing expression
2746        * is just an immediate.
2747        */
2748       sampler_reg = src_reg(sampler);
2749    }
2750
2751    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2752     * emitting anything other than setting up the constant result.
2753     */
2754    if (ir->op == ir_tg4) {
2755       ir_constant *chan = ir->lod_info.component->as_constant();
2756       int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2757       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2758          dst_reg result(this, ir->type);
2759          this->result = src_reg(result);
2760          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2761          return;
2762       }
2763    }
2764
2765    /* Should be lowered by do_lower_texture_projection */
2766    assert(!ir->projector);
2767
2768    /* Should be lowered */
2769    assert(!ir->offset || !ir->offset->type->is_array());
2770
2771    /* Generate code to compute all the subexpression trees.  This has to be
2772     * done before loading any values into MRFs for the sampler message since
2773     * generating these values may involve SEND messages that need the MRFs.
2774     */
2775    src_reg coordinate;
2776    int coord_components = 0;
2777    if (ir->coordinate) {
2778       coord_components = ir->coordinate->type->vector_elements;
2779       ir->coordinate->accept(this);
2780       coordinate = this->result;
2781    }
2782
2783    src_reg shadow_comparitor;
2784    if (ir->shadow_comparitor) {
2785       ir->shadow_comparitor->accept(this);
2786       shadow_comparitor = this->result;
2787    }
2788
2789    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2790    src_reg offset_value;
2791    if (has_nonconstant_offset) {
2792       ir->offset->accept(this);
2793       offset_value = src_reg(this->result);
2794    }
2795
2796    src_reg lod, lod2, sample_index, mcs;
2797    switch (ir->op) {
2798    case ir_tex:
2799       lod = src_reg(0.0f);
2800       break;
2801    case ir_txf:
2802    case ir_txl:
2803    case ir_txs:
2804       ir->lod_info.lod->accept(this);
2805       lod = this->result;
2806       break;
2807    case ir_query_levels:
2808       lod = src_reg(0);
2809       break;
2810    case ir_txf_ms:
2811       ir->lod_info.sample_index->accept(this);
2812       sample_index = this->result;
2813
2814       if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2815          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2816       else
2817          mcs = src_reg(0u);
2818       break;
2819    case ir_txd:
2820       ir->lod_info.grad.dPdx->accept(this);
2821       lod = this->result;
2822
2823       ir->lod_info.grad.dPdy->accept(this);
2824       lod2 = this->result;
2825       break;
2826    case ir_txb:
2827    case ir_lod:
2828    case ir_tg4:
2829    case ir_texture_samples:
2830       break;
2831    }
2832
2833    uint32_t constant_offset = 0;
2834    if (ir->offset != NULL && !has_nonconstant_offset) {
2835       constant_offset  =
2836          brw_texture_offset(ir->offset->as_constant()->value.i,
2837                             ir->offset->type->vector_elements);
2838    }
2839
2840    /* Stuff the channel select bits in the top of the texture offset */
2841    if (ir->op == ir_tg4)
2842       constant_offset |=
2843          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2844                          sampler) << 16;
2845
2846    glsl_type const *type = ir->sampler->type;
2847    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2848       type->sampler_array;
2849
2850    this->result = src_reg(this, ir->type);
2851    dst_reg dest = dst_reg(this->result);
2852
2853    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2854                 shadow_comparitor,
2855                 lod, lod2, sample_index,
2856                 constant_offset, offset_value,
2857                 mcs, is_cube_array, sampler, sampler_reg);
2858 }
2859
2860 /**
2861  * Apply workarounds for Gen6 gather with UINT/SINT
2862  */
2863 void
2864 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2865 {
2866    if (!wa)
2867       return;
2868
2869    int width = (wa & WA_8BIT) ? 8 : 16;
2870    dst_reg dst_f = dst;
2871    dst_f.type = BRW_REGISTER_TYPE_F;
2872
2873    /* Convert from UNORM to UINT */
2874    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2875    emit(MOV(dst, src_reg(dst_f)));
2876
2877    if (wa & WA_SIGN) {
2878       /* Reinterpret the UINT value as a signed INT value by
2879        * shifting the sign bit into place, then shifting back
2880        * preserving sign.
2881        */
2882       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2883       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2884    }
2885 }
2886
2887 /**
2888  * Set up the gather channel based on the swizzle, for gather4.
2889  */
2890 uint32_t
2891 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2892 {
2893    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2894    switch (swiz) {
2895       case SWIZZLE_X: return 0;
2896       case SWIZZLE_Y:
2897          /* gather4 sampler is broken for green channel on RG32F --
2898           * we must ask for blue instead.
2899           */
2900          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2901             return 2;
2902          return 1;
2903       case SWIZZLE_Z: return 2;
2904       case SWIZZLE_W: return 3;
2905       default:
2906          unreachable("Not reached"); /* zero, one swizzles handled already */
2907    }
2908 }
2909
2910 void
2911 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2912                              src_reg orig_val, uint32_t sampler,
2913                              const glsl_type *dest_type)
2914 {
2915    int s = key_tex->swizzles[sampler];
2916
2917    dst_reg swizzled_result = dest;
2918
2919    if (op == ir_query_levels) {
2920       /* # levels is in .w */
2921       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2922       emit(MOV(swizzled_result, orig_val));
2923       return;
2924    }
2925
2926    if (op == ir_txs || dest_type == glsl_type::float_type
2927                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2928       emit(MOV(swizzled_result, orig_val));
2929       return;
2930    }
2931
2932
2933    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2934    int swizzle[4] = {0};
2935
2936    for (int i = 0; i < 4; i++) {
2937       switch (GET_SWZ(s, i)) {
2938       case SWIZZLE_ZERO:
2939          zero_mask |= (1 << i);
2940          break;
2941       case SWIZZLE_ONE:
2942          one_mask |= (1 << i);
2943          break;
2944       default:
2945          copy_mask |= (1 << i);
2946          swizzle[i] = GET_SWZ(s, i);
2947          break;
2948       }
2949    }
2950
2951    if (copy_mask) {
2952       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2953       swizzled_result.writemask = copy_mask;
2954       emit(MOV(swizzled_result, orig_val));
2955    }
2956
2957    if (zero_mask) {
2958       swizzled_result.writemask = zero_mask;
2959       emit(MOV(swizzled_result, src_reg(0.0f)));
2960    }
2961
2962    if (one_mask) {
2963       swizzled_result.writemask = one_mask;
2964       emit(MOV(swizzled_result, src_reg(1.0f)));
2965    }
2966 }
2967
2968 void
2969 vec4_visitor::visit(ir_return *)
2970 {
2971    unreachable("not reached");
2972 }
2973
2974 void
2975 vec4_visitor::visit(ir_discard *)
2976 {
2977    unreachable("not reached");
2978 }
2979
2980 void
2981 vec4_visitor::visit(ir_if *ir)
2982 {
2983    /* Don't point the annotation at the if statement, because then it plus
2984     * the then and else blocks get printed.
2985     */
2986    this->base_ir = ir->condition;
2987
2988    if (devinfo->gen == 6) {
2989       emit_if_gen6(ir);
2990    } else {
2991       enum brw_predicate predicate;
2992       emit_bool_to_cond_code(ir->condition, &predicate);
2993       emit(IF(predicate));
2994    }
2995
2996    visit_instructions(&ir->then_instructions);
2997
2998    if (!ir->else_instructions.is_empty()) {
2999       this->base_ir = ir->condition;
3000       emit(BRW_OPCODE_ELSE);
3001
3002       visit_instructions(&ir->else_instructions);
3003    }
3004
3005    this->base_ir = ir->condition;
3006    emit(BRW_OPCODE_ENDIF);
3007 }
3008
3009 void
3010 vec4_visitor::gs_emit_vertex(int stream_id)
3011 {
3012    unreachable("not reached");
3013 }
3014
3015 void
3016 vec4_visitor::visit(ir_emit_vertex *)
3017 {
3018    unreachable("not reached");
3019 }
3020
3021 void
3022 vec4_visitor::gs_end_primitive()
3023 {
3024    unreachable("not reached");
3025 }
3026
3027
3028 void
3029 vec4_visitor::visit(ir_end_primitive *)
3030 {
3031    unreachable("not reached");
3032 }
3033
3034 void
3035 vec4_visitor::visit(ir_barrier *)
3036 {
3037    unreachable("not reached");
3038 }
3039
3040 void
3041 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3042                                   dst_reg dst, src_reg offset,
3043                                   src_reg src0, src_reg src1)
3044 {
3045    unsigned mlen = 0;
3046
3047    /* Set the atomic operation offset. */
3048    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3049    mlen++;
3050
3051    /* Set the atomic operation arguments. */
3052    if (src0.file != BAD_FILE) {
3053       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3054       mlen++;
3055    }
3056
3057    if (src1.file != BAD_FILE) {
3058       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3059       mlen++;
3060    }
3061
3062    /* Emit the instruction.  Note that this maps to the normal SIMD8
3063     * untyped atomic message on Ivy Bridge, but that's OK because
3064     * unused channels will be masked out.
3065     */
3066    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3067                                  brw_message_reg(0),
3068                                  src_reg(surf_index), src_reg(atomic_op));
3069    inst->mlen = mlen;
3070 }
3071
3072 void
3073 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3074                                         src_reg offset)
3075 {
3076    /* Set the surface read offset. */
3077    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3078
3079    /* Emit the instruction.  Note that this maps to the normal SIMD8
3080     * untyped surface read message, but that's OK because unused
3081     * channels will be masked out.
3082     */
3083    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3084                                  brw_message_reg(0),
3085                                  src_reg(surf_index), src_reg(1));
3086    inst->mlen = 1;
3087 }
3088
3089 void
3090 vec4_visitor::emit_ndc_computation()
3091 {
3092    /* Get the position */
3093    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3094
3095    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3096    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3097    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3098
3099    current_annotation = "NDC";
3100    dst_reg ndc_w = ndc;
3101    ndc_w.writemask = WRITEMASK_W;
3102    src_reg pos_w = pos;
3103    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3104    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3105
3106    dst_reg ndc_xyz = ndc;
3107    ndc_xyz.writemask = WRITEMASK_XYZ;
3108
3109    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3110 }
3111
3112 void
3113 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3114 {
3115    if (devinfo->gen < 6 &&
3116        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3117         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3118         devinfo->has_negative_rhw_bug)) {
3119       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3120       dst_reg header1_w = header1;
3121       header1_w.writemask = WRITEMASK_W;
3122
3123       emit(MOV(header1, 0u));
3124
3125       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3126          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3127
3128          current_annotation = "Point size";
3129          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3130          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3131       }
3132
3133       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3134          current_annotation = "Clipping flags";
3135          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3136          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3137
3138          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3139          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3140          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3141
3142          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3143          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3144          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3145          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3146       }
3147
3148       /* i965 clipping workaround:
3149        * 1) Test for -ve rhw
3150        * 2) If set,
3151        *      set ndc = (0,0,0,0)
3152        *      set ucp[6] = 1
3153        *
3154        * Later, clipping will detect ucp[6] and ensure the primitive is
3155        * clipped against all fixed planes.
3156        */
3157       if (devinfo->has_negative_rhw_bug) {
3158          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3159          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3160          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3161          vec4_instruction *inst;
3162          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3163          inst->predicate = BRW_PREDICATE_NORMAL;
3164          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3165          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3166          inst->predicate = BRW_PREDICATE_NORMAL;
3167       }
3168
3169       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3170    } else if (devinfo->gen < 6) {
3171       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3172    } else {
3173       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3174       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3175          dst_reg reg_w = reg;
3176          reg_w.writemask = WRITEMASK_W;
3177          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3178          reg_as_src.type = reg_w.type;
3179          reg_as_src.swizzle = brw_swizzle_for_size(1);
3180          emit(MOV(reg_w, reg_as_src));
3181       }
3182       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3183          dst_reg reg_y = reg;
3184          reg_y.writemask = WRITEMASK_Y;
3185          reg_y.type = BRW_REGISTER_TYPE_D;
3186          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3187          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3188       }
3189       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3190          dst_reg reg_z = reg;
3191          reg_z.writemask = WRITEMASK_Z;
3192          reg_z.type = BRW_REGISTER_TYPE_D;
3193          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3194          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3195       }
3196    }
3197 }
3198
3199 vec4_instruction *
3200 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3201 {
3202    assert(varying < VARYING_SLOT_MAX);
3203    assert(output_reg[varying].type == reg.type);
3204    current_annotation = output_reg_annotation[varying];
3205    /* Copy the register, saturating if necessary */
3206    return emit(MOV(reg, src_reg(output_reg[varying])));
3207 }
3208
3209 void
3210 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3211 {
3212    reg.type = BRW_REGISTER_TYPE_F;
3213    output_reg[varying].type = reg.type;
3214
3215    switch (varying) {
3216    case VARYING_SLOT_PSIZ:
3217    {
3218       /* PSIZ is always in slot 0, and is coupled with other flags. */
3219       current_annotation = "indices, point width, clip flags";
3220       emit_psiz_and_flags(reg);
3221       break;
3222    }
3223    case BRW_VARYING_SLOT_NDC:
3224       current_annotation = "NDC";
3225       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3226       break;
3227    case VARYING_SLOT_POS:
3228       current_annotation = "gl_Position";
3229       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3230       break;
3231    case VARYING_SLOT_EDGE:
3232       /* This is present when doing unfilled polygons.  We're supposed to copy
3233        * the edge flag from the user-provided vertex array
3234        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3235        * of that attribute (starts as 1.0f).  This is then used in clipping to
3236        * determine which edges should be drawn as wireframe.
3237        */
3238       current_annotation = "edge flag";
3239       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3240                                     glsl_type::float_type, WRITEMASK_XYZW))));
3241       break;
3242    case BRW_VARYING_SLOT_PAD:
3243       /* No need to write to this slot */
3244       break;
3245    default:
3246       emit_generic_urb_slot(reg, varying);
3247       break;
3248    }
3249 }
3250
3251 static int
3252 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3253 {
3254    if (devinfo->gen >= 6) {
3255       /* URB data written (does not include the message header reg) must
3256        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3257        * section 5.4.3.2.2: URB_INTERLEAVED.
3258        *
3259        * URB entries are allocated on a multiple of 1024 bits, so an
3260        * extra 128 bits written here to make the end align to 256 is
3261        * no problem.
3262        */
3263       if ((mlen % 2) != 1)
3264          mlen++;
3265    }
3266
3267    return mlen;
3268 }
3269
3270
3271 /**
3272  * Generates the VUE payload plus the necessary URB write instructions to
3273  * output it.
3274  *
3275  * The VUE layout is documented in Volume 2a.
3276  */
3277 void
3278 vec4_visitor::emit_vertex()
3279 {
3280    /* MRF 0 is reserved for the debugger, so start with message header
3281     * in MRF 1.
3282     */
3283    int base_mrf = 1;
3284    int mrf = base_mrf;
3285    /* In the process of generating our URB write message contents, we
3286     * may need to unspill a register or load from an array.  Those
3287     * reads would use MRFs 14-15.
3288     */
3289    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
3290
3291    /* The following assertion verifies that max_usable_mrf causes an
3292     * even-numbered amount of URB write data, which will meet gen6's
3293     * requirements for length alignment.
3294     */
3295    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3296
3297    /* First mrf is the g0-based message header containing URB handles and
3298     * such.
3299     */
3300    emit_urb_write_header(mrf++);
3301
3302    if (devinfo->gen < 6) {
3303       emit_ndc_computation();
3304    }
3305
3306    /* We may need to split this up into several URB writes, so do them in a
3307     * loop.
3308     */
3309    int slot = 0;
3310    bool complete = false;
3311    do {
3312       /* URB offset is in URB row increments, and each of our MRFs is half of
3313        * one of those, since we're doing interleaved writes.
3314        */
3315       int offset = slot / 2;
3316
3317       mrf = base_mrf + 1;
3318       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3319          emit_urb_slot(dst_reg(MRF, mrf++),
3320                        prog_data->vue_map.slot_to_varying[slot]);
3321
3322          /* If this was max_usable_mrf, we can't fit anything more into this
3323           * URB WRITE. Same thing if we reached the maximum length available.
3324           */
3325          if (mrf > max_usable_mrf ||
3326              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
3327             slot++;
3328             break;
3329          }
3330       }
3331
3332       complete = slot >= prog_data->vue_map.num_slots;
3333       current_annotation = "URB write";
3334       vec4_instruction *inst = emit_urb_write_opcode(complete);
3335       inst->base_mrf = base_mrf;
3336       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3337       inst->offset += offset;
3338    } while(!complete);
3339 }
3340
3341
3342 src_reg
3343 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3344                                  src_reg *reladdr, int reg_offset)
3345 {
3346    /* Because we store the values to scratch interleaved like our
3347     * vertex data, we need to scale the vec4 index by 2.
3348     */
3349    int message_header_scale = 2;
3350
3351    /* Pre-gen6, the message header uses byte offsets instead of vec4
3352     * (16-byte) offset units.
3353     */
3354    if (devinfo->gen < 6)
3355       message_header_scale *= 16;
3356
3357    if (reladdr) {
3358       src_reg index = src_reg(this, glsl_type::int_type);
3359
3360       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3361                                    src_reg(reg_offset)));
3362       emit_before(block, inst, MUL(dst_reg(index), index,
3363                                    src_reg(message_header_scale)));
3364
3365       return index;
3366    } else {
3367       return src_reg(reg_offset * message_header_scale);
3368    }
3369 }
3370
3371 src_reg
3372 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3373                                        src_reg *reladdr, int reg_offset)
3374 {
3375    if (reladdr) {
3376       src_reg index = src_reg(this, glsl_type::int_type);
3377
3378       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3379                                    src_reg(reg_offset)));
3380
3381       /* Pre-gen6, the message header uses byte offsets instead of vec4
3382        * (16-byte) offset units.
3383        */
3384       if (devinfo->gen < 6) {
3385          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3386       }
3387
3388       return index;
3389    } else if (devinfo->gen >= 8) {
3390       /* Store the offset in a GRF so we can send-from-GRF. */
3391       src_reg offset = src_reg(this, glsl_type::int_type);
3392       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3393       return offset;
3394    } else {
3395       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3396       return src_reg(reg_offset * message_header_scale);
3397    }
3398 }
3399
3400 /**
3401  * Emits an instruction before @inst to load the value named by @orig_src
3402  * from scratch space at @base_offset to @temp.
3403  *
3404  * @base_offset is measured in 32-byte units (the size of a register).
3405  */
3406 void
3407 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3408                                 dst_reg temp, src_reg orig_src,
3409                                 int base_offset)
3410 {
3411    int reg_offset = base_offset + orig_src.reg_offset;
3412    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3413                                       reg_offset);
3414
3415    emit_before(block, inst, SCRATCH_READ(temp, index));
3416 }
3417
3418 /**
3419  * Emits an instruction after @inst to store the value to be written
3420  * to @orig_dst to scratch space at @base_offset, from @temp.
3421  *
3422  * @base_offset is measured in 32-byte units (the size of a register).
3423  */
3424 void
3425 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3426                                  int base_offset)
3427 {
3428    int reg_offset = base_offset + inst->dst.reg_offset;
3429    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3430                                       reg_offset);
3431
3432    /* Create a temporary register to store *inst's result in.
3433     *
3434     * We have to be careful in MOVing from our temporary result register in
3435     * the scratch write.  If we swizzle from channels of the temporary that
3436     * weren't initialized, it will confuse live interval analysis, which will
3437     * make spilling fail to make progress.
3438     */
3439    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3440                                        inst->dst.type),
3441                                 brw_swizzle_for_mask(inst->dst.writemask));
3442    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3443                                        inst->dst.writemask));
3444    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3445    if (inst->opcode != BRW_OPCODE_SEL)
3446       write->predicate = inst->predicate;
3447    write->ir = inst->ir;
3448    write->annotation = inst->annotation;
3449    inst->insert_after(block, write);
3450
3451    inst->dst.file = temp.file;
3452    inst->dst.reg = temp.reg;
3453    inst->dst.reg_offset = temp.reg_offset;
3454    inst->dst.reladdr = NULL;
3455 }
3456
3457 /**
3458  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3459  * adds the scratch read(s) before \p inst. The function also checks for
3460  * recursive reladdr scratch accesses, issuing the corresponding scratch
3461  * loads and rewriting reladdr references accordingly.
3462  *
3463  * \return \p src if it did not require a scratch load, otherwise, the
3464  * register holding the result of the scratch load that the caller should
3465  * use to rewrite src.
3466  */
3467 src_reg
3468 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3469                                    vec4_instruction *inst, src_reg src)
3470 {
3471    /* Resolve recursive reladdr scratch access by calling ourselves
3472     * with src.reladdr
3473     */
3474    if (src.reladdr)
3475       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3476                                           *src.reladdr);
3477
3478    /* Now handle scratch access on src */
3479    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3480       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3481       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3482       src.reg = temp.reg;
3483       src.reg_offset = temp.reg_offset;
3484       src.reladdr = NULL;
3485    }
3486
3487    return src;
3488 }
3489
3490 /**
3491  * We can't generally support array access in GRF space, because a
3492  * single instruction's destination can only span 2 contiguous
3493  * registers.  So, we send all GRF arrays that get variable index
3494  * access to scratch space.
3495  */
3496 void
3497 vec4_visitor::move_grf_array_access_to_scratch()
3498 {
3499    int scratch_loc[this->alloc.count];
3500    memset(scratch_loc, -1, sizeof(scratch_loc));
3501
3502    /* First, calculate the set of virtual GRFs that need to be punted
3503     * to scratch due to having any array access on them, and where in
3504     * scratch.
3505     */
3506    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3507       if (inst->dst.file == GRF && inst->dst.reladdr) {
3508          if (scratch_loc[inst->dst.reg] == -1) {
3509             scratch_loc[inst->dst.reg] = last_scratch;
3510             last_scratch += this->alloc.sizes[inst->dst.reg];
3511          }
3512
3513          for (src_reg *iter = inst->dst.reladdr;
3514               iter->reladdr;
3515               iter = iter->reladdr) {
3516             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3517                scratch_loc[iter->reg] = last_scratch;
3518                last_scratch += this->alloc.sizes[iter->reg];
3519             }
3520          }
3521       }
3522
3523       for (int i = 0 ; i < 3; i++) {
3524          for (src_reg *iter = &inst->src[i];
3525               iter->reladdr;
3526               iter = iter->reladdr) {
3527             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3528                scratch_loc[iter->reg] = last_scratch;
3529                last_scratch += this->alloc.sizes[iter->reg];
3530             }
3531          }
3532       }
3533    }
3534
3535    /* Now, for anything that will be accessed through scratch, rewrite
3536     * it to load/store.  Note that this is a _safe list walk, because
3537     * we may generate a new scratch_write instruction after the one
3538     * we're processing.
3539     */
3540    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3541       /* Set up the annotation tracking for new generated instructions. */
3542       base_ir = inst->ir;
3543       current_annotation = inst->annotation;
3544
3545       /* First handle scratch access on the dst. Notice we have to handle
3546        * the case where the dst's reladdr also points to scratch space.
3547        */
3548       if (inst->dst.reladdr)
3549          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3550                                                    *inst->dst.reladdr);
3551
3552       /* Now that we have handled any (possibly recursive) reladdr scratch
3553        * accesses for dst we can safely do the scratch write for dst itself
3554        */
3555       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3556          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3557
3558       /* Now handle scratch access on any src. In this case, since inst->src[i]
3559        * already is a src_reg, we can just call emit_resolve_reladdr with
3560        * inst->src[i] and it will take care of handling scratch loads for
3561        * both src and src.reladdr (recursively).
3562        */
3563       for (int i = 0 ; i < 3; i++) {
3564          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3565                                              inst->src[i]);
3566       }
3567    }
3568 }
3569
3570 /**
3571  * Emits an instruction before @inst to load the value named by @orig_src
3572  * from the pull constant buffer (surface) at @base_offset to @temp.
3573  */
3574 void
3575 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3576                                       dst_reg temp, src_reg orig_src,
3577                                       int base_offset)
3578 {
3579    int reg_offset = base_offset + orig_src.reg_offset;
3580    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3581    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3582                                              reg_offset);
3583
3584    emit_pull_constant_load_reg(temp,
3585                                index,
3586                                offset,
3587                                block, inst);
3588 }
3589
3590 /**
3591  * Implements array access of uniforms by inserting a
3592  * PULL_CONSTANT_LOAD instruction.
3593  *
3594  * Unlike temporary GRF array access (where we don't support it due to
3595  * the difficulty of doing relative addressing on instruction
3596  * destinations), we could potentially do array access of uniforms
3597  * that were loaded in GRF space as push constants.  In real-world
3598  * usage we've seen, though, the arrays being used are always larger
3599  * than we could load as push constants, so just always move all
3600  * uniform array access out to a pull constant buffer.
3601  */
3602 void
3603 vec4_visitor::move_uniform_array_access_to_pull_constants()
3604 {
3605    int pull_constant_loc[this->uniforms];
3606    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3607    bool nested_reladdr;
3608
3609    /* Walk through and find array access of uniforms.  Put a copy of that
3610     * uniform in the pull constant buffer.
3611     *
3612     * Note that we don't move constant-indexed accesses to arrays.  No
3613     * testing has been done of the performance impact of this choice.
3614     */
3615    do {
3616       nested_reladdr = false;
3617
3618       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3619          for (int i = 0 ; i < 3; i++) {
3620             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3621                continue;
3622
3623             int uniform = inst->src[i].reg;
3624
3625             if (inst->src[i].reladdr->reladdr)
3626                nested_reladdr = true;  /* will need another pass */
3627
3628             /* If this array isn't already present in the pull constant buffer,
3629              * add it.
3630              */
3631             if (pull_constant_loc[uniform] == -1) {
3632                const gl_constant_value **values =
3633                   &stage_prog_data->param[uniform * 4];
3634
3635                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3636
3637                assert(uniform < uniform_array_size);
3638                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3639                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3640                      = values[j];
3641                }
3642             }
3643
3644             /* Set up the annotation tracking for new generated instructions. */
3645             base_ir = inst->ir;
3646             current_annotation = inst->annotation;
3647
3648             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3649
3650             emit_pull_constant_load(block, inst, temp, inst->src[i],
3651                                     pull_constant_loc[uniform]);
3652
3653             inst->src[i].file = temp.file;
3654             inst->src[i].reg = temp.reg;
3655             inst->src[i].reg_offset = temp.reg_offset;
3656             inst->src[i].reladdr = NULL;
3657          }
3658       }
3659    } while (nested_reladdr);
3660
3661    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3662     * no need to track them as larger-than-vec4 objects.  This will be
3663     * relied on in cutting out unused uniform vectors from push
3664     * constants.
3665     */
3666    split_uniform_registers();
3667 }
3668
3669 void
3670 vec4_visitor::resolve_ud_negate(src_reg *reg)
3671 {
3672    if (reg->type != BRW_REGISTER_TYPE_UD ||
3673        !reg->negate)
3674       return;
3675
3676    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3677    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3678    *reg = temp;
3679 }
3680
3681 /**
3682  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3683  *
3684  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3685  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3686  */
3687 void
3688 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3689 {
3690    assert(devinfo->gen <= 5);
3691
3692    if (!rvalue->type->is_boolean())
3693       return;
3694
3695    src_reg and_result = src_reg(this, rvalue->type);
3696    src_reg neg_result = src_reg(this, rvalue->type);
3697    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3698    emit(MOV(dst_reg(neg_result), negate(and_result)));
3699    *reg = neg_result;
3700 }
3701
3702 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3703                            void *log_data,
3704                            struct gl_program *prog,
3705                            const struct brw_sampler_prog_key_data *key_tex,
3706                            struct brw_vue_prog_data *prog_data,
3707                            struct gl_shader_program *shader_prog,
3708                            gl_shader_stage stage,
3709                            void *mem_ctx,
3710                            bool no_spills,
3711                            int shader_time_index)
3712    : backend_shader(compiler, log_data, mem_ctx,
3713                     shader_prog, prog, &prog_data->base, stage),
3714      key_tex(key_tex),
3715      prog_data(prog_data),
3716      sanity_param_count(0),
3717      fail_msg(NULL),
3718      first_non_payload_grf(0),
3719      need_all_constants_in_pull_buffer(false),
3720      no_spills(no_spills),
3721      shader_time_index(shader_time_index),
3722      last_scratch(0)
3723 {
3724    this->failed = false;
3725
3726    this->base_ir = NULL;
3727    this->current_annotation = NULL;
3728    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3729
3730    this->variable_ht = hash_table_ctor(0,
3731                                        hash_table_pointer_hash,
3732                                        hash_table_pointer_compare);
3733
3734    this->virtual_grf_start = NULL;
3735    this->virtual_grf_end = NULL;
3736    this->live_intervals = NULL;
3737
3738    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3739
3740    this->uniforms = 0;
3741
3742    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3743     * at least one. See setup_uniforms() in brw_vec4.cpp.
3744     */
3745    this->uniform_array_size = 1;
3746    if (prog_data) {
3747       this->uniform_array_size =
3748          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3749    }
3750
3751    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3752    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3753 }
3754
3755 vec4_visitor::~vec4_visitor()
3756 {
3757    hash_table_dtor(this->variable_ht);
3758 }
3759
3760
3761 void
3762 vec4_visitor::fail(const char *format, ...)
3763 {
3764    va_list va;
3765    char *msg;
3766
3767    if (failed)
3768       return;
3769
3770    failed = true;
3771
3772    va_start(va, format);
3773    msg = ralloc_vasprintf(mem_ctx, format, va);
3774    va_end(va);
3775    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3776
3777    this->fail_msg = msg;
3778
3779    if (debug_enabled) {
3780       fprintf(stderr, "%s",  msg);
3781    }
3782 }
3783
3784 } /* namespace brw */