src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->predicate = BRW_PREDICATE_NONE;
  49    this->predicate_inverse = false;
  50    this->target = 0;
  51    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  52    this->shadow_compare = false;
  53    this->ir = NULL;
  54    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  55    this->header_size = 0;
  56    this->flag_subreg = 0;
  57    this->mlen = 0;
  58    this->base_mrf = 0;
  59    this->offset = 0;
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188
 189 /** Gen4 predicated IF. */
 190 vec4_instruction *
 191 vec4_visitor::IF(enum brw_predicate predicate)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197
 198    return inst;
 199 }
 200
 201 /** Gen6 IF with embedded comparison. */
 202 vec4_instruction *
 203 vec4_visitor::IF(src_reg src0, src_reg src1,
 204                  enum brw_conditional_mod condition)
 205 {
 206    assert(devinfo->gen == 6);
 207
 208    vec4_instruction *inst;
 209
 210    resolve_ud_negate(&src0);
 211    resolve_ud_negate(&src1);
 212
 213    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 214                                         src0, src1);
 215    inst->conditional_mod = condition;
 216
 217    return inst;
 218 }
 219
 220 /**
 221  * CMP: Sets the low bit of the destination channels with the result
 222  * of the comparison, while the upper bits are undefined, and updates
 223  * the flag register with the packed 16 bits of the result.
 224  */
 225 vec4_instruction *
 226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 227                   enum brw_conditional_mod condition)
 228 {
 229    vec4_instruction *inst;
 230
 231    /* Take the instruction:
 232     *
 233     * CMP null<d> src0<f> src1<f>
 234     *
 235     * Original gen4 does type conversion to the destination type before
 236     * comparison, producing garbage results for floating point comparisons.
 237     *
 238     * The destination type doesn't matter on newer generations, so we set the
 239     * type to match src0 so we can compact the instruction.
 240     */
 241    dst.type = src0.type;
 242    if (dst.file == HW_REG)
 243       dst.fixed_hw_reg.type = dst.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 void
 282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 283 {
 284    static enum opcode dot_opcodes[] = {
 285       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 286    };
 287
 288    emit(dot_opcodes[elements - 2], dst, src0, src1);
 289 }
 290
 291 src_reg
 292 vec4_visitor::fix_3src_operand(const src_reg &src)
 293 {
 294    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 295     * able to use vertical stride of zero to replicate the vec4 uniform, like
 296     *
 297     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 298     *
 299     * But you can't, since vertical stride is always four in three-source
 300     * instructions. Instead, insert a MOV instruction to do the replication so
 301     * that the three-source instruction can consume it.
 302     */
 303
 304    /* The MOV is only needed if the source is a uniform or immediate. */
 305    if (src.file != UNIFORM && src.file != IMM)
 306       return src;
 307
 308    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 309       return src;
 310
 311    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 312    expanded.type = src.type;
 313    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 314    return src_reg(expanded);
 315 }
 316
 317 src_reg
 318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 319 {
 320    if (!src.abs && !src.negate)
 321       return src;
 322
 323    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 324    resolved.type = src.type;
 325    emit(MOV(resolved, src));
 326
 327    return src_reg(resolved);
 328 }
 329
 330 src_reg
 331 vec4_visitor::fix_math_operand(const src_reg &src)
 332 {
 333    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 334       return src;
 335
 336    /* The gen6 math instruction ignores the source modifiers --
 337     * swizzle, abs, negate, and at least some parts of the register
 338     * region description.
 339     *
 340     * Rather than trying to enumerate all these cases, *always* expand the
 341     * operand to a temp GRF for gen6.
 342     *
 343     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 344     * can't use.
 345     */
 346
 347    if (devinfo->gen == 7 && src.file != IMM)
 348       return src;
 349
 350    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 351    expanded.type = src.type;
 352    emit(MOV(expanded, src));
 353    return src_reg(expanded);
 354 }
 355
 356 vec4_instruction *
 357 vec4_visitor::emit_math(enum opcode opcode,
 358                         const dst_reg &dst,
 359                         const src_reg &src0, const src_reg &src1)
 360 {
 361    vec4_instruction *math =
 362       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 363
 364    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 365       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 366       math->dst = dst_reg(this, glsl_type::vec4_type);
 367       math->dst.type = dst.type;
 368       math = emit(MOV(dst, src_reg(math->dst)));
 369    } else if (devinfo->gen < 6) {
 370       math->base_mrf = 1;
 371       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 372    }
 373
 374    return math;
 375 }
 376
 377 void
 378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 379 {
 380    if (devinfo->gen < 7) {
 381       unreachable("ir_unop_pack_half_2x16 should be lowered");
 382    }
 383
 384    assert(dst.type == BRW_REGISTER_TYPE_UD);
 385    assert(src0.type == BRW_REGISTER_TYPE_F);
 386
 387    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 388     *
 389     *   Because this instruction does not have a 16-bit floating-point type,
 390     *   the destination data type must be Word (W).
 391     *
 392     *   The destination must be DWord-aligned and specify a horizontal stride
 393     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 394     *   each destination channel and the upper word is not modified.
 395     *
 396     * The above restriction implies that the f32to16 instruction must use
 397     * align1 mode, because only in align1 mode is it possible to specify
 398     * horizontal stride.  We choose here to defy the hardware docs and emit
 399     * align16 instructions.
 400     *
 401     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 402     * instructions. I was partially successful in that the code passed all
 403     * tests.  However, the code was dubiously correct and fragile, and the
 404     * tests were not harsh enough to probe that frailty. Not trusting the
 405     * code, I chose instead to remain in align16 mode in defiance of the hw
 406     * docs).
 407     *
 408     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 409     * simulator, emitting a f32to16 in align16 mode with UD as destination
 410     * data type is safe. The behavior differs from that specified in the PRM
 411     * in that the upper word of each destination channel is cleared to 0.
 412     */
 413
 414    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 415    src_reg tmp_src(tmp_dst);
 416
 417 #if 0
 418    /* Verify the undocumented behavior on which the following instructions
 419     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 420     * then the result of the bit-or instruction below will be incorrect.
 421     *
 422     * You should inspect the disasm output in order to verify that the MOV is
 423     * not optimized away.
 424     */
 425    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 426 #endif
 427
 428    /* Give tmp the form below, where "." means untouched.
 429     *
 430     *     w z          y          x w z          y          x
 431     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 432     *
 433     * That the upper word of each write-channel be 0 is required for the
 434     * following bit-shift and bit-or instructions to work. Note that this
 435     * relies on the undocumented hardware behavior mentioned above.
 436     */
 437    tmp_dst.writemask = WRITEMASK_XY;
 438    emit(F32TO16(tmp_dst, src0));
 439
 440    /* Give the write-channels of dst the form:
 441     *   0xhhhh0000
 442     */
 443    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 444    emit(SHL(dst, tmp_src, src_reg(16u)));
 445
 446    /* Finally, give the write-channels of dst the form of packHalf2x16's
 447     * output:
 448     *   0xhhhhllll
 449     */
 450    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 451    emit(OR(dst, src_reg(dst), tmp_src));
 452 }
 453
 454 void
 455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 456 {
 457    if (devinfo->gen < 7) {
 458       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 459    }
 460
 461    assert(dst.type == BRW_REGISTER_TYPE_F);
 462    assert(src0.type == BRW_REGISTER_TYPE_UD);
 463
 464    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 465     *
 466     *   Because this instruction does not have a 16-bit floating-point type,
 467     *   the source data type must be Word (W). The destination type must be
 468     *   F (Float).
 469     *
 470     * To use W as the source data type, we must adjust horizontal strides,
 471     * which is only possible in align1 mode. All my [chadv] attempts at
 472     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 473     * Piglit tests, so I gave up.
 474     *
 475     * I've verified that, on gen7 hardware and the simulator, it is safe to
 476     * emit f16to32 in align16 mode with UD as source data type.
 477     */
 478
 479    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 480    src_reg tmp_src(tmp_dst);
 481
 482    tmp_dst.writemask = WRITEMASK_X;
 483    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 484
 485    tmp_dst.writemask = WRITEMASK_Y;
 486    emit(SHR(tmp_dst, src0, src_reg(16u)));
 487
 488    dst.writemask = WRITEMASK_XY;
 489    emit(F16TO32(dst, tmp_src));
 490 }
 491
 492 void
 493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 494 {
 495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 497     * is not suitable to generate the shift values, but we can use the packed
 498     * vector float and a type-converting MOV.
 499     */
 500    dst_reg shift(this, glsl_type::uvec4_type);
 501    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 502
 503    dst_reg shifted(this, glsl_type::uvec4_type);
 504    src0.swizzle = BRW_SWIZZLE_XXXX;
 505    emit(SHR(shifted, src0, src_reg(shift)));
 506
 507    shifted.type = BRW_REGISTER_TYPE_UB;
 508    dst_reg f(this, glsl_type::vec4_type);
 509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 510
 511    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 516 {
 517    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 518     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 519     * is not suitable to generate the shift values, but we can use the packed
 520     * vector float and a type-converting MOV.
 521     */
 522    dst_reg shift(this, glsl_type::uvec4_type);
 523    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 524
 525    dst_reg shifted(this, glsl_type::uvec4_type);
 526    src0.swizzle = BRW_SWIZZLE_XXXX;
 527    emit(SHR(shifted, src0, src_reg(shift)));
 528
 529    shifted.type = BRW_REGISTER_TYPE_B;
 530    dst_reg f(this, glsl_type::vec4_type);
 531    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 532
 533    dst_reg scaled(this, glsl_type::vec4_type);
 534    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 535
 536    dst_reg max(this, glsl_type::vec4_type);
 537    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 538    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 539 }
 540
 541 void
 542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 543 {
 544    dst_reg saturated(this, glsl_type::vec4_type);
 545    vec4_instruction *inst = emit(MOV(saturated, src0));
 546    inst->saturate = true;
 547
 548    dst_reg scaled(this, glsl_type::vec4_type);
 549    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 550
 551    dst_reg rounded(this, glsl_type::vec4_type);
 552    emit(RNDE(rounded, src_reg(scaled)));
 553
 554    dst_reg u(this, glsl_type::uvec4_type);
 555    emit(MOV(u, src_reg(rounded)));
 556
 557    src_reg bytes(u);
 558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 559 }
 560
 561 void
 562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 563 {
 564    dst_reg max(this, glsl_type::vec4_type);
 565    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 566
 567    dst_reg min(this, glsl_type::vec4_type);
 568    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 569
 570    dst_reg scaled(this, glsl_type::vec4_type);
 571    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 572
 573    dst_reg rounded(this, glsl_type::vec4_type);
 574    emit(RNDE(rounded, src_reg(scaled)));
 575
 576    dst_reg i(this, glsl_type::ivec4_type);
 577    emit(MOV(i, src_reg(rounded)));
 578
 579    src_reg bytes(i);
 580    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 581 }
 582
 583 void
 584 vec4_visitor::visit_instructions(const exec_list *list)
 585 {
 586    foreach_in_list(ir_instruction, ir, list) {
 587       base_ir = ir;
 588       ir->accept(this);
 589    }
 590 }
 591
 592 /**
 593  * Returns the minimum number of vec4 elements needed to pack a type.
 594  *
 595  * For simple types, it will return 1 (a single vec4); for matrices, the
 596  * number of columns; for array and struct, the sum of the vec4_size of
 597  * each of its elements; and for sampler and atomic, zero.
 598  *
 599  * This method is useful to calculate how much register space is needed to
 600  * store a particular type.
 601  */
 602 extern "C" int
 603 type_size_vec4(const struct glsl_type *type)
 604 {
 605    unsigned int i;
 606    int size;
 607
 608    switch (type->base_type) {
 609    case GLSL_TYPE_UINT:
 610    case GLSL_TYPE_INT:
 611    case GLSL_TYPE_FLOAT:
 612    case GLSL_TYPE_BOOL:
 613       if (type->is_matrix()) {
 614          return type->matrix_columns;
 615       } else {
 616          /* Regardless of size of vector, it gets a vec4. This is bad
 617           * packing for things like floats, but otherwise arrays become a
 618           * mess.  Hopefully a later pass over the code can pack scalars
 619           * down if appropriate.
 620           */
 621          return 1;
 622       }
 623    case GLSL_TYPE_ARRAY:
 624       assert(type->length > 0);
 625       return type_size_vec4(type->fields.array) * type->length;
 626    case GLSL_TYPE_STRUCT:
 627       size = 0;
 628       for (i = 0; i < type->length; i++) {
 629          size += type_size_vec4(type->fields.structure[i].type);
 630       }
 631       return size;
 632    case GLSL_TYPE_SUBROUTINE:
 633       return 1;
 634
 635    case GLSL_TYPE_SAMPLER:
 636       /* Samplers take up no register space, since they're baked in at
 637        * link time.
 638        */
 639       return 0;
 640    case GLSL_TYPE_ATOMIC_UINT:
 641       return 0;
 642    case GLSL_TYPE_IMAGE:
 643       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 644    case GLSL_TYPE_VOID:
 645    case GLSL_TYPE_DOUBLE:
 646    case GLSL_TYPE_ERROR:
 647    case GLSL_TYPE_INTERFACE:
 648    case GLSL_TYPE_FUNCTION:
 649       unreachable("not reached");
 650    }
 651
 652    return 0;
 653 }
 654
 655 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 656 {
 657    init();
 658
 659    this->file = GRF;
 660    this->reg = v->alloc.allocate(type_size_vec4(type));
 661
 662    if (type->is_array() || type->is_record()) {
 663       this->swizzle = BRW_SWIZZLE_NOOP;
 664    } else {
 665       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 666    }
 667
 668    this->type = brw_type_for_base_type(type);
 669 }
 670
 671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 672 {
 673    assert(size > 0);
 674
 675    init();
 676
 677    this->file = GRF;
 678    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 679
 680    this->swizzle = BRW_SWIZZLE_NOOP;
 681
 682    this->type = brw_type_for_base_type(type);
 683 }
 684
 685 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 686 {
 687    init();
 688
 689    this->file = GRF;
 690    this->reg = v->alloc.allocate(type_size_vec4(type));
 691
 692    if (type->is_array() || type->is_record()) {
 693       this->writemask = WRITEMASK_XYZW;
 694    } else {
 695       this->writemask = (1 << type->vector_elements) - 1;
 696    }
 697
 698    this->type = brw_type_for_base_type(type);
 699 }
 700
 701 void
 702 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 703                                        const gl_constant_value *values,
 704                                        unsigned n)
 705 {
 706    static const gl_constant_value zero = { 0 };
 707
 708    assert(param_offset % 4 == 0);
 709
 710    for (unsigned i = 0; i < n; ++i)
 711       stage_prog_data->param[param_offset + i] = &values[i];
 712
 713    for (unsigned i = n; i < 4; ++i)
 714       stage_prog_data->param[param_offset + i] = &zero;
 715
 716    uniform_vector_size[param_offset / 4] = n;
 717 }
 718
 719 /* Our support for uniforms is piggy-backed on the struct
 720  * gl_fragment_program, because that's where the values actually
 721  * get stored, rather than in some global gl_shader_program uniform
 722  * store.
 723  */
 724 void
 725 vec4_visitor::setup_uniform_values(ir_variable *ir)
 726 {
 727    int namelen = strlen(ir->name);
 728
 729    /* The data for our (non-builtin) uniforms is stored in a series of
 730     * gl_uniform_driver_storage structs for each subcomponent that
 731     * glGetUniformLocation() could name.  We know it's been set up in the same
 732     * order we'd walk the type, so walk the list of storage and find anything
 733     * with our name, or the prefix of a component that starts with our name.
 734     */
 735    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 736       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 737
 738       if (storage->builtin)
 739          continue;
 740
 741       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 742           (storage->name[namelen] != 0 &&
 743            storage->name[namelen] != '.' &&
 744            storage->name[namelen] != '[')) {
 745          continue;
 746       }
 747
 748       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 749                                      storage->type->matrix_columns);
 750       const unsigned vector_size = storage->type->vector_elements;
 751
 752       for (unsigned s = 0; s < vector_count; s++) {
 753          setup_vec4_uniform_value(uniforms * 4,
 754                                   &storage->storage[s * vector_size],
 755                                   vector_size);
 756          uniforms++;
 757       }
 758    }
 759 }
 760
 761 /* Our support for builtin uniforms is even scarier than non-builtin.
 762  * It sits on top of the PROG_STATE_VAR parameters that are
 763  * automatically updated from GL context state.
 764  */
 765 void
 766 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 767 {
 768    const ir_state_slot *const slots = ir->get_state_slots();
 769    assert(slots != NULL);
 770
 771    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 772       /* This state reference has already been setup by ir_to_mesa,
 773        * but we'll get the same index back here.  We can reference
 774        * ParameterValues directly, since unlike brw_fs.cpp, we never
 775        * add new state references during compile.
 776        */
 777       int index = _mesa_add_state_reference(this->prog->Parameters,
 778                                             (gl_state_index *)slots[i].tokens);
 779       gl_constant_value *values =
 780          &this->prog->Parameters->ParameterValues[index][0];
 781
 782       assert(this->uniforms < uniform_array_size);
 783
 784       for (unsigned j = 0; j < 4; j++)
 785          stage_prog_data->param[this->uniforms * 4 + j] =
 786             &values[GET_SWZ(slots[i].swizzle, j)];
 787
 788       this->uniform_vector_size[this->uniforms] =
 789          (ir->type->is_scalar() || ir->type->is_vector() ||
 790           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 791
 792       this->uniforms++;
 793    }
 794 }
 795
 796 dst_reg *
 797 vec4_visitor::variable_storage(ir_variable *var)
 798 {
 799    return (dst_reg *)hash_table_find(this->variable_ht, var);
 800 }
 801
 802 void
 803 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 804                                      enum brw_predicate *predicate)
 805 {
 806    ir_expression *expr = ir->as_expression();
 807
 808    *predicate = BRW_PREDICATE_NORMAL;
 809
 810    if (expr && expr->operation != ir_binop_ubo_load) {
 811       src_reg op[3];
 812       vec4_instruction *inst;
 813
 814       assert(expr->get_num_operands() <= 3);
 815       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 816          expr->operands[i]->accept(this);
 817          op[i] = this->result;
 818
 819          resolve_ud_negate(&op[i]);
 820       }
 821
 822       switch (expr->operation) {
 823       case ir_unop_logic_not:
 824          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 825          inst->conditional_mod = BRW_CONDITIONAL_Z;
 826          break;
 827
 828       case ir_binop_logic_xor:
 829          if (devinfo->gen <= 5) {
 830             src_reg temp = src_reg(this, ir->type);
 831             emit(XOR(dst_reg(temp), op[0], op[1]));
 832             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 833          } else {
 834             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 835          }
 836          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 837          break;
 838
 839       case ir_binop_logic_or:
 840          if (devinfo->gen <= 5) {
 841             src_reg temp = src_reg(this, ir->type);
 842             emit(OR(dst_reg(temp), op[0], op[1]));
 843             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 844          } else {
 845             inst = emit(OR(dst_null_d(), op[0], op[1]));
 846          }
 847          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 848          break;
 849
 850       case ir_binop_logic_and:
 851          if (devinfo->gen <= 5) {
 852             src_reg temp = src_reg(this, ir->type);
 853             emit(AND(dst_reg(temp), op[0], op[1]));
 854             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 855          } else {
 856             inst = emit(AND(dst_null_d(), op[0], op[1]));
 857          }
 858          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 859          break;
 860
 861       case ir_unop_f2b:
 862          if (devinfo->gen >= 6) {
 863             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 864          } else {
 865             inst = emit(MOV(dst_null_f(), op[0]));
 866             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867          }
 868          break;
 869
 870       case ir_unop_i2b:
 871          if (devinfo->gen >= 6) {
 872             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 873          } else {
 874             inst = emit(MOV(dst_null_d(), op[0]));
 875             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 876          }
 877          break;
 878
 879       case ir_binop_all_equal:
 880          if (devinfo->gen <= 5) {
 881             resolve_bool_comparison(expr->operands[0], &op[0]);
 882             resolve_bool_comparison(expr->operands[1], &op[1]);
 883          }
 884          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 885          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 886          break;
 887
 888       case ir_binop_any_nequal:
 889          if (devinfo->gen <= 5) {
 890             resolve_bool_comparison(expr->operands[0], &op[0]);
 891             resolve_bool_comparison(expr->operands[1], &op[1]);
 892          }
 893          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 894          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 895          break;
 896
 897       case ir_unop_any:
 898          if (devinfo->gen <= 5) {
 899             resolve_bool_comparison(expr->operands[0], &op[0]);
 900          }
 901          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 902          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 903          break;
 904
 905       case ir_binop_greater:
 906       case ir_binop_gequal:
 907       case ir_binop_less:
 908       case ir_binop_lequal:
 909       case ir_binop_equal:
 910       case ir_binop_nequal:
 911          if (devinfo->gen <= 5) {
 912             resolve_bool_comparison(expr->operands[0], &op[0]);
 913             resolve_bool_comparison(expr->operands[1], &op[1]);
 914          }
 915          emit(CMP(dst_null_d(), op[0], op[1],
 916                   brw_conditional_for_comparison(expr->operation)));
 917          break;
 918
 919       case ir_triop_csel: {
 920          /* Expand the boolean condition into the flag register. */
 921          inst = emit(MOV(dst_null_d(), op[0]));
 922          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923
 924          /* Select which boolean to return. */
 925          dst_reg temp(this, expr->operands[1]->type);
 926          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 927          inst->predicate = BRW_PREDICATE_NORMAL;
 928
 929          /* Expand the result to a condition code. */
 930          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 931          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 932          break;
 933       }
 934
 935       default:
 936          unreachable("not reached");
 937       }
 938       return;
 939    }
 940
 941    ir->accept(this);
 942
 943    resolve_ud_negate(&this->result);
 944
 945    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 946    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 947 }
 948
 949 /**
 950  * Emit a gen6 IF statement with the comparison folded into the IF
 951  * instruction.
 952  */
 953 void
 954 vec4_visitor::emit_if_gen6(ir_if *ir)
 955 {
 956    ir_expression *expr = ir->condition->as_expression();
 957
 958    if (expr && expr->operation != ir_binop_ubo_load) {
 959       src_reg op[3];
 960       dst_reg temp;
 961
 962       assert(expr->get_num_operands() <= 3);
 963       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 964          expr->operands[i]->accept(this);
 965          op[i] = this->result;
 966       }
 967
 968       switch (expr->operation) {
 969       case ir_unop_logic_not:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 971          return;
 972
 973       case ir_binop_logic_xor:
 974          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 975          return;
 976
 977       case ir_binop_logic_or:
 978          temp = dst_reg(this, glsl_type::bool_type);
 979          emit(OR(temp, op[0], op[1]));
 980          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 981          return;
 982
 983       case ir_binop_logic_and:
 984          temp = dst_reg(this, glsl_type::bool_type);
 985          emit(AND(temp, op[0], op[1]));
 986          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 987          return;
 988
 989       case ir_unop_f2b:
 990          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 991          return;
 992
 993       case ir_unop_i2b:
 994          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 995          return;
 996
 997       case ir_binop_greater:
 998       case ir_binop_gequal:
 999       case ir_binop_less:
1000       case ir_binop_lequal:
1001       case ir_binop_equal:
1002       case ir_binop_nequal:
1003          emit(IF(op[0], op[1],
1004                  brw_conditional_for_comparison(expr->operation)));
1005          return;
1006
1007       case ir_binop_all_equal:
1008          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1009          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1010          return;
1011
1012       case ir_binop_any_nequal:
1013          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1014          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1015          return;
1016
1017       case ir_unop_any:
1018          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1019          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1020          return;
1021
1022       case ir_triop_csel: {
1023          /* Expand the boolean condition into the flag register. */
1024          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1025          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1026
1027          /* Select which boolean to return. */
1028          dst_reg temp(this, expr->operands[1]->type);
1029          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1030          inst->predicate = BRW_PREDICATE_NORMAL;
1031
1032          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1033          return;
1034       }
1035
1036       default:
1037          unreachable("not reached");
1038       }
1039       return;
1040    }
1041
1042    ir->condition->accept(this);
1043
1044    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1045 }
1046
1047 void
1048 vec4_visitor::visit(ir_variable *ir)
1049 {
1050    dst_reg *reg = NULL;
1051
1052    if (variable_storage(ir))
1053       return;
1054
1055    switch (ir->data.mode) {
1056    case ir_var_shader_in:
1057       assert(ir->data.location != -1);
1058       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1059       break;
1060
1061    case ir_var_shader_out:
1062       assert(ir->data.location != -1);
1063       reg = new(mem_ctx) dst_reg(this, ir->type);
1064
1065       for (int i = 0; i < type_size_vec4(ir->type); i++) {
1066          output_reg[ir->data.location + i] = *reg;
1067          output_reg[ir->data.location + i].reg_offset = i;
1068          output_reg_annotation[ir->data.location + i] = ir->name;
1069       }
1070       break;
1071
1072    case ir_var_auto:
1073    case ir_var_temporary:
1074       reg = new(mem_ctx) dst_reg(this, ir->type);
1075       break;
1076
1077    case ir_var_uniform:
1078    case ir_var_shader_storage:
1079       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1080
1081       /* Thanks to the lower_ubo_reference pass, we will see only
1082        * ir_binop_{ubo,ssbo}_load expressions and not ir_dereference_variable
1083        * for UBO/SSBO variables, so no need for them to be in variable_ht.
1084        *
1085        * Some uniforms, such as samplers and atomic counters, have no actual
1086        * storage, so we should ignore them.
1087        */
1088       if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1089          return;
1090
1091       /* Track how big the whole uniform variable is, in case we need to put a
1092        * copy of its data into pull constants for array access.
1093        */
1094       assert(this->uniforms < uniform_array_size);
1095       this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1096
1097       if (!strncmp(ir->name, "gl_", 3)) {
1098          setup_builtin_uniform_values(ir);
1099       } else {
1100          setup_uniform_values(ir);
1101       }
1102       break;
1103
1104    case ir_var_system_value:
1105       reg = make_reg_for_system_value(ir->data.location, ir->type);
1106       break;
1107
1108    default:
1109       unreachable("not reached");
1110    }
1111
1112    reg->type = brw_type_for_base_type(ir->type);
1113    hash_table_insert(this->variable_ht, reg, ir);
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_loop *ir)
1118 {
1119    /* We don't want debugging output to print the whole body of the
1120     * loop as the annotation.
1121     */
1122    this->base_ir = NULL;
1123
1124    emit(BRW_OPCODE_DO);
1125
1126    visit_instructions(&ir->body_instructions);
1127
1128    emit(BRW_OPCODE_WHILE);
1129 }
1130
1131 void
1132 vec4_visitor::visit(ir_loop_jump *ir)
1133 {
1134    switch (ir->mode) {
1135    case ir_loop_jump::jump_break:
1136       emit(BRW_OPCODE_BREAK);
1137       break;
1138    case ir_loop_jump::jump_continue:
1139       emit(BRW_OPCODE_CONTINUE);
1140       break;
1141    }
1142 }
1143
1144
1145 void
1146 vec4_visitor::visit(ir_function_signature *)
1147 {
1148    unreachable("not reached");
1149 }
1150
1151 void
1152 vec4_visitor::visit(ir_function *ir)
1153 {
1154    /* Ignore function bodies other than main() -- we shouldn't see calls to
1155     * them since they should all be inlined.
1156     */
1157    if (strcmp(ir->name, "main") == 0) {
1158       const ir_function_signature *sig;
1159       exec_list empty;
1160
1161       sig = ir->matching_signature(NULL, &empty, false);
1162
1163       assert(sig);
1164
1165       visit_instructions(&sig->body);
1166    }
1167 }
1168
1169 bool
1170 vec4_visitor::try_emit_mad(ir_expression *ir)
1171 {
1172    /* 3-src instructions were introduced in gen6. */
1173    if (devinfo->gen < 6)
1174       return false;
1175
1176    /* MAD can only handle floating-point data. */
1177    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1178       return false;
1179
1180    ir_rvalue *nonmul;
1181    ir_expression *mul;
1182    bool mul_negate, mul_abs;
1183
1184    for (int i = 0; i < 2; i++) {
1185       mul_negate = false;
1186       mul_abs = false;
1187
1188       mul = ir->operands[i]->as_expression();
1189       nonmul = ir->operands[1 - i];
1190
1191       if (mul && mul->operation == ir_unop_abs) {
1192          mul = mul->operands[0]->as_expression();
1193          mul_abs = true;
1194       } else if (mul && mul->operation == ir_unop_neg) {
1195          mul = mul->operands[0]->as_expression();
1196          mul_negate = true;
1197       }
1198
1199       if (mul && mul->operation == ir_binop_mul)
1200          break;
1201    }
1202
1203    if (!mul || mul->operation != ir_binop_mul)
1204       return false;
1205
1206    nonmul->accept(this);
1207    src_reg src0 = fix_3src_operand(this->result);
1208
1209    mul->operands[0]->accept(this);
1210    src_reg src1 = fix_3src_operand(this->result);
1211    src1.negate ^= mul_negate;
1212    src1.abs = mul_abs;
1213    if (mul_abs)
1214       src1.negate = false;
1215
1216    mul->operands[1]->accept(this);
1217    src_reg src2 = fix_3src_operand(this->result);
1218    src2.abs = mul_abs;
1219    if (mul_abs)
1220       src2.negate = false;
1221
1222    this->result = src_reg(this, ir->type);
1223    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1224
1225    return true;
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1230 {
1231    /* This optimization relies on CMP setting the destination to 0 when
1232     * false.  Early hardware only sets the least significant bit, and
1233     * leaves the other bits undefined.  So we can't use it.
1234     */
1235    if (devinfo->gen < 6)
1236       return false;
1237
1238    ir_expression *const cmp = ir->operands[0]->as_expression();
1239
1240    if (cmp == NULL)
1241       return false;
1242
1243    switch (cmp->operation) {
1244    case ir_binop_less:
1245    case ir_binop_greater:
1246    case ir_binop_lequal:
1247    case ir_binop_gequal:
1248    case ir_binop_equal:
1249    case ir_binop_nequal:
1250       break;
1251
1252    default:
1253       return false;
1254    }
1255
1256    cmp->operands[0]->accept(this);
1257    const src_reg cmp_src0 = this->result;
1258
1259    cmp->operands[1]->accept(this);
1260    const src_reg cmp_src1 = this->result;
1261
1262    this->result = src_reg(this, ir->type);
1263
1264    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1265             brw_conditional_for_comparison(cmp->operation)));
1266
1267    /* If the comparison is false, this->result will just happen to be zero.
1268     */
1269    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1270                                        this->result, src_reg(1.0f));
1271    inst->predicate = BRW_PREDICATE_NORMAL;
1272    inst->predicate_inverse = true;
1273
1274    return true;
1275 }
1276
1277 vec4_instruction *
1278 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1279                           src_reg src0, src_reg src1)
1280 {
1281    vec4_instruction *inst;
1282
1283    if (devinfo->gen >= 6) {
1284       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1285       inst->conditional_mod = conditionalmod;
1286    } else {
1287       emit(CMP(dst, src0, src1, conditionalmod));
1288
1289       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1290       inst->predicate = BRW_PREDICATE_NORMAL;
1291    }
1292
1293    return inst;
1294 }
1295
1296 vec4_instruction *
1297 vec4_visitor::emit_lrp(const dst_reg &dst,
1298                        const src_reg &x, const src_reg &y, const src_reg &a)
1299 {
1300    if (devinfo->gen >= 6) {
1301       /* Note that the instruction's argument order is reversed from GLSL
1302        * and the IR.
1303        */
1304      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1305                      fix_3src_operand(x)));
1306    } else {
1307       /* Earlier generations don't support three source operations, so we
1308        * need to emit x*(1-a) + y*a.
1309        */
1310       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1311       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1312       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1313       y_times_a.writemask           = dst.writemask;
1314       one_minus_a.writemask         = dst.writemask;
1315       x_times_one_minus_a.writemask = dst.writemask;
1316
1317       emit(MUL(y_times_a, y, a));
1318       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1319       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1320       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1321    }
1322 }
1323
1324 /**
1325  * Emits the instructions needed to perform a pull constant load. before_block
1326  * and before_inst can be NULL in which case the instruction will be appended
1327  * to the end of the instruction list.
1328  */
1329 void
1330 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1331                                           src_reg surf_index,
1332                                           src_reg offset_reg,
1333                                           bblock_t *before_block,
1334                                           vec4_instruction *before_inst)
1335 {
1336    assert((before_inst == NULL && before_block == NULL) ||
1337           (before_inst && before_block));
1338
1339    vec4_instruction *pull;
1340
1341    if (devinfo->gen >= 9) {
1342       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1343       src_reg header(this, glsl_type::uvec4_type, 2);
1344
1345       pull = new(mem_ctx)
1346          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1347                           dst_reg(header));
1348
1349       if (before_inst)
1350          emit_before(before_block, before_inst, pull);
1351       else
1352          emit(pull);
1353
1354       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1355                                  offset_reg.type);
1356       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1357
1358       if (before_inst)
1359          emit_before(before_block, before_inst, pull);
1360       else
1361          emit(pull);
1362
1363       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1364                                            dst,
1365                                            surf_index,
1366                                            header);
1367       pull->mlen = 2;
1368       pull->header_size = 1;
1369    } else if (devinfo->gen >= 7) {
1370       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1371
1372       grf_offset.type = offset_reg.type;
1373
1374       pull = MOV(grf_offset, offset_reg);
1375
1376       if (before_inst)
1377          emit_before(before_block, before_inst, pull);
1378       else
1379          emit(pull);
1380
1381       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1382                                            dst,
1383                                            surf_index,
1384                                            src_reg(grf_offset));
1385       pull->mlen = 1;
1386    } else {
1387       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1388                                            dst,
1389                                            surf_index,
1390                                            offset_reg);
1391       pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
1392       pull->mlen = 1;
1393    }
1394
1395    if (before_inst)
1396       emit_before(before_block, before_inst, pull);
1397    else
1398       emit(pull);
1399 }
1400
1401 src_reg
1402 vec4_visitor::emit_uniformize(const src_reg &src)
1403 {
1404    const src_reg chan_index(this, glsl_type::uint_type);
1405    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1406                               src.type);
1407
1408    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1409       ->force_writemask_all = true;
1410    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1411       ->force_writemask_all = true;
1412
1413    return src_reg(dst);
1414 }
1415
1416 void
1417 vec4_visitor::visit(ir_expression *ir)
1418 {
1419    unsigned int operand;
1420    src_reg op[ARRAY_SIZE(ir->operands)];
1421    vec4_instruction *inst;
1422
1423    if (ir->operation == ir_binop_add) {
1424       if (try_emit_mad(ir))
1425          return;
1426    }
1427
1428    if (ir->operation == ir_unop_b2f) {
1429       if (try_emit_b2f_of_compare(ir))
1430          return;
1431    }
1432
1433    /* Storage for our result.  Ideally for an assignment we'd be using
1434     * the actual storage for the result here, instead.
1435     */
1436    dst_reg result_dst(this, ir->type);
1437    src_reg result_src(result_dst);
1438
1439    if (ir->operation == ir_triop_csel) {
1440       ir->operands[1]->accept(this);
1441       op[1] = this->result;
1442       ir->operands[2]->accept(this);
1443       op[2] = this->result;
1444
1445       enum brw_predicate predicate;
1446       emit_bool_to_cond_code(ir->operands[0], &predicate);
1447       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1448       inst->predicate = predicate;
1449       this->result = result_src;
1450       return;
1451    }
1452
1453    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1454       this->result.file = BAD_FILE;
1455       ir->operands[operand]->accept(this);
1456       if (this->result.file == BAD_FILE) {
1457          fprintf(stderr, "Failed to get tree for expression operand:\n");
1458          ir->operands[operand]->fprint(stderr);
1459          exit(1);
1460       }
1461       op[operand] = this->result;
1462
1463       /* Matrix expression operands should have been broken down to vector
1464        * operations already.
1465        */
1466       assert(!ir->operands[operand]->type->is_matrix());
1467    }
1468
1469    /* If nothing special happens, this is the result. */
1470    this->result = result_src;
1471
1472    switch (ir->operation) {
1473    case ir_unop_logic_not:
1474       emit(NOT(result_dst, op[0]));
1475       break;
1476    case ir_unop_neg:
1477       op[0].negate = !op[0].negate;
1478       emit(MOV(result_dst, op[0]));
1479       break;
1480    case ir_unop_abs:
1481       op[0].abs = true;
1482       op[0].negate = false;
1483       emit(MOV(result_dst, op[0]));
1484       break;
1485
1486    case ir_unop_sign:
1487       if (ir->type->is_float()) {
1488          /* AND(val, 0x80000000) gives the sign bit.
1489           *
1490           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1491           * zero.
1492           */
1493          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1494
1495          op[0].type = BRW_REGISTER_TYPE_UD;
1496          result_dst.type = BRW_REGISTER_TYPE_UD;
1497          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1498
1499          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1500          inst->predicate = BRW_PREDICATE_NORMAL;
1501
1502          this->result.type = BRW_REGISTER_TYPE_F;
1503       } else {
1504          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1505           *               -> non-negative val generates 0x00000000.
1506           *  Predicated OR sets 1 if val is positive.
1507           */
1508          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1509
1510          emit(ASR(result_dst, op[0], src_reg(31)));
1511
1512          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1513          inst->predicate = BRW_PREDICATE_NORMAL;
1514       }
1515       break;
1516
1517    case ir_unop_rcp:
1518       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1519       break;
1520
1521    case ir_unop_exp2:
1522       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1523       break;
1524    case ir_unop_log2:
1525       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1526       break;
1527    case ir_unop_exp:
1528    case ir_unop_log:
1529       unreachable("not reached: should be handled by ir_explog_to_explog2");
1530    case ir_unop_sin:
1531       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1532       break;
1533    case ir_unop_cos:
1534       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1535       break;
1536
1537    case ir_unop_dFdx:
1538    case ir_unop_dFdx_coarse:
1539    case ir_unop_dFdx_fine:
1540    case ir_unop_dFdy:
1541    case ir_unop_dFdy_coarse:
1542    case ir_unop_dFdy_fine:
1543       unreachable("derivatives not valid in vertex shader");
1544
1545    case ir_unop_bitfield_reverse:
1546       emit(BFREV(result_dst, op[0]));
1547       break;
1548    case ir_unop_bit_count:
1549       emit(CBIT(result_dst, op[0]));
1550       break;
1551    case ir_unop_find_msb: {
1552       src_reg temp = src_reg(this, glsl_type::uint_type);
1553
1554       inst = emit(FBH(dst_reg(temp), op[0]));
1555       inst->dst.writemask = WRITEMASK_XYZW;
1556
1557       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1558        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1559        * subtract the result from 31 to convert the MSB count into an LSB count.
1560        */
1561
1562       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1563       temp.swizzle = BRW_SWIZZLE_NOOP;
1564       emit(MOV(result_dst, temp));
1565
1566       src_reg src_tmp = src_reg(result_dst);
1567       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1568
1569       src_tmp.negate = true;
1570       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1571       inst->predicate = BRW_PREDICATE_NORMAL;
1572       break;
1573    }
1574    case ir_unop_find_lsb:
1575       emit(FBL(result_dst, op[0]));
1576       break;
1577    case ir_unop_saturate:
1578       inst = emit(MOV(result_dst, op[0]));
1579       inst->saturate = true;
1580       break;
1581
1582    case ir_unop_noise:
1583       unreachable("not reached: should be handled by lower_noise");
1584
1585    case ir_unop_subroutine_to_int:
1586       emit(MOV(result_dst, op[0]));
1587       break;
1588
1589    case ir_unop_ssbo_unsized_array_length:
1590       unreachable("not reached: should be handled by lower_ubo_reference");
1591       break;
1592
1593    case ir_binop_add:
1594       emit(ADD(result_dst, op[0], op[1]));
1595       break;
1596    case ir_binop_sub:
1597       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1598
1599    case ir_binop_mul:
1600       if (devinfo->gen < 8 && ir->type->is_integer()) {
1601          /* For integer multiplication, the MUL uses the low 16 bits of one of
1602           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1603           * accumulates in the contribution of the upper 16 bits of that
1604           * operand.  If we can determine that one of the args is in the low
1605           * 16 bits, though, we can just emit a single MUL.
1606           */
1607          if (ir->operands[0]->is_uint16_constant()) {
1608             if (devinfo->gen < 7)
1609                emit(MUL(result_dst, op[0], op[1]));
1610             else
1611                emit(MUL(result_dst, op[1], op[0]));
1612          } else if (ir->operands[1]->is_uint16_constant()) {
1613             if (devinfo->gen < 7)
1614                emit(MUL(result_dst, op[1], op[0]));
1615             else
1616                emit(MUL(result_dst, op[0], op[1]));
1617          } else {
1618             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1619
1620             emit(MUL(acc, op[0], op[1]));
1621             emit(MACH(dst_null_d(), op[0], op[1]));
1622             emit(MOV(result_dst, src_reg(acc)));
1623          }
1624       } else {
1625          emit(MUL(result_dst, op[0], op[1]));
1626       }
1627       break;
1628    case ir_binop_imul_high: {
1629       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1630
1631       emit(MUL(acc, op[0], op[1]));
1632       emit(MACH(result_dst, op[0], op[1]));
1633       break;
1634    }
1635    case ir_binop_div:
1636       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1637       assert(ir->type->is_integer());
1638       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1639       break;
1640
1641    case ir_binop_carry:
1642       unreachable("Should have been lowered by carry_to_arith().");
1643
1644    case ir_binop_borrow:
1645       unreachable("Should have been lowered by borrow_to_arith().");
1646
1647    case ir_binop_mod:
1648       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1649       assert(ir->type->is_integer());
1650       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1651       break;
1652
1653    case ir_binop_less:
1654    case ir_binop_greater:
1655    case ir_binop_lequal:
1656    case ir_binop_gequal:
1657    case ir_binop_equal:
1658    case ir_binop_nequal: {
1659       if (devinfo->gen <= 5) {
1660          resolve_bool_comparison(ir->operands[0], &op[0]);
1661          resolve_bool_comparison(ir->operands[1], &op[1]);
1662       }
1663       emit(CMP(result_dst, op[0], op[1],
1664                brw_conditional_for_comparison(ir->operation)));
1665       break;
1666    }
1667
1668    case ir_binop_all_equal:
1669       if (devinfo->gen <= 5) {
1670          resolve_bool_comparison(ir->operands[0], &op[0]);
1671          resolve_bool_comparison(ir->operands[1], &op[1]);
1672       }
1673
1674       /* "==" operator producing a scalar boolean. */
1675       if (ir->operands[0]->type->is_vector() ||
1676           ir->operands[1]->type->is_vector()) {
1677          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1678          emit(MOV(result_dst, src_reg(0)));
1679          inst = emit(MOV(result_dst, src_reg(~0)));
1680          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1681       } else {
1682          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1683       }
1684       break;
1685    case ir_binop_any_nequal:
1686       if (devinfo->gen <= 5) {
1687          resolve_bool_comparison(ir->operands[0], &op[0]);
1688          resolve_bool_comparison(ir->operands[1], &op[1]);
1689       }
1690
1691       /* "!=" operator producing a scalar boolean. */
1692       if (ir->operands[0]->type->is_vector() ||
1693           ir->operands[1]->type->is_vector()) {
1694          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1695
1696          emit(MOV(result_dst, src_reg(0)));
1697          inst = emit(MOV(result_dst, src_reg(~0)));
1698          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1699       } else {
1700          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1701       }
1702       break;
1703
1704    case ir_unop_any:
1705       if (devinfo->gen <= 5) {
1706          resolve_bool_comparison(ir->operands[0], &op[0]);
1707       }
1708       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1709       emit(MOV(result_dst, src_reg(0)));
1710
1711       inst = emit(MOV(result_dst, src_reg(~0)));
1712       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1713       break;
1714
1715    case ir_binop_logic_xor:
1716       emit(XOR(result_dst, op[0], op[1]));
1717       break;
1718
1719    case ir_binop_logic_or:
1720       emit(OR(result_dst, op[0], op[1]));
1721       break;
1722
1723    case ir_binop_logic_and:
1724       emit(AND(result_dst, op[0], op[1]));
1725       break;
1726
1727    case ir_binop_dot:
1728       assert(ir->operands[0]->type->is_vector());
1729       assert(ir->operands[0]->type == ir->operands[1]->type);
1730       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1731       break;
1732
1733    case ir_unop_sqrt:
1734       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1735       break;
1736    case ir_unop_rsq:
1737       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1738       break;
1739
1740    case ir_unop_bitcast_i2f:
1741    case ir_unop_bitcast_u2f:
1742       this->result = op[0];
1743       this->result.type = BRW_REGISTER_TYPE_F;
1744       break;
1745
1746    case ir_unop_bitcast_f2i:
1747       this->result = op[0];
1748       this->result.type = BRW_REGISTER_TYPE_D;
1749       break;
1750
1751    case ir_unop_bitcast_f2u:
1752       this->result = op[0];
1753       this->result.type = BRW_REGISTER_TYPE_UD;
1754       break;
1755
1756    case ir_unop_i2f:
1757    case ir_unop_i2u:
1758    case ir_unop_u2i:
1759    case ir_unop_u2f:
1760    case ir_unop_f2i:
1761    case ir_unop_f2u:
1762       emit(MOV(result_dst, op[0]));
1763       break;
1764    case ir_unop_b2i:
1765    case ir_unop_b2f:
1766       if (devinfo->gen <= 5) {
1767          resolve_bool_comparison(ir->operands[0], &op[0]);
1768       }
1769       emit(MOV(result_dst, negate(op[0])));
1770       break;
1771    case ir_unop_f2b:
1772       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1773       break;
1774    case ir_unop_i2b:
1775       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1776       break;
1777
1778    case ir_unop_trunc:
1779       emit(RNDZ(result_dst, op[0]));
1780       break;
1781    case ir_unop_ceil: {
1782          src_reg tmp = src_reg(this, ir->type);
1783          op[0].negate = !op[0].negate;
1784          emit(RNDD(dst_reg(tmp), op[0]));
1785          tmp.negate = true;
1786          emit(MOV(result_dst, tmp));
1787       }
1788       break;
1789    case ir_unop_floor:
1790       inst = emit(RNDD(result_dst, op[0]));
1791       break;
1792    case ir_unop_fract:
1793       inst = emit(FRC(result_dst, op[0]));
1794       break;
1795    case ir_unop_round_even:
1796       emit(RNDE(result_dst, op[0]));
1797       break;
1798
1799    case ir_unop_get_buffer_size:
1800       unreachable("not reached: not implemented");
1801       break;
1802
1803    case ir_binop_min:
1804       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1805       break;
1806    case ir_binop_max:
1807       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1808       break;
1809
1810    case ir_binop_pow:
1811       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1812       break;
1813
1814    case ir_unop_bit_not:
1815       inst = emit(NOT(result_dst, op[0]));
1816       break;
1817    case ir_binop_bit_and:
1818       inst = emit(AND(result_dst, op[0], op[1]));
1819       break;
1820    case ir_binop_bit_xor:
1821       inst = emit(XOR(result_dst, op[0], op[1]));
1822       break;
1823    case ir_binop_bit_or:
1824       inst = emit(OR(result_dst, op[0], op[1]));
1825       break;
1826
1827    case ir_binop_lshift:
1828       inst = emit(SHL(result_dst, op[0], op[1]));
1829       break;
1830
1831    case ir_binop_rshift:
1832       if (ir->type->base_type == GLSL_TYPE_INT)
1833          inst = emit(ASR(result_dst, op[0], op[1]));
1834       else
1835          inst = emit(SHR(result_dst, op[0], op[1]));
1836       break;
1837
1838    case ir_binop_bfm:
1839       emit(BFI1(result_dst, op[0], op[1]));
1840       break;
1841
1842    case ir_binop_ubo_load: {
1843       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1844       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1845       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1846       src_reg offset;
1847
1848       /* Now, load the vector from that offset. */
1849       assert(ir->type->is_vector() || ir->type->is_scalar());
1850
1851       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1852       packed_consts.type = result.type;
1853       src_reg surf_index;
1854
1855       if (const_uniform_block) {
1856          /* The block index is a constant, so just emit the binding table entry
1857           * as an immediate.
1858           */
1859          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1860                               const_uniform_block->value.u[0]);
1861       } else {
1862          /* The block index is not a constant. Evaluate the index expression
1863           * per-channel and add the base UBO index; we have to select a value
1864           * from any live channel.
1865           */
1866          surf_index = src_reg(this, glsl_type::uint_type);
1867          emit(ADD(dst_reg(surf_index), op[0],
1868                   src_reg(prog_data->base.binding_table.ubo_start)));
1869          surf_index = emit_uniformize(surf_index);
1870
1871          /* Assume this may touch any UBO. It would be nice to provide
1872           * a tighter bound, but the array information is already lowered away.
1873           */
1874          brw_mark_surface_used(&prog_data->base,
1875                                prog_data->base.binding_table.ubo_start +
1876                                shader_prog->NumBufferInterfaceBlocks - 1);
1877       }
1878
1879       if (const_offset_ir) {
1880          if (devinfo->gen >= 8) {
1881             /* Store the offset in a GRF so we can send-from-GRF. */
1882             offset = src_reg(this, glsl_type::int_type);
1883             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1884          } else {
1885             /* Immediates are fine on older generations since they'll be moved
1886              * to a (potentially fake) MRF at the generator level.
1887              */
1888             offset = src_reg(const_offset / 16);
1889          }
1890       } else {
1891          offset = src_reg(this, glsl_type::uint_type);
1892          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1893       }
1894
1895       emit_pull_constant_load_reg(dst_reg(packed_consts),
1896                                   surf_index,
1897                                   offset,
1898                                   NULL, NULL /* before_block/inst */);
1899
1900       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1901       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1902                                             const_offset % 16 / 4,
1903                                             const_offset % 16 / 4,
1904                                             const_offset % 16 / 4);
1905
1906       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1907       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1908          emit(CMP(result_dst, packed_consts, src_reg(0u),
1909                   BRW_CONDITIONAL_NZ));
1910       } else {
1911          emit(MOV(result_dst, packed_consts));
1912       }
1913       break;
1914    }
1915
1916    case ir_binop_vector_extract:
1917       unreachable("should have been lowered by vec_index_to_cond_assign");
1918
1919    case ir_triop_fma:
1920       op[0] = fix_3src_operand(op[0]);
1921       op[1] = fix_3src_operand(op[1]);
1922       op[2] = fix_3src_operand(op[2]);
1923       /* Note that the instruction's argument order is reversed from GLSL
1924        * and the IR.
1925        */
1926       emit(MAD(result_dst, op[2], op[1], op[0]));
1927       break;
1928
1929    case ir_triop_lrp:
1930       emit_lrp(result_dst, op[0], op[1], op[2]);
1931       break;
1932
1933    case ir_triop_csel:
1934       unreachable("already handled above");
1935       break;
1936
1937    case ir_triop_bfi:
1938       op[0] = fix_3src_operand(op[0]);
1939       op[1] = fix_3src_operand(op[1]);
1940       op[2] = fix_3src_operand(op[2]);
1941       emit(BFI2(result_dst, op[0], op[1], op[2]));
1942       break;
1943
1944    case ir_triop_bitfield_extract:
1945       op[0] = fix_3src_operand(op[0]);
1946       op[1] = fix_3src_operand(op[1]);
1947       op[2] = fix_3src_operand(op[2]);
1948       /* Note that the instruction's argument order is reversed from GLSL
1949        * and the IR.
1950        */
1951       emit(BFE(result_dst, op[2], op[1], op[0]));
1952       break;
1953
1954    case ir_triop_vector_insert:
1955       unreachable("should have been lowered by lower_vector_insert");
1956
1957    case ir_quadop_bitfield_insert:
1958       unreachable("not reached: should be handled by "
1959               "bitfield_insert_to_bfm_bfi\n");
1960
1961    case ir_quadop_vector:
1962       unreachable("not reached: should be handled by lower_quadop_vector");
1963
1964    case ir_unop_pack_half_2x16:
1965       emit_pack_half_2x16(result_dst, op[0]);
1966       break;
1967    case ir_unop_unpack_half_2x16:
1968       emit_unpack_half_2x16(result_dst, op[0]);
1969       break;
1970    case ir_unop_unpack_unorm_4x8:
1971       emit_unpack_unorm_4x8(result_dst, op[0]);
1972       break;
1973    case ir_unop_unpack_snorm_4x8:
1974       emit_unpack_snorm_4x8(result_dst, op[0]);
1975       break;
1976    case ir_unop_pack_unorm_4x8:
1977       emit_pack_unorm_4x8(result_dst, op[0]);
1978       break;
1979    case ir_unop_pack_snorm_4x8:
1980       emit_pack_snorm_4x8(result_dst, op[0]);
1981       break;
1982    case ir_unop_pack_snorm_2x16:
1983    case ir_unop_pack_unorm_2x16:
1984    case ir_unop_unpack_snorm_2x16:
1985    case ir_unop_unpack_unorm_2x16:
1986       unreachable("not reached: should be handled by lower_packing_builtins");
1987    case ir_unop_unpack_half_2x16_split_x:
1988    case ir_unop_unpack_half_2x16_split_y:
1989    case ir_binop_pack_half_2x16_split:
1990    case ir_unop_interpolate_at_centroid:
1991    case ir_binop_interpolate_at_sample:
1992    case ir_binop_interpolate_at_offset:
1993       unreachable("not reached: should not occur in vertex shader");
1994    case ir_binop_ldexp:
1995       unreachable("not reached: should be handled by ldexp_to_arith()");
1996    case ir_unop_d2f:
1997    case ir_unop_f2d:
1998    case ir_unop_d2i:
1999    case ir_unop_i2d:
2000    case ir_unop_d2u:
2001    case ir_unop_u2d:
2002    case ir_unop_d2b:
2003    case ir_unop_pack_double_2x32:
2004    case ir_unop_unpack_double_2x32:
2005    case ir_unop_frexp_sig:
2006    case ir_unop_frexp_exp:
2007       unreachable("fp64 todo");
2008    }
2009 }
2010
2011
2012 void
2013 vec4_visitor::visit(ir_swizzle *ir)
2014 {
2015    /* Note that this is only swizzles in expressions, not those on the left
2016     * hand side of an assignment, which do write masking.  See ir_assignment
2017     * for that.
2018     */
2019    const unsigned swz = brw_compose_swizzle(
2020       brw_swizzle_for_size(ir->type->vector_elements),
2021       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2022
2023    ir->val->accept(this);
2024    this->result = swizzle(this->result, swz);
2025 }
2026
2027 void
2028 vec4_visitor::visit(ir_dereference_variable *ir)
2029 {
2030    const struct glsl_type *type = ir->type;
2031    dst_reg *reg = variable_storage(ir->var);
2032
2033    if (!reg) {
2034       fail("Failed to find variable storage for %s\n", ir->var->name);
2035       this->result = src_reg(brw_null_reg());
2036       return;
2037    }
2038
2039    this->result = src_reg(*reg);
2040
2041    /* System values get their swizzle from the dst_reg writemask */
2042    if (ir->var->data.mode == ir_var_system_value)
2043       return;
2044
2045    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2046       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2047 }
2048
2049
2050 int
2051 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2052 {
2053    /* Under normal circumstances array elements are stored consecutively, so
2054     * the stride is equal to the size of the array element.
2055     */
2056    return type_size_vec4(ir->type);
2057 }
2058
2059
2060 void
2061 vec4_visitor::visit(ir_dereference_array *ir)
2062 {
2063    ir_constant *constant_index;
2064    src_reg src;
2065    int array_stride = compute_array_stride(ir);
2066
2067    constant_index = ir->array_index->constant_expression_value();
2068
2069    ir->array->accept(this);
2070    src = this->result;
2071
2072    if (constant_index) {
2073       src.reg_offset += constant_index->value.i[0] * array_stride;
2074    } else {
2075       /* Variable index array dereference.  It eats the "vec4" of the
2076        * base of the array and an index that offsets the Mesa register
2077        * index.
2078        */
2079       ir->array_index->accept(this);
2080
2081       src_reg index_reg;
2082
2083       if (array_stride == 1) {
2084          index_reg = this->result;
2085       } else {
2086          index_reg = src_reg(this, glsl_type::int_type);
2087
2088          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2089       }
2090
2091       if (src.reladdr) {
2092          src_reg temp = src_reg(this, glsl_type::int_type);
2093
2094          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2095
2096          index_reg = temp;
2097       }
2098
2099       src.reladdr = ralloc(mem_ctx, src_reg);
2100       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2101    }
2102
2103    /* If the type is smaller than a vec4, replicate the last channel out. */
2104    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2105       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2106    else
2107       src.swizzle = BRW_SWIZZLE_NOOP;
2108    src.type = brw_type_for_base_type(ir->type);
2109
2110    this->result = src;
2111 }
2112
2113 void
2114 vec4_visitor::visit(ir_dereference_record *ir)
2115 {
2116    unsigned int i;
2117    const glsl_type *struct_type = ir->record->type;
2118    int offset = 0;
2119
2120    ir->record->accept(this);
2121
2122    for (i = 0; i < struct_type->length; i++) {
2123       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2124          break;
2125       offset += type_size_vec4(struct_type->fields.structure[i].type);
2126    }
2127
2128    /* If the type is smaller than a vec4, replicate the last channel out. */
2129    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2130       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2131    else
2132       this->result.swizzle = BRW_SWIZZLE_NOOP;
2133    this->result.type = brw_type_for_base_type(ir->type);
2134
2135    this->result.reg_offset += offset;
2136 }
2137
2138 /**
2139  * We want to be careful in assignment setup to hit the actual storage
2140  * instead of potentially using a temporary like we might with the
2141  * ir_dereference handler.
2142  */
2143 static dst_reg
2144 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2145 {
2146    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2147     * access of a vector, it must be separated into a series conditional moves
2148     * before reaching this point (see ir_vec_index_to_cond_assign).
2149     */
2150    assert(ir->as_dereference());
2151    ir_dereference_array *deref_array = ir->as_dereference_array();
2152    if (deref_array) {
2153       assert(!deref_array->array->type->is_vector());
2154    }
2155
2156    /* Use the rvalue deref handler for the most part.  We'll ignore
2157     * swizzles in it and write swizzles using writemask, though.
2158     */
2159    ir->accept(v);
2160    return dst_reg(v->result);
2161 }
2162
2163 void
2164 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2165                               const struct glsl_type *type,
2166                               enum brw_predicate predicate)
2167 {
2168    if (type->base_type == GLSL_TYPE_STRUCT) {
2169       for (unsigned int i = 0; i < type->length; i++) {
2170          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2171       }
2172       return;
2173    }
2174
2175    if (type->is_array()) {
2176       for (unsigned int i = 0; i < type->length; i++) {
2177          emit_block_move(dst, src, type->fields.array, predicate);
2178       }
2179       return;
2180    }
2181
2182    if (type->is_matrix()) {
2183       const struct glsl_type *vec_type;
2184
2185       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2186                                          type->vector_elements, 1);
2187
2188       for (int i = 0; i < type->matrix_columns; i++) {
2189          emit_block_move(dst, src, vec_type, predicate);
2190       }
2191       return;
2192    }
2193
2194    assert(type->is_scalar() || type->is_vector());
2195
2196    dst->type = brw_type_for_base_type(type);
2197    src->type = dst->type;
2198
2199    dst->writemask = (1 << type->vector_elements) - 1;
2200
2201    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2202
2203    vec4_instruction *inst = emit(MOV(*dst, *src));
2204    inst->predicate = predicate;
2205
2206    dst->reg_offset++;
2207    src->reg_offset++;
2208 }
2209
2210
2211 /* If the RHS processing resulted in an instruction generating a
2212  * temporary value, and it would be easy to rewrite the instruction to
2213  * generate its result right into the LHS instead, do so.  This ends
2214  * up reliably removing instructions where it can be tricky to do so
2215  * later without real UD chain information.
2216  */
2217 bool
2218 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2219                                      dst_reg dst,
2220                                      src_reg src,
2221                                      vec4_instruction *pre_rhs_inst,
2222                                      vec4_instruction *last_rhs_inst)
2223 {
2224    /* This could be supported, but it would take more smarts. */
2225    if (ir->condition)
2226       return false;
2227
2228    if (pre_rhs_inst == last_rhs_inst)
2229       return false; /* No instructions generated to work with. */
2230
2231    /* Make sure the last instruction generated our source reg. */
2232    if (src.file != GRF ||
2233        src.file != last_rhs_inst->dst.file ||
2234        src.reg != last_rhs_inst->dst.reg ||
2235        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2236        src.reladdr ||
2237        src.abs ||
2238        src.negate ||
2239        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2240       return false;
2241
2242    /* Check that that last instruction fully initialized the channels
2243     * we want to use, in the order we want to use them.  We could
2244     * potentially reswizzle the operands of many instructions so that
2245     * we could handle out of order channels, but don't yet.
2246     */
2247
2248    for (unsigned i = 0; i < 4; i++) {
2249       if (dst.writemask & (1 << i)) {
2250          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2251             return false;
2252
2253          if (BRW_GET_SWZ(src.swizzle, i) != i)
2254             return false;
2255       }
2256    }
2257
2258    /* Success!  Rewrite the instruction. */
2259    last_rhs_inst->dst.file = dst.file;
2260    last_rhs_inst->dst.reg = dst.reg;
2261    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2262    last_rhs_inst->dst.reladdr = dst.reladdr;
2263    last_rhs_inst->dst.writemask &= dst.writemask;
2264
2265    return true;
2266 }
2267
2268 void
2269 vec4_visitor::visit(ir_assignment *ir)
2270 {
2271    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2272    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2273
2274    if (!ir->lhs->type->is_scalar() &&
2275        !ir->lhs->type->is_vector()) {
2276       ir->rhs->accept(this);
2277       src_reg src = this->result;
2278
2279       if (ir->condition) {
2280          emit_bool_to_cond_code(ir->condition, &predicate);
2281       }
2282
2283       /* emit_block_move doesn't account for swizzles in the source register.
2284        * This should be ok, since the source register is a structure or an
2285        * array, and those can't be swizzled.  But double-check to be sure.
2286        */
2287       assert(src.swizzle ==
2288              (ir->rhs->type->is_matrix()
2289               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2290               : BRW_SWIZZLE_NOOP));
2291
2292       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2293       return;
2294    }
2295
2296    /* Now we're down to just a scalar/vector with writemasks. */
2297    int i;
2298
2299    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2300    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2301
2302    ir->rhs->accept(this);
2303
2304    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2305
2306    int swizzles[4];
2307    int src_chan = 0;
2308
2309    assert(ir->lhs->type->is_vector() ||
2310           ir->lhs->type->is_scalar());
2311    dst.writemask = ir->write_mask;
2312
2313    /* Swizzle a small RHS vector into the channels being written.
2314     *
2315     * glsl ir treats write_mask as dictating how many channels are
2316     * present on the RHS while in our instructions we need to make
2317     * those channels appear in the slots of the vec4 they're written to.
2318     */
2319    for (int i = 0; i < 4; i++)
2320       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2321
2322    src_reg src = swizzle(this->result,
2323                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2324                                       swizzles[2], swizzles[3]));
2325
2326    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2327       return;
2328    }
2329
2330    if (ir->condition) {
2331       emit_bool_to_cond_code(ir->condition, &predicate);
2332    }
2333
2334    for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2335       vec4_instruction *inst = emit(MOV(dst, src));
2336       inst->predicate = predicate;
2337
2338       dst.reg_offset++;
2339       src.reg_offset++;
2340    }
2341 }
2342
2343 void
2344 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2345 {
2346    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2347       foreach_in_list(ir_constant, field_value, &ir->components) {
2348          emit_constant_values(dst, field_value);
2349       }
2350       return;
2351    }
2352
2353    if (ir->type->is_array()) {
2354       for (unsigned int i = 0; i < ir->type->length; i++) {
2355          emit_constant_values(dst, ir->array_elements[i]);
2356       }
2357       return;
2358    }
2359
2360    if (ir->type->is_matrix()) {
2361       for (int i = 0; i < ir->type->matrix_columns; i++) {
2362          float *vec = &ir->value.f[i * ir->type->vector_elements];
2363
2364          for (int j = 0; j < ir->type->vector_elements; j++) {
2365             dst->writemask = 1 << j;
2366             dst->type = BRW_REGISTER_TYPE_F;
2367
2368             emit(MOV(*dst, src_reg(vec[j])));
2369          }
2370          dst->reg_offset++;
2371       }
2372       return;
2373    }
2374
2375    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2376
2377    for (int i = 0; i < ir->type->vector_elements; i++) {
2378       if (!(remaining_writemask & (1 << i)))
2379          continue;
2380
2381       dst->writemask = 1 << i;
2382       dst->type = brw_type_for_base_type(ir->type);
2383
2384       /* Find other components that match the one we're about to
2385        * write.  Emits fewer instructions for things like vec4(0.5,
2386        * 1.5, 1.5, 1.5).
2387        */
2388       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2389          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2390             if (ir->value.b[i] == ir->value.b[j])
2391                dst->writemask |= (1 << j);
2392          } else {
2393             /* u, i, and f storage all line up, so no need for a
2394              * switch case for comparing each type.
2395              */
2396             if (ir->value.u[i] == ir->value.u[j])
2397                dst->writemask |= (1 << j);
2398          }
2399       }
2400
2401       switch (ir->type->base_type) {
2402       case GLSL_TYPE_FLOAT:
2403          emit(MOV(*dst, src_reg(ir->value.f[i])));
2404          break;
2405       case GLSL_TYPE_INT:
2406          emit(MOV(*dst, src_reg(ir->value.i[i])));
2407          break;
2408       case GLSL_TYPE_UINT:
2409          emit(MOV(*dst, src_reg(ir->value.u[i])));
2410          break;
2411       case GLSL_TYPE_BOOL:
2412          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2413          break;
2414       default:
2415          unreachable("Non-float/uint/int/bool constant");
2416       }
2417
2418       remaining_writemask &= ~dst->writemask;
2419    }
2420    dst->reg_offset++;
2421 }
2422
2423 void
2424 vec4_visitor::visit(ir_constant *ir)
2425 {
2426    dst_reg dst = dst_reg(this, ir->type);
2427    this->result = src_reg(dst);
2428
2429    emit_constant_values(&dst, ir);
2430 }
2431
2432 void
2433 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2434 {
2435    ir_dereference *deref = static_cast<ir_dereference *>(
2436       ir->actual_parameters.get_head());
2437    ir_variable *location = deref->variable_referenced();
2438    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2439                           location->data.binding);
2440
2441    /* Calculate the surface offset */
2442    src_reg offset(this, glsl_type::uint_type);
2443    ir_dereference_array *deref_array = deref->as_dereference_array();
2444    if (deref_array) {
2445       deref_array->array_index->accept(this);
2446
2447       src_reg tmp(this, glsl_type::uint_type);
2448       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2449       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2450    } else {
2451       offset = location->data.atomic.offset;
2452    }
2453
2454    /* Emit the appropriate machine instruction */
2455    const char *callee = ir->callee->function_name();
2456    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2457
2458    if (!strcmp("__intrinsic_atomic_read", callee)) {
2459       emit_untyped_surface_read(surf_index, dst, offset);
2460
2461    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2462       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2463                           src_reg(), src_reg());
2464
2465    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2466       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2467                           src_reg(), src_reg());
2468    }
2469
2470    brw_mark_surface_used(stage_prog_data, surf_index);
2471 }
2472
2473 void
2474 vec4_visitor::visit(ir_call *ir)
2475 {
2476    const char *callee = ir->callee->function_name();
2477
2478    if (!strcmp("__intrinsic_atomic_read", callee) ||
2479        !strcmp("__intrinsic_atomic_increment", callee) ||
2480        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2481       visit_atomic_counter_intrinsic(ir);
2482    } else {
2483       unreachable("Unsupported intrinsic.");
2484    }
2485 }
2486
2487 src_reg
2488 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2489                              src_reg coordinate, src_reg sampler)
2490 {
2491    vec4_instruction *inst =
2492       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2493                                     dst_reg(this, glsl_type::uvec4_type));
2494    inst->base_mrf = 2;
2495    inst->src[1] = sampler;
2496
2497    int param_base;
2498
2499    if (devinfo->gen >= 9) {
2500       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2501       vec4_instruction *header_inst = new(mem_ctx)
2502          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2503                           dst_reg(MRF, inst->base_mrf));
2504
2505       emit(header_inst);
2506
2507       inst->mlen = 2;
2508       inst->header_size = 1;
2509       param_base = inst->base_mrf + 1;
2510    } else {
2511       inst->mlen = 1;
2512       param_base = inst->base_mrf;
2513    }
2514
2515    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2516    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2517    int zero_mask = 0xf & ~coord_mask;
2518
2519    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2520             coordinate));
2521
2522    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2523             src_reg(0)));
2524
2525    emit(inst);
2526    return src_reg(inst->dst);
2527 }
2528
2529 bool
2530 vec4_visitor::is_high_sampler(src_reg sampler)
2531 {
2532    if (devinfo->gen < 8 && !devinfo->is_haswell)
2533       return false;
2534
2535    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2536 }
2537
2538 void
2539 vec4_visitor::emit_texture(ir_texture_opcode op,
2540                            dst_reg dest,
2541                            const glsl_type *dest_type,
2542                            src_reg coordinate,
2543                            int coord_components,
2544                            src_reg shadow_comparitor,
2545                            src_reg lod, src_reg lod2,
2546                            src_reg sample_index,
2547                            uint32_t constant_offset,
2548                            src_reg offset_value,
2549                            src_reg mcs,
2550                            bool is_cube_array,
2551                            uint32_t sampler,
2552                            src_reg sampler_reg)
2553 {
2554    enum opcode opcode;
2555    switch (op) {
2556    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2557    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2558    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2559    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2560    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2561    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2562    case ir_tg4: opcode = offset_value.file != BAD_FILE
2563                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2564    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2565    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
2566    case ir_txb:
2567       unreachable("TXB is not valid for vertex shaders.");
2568    case ir_lod:
2569       unreachable("LOD is not valid for vertex shaders.");
2570    default:
2571       unreachable("Unrecognized tex op");
2572    }
2573
2574    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2575       opcode, dst_reg(this, dest_type));
2576
2577    inst->offset = constant_offset;
2578
2579    /* The message header is necessary for:
2580     * - Gen4 (always)
2581     * - Gen9+ for selecting SIMD4x2
2582     * - Texel offsets
2583     * - Gather channel selection
2584     * - Sampler indices too large to fit in a 4-bit value.
2585     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2586     */
2587    inst->header_size =
2588       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2589        inst->offset != 0 || op == ir_tg4 ||
2590        op == ir_texture_samples ||
2591        is_high_sampler(sampler_reg)) ? 1 : 0;
2592    inst->base_mrf = 2;
2593    inst->mlen = inst->header_size;
2594    inst->dst.writemask = WRITEMASK_XYZW;
2595    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2596
2597    inst->src[1] = sampler_reg;
2598
2599    /* MRF for the first parameter */
2600    int param_base = inst->base_mrf + inst->header_size;
2601
2602    if (op == ir_txs || op == ir_query_levels) {
2603       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2604       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2605       inst->mlen++;
2606    } else if (op == ir_texture_samples) {
2607       inst->dst.writemask = WRITEMASK_X;
2608    } else {
2609       /* Load the coordinate */
2610       /* FINISHME: gl_clamp_mask and saturate */
2611       int coord_mask = (1 << coord_components) - 1;
2612       int zero_mask = 0xf & ~coord_mask;
2613
2614       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2615                coordinate));
2616       inst->mlen++;
2617
2618       if (zero_mask != 0) {
2619          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2620                   src_reg(0)));
2621       }
2622       /* Load the shadow comparitor */
2623       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2624          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2625                           WRITEMASK_X),
2626                   shadow_comparitor));
2627          inst->mlen++;
2628       }
2629
2630       /* Load the LOD info */
2631       if (op == ir_tex || op == ir_txl) {
2632          int mrf, writemask;
2633          if (devinfo->gen >= 5) {
2634             mrf = param_base + 1;
2635             if (shadow_comparitor.file != BAD_FILE) {
2636                writemask = WRITEMASK_Y;
2637                /* mlen already incremented */
2638             } else {
2639                writemask = WRITEMASK_X;
2640                inst->mlen++;
2641             }
2642          } else /* devinfo->gen == 4 */ {
2643             mrf = param_base;
2644             writemask = WRITEMASK_W;
2645          }
2646          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2647       } else if (op == ir_txf) {
2648          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2649       } else if (op == ir_txf_ms) {
2650          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2651                   sample_index));
2652          if (devinfo->gen >= 7) {
2653             /* MCS data is in the first channel of `mcs`, but we need to get it into
2654              * the .y channel of the second vec4 of params, so replicate .x across
2655              * the whole vec4 and then mask off everything except .y
2656              */
2657             mcs.swizzle = BRW_SWIZZLE_XXXX;
2658             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2659                      mcs));
2660          }
2661          inst->mlen++;
2662       } else if (op == ir_txd) {
2663          const brw_reg_type type = lod.type;
2664
2665          if (devinfo->gen >= 5) {
2666             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2667             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2668             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2669             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2670             inst->mlen++;
2671
2672             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2673                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2674                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2675                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2676                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2677                inst->mlen++;
2678
2679                if (shadow_comparitor.file != BAD_FILE) {
2680                   emit(MOV(dst_reg(MRF, param_base + 2,
2681                                    shadow_comparitor.type, WRITEMASK_Z),
2682                            shadow_comparitor));
2683                }
2684             }
2685          } else /* devinfo->gen == 4 */ {
2686             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2687             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2688             inst->mlen += 2;
2689          }
2690       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2691          if (shadow_comparitor.file != BAD_FILE) {
2692             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2693                      shadow_comparitor));
2694          }
2695
2696          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2697                   offset_value));
2698          inst->mlen++;
2699       }
2700    }
2701
2702    emit(inst);
2703
2704    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2705     * spec requires layers.
2706     */
2707    if (op == ir_txs && is_cube_array) {
2708       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2709                 writemask(inst->dst, WRITEMASK_Z),
2710                 src_reg(inst->dst), src_reg(6));
2711    }
2712
2713    if (devinfo->gen == 6 && op == ir_tg4) {
2714       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2715    }
2716
2717    swizzle_result(op, dest,
2718                   src_reg(inst->dst), sampler, dest_type);
2719 }
2720
2721 void
2722 vec4_visitor::visit(ir_texture *ir)
2723 {
2724    uint32_t sampler =
2725       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2726
2727    ir_rvalue *nonconst_sampler_index =
2728       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2729
2730    /* Handle non-constant sampler array indexing */
2731    src_reg sampler_reg;
2732    if (nonconst_sampler_index) {
2733       /* The highest sampler which may be used by this operation is
2734        * the last element of the array. Mark it here, because the generator
2735        * doesn't have enough information to determine the bound.
2736        */
2737       uint32_t array_size = ir->sampler->as_dereference_array()
2738          ->array->type->array_size();
2739
2740       uint32_t max_used = sampler + array_size - 1;
2741       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2742          max_used += prog_data->base.binding_table.gather_texture_start;
2743       } else {
2744          max_used += prog_data->base.binding_table.texture_start;
2745       }
2746
2747       brw_mark_surface_used(&prog_data->base, max_used);
2748
2749       /* Emit code to evaluate the actual indexing expression */
2750       nonconst_sampler_index->accept(this);
2751       src_reg temp(this, glsl_type::uint_type);
2752       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2753       sampler_reg = emit_uniformize(temp);
2754    } else {
2755       /* Single sampler, or constant array index; the indexing expression
2756        * is just an immediate.
2757        */
2758       sampler_reg = src_reg(sampler);
2759    }
2760
2761    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2762     * emitting anything other than setting up the constant result.
2763     */
2764    if (ir->op == ir_tg4) {
2765       ir_constant *chan = ir->lod_info.component->as_constant();
2766       int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2767       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2768          dst_reg result(this, ir->type);
2769          this->result = src_reg(result);
2770          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2771          return;
2772       }
2773    }
2774
2775    /* Should be lowered by do_lower_texture_projection */
2776    assert(!ir->projector);
2777
2778    /* Should be lowered */
2779    assert(!ir->offset || !ir->offset->type->is_array());
2780
2781    /* Generate code to compute all the subexpression trees.  This has to be
2782     * done before loading any values into MRFs for the sampler message since
2783     * generating these values may involve SEND messages that need the MRFs.
2784     */
2785    src_reg coordinate;
2786    int coord_components = 0;
2787    if (ir->coordinate) {
2788       coord_components = ir->coordinate->type->vector_elements;
2789       ir->coordinate->accept(this);
2790       coordinate = this->result;
2791    }
2792
2793    src_reg shadow_comparitor;
2794    if (ir->shadow_comparitor) {
2795       ir->shadow_comparitor->accept(this);
2796       shadow_comparitor = this->result;
2797    }
2798
2799    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2800    src_reg offset_value;
2801    if (has_nonconstant_offset) {
2802       ir->offset->accept(this);
2803       offset_value = src_reg(this->result);
2804    }
2805
2806    src_reg lod, lod2, sample_index, mcs;
2807    switch (ir->op) {
2808    case ir_tex:
2809       lod = src_reg(0.0f);
2810       break;
2811    case ir_txf:
2812    case ir_txl:
2813    case ir_txs:
2814       ir->lod_info.lod->accept(this);
2815       lod = this->result;
2816       break;
2817    case ir_query_levels:
2818       lod = src_reg(0);
2819       break;
2820    case ir_txf_ms:
2821       ir->lod_info.sample_index->accept(this);
2822       sample_index = this->result;
2823
2824       if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2825          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2826       else
2827          mcs = src_reg(0u);
2828       break;
2829    case ir_txd:
2830       ir->lod_info.grad.dPdx->accept(this);
2831       lod = this->result;
2832
2833       ir->lod_info.grad.dPdy->accept(this);
2834       lod2 = this->result;
2835       break;
2836    case ir_txb:
2837    case ir_lod:
2838    case ir_tg4:
2839    case ir_texture_samples:
2840       break;
2841    }
2842
2843    uint32_t constant_offset = 0;
2844    if (ir->offset != NULL && !has_nonconstant_offset) {
2845       constant_offset  =
2846          brw_texture_offset(ir->offset->as_constant()->value.i,
2847                             ir->offset->type->vector_elements);
2848    }
2849
2850    /* Stuff the channel select bits in the top of the texture offset */
2851    if (ir->op == ir_tg4)
2852       constant_offset |=
2853          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2854                          sampler) << 16;
2855
2856    glsl_type const *type = ir->sampler->type;
2857    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2858       type->sampler_array;
2859
2860    this->result = src_reg(this, ir->type);
2861    dst_reg dest = dst_reg(this->result);
2862
2863    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2864                 shadow_comparitor,
2865                 lod, lod2, sample_index,
2866                 constant_offset, offset_value,
2867                 mcs, is_cube_array, sampler, sampler_reg);
2868 }
2869
2870 /**
2871  * Apply workarounds for Gen6 gather with UINT/SINT
2872  */
2873 void
2874 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2875 {
2876    if (!wa)
2877       return;
2878
2879    int width = (wa & WA_8BIT) ? 8 : 16;
2880    dst_reg dst_f = dst;
2881    dst_f.type = BRW_REGISTER_TYPE_F;
2882
2883    /* Convert from UNORM to UINT */
2884    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2885    emit(MOV(dst, src_reg(dst_f)));
2886
2887    if (wa & WA_SIGN) {
2888       /* Reinterpret the UINT value as a signed INT value by
2889        * shifting the sign bit into place, then shifting back
2890        * preserving sign.
2891        */
2892       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2893       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2894    }
2895 }
2896
2897 /**
2898  * Set up the gather channel based on the swizzle, for gather4.
2899  */
2900 uint32_t
2901 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2902 {
2903    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2904    switch (swiz) {
2905       case SWIZZLE_X: return 0;
2906       case SWIZZLE_Y:
2907          /* gather4 sampler is broken for green channel on RG32F --
2908           * we must ask for blue instead.
2909           */
2910          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2911             return 2;
2912          return 1;
2913       case SWIZZLE_Z: return 2;
2914       case SWIZZLE_W: return 3;
2915       default:
2916          unreachable("Not reached"); /* zero, one swizzles handled already */
2917    }
2918 }
2919
2920 void
2921 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2922                              src_reg orig_val, uint32_t sampler,
2923                              const glsl_type *dest_type)
2924 {
2925    int s = key_tex->swizzles[sampler];
2926
2927    dst_reg swizzled_result = dest;
2928
2929    if (op == ir_query_levels) {
2930       /* # levels is in .w */
2931       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2932       emit(MOV(swizzled_result, orig_val));
2933       return;
2934    }
2935
2936    if (op == ir_txs || dest_type == glsl_type::float_type
2937                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2938       emit(MOV(swizzled_result, orig_val));
2939       return;
2940    }
2941
2942
2943    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2944    int swizzle[4] = {0};
2945
2946    for (int i = 0; i < 4; i++) {
2947       switch (GET_SWZ(s, i)) {
2948       case SWIZZLE_ZERO:
2949          zero_mask |= (1 << i);
2950          break;
2951       case SWIZZLE_ONE:
2952          one_mask |= (1 << i);
2953          break;
2954       default:
2955          copy_mask |= (1 << i);
2956          swizzle[i] = GET_SWZ(s, i);
2957          break;
2958       }
2959    }
2960
2961    if (copy_mask) {
2962       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2963       swizzled_result.writemask = copy_mask;
2964       emit(MOV(swizzled_result, orig_val));
2965    }
2966
2967    if (zero_mask) {
2968       swizzled_result.writemask = zero_mask;
2969       emit(MOV(swizzled_result, src_reg(0.0f)));
2970    }
2971
2972    if (one_mask) {
2973       swizzled_result.writemask = one_mask;
2974       emit(MOV(swizzled_result, src_reg(1.0f)));
2975    }
2976 }
2977
2978 void
2979 vec4_visitor::visit(ir_return *)
2980 {
2981    unreachable("not reached");
2982 }
2983
2984 void
2985 vec4_visitor::visit(ir_discard *)
2986 {
2987    unreachable("not reached");
2988 }
2989
2990 void
2991 vec4_visitor::visit(ir_if *ir)
2992 {
2993    /* Don't point the annotation at the if statement, because then it plus
2994     * the then and else blocks get printed.
2995     */
2996    this->base_ir = ir->condition;
2997
2998    if (devinfo->gen == 6) {
2999       emit_if_gen6(ir);
3000    } else {
3001       enum brw_predicate predicate;
3002       emit_bool_to_cond_code(ir->condition, &predicate);
3003       emit(IF(predicate));
3004    }
3005
3006    visit_instructions(&ir->then_instructions);
3007
3008    if (!ir->else_instructions.is_empty()) {
3009       this->base_ir = ir->condition;
3010       emit(BRW_OPCODE_ELSE);
3011
3012       visit_instructions(&ir->else_instructions);
3013    }
3014
3015    this->base_ir = ir->condition;
3016    emit(BRW_OPCODE_ENDIF);
3017 }
3018
3019 void
3020 vec4_visitor::gs_emit_vertex(int stream_id)
3021 {
3022    unreachable("not reached");
3023 }
3024
3025 void
3026 vec4_visitor::visit(ir_emit_vertex *)
3027 {
3028    unreachable("not reached");
3029 }
3030
3031 void
3032 vec4_visitor::gs_end_primitive()
3033 {
3034    unreachable("not reached");
3035 }
3036
3037
3038 void
3039 vec4_visitor::visit(ir_end_primitive *)
3040 {
3041    unreachable("not reached");
3042 }
3043
3044 void
3045 vec4_visitor::visit(ir_barrier *)
3046 {
3047    unreachable("not reached");
3048 }
3049
3050 void
3051 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3052                                   dst_reg dst, src_reg offset,
3053                                   src_reg src0, src_reg src1)
3054 {
3055    unsigned mlen = 0;
3056
3057    /* Set the atomic operation offset. */
3058    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3059    mlen++;
3060
3061    /* Set the atomic operation arguments. */
3062    if (src0.file != BAD_FILE) {
3063       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3064       mlen++;
3065    }
3066
3067    if (src1.file != BAD_FILE) {
3068       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3069       mlen++;
3070    }
3071
3072    /* Emit the instruction.  Note that this maps to the normal SIMD8
3073     * untyped atomic message on Ivy Bridge, but that's OK because
3074     * unused channels will be masked out.
3075     */
3076    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3077                                  brw_message_reg(0),
3078                                  src_reg(surf_index), src_reg(atomic_op));
3079    inst->mlen = mlen;
3080 }
3081
3082 void
3083 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3084                                         src_reg offset)
3085 {
3086    /* Set the surface read offset. */
3087    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3088
3089    /* Emit the instruction.  Note that this maps to the normal SIMD8
3090     * untyped surface read message, but that's OK because unused
3091     * channels will be masked out.
3092     */
3093    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3094                                  brw_message_reg(0),
3095                                  src_reg(surf_index), src_reg(1));
3096    inst->mlen = 1;
3097 }
3098
3099 void
3100 vec4_visitor::emit_ndc_computation()
3101 {
3102    /* Get the position */
3103    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3104
3105    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3106    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3107    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3108
3109    current_annotation = "NDC";
3110    dst_reg ndc_w = ndc;
3111    ndc_w.writemask = WRITEMASK_W;
3112    src_reg pos_w = pos;
3113    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3114    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3115
3116    dst_reg ndc_xyz = ndc;
3117    ndc_xyz.writemask = WRITEMASK_XYZ;
3118
3119    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3120 }
3121
3122 void
3123 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3124 {
3125    if (devinfo->gen < 6 &&
3126        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3127         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3128         devinfo->has_negative_rhw_bug)) {
3129       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3130       dst_reg header1_w = header1;
3131       header1_w.writemask = WRITEMASK_W;
3132
3133       emit(MOV(header1, 0u));
3134
3135       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3136          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3137
3138          current_annotation = "Point size";
3139          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3140          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3141       }
3142
3143       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3144          current_annotation = "Clipping flags";
3145          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3146          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3147
3148          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3149          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3150          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3151
3152          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3153          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3154          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3155          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3156       }
3157
3158       /* i965 clipping workaround:
3159        * 1) Test for -ve rhw
3160        * 2) If set,
3161        *      set ndc = (0,0,0,0)
3162        *      set ucp[6] = 1
3163        *
3164        * Later, clipping will detect ucp[6] and ensure the primitive is
3165        * clipped against all fixed planes.
3166        */
3167       if (devinfo->has_negative_rhw_bug) {
3168          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3169          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3170          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3171          vec4_instruction *inst;
3172          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3173          inst->predicate = BRW_PREDICATE_NORMAL;
3174          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3175          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3176          inst->predicate = BRW_PREDICATE_NORMAL;
3177       }
3178
3179       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3180    } else if (devinfo->gen < 6) {
3181       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3182    } else {
3183       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3184       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3185          dst_reg reg_w = reg;
3186          reg_w.writemask = WRITEMASK_W;
3187          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3188          reg_as_src.type = reg_w.type;
3189          reg_as_src.swizzle = brw_swizzle_for_size(1);
3190          emit(MOV(reg_w, reg_as_src));
3191       }
3192       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3193          dst_reg reg_y = reg;
3194          reg_y.writemask = WRITEMASK_Y;
3195          reg_y.type = BRW_REGISTER_TYPE_D;
3196          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3197          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3198       }
3199       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3200          dst_reg reg_z = reg;
3201          reg_z.writemask = WRITEMASK_Z;
3202          reg_z.type = BRW_REGISTER_TYPE_D;
3203          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3204          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3205       }
3206    }
3207 }
3208
3209 vec4_instruction *
3210 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3211 {
3212    assert(varying < VARYING_SLOT_MAX);
3213    assert(output_reg[varying].type == reg.type);
3214    current_annotation = output_reg_annotation[varying];
3215    /* Copy the register, saturating if necessary */
3216    return emit(MOV(reg, src_reg(output_reg[varying])));
3217 }
3218
3219 void
3220 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3221 {
3222    reg.type = BRW_REGISTER_TYPE_F;
3223    output_reg[varying].type = reg.type;
3224
3225    switch (varying) {
3226    case VARYING_SLOT_PSIZ:
3227    {
3228       /* PSIZ is always in slot 0, and is coupled with other flags. */
3229       current_annotation = "indices, point width, clip flags";
3230       emit_psiz_and_flags(reg);
3231       break;
3232    }
3233    case BRW_VARYING_SLOT_NDC:
3234       current_annotation = "NDC";
3235       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3236       break;
3237    case VARYING_SLOT_POS:
3238       current_annotation = "gl_Position";
3239       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3240       break;
3241    case VARYING_SLOT_EDGE:
3242       /* This is present when doing unfilled polygons.  We're supposed to copy
3243        * the edge flag from the user-provided vertex array
3244        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3245        * of that attribute (starts as 1.0f).  This is then used in clipping to
3246        * determine which edges should be drawn as wireframe.
3247        */
3248       current_annotation = "edge flag";
3249       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3250                                     glsl_type::float_type, WRITEMASK_XYZW))));
3251       break;
3252    case BRW_VARYING_SLOT_PAD:
3253       /* No need to write to this slot */
3254       break;
3255    default:
3256       emit_generic_urb_slot(reg, varying);
3257       break;
3258    }
3259 }
3260
3261 static int
3262 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3263 {
3264    if (devinfo->gen >= 6) {
3265       /* URB data written (does not include the message header reg) must
3266        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3267        * section 5.4.3.2.2: URB_INTERLEAVED.
3268        *
3269        * URB entries are allocated on a multiple of 1024 bits, so an
3270        * extra 128 bits written here to make the end align to 256 is
3271        * no problem.
3272        */
3273       if ((mlen % 2) != 1)
3274          mlen++;
3275    }
3276
3277    return mlen;
3278 }
3279
3280
3281 /**
3282  * Generates the VUE payload plus the necessary URB write instructions to
3283  * output it.
3284  *
3285  * The VUE layout is documented in Volume 2a.
3286  */
3287 void
3288 vec4_visitor::emit_vertex()
3289 {
3290    /* MRF 0 is reserved for the debugger, so start with message header
3291     * in MRF 1.
3292     */
3293    int base_mrf = 1;
3294    int mrf = base_mrf;
3295    /* In the process of generating our URB write message contents, we
3296     * may need to unspill a register or load from an array.  Those
3297     * reads would use MRFs 14-15.
3298     */
3299    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
3300
3301    /* The following assertion verifies that max_usable_mrf causes an
3302     * even-numbered amount of URB write data, which will meet gen6's
3303     * requirements for length alignment.
3304     */
3305    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3306
3307    /* First mrf is the g0-based message header containing URB handles and
3308     * such.
3309     */
3310    emit_urb_write_header(mrf++);
3311
3312    if (devinfo->gen < 6) {
3313       emit_ndc_computation();
3314    }
3315
3316    /* We may need to split this up into several URB writes, so do them in a
3317     * loop.
3318     */
3319    int slot = 0;
3320    bool complete = false;
3321    do {
3322       /* URB offset is in URB row increments, and each of our MRFs is half of
3323        * one of those, since we're doing interleaved writes.
3324        */
3325       int offset = slot / 2;
3326
3327       mrf = base_mrf + 1;
3328       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3329          emit_urb_slot(dst_reg(MRF, mrf++),
3330                        prog_data->vue_map.slot_to_varying[slot]);
3331
3332          /* If this was max_usable_mrf, we can't fit anything more into this
3333           * URB WRITE. Same thing if we reached the maximum length available.
3334           */
3335          if (mrf > max_usable_mrf ||
3336              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
3337             slot++;
3338             break;
3339          }
3340       }
3341
3342       complete = slot >= prog_data->vue_map.num_slots;
3343       current_annotation = "URB write";
3344       vec4_instruction *inst = emit_urb_write_opcode(complete);
3345       inst->base_mrf = base_mrf;
3346       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3347       inst->offset += offset;
3348    } while(!complete);
3349 }
3350
3351
3352 src_reg
3353 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3354                                  src_reg *reladdr, int reg_offset)
3355 {
3356    /* Because we store the values to scratch interleaved like our
3357     * vertex data, we need to scale the vec4 index by 2.
3358     */
3359    int message_header_scale = 2;
3360
3361    /* Pre-gen6, the message header uses byte offsets instead of vec4
3362     * (16-byte) offset units.
3363     */
3364    if (devinfo->gen < 6)
3365       message_header_scale *= 16;
3366
3367    if (reladdr) {
3368       src_reg index = src_reg(this, glsl_type::int_type);
3369
3370       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3371                                    src_reg(reg_offset)));
3372       emit_before(block, inst, MUL(dst_reg(index), index,
3373                                    src_reg(message_header_scale)));
3374
3375       return index;
3376    } else {
3377       return src_reg(reg_offset * message_header_scale);
3378    }
3379 }
3380
3381 src_reg
3382 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3383                                        src_reg *reladdr, int reg_offset)
3384 {
3385    if (reladdr) {
3386       src_reg index = src_reg(this, glsl_type::int_type);
3387
3388       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3389                                    src_reg(reg_offset)));
3390
3391       /* Pre-gen6, the message header uses byte offsets instead of vec4
3392        * (16-byte) offset units.
3393        */
3394       if (devinfo->gen < 6) {
3395          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3396       }
3397
3398       return index;
3399    } else if (devinfo->gen >= 8) {
3400       /* Store the offset in a GRF so we can send-from-GRF. */
3401       src_reg offset = src_reg(this, glsl_type::int_type);
3402       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3403       return offset;
3404    } else {
3405       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3406       return src_reg(reg_offset * message_header_scale);
3407    }
3408 }
3409
3410 /**
3411  * Emits an instruction before @inst to load the value named by @orig_src
3412  * from scratch space at @base_offset to @temp.
3413  *
3414  * @base_offset is measured in 32-byte units (the size of a register).
3415  */
3416 void
3417 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3418                                 dst_reg temp, src_reg orig_src,
3419                                 int base_offset)
3420 {
3421    int reg_offset = base_offset + orig_src.reg_offset;
3422    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3423                                       reg_offset);
3424
3425    emit_before(block, inst, SCRATCH_READ(temp, index));
3426 }
3427
3428 /**
3429  * Emits an instruction after @inst to store the value to be written
3430  * to @orig_dst to scratch space at @base_offset, from @temp.
3431  *
3432  * @base_offset is measured in 32-byte units (the size of a register).
3433  */
3434 void
3435 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3436                                  int base_offset)
3437 {
3438    int reg_offset = base_offset + inst->dst.reg_offset;
3439    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3440                                       reg_offset);
3441
3442    /* Create a temporary register to store *inst's result in.
3443     *
3444     * We have to be careful in MOVing from our temporary result register in
3445     * the scratch write.  If we swizzle from channels of the temporary that
3446     * weren't initialized, it will confuse live interval analysis, which will
3447     * make spilling fail to make progress.
3448     */
3449    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3450                                        inst->dst.type),
3451                                 brw_swizzle_for_mask(inst->dst.writemask));
3452    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3453                                        inst->dst.writemask));
3454    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3455    if (inst->opcode != BRW_OPCODE_SEL)
3456       write->predicate = inst->predicate;
3457    write->ir = inst->ir;
3458    write->annotation = inst->annotation;
3459    inst->insert_after(block, write);
3460
3461    inst->dst.file = temp.file;
3462    inst->dst.reg = temp.reg;
3463    inst->dst.reg_offset = temp.reg_offset;
3464    inst->dst.reladdr = NULL;
3465 }
3466
3467 /**
3468  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3469  * adds the scratch read(s) before \p inst. The function also checks for
3470  * recursive reladdr scratch accesses, issuing the corresponding scratch
3471  * loads and rewriting reladdr references accordingly.
3472  *
3473  * \return \p src if it did not require a scratch load, otherwise, the
3474  * register holding the result of the scratch load that the caller should
3475  * use to rewrite src.
3476  */
3477 src_reg
3478 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3479                                    vec4_instruction *inst, src_reg src)
3480 {
3481    /* Resolve recursive reladdr scratch access by calling ourselves
3482     * with src.reladdr
3483     */
3484    if (src.reladdr)
3485       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3486                                           *src.reladdr);
3487
3488    /* Now handle scratch access on src */
3489    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3490       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3491       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3492       src.reg = temp.reg;
3493       src.reg_offset = temp.reg_offset;
3494       src.reladdr = NULL;
3495    }
3496
3497    return src;
3498 }
3499
3500 /**
3501  * We can't generally support array access in GRF space, because a
3502  * single instruction's destination can only span 2 contiguous
3503  * registers.  So, we send all GRF arrays that get variable index
3504  * access to scratch space.
3505  */
3506 void
3507 vec4_visitor::move_grf_array_access_to_scratch()
3508 {
3509    int scratch_loc[this->alloc.count];
3510    memset(scratch_loc, -1, sizeof(scratch_loc));
3511
3512    /* First, calculate the set of virtual GRFs that need to be punted
3513     * to scratch due to having any array access on them, and where in
3514     * scratch.
3515     */
3516    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3517       if (inst->dst.file == GRF && inst->dst.reladdr) {
3518          if (scratch_loc[inst->dst.reg] == -1) {
3519             scratch_loc[inst->dst.reg] = last_scratch;
3520             last_scratch += this->alloc.sizes[inst->dst.reg];
3521          }
3522
3523          for (src_reg *iter = inst->dst.reladdr;
3524               iter->reladdr;
3525               iter = iter->reladdr) {
3526             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3527                scratch_loc[iter->reg] = last_scratch;
3528                last_scratch += this->alloc.sizes[iter->reg];
3529             }
3530          }
3531       }
3532
3533       for (int i = 0 ; i < 3; i++) {
3534          for (src_reg *iter = &inst->src[i];
3535               iter->reladdr;
3536               iter = iter->reladdr) {
3537             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3538                scratch_loc[iter->reg] = last_scratch;
3539                last_scratch += this->alloc.sizes[iter->reg];
3540             }
3541          }
3542       }
3543    }
3544
3545    /* Now, for anything that will be accessed through scratch, rewrite
3546     * it to load/store.  Note that this is a _safe list walk, because
3547     * we may generate a new scratch_write instruction after the one
3548     * we're processing.
3549     */
3550    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3551       /* Set up the annotation tracking for new generated instructions. */
3552       base_ir = inst->ir;
3553       current_annotation = inst->annotation;
3554
3555       /* First handle scratch access on the dst. Notice we have to handle
3556        * the case where the dst's reladdr also points to scratch space.
3557        */
3558       if (inst->dst.reladdr)
3559          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3560                                                    *inst->dst.reladdr);
3561
3562       /* Now that we have handled any (possibly recursive) reladdr scratch
3563        * accesses for dst we can safely do the scratch write for dst itself
3564        */
3565       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3566          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3567
3568       /* Now handle scratch access on any src. In this case, since inst->src[i]
3569        * already is a src_reg, we can just call emit_resolve_reladdr with
3570        * inst->src[i] and it will take care of handling scratch loads for
3571        * both src and src.reladdr (recursively).
3572        */
3573       for (int i = 0 ; i < 3; i++) {
3574          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3575                                              inst->src[i]);
3576       }
3577    }
3578 }
3579
3580 /**
3581  * Emits an instruction before @inst to load the value named by @orig_src
3582  * from the pull constant buffer (surface) at @base_offset to @temp.
3583  */
3584 void
3585 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3586                                       dst_reg temp, src_reg orig_src,
3587                                       int base_offset)
3588 {
3589    int reg_offset = base_offset + orig_src.reg_offset;
3590    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3591    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3592                                              reg_offset);
3593
3594    emit_pull_constant_load_reg(temp,
3595                                index,
3596                                offset,
3597                                block, inst);
3598 }
3599
3600 /**
3601  * Implements array access of uniforms by inserting a
3602  * PULL_CONSTANT_LOAD instruction.
3603  *
3604  * Unlike temporary GRF array access (where we don't support it due to
3605  * the difficulty of doing relative addressing on instruction
3606  * destinations), we could potentially do array access of uniforms
3607  * that were loaded in GRF space as push constants.  In real-world
3608  * usage we've seen, though, the arrays being used are always larger
3609  * than we could load as push constants, so just always move all
3610  * uniform array access out to a pull constant buffer.
3611  */
3612 void
3613 vec4_visitor::move_uniform_array_access_to_pull_constants()
3614 {
3615    int pull_constant_loc[this->uniforms];
3616    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3617    bool nested_reladdr;
3618
3619    /* Walk through and find array access of uniforms.  Put a copy of that
3620     * uniform in the pull constant buffer.
3621     *
3622     * Note that we don't move constant-indexed accesses to arrays.  No
3623     * testing has been done of the performance impact of this choice.
3624     */
3625    do {
3626       nested_reladdr = false;
3627
3628       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3629          for (int i = 0 ; i < 3; i++) {
3630             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3631                continue;
3632
3633             int uniform = inst->src[i].reg;
3634
3635             if (inst->src[i].reladdr->reladdr)
3636                nested_reladdr = true;  /* will need another pass */
3637
3638             /* If this array isn't already present in the pull constant buffer,
3639              * add it.
3640              */
3641             if (pull_constant_loc[uniform] == -1) {
3642                const gl_constant_value **values =
3643                   &stage_prog_data->param[uniform * 4];
3644
3645                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3646
3647                assert(uniform < uniform_array_size);
3648                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3649                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3650                      = values[j];
3651                }
3652             }
3653
3654             /* Set up the annotation tracking for new generated instructions. */
3655             base_ir = inst->ir;
3656             current_annotation = inst->annotation;
3657
3658             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3659
3660             emit_pull_constant_load(block, inst, temp, inst->src[i],
3661                                     pull_constant_loc[uniform]);
3662
3663             inst->src[i].file = temp.file;
3664             inst->src[i].reg = temp.reg;
3665             inst->src[i].reg_offset = temp.reg_offset;
3666             inst->src[i].reladdr = NULL;
3667          }
3668       }
3669    } while (nested_reladdr);
3670
3671    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3672     * no need to track them as larger-than-vec4 objects.  This will be
3673     * relied on in cutting out unused uniform vectors from push
3674     * constants.
3675     */
3676    split_uniform_registers();
3677 }
3678
3679 void
3680 vec4_visitor::resolve_ud_negate(src_reg *reg)
3681 {
3682    if (reg->type != BRW_REGISTER_TYPE_UD ||
3683        !reg->negate)
3684       return;
3685
3686    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3687    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3688    *reg = temp;
3689 }
3690
3691 /**
3692  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3693  *
3694  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3695  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3696  */
3697 void
3698 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3699 {
3700    assert(devinfo->gen <= 5);
3701
3702    if (!rvalue->type->is_boolean())
3703       return;
3704
3705    src_reg and_result = src_reg(this, rvalue->type);
3706    src_reg neg_result = src_reg(this, rvalue->type);
3707    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3708    emit(MOV(dst_reg(neg_result), negate(and_result)));
3709    *reg = neg_result;
3710 }
3711
3712 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3713                            void *log_data,
3714                            struct gl_program *prog,
3715                            const struct brw_sampler_prog_key_data *key_tex,
3716                            struct brw_vue_prog_data *prog_data,
3717                            struct gl_shader_program *shader_prog,
3718                            gl_shader_stage stage,
3719                            void *mem_ctx,
3720                            bool no_spills,
3721                            int shader_time_index)
3722    : backend_shader(compiler, log_data, mem_ctx,
3723                     shader_prog, prog, &prog_data->base, stage),
3724      key_tex(key_tex),
3725      prog_data(prog_data),
3726      sanity_param_count(0),
3727      fail_msg(NULL),
3728      first_non_payload_grf(0),
3729      need_all_constants_in_pull_buffer(false),
3730      no_spills(no_spills),
3731      shader_time_index(shader_time_index),
3732      last_scratch(0)
3733 {
3734    this->failed = false;
3735
3736    this->base_ir = NULL;
3737    this->current_annotation = NULL;
3738    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3739
3740    this->variable_ht = hash_table_ctor(0,
3741                                        hash_table_pointer_hash,
3742                                        hash_table_pointer_compare);
3743
3744    this->virtual_grf_start = NULL;
3745    this->virtual_grf_end = NULL;
3746    this->live_intervals = NULL;
3747
3748    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3749
3750    this->uniforms = 0;
3751
3752    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3753     * at least one. See setup_uniforms() in brw_vec4.cpp.
3754     */
3755    this->uniform_array_size = 1;
3756    if (prog_data) {
3757       this->uniform_array_size =
3758          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3759    }
3760
3761    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3762    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3763 }
3764
3765 vec4_visitor::~vec4_visitor()
3766 {
3767    hash_table_dtor(this->variable_ht);
3768 }
3769
3770
3771 void
3772 vec4_visitor::fail(const char *format, ...)
3773 {
3774    va_list va;
3775    char *msg;
3776
3777    if (failed)
3778       return;
3779
3780    failed = true;
3781
3782    va_start(va, format);
3783    msg = ralloc_vasprintf(mem_ctx, format, va);
3784    va_end(va);
3785    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3786
3787    this->fail_msg = msg;
3788
3789    if (debug_enabled) {
3790       fprintf(stderr, "%s",  msg);
3791    }
3792 }
3793
3794 } /* namespace brw */