src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->predicate = BRW_PREDICATE_NONE;
  49    this->predicate_inverse = false;
  50    this->target = 0;
  51    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  52    this->shadow_compare = false;
  53    this->ir = NULL;
  54    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  55    this->header_size = 0;
  56    this->flag_subreg = 0;
  57    this->mlen = 0;
  58    this->base_mrf = 0;
  59    this->offset = 0;
  60    this->annotation = NULL;
  61 }
  62
  63 vec4_instruction *
  64 vec4_visitor::emit(vec4_instruction *inst)
  65 {
  66    inst->ir = this->base_ir;
  67    inst->annotation = this->current_annotation;
  68
  69    this->instructions.push_tail(inst);
  70
  71    return inst;
  72 }
  73
  74 vec4_instruction *
  75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  76                           vec4_instruction *new_inst)
  77 {
  78    new_inst->ir = inst->ir;
  79    new_inst->annotation = inst->annotation;
  80
  81    inst->insert_before(block, new_inst);
  82
  83    return inst;
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  88                    const src_reg &src1, const src_reg &src2)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  91 }
  92
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  96                    const src_reg &src1)
  97 {
  98    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  99 }
 100
 101 vec4_instruction *
 102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 103 {
 104    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 109 {
 110    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 111 }
 112
 113 vec4_instruction *
 114 vec4_visitor::emit(enum opcode opcode)
 115 {
 116    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 117 }
 118
 119 #define ALU1(op)                                                        \
 120    vec4_instruction *                                                   \
 121    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 122    {                                                                    \
 123       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 124    }
 125
 126 #define ALU2(op)                                                        \
 127    vec4_instruction *                                                   \
 128    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 129                     const src_reg &src1)                                \
 130    {                                                                    \
 131       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 132                                            src0, src1);                 \
 133    }
 134
 135 #define ALU2_ACC(op)                                                    \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 138                     const src_reg &src1)                                \
 139    {                                                                    \
 140       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 141                        BRW_OPCODE_##op, dst, src0, src1);               \
 142       inst->writes_accumulator = true;                                  \
 143       return inst;                                                      \
 144    }
 145
 146 #define ALU3(op)                                                        \
 147    vec4_instruction *                                                   \
 148    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 149                     const src_reg &src1, const src_reg &src2)           \
 150    {                                                                    \
 151       assert(devinfo->gen >= 6);                                                \
 152       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 153                                            src0, src1, src2);           \
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU1(F32TO16)
 163 ALU1(F16TO32)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(DP3)
 171 ALU2(DP4)
 172 ALU2(DPH)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(MAC)
 188
 189 /** Gen4 predicated IF. */
 190 vec4_instruction *
 191 vec4_visitor::IF(enum brw_predicate predicate)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197
 198    return inst;
 199 }
 200
 201 /** Gen6 IF with embedded comparison. */
 202 vec4_instruction *
 203 vec4_visitor::IF(src_reg src0, src_reg src1,
 204                  enum brw_conditional_mod condition)
 205 {
 206    assert(devinfo->gen == 6);
 207
 208    vec4_instruction *inst;
 209
 210    resolve_ud_negate(&src0);
 211    resolve_ud_negate(&src1);
 212
 213    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 214                                         src0, src1);
 215    inst->conditional_mod = condition;
 216
 217    return inst;
 218 }
 219
 220 /**
 221  * CMP: Sets the low bit of the destination channels with the result
 222  * of the comparison, while the upper bits are undefined, and updates
 223  * the flag register with the packed 16 bits of the result.
 224  */
 225 vec4_instruction *
 226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 227                   enum brw_conditional_mod condition)
 228 {
 229    vec4_instruction *inst;
 230
 231    /* Take the instruction:
 232     *
 233     * CMP null<d> src0<f> src1<f>
 234     *
 235     * Original gen4 does type conversion to the destination type before
 236     * comparison, producing garbage results for floating point comparisons.
 237     *
 238     * The destination type doesn't matter on newer generations, so we set the
 239     * type to match src0 so we can compact the instruction.
 240     */
 241    dst.type = src0.type;
 242    if (dst.file == HW_REG)
 243       dst.fixed_hw_reg.type = dst.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 void
 282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 283 {
 284    static enum opcode dot_opcodes[] = {
 285       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 286    };
 287
 288    emit(dot_opcodes[elements - 2], dst, src0, src1);
 289 }
 290
 291 src_reg
 292 vec4_visitor::fix_3src_operand(const src_reg &src)
 293 {
 294    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 295     * able to use vertical stride of zero to replicate the vec4 uniform, like
 296     *
 297     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 298     *
 299     * But you can't, since vertical stride is always four in three-source
 300     * instructions. Instead, insert a MOV instruction to do the replication so
 301     * that the three-source instruction can consume it.
 302     */
 303
 304    /* The MOV is only needed if the source is a uniform or immediate. */
 305    if (src.file != UNIFORM && src.file != IMM)
 306       return src;
 307
 308    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 309       return src;
 310
 311    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 312    expanded.type = src.type;
 313    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 314    return src_reg(expanded);
 315 }
 316
 317 src_reg
 318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 319 {
 320    if (!src.abs && !src.negate)
 321       return src;
 322
 323    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 324    resolved.type = src.type;
 325    emit(MOV(resolved, src));
 326
 327    return src_reg(resolved);
 328 }
 329
 330 src_reg
 331 vec4_visitor::fix_math_operand(const src_reg &src)
 332 {
 333    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 334       return src;
 335
 336    /* The gen6 math instruction ignores the source modifiers --
 337     * swizzle, abs, negate, and at least some parts of the register
 338     * region description.
 339     *
 340     * Rather than trying to enumerate all these cases, *always* expand the
 341     * operand to a temp GRF for gen6.
 342     *
 343     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 344     * can't use.
 345     */
 346
 347    if (devinfo->gen == 7 && src.file != IMM)
 348       return src;
 349
 350    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 351    expanded.type = src.type;
 352    emit(MOV(expanded, src));
 353    return src_reg(expanded);
 354 }
 355
 356 vec4_instruction *
 357 vec4_visitor::emit_math(enum opcode opcode,
 358                         const dst_reg &dst,
 359                         const src_reg &src0, const src_reg &src1)
 360 {
 361    vec4_instruction *math =
 362       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 363
 364    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 365       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 366       math->dst = dst_reg(this, glsl_type::vec4_type);
 367       math->dst.type = dst.type;
 368       math = emit(MOV(dst, src_reg(math->dst)));
 369    } else if (devinfo->gen < 6) {
 370       math->base_mrf = 1;
 371       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 372    }
 373
 374    return math;
 375 }
 376
 377 void
 378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 379 {
 380    if (devinfo->gen < 7) {
 381       unreachable("ir_unop_pack_half_2x16 should be lowered");
 382    }
 383
 384    assert(dst.type == BRW_REGISTER_TYPE_UD);
 385    assert(src0.type == BRW_REGISTER_TYPE_F);
 386
 387    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 388     *
 389     *   Because this instruction does not have a 16-bit floating-point type,
 390     *   the destination data type must be Word (W).
 391     *
 392     *   The destination must be DWord-aligned and specify a horizontal stride
 393     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 394     *   each destination channel and the upper word is not modified.
 395     *
 396     * The above restriction implies that the f32to16 instruction must use
 397     * align1 mode, because only in align1 mode is it possible to specify
 398     * horizontal stride.  We choose here to defy the hardware docs and emit
 399     * align16 instructions.
 400     *
 401     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 402     * instructions. I was partially successful in that the code passed all
 403     * tests.  However, the code was dubiously correct and fragile, and the
 404     * tests were not harsh enough to probe that frailty. Not trusting the
 405     * code, I chose instead to remain in align16 mode in defiance of the hw
 406     * docs).
 407     *
 408     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 409     * simulator, emitting a f32to16 in align16 mode with UD as destination
 410     * data type is safe. The behavior differs from that specified in the PRM
 411     * in that the upper word of each destination channel is cleared to 0.
 412     */
 413
 414    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 415    src_reg tmp_src(tmp_dst);
 416
 417 #if 0
 418    /* Verify the undocumented behavior on which the following instructions
 419     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 420     * then the result of the bit-or instruction below will be incorrect.
 421     *
 422     * You should inspect the disasm output in order to verify that the MOV is
 423     * not optimized away.
 424     */
 425    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 426 #endif
 427
 428    /* Give tmp the form below, where "." means untouched.
 429     *
 430     *     w z          y          x w z          y          x
 431     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 432     *
 433     * That the upper word of each write-channel be 0 is required for the
 434     * following bit-shift and bit-or instructions to work. Note that this
 435     * relies on the undocumented hardware behavior mentioned above.
 436     */
 437    tmp_dst.writemask = WRITEMASK_XY;
 438    emit(F32TO16(tmp_dst, src0));
 439
 440    /* Give the write-channels of dst the form:
 441     *   0xhhhh0000
 442     */
 443    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 444    emit(SHL(dst, tmp_src, src_reg(16u)));
 445
 446    /* Finally, give the write-channels of dst the form of packHalf2x16's
 447     * output:
 448     *   0xhhhhllll
 449     */
 450    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 451    emit(OR(dst, src_reg(dst), tmp_src));
 452 }
 453
 454 void
 455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 456 {
 457    if (devinfo->gen < 7) {
 458       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 459    }
 460
 461    assert(dst.type == BRW_REGISTER_TYPE_F);
 462    assert(src0.type == BRW_REGISTER_TYPE_UD);
 463
 464    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 465     *
 466     *   Because this instruction does not have a 16-bit floating-point type,
 467     *   the source data type must be Word (W). The destination type must be
 468     *   F (Float).
 469     *
 470     * To use W as the source data type, we must adjust horizontal strides,
 471     * which is only possible in align1 mode. All my [chadv] attempts at
 472     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 473     * Piglit tests, so I gave up.
 474     *
 475     * I've verified that, on gen7 hardware and the simulator, it is safe to
 476     * emit f16to32 in align16 mode with UD as source data type.
 477     */
 478
 479    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 480    src_reg tmp_src(tmp_dst);
 481
 482    tmp_dst.writemask = WRITEMASK_X;
 483    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 484
 485    tmp_dst.writemask = WRITEMASK_Y;
 486    emit(SHR(tmp_dst, src0, src_reg(16u)));
 487
 488    dst.writemask = WRITEMASK_XY;
 489    emit(F16TO32(dst, tmp_src));
 490 }
 491
 492 void
 493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 494 {
 495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 497     * is not suitable to generate the shift values, but we can use the packed
 498     * vector float and a type-converting MOV.
 499     */
 500    dst_reg shift(this, glsl_type::uvec4_type);
 501    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 502
 503    dst_reg shifted(this, glsl_type::uvec4_type);
 504    src0.swizzle = BRW_SWIZZLE_XXXX;
 505    emit(SHR(shifted, src0, src_reg(shift)));
 506
 507    shifted.type = BRW_REGISTER_TYPE_UB;
 508    dst_reg f(this, glsl_type::vec4_type);
 509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 510
 511    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 512 }
 513
 514 void
 515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 516 {
 517    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 518     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 519     * is not suitable to generate the shift values, but we can use the packed
 520     * vector float and a type-converting MOV.
 521     */
 522    dst_reg shift(this, glsl_type::uvec4_type);
 523    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 524
 525    dst_reg shifted(this, glsl_type::uvec4_type);
 526    src0.swizzle = BRW_SWIZZLE_XXXX;
 527    emit(SHR(shifted, src0, src_reg(shift)));
 528
 529    shifted.type = BRW_REGISTER_TYPE_B;
 530    dst_reg f(this, glsl_type::vec4_type);
 531    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 532
 533    dst_reg scaled(this, glsl_type::vec4_type);
 534    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 535
 536    dst_reg max(this, glsl_type::vec4_type);
 537    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 538    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 539 }
 540
 541 void
 542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 543 {
 544    dst_reg saturated(this, glsl_type::vec4_type);
 545    vec4_instruction *inst = emit(MOV(saturated, src0));
 546    inst->saturate = true;
 547
 548    dst_reg scaled(this, glsl_type::vec4_type);
 549    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 550
 551    dst_reg rounded(this, glsl_type::vec4_type);
 552    emit(RNDE(rounded, src_reg(scaled)));
 553
 554    dst_reg u(this, glsl_type::uvec4_type);
 555    emit(MOV(u, src_reg(rounded)));
 556
 557    src_reg bytes(u);
 558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 559 }
 560
 561 void
 562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 563 {
 564    dst_reg max(this, glsl_type::vec4_type);
 565    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 566
 567    dst_reg min(this, glsl_type::vec4_type);
 568    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 569
 570    dst_reg scaled(this, glsl_type::vec4_type);
 571    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 572
 573    dst_reg rounded(this, glsl_type::vec4_type);
 574    emit(RNDE(rounded, src_reg(scaled)));
 575
 576    dst_reg i(this, glsl_type::ivec4_type);
 577    emit(MOV(i, src_reg(rounded)));
 578
 579    src_reg bytes(i);
 580    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 581 }
 582
 583 void
 584 vec4_visitor::visit_instructions(const exec_list *list)
 585 {
 586    foreach_in_list(ir_instruction, ir, list) {
 587       base_ir = ir;
 588       ir->accept(this);
 589    }
 590 }
 591
 592 /**
 593  * Returns the minimum number of vec4 elements needed to pack a type.
 594  *
 595  * For simple types, it will return 1 (a single vec4); for matrices, the
 596  * number of columns; for array and struct, the sum of the vec4_size of
 597  * each of its elements; and for sampler and atomic, zero.
 598  *
 599  * This method is useful to calculate how much register space is needed to
 600  * store a particular type.
 601  */
 602 extern "C" int
 603 type_size_vec4(const struct glsl_type *type)
 604 {
 605    unsigned int i;
 606    int size;
 607
 608    switch (type->base_type) {
 609    case GLSL_TYPE_UINT:
 610    case GLSL_TYPE_INT:
 611    case GLSL_TYPE_FLOAT:
 612    case GLSL_TYPE_BOOL:
 613       if (type->is_matrix()) {
 614          return type->matrix_columns;
 615       } else {
 616          /* Regardless of size of vector, it gets a vec4. This is bad
 617           * packing for things like floats, but otherwise arrays become a
 618           * mess.  Hopefully a later pass over the code can pack scalars
 619           * down if appropriate.
 620           */
 621          return 1;
 622       }
 623    case GLSL_TYPE_ARRAY:
 624       assert(type->length > 0);
 625       return type_size_vec4(type->fields.array) * type->length;
 626    case GLSL_TYPE_STRUCT:
 627       size = 0;
 628       for (i = 0; i < type->length; i++) {
 629          size += type_size_vec4(type->fields.structure[i].type);
 630       }
 631       return size;
 632    case GLSL_TYPE_SUBROUTINE:
 633       return 1;
 634
 635    case GLSL_TYPE_SAMPLER:
 636       /* Samplers take up no register space, since they're baked in at
 637        * link time.
 638        */
 639       return 0;
 640    case GLSL_TYPE_ATOMIC_UINT:
 641       return 0;
 642    case GLSL_TYPE_IMAGE:
 643       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 644    case GLSL_TYPE_VOID:
 645    case GLSL_TYPE_DOUBLE:
 646    case GLSL_TYPE_ERROR:
 647    case GLSL_TYPE_INTERFACE:
 648       unreachable("not reached");
 649    }
 650
 651    return 0;
 652 }
 653
 654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size_vec4(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->swizzle = BRW_SWIZZLE_NOOP;
 663    } else {
 664       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 671 {
 672    assert(size > 0);
 673
 674    init();
 675
 676    this->file = GRF;
 677    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 678
 679    this->swizzle = BRW_SWIZZLE_NOOP;
 680
 681    this->type = brw_type_for_base_type(type);
 682 }
 683
 684 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 685 {
 686    init();
 687
 688    this->file = GRF;
 689    this->reg = v->alloc.allocate(type_size_vec4(type));
 690
 691    if (type->is_array() || type->is_record()) {
 692       this->writemask = WRITEMASK_XYZW;
 693    } else {
 694       this->writemask = (1 << type->vector_elements) - 1;
 695    }
 696
 697    this->type = brw_type_for_base_type(type);
 698 }
 699
 700 void
 701 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 702                                        const gl_constant_value *values,
 703                                        unsigned n)
 704 {
 705    static const gl_constant_value zero = { 0 };
 706
 707    assert(param_offset % 4 == 0);
 708
 709    for (unsigned i = 0; i < n; ++i)
 710       stage_prog_data->param[param_offset + i] = &values[i];
 711
 712    for (unsigned i = n; i < 4; ++i)
 713       stage_prog_data->param[param_offset + i] = &zero;
 714
 715    uniform_vector_size[param_offset / 4] = n;
 716 }
 717
 718 /* Our support for uniforms is piggy-backed on the struct
 719  * gl_fragment_program, because that's where the values actually
 720  * get stored, rather than in some global gl_shader_program uniform
 721  * store.
 722  */
 723 void
 724 vec4_visitor::setup_uniform_values(ir_variable *ir)
 725 {
 726    int namelen = strlen(ir->name);
 727
 728    /* The data for our (non-builtin) uniforms is stored in a series of
 729     * gl_uniform_driver_storage structs for each subcomponent that
 730     * glGetUniformLocation() could name.  We know it's been set up in the same
 731     * order we'd walk the type, so walk the list of storage and find anything
 732     * with our name, or the prefix of a component that starts with our name.
 733     */
 734    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 735       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 736
 737       if (storage->builtin)
 738          continue;
 739
 740       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 741           (storage->name[namelen] != 0 &&
 742            storage->name[namelen] != '.' &&
 743            storage->name[namelen] != '[')) {
 744          continue;
 745       }
 746
 747       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 748                                      storage->type->matrix_columns);
 749       const unsigned vector_size = storage->type->vector_elements;
 750
 751       for (unsigned s = 0; s < vector_count; s++) {
 752          setup_vec4_uniform_value(uniforms * 4,
 753                                   &storage->storage[s * vector_size],
 754                                   vector_size);
 755          uniforms++;
 756       }
 757    }
 758 }
 759
 760 /* Our support for builtin uniforms is even scarier than non-builtin.
 761  * It sits on top of the PROG_STATE_VAR parameters that are
 762  * automatically updated from GL context state.
 763  */
 764 void
 765 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 766 {
 767    const ir_state_slot *const slots = ir->get_state_slots();
 768    assert(slots != NULL);
 769
 770    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 771       /* This state reference has already been setup by ir_to_mesa,
 772        * but we'll get the same index back here.  We can reference
 773        * ParameterValues directly, since unlike brw_fs.cpp, we never
 774        * add new state references during compile.
 775        */
 776       int index = _mesa_add_state_reference(this->prog->Parameters,
 777                                             (gl_state_index *)slots[i].tokens);
 778       gl_constant_value *values =
 779          &this->prog->Parameters->ParameterValues[index][0];
 780
 781       assert(this->uniforms < uniform_array_size);
 782
 783       for (unsigned j = 0; j < 4; j++)
 784          stage_prog_data->param[this->uniforms * 4 + j] =
 785             &values[GET_SWZ(slots[i].swizzle, j)];
 786
 787       this->uniform_vector_size[this->uniforms] =
 788          (ir->type->is_scalar() || ir->type->is_vector() ||
 789           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 790
 791       this->uniforms++;
 792    }
 793 }
 794
 795 dst_reg *
 796 vec4_visitor::variable_storage(ir_variable *var)
 797 {
 798    return (dst_reg *)hash_table_find(this->variable_ht, var);
 799 }
 800
 801 void
 802 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 803                                      enum brw_predicate *predicate)
 804 {
 805    ir_expression *expr = ir->as_expression();
 806
 807    *predicate = BRW_PREDICATE_NORMAL;
 808
 809    if (expr && expr->operation != ir_binop_ubo_load) {
 810       src_reg op[3];
 811       vec4_instruction *inst;
 812
 813       assert(expr->get_num_operands() <= 3);
 814       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 815          expr->operands[i]->accept(this);
 816          op[i] = this->result;
 817
 818          resolve_ud_negate(&op[i]);
 819       }
 820
 821       switch (expr->operation) {
 822       case ir_unop_logic_not:
 823          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 824          inst->conditional_mod = BRW_CONDITIONAL_Z;
 825          break;
 826
 827       case ir_binop_logic_xor:
 828          if (devinfo->gen <= 5) {
 829             src_reg temp = src_reg(this, ir->type);
 830             emit(XOR(dst_reg(temp), op[0], op[1]));
 831             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 832          } else {
 833             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 834          }
 835          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836          break;
 837
 838       case ir_binop_logic_or:
 839          if (devinfo->gen <= 5) {
 840             src_reg temp = src_reg(this, ir->type);
 841             emit(OR(dst_reg(temp), op[0], op[1]));
 842             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 843          } else {
 844             inst = emit(OR(dst_null_d(), op[0], op[1]));
 845          }
 846          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 847          break;
 848
 849       case ir_binop_logic_and:
 850          if (devinfo->gen <= 5) {
 851             src_reg temp = src_reg(this, ir->type);
 852             emit(AND(dst_reg(temp), op[0], op[1]));
 853             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 854          } else {
 855             inst = emit(AND(dst_null_d(), op[0], op[1]));
 856          }
 857          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 858          break;
 859
 860       case ir_unop_f2b:
 861          if (devinfo->gen >= 6) {
 862             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 863          } else {
 864             inst = emit(MOV(dst_null_f(), op[0]));
 865             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 866          }
 867          break;
 868
 869       case ir_unop_i2b:
 870          if (devinfo->gen >= 6) {
 871             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 872          } else {
 873             inst = emit(MOV(dst_null_d(), op[0]));
 874             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 875          }
 876          break;
 877
 878       case ir_binop_all_equal:
 879          if (devinfo->gen <= 5) {
 880             resolve_bool_comparison(expr->operands[0], &op[0]);
 881             resolve_bool_comparison(expr->operands[1], &op[1]);
 882          }
 883          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 884          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 885          break;
 886
 887       case ir_binop_any_nequal:
 888          if (devinfo->gen <= 5) {
 889             resolve_bool_comparison(expr->operands[0], &op[0]);
 890             resolve_bool_comparison(expr->operands[1], &op[1]);
 891          }
 892          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 893          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 894          break;
 895
 896       case ir_unop_any:
 897          if (devinfo->gen <= 5) {
 898             resolve_bool_comparison(expr->operands[0], &op[0]);
 899          }
 900          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 901          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 902          break;
 903
 904       case ir_binop_greater:
 905       case ir_binop_gequal:
 906       case ir_binop_less:
 907       case ir_binop_lequal:
 908       case ir_binop_equal:
 909       case ir_binop_nequal:
 910          if (devinfo->gen <= 5) {
 911             resolve_bool_comparison(expr->operands[0], &op[0]);
 912             resolve_bool_comparison(expr->operands[1], &op[1]);
 913          }
 914          emit(CMP(dst_null_d(), op[0], op[1],
 915                   brw_conditional_for_comparison(expr->operation)));
 916          break;
 917
 918       case ir_triop_csel: {
 919          /* Expand the boolean condition into the flag register. */
 920          inst = emit(MOV(dst_null_d(), op[0]));
 921          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 922
 923          /* Select which boolean to return. */
 924          dst_reg temp(this, expr->operands[1]->type);
 925          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 926          inst->predicate = BRW_PREDICATE_NORMAL;
 927
 928          /* Expand the result to a condition code. */
 929          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 930          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 931          break;
 932       }
 933
 934       default:
 935          unreachable("not reached");
 936       }
 937       return;
 938    }
 939
 940    ir->accept(this);
 941
 942    resolve_ud_negate(&this->result);
 943
 944    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 945    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 946 }
 947
 948 /**
 949  * Emit a gen6 IF statement with the comparison folded into the IF
 950  * instruction.
 951  */
 952 void
 953 vec4_visitor::emit_if_gen6(ir_if *ir)
 954 {
 955    ir_expression *expr = ir->condition->as_expression();
 956
 957    if (expr && expr->operation != ir_binop_ubo_load) {
 958       src_reg op[3];
 959       dst_reg temp;
 960
 961       assert(expr->get_num_operands() <= 3);
 962       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 963          expr->operands[i]->accept(this);
 964          op[i] = this->result;
 965       }
 966
 967       switch (expr->operation) {
 968       case ir_unop_logic_not:
 969          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 970          return;
 971
 972       case ir_binop_logic_xor:
 973          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 974          return;
 975
 976       case ir_binop_logic_or:
 977          temp = dst_reg(this, glsl_type::bool_type);
 978          emit(OR(temp, op[0], op[1]));
 979          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 980          return;
 981
 982       case ir_binop_logic_and:
 983          temp = dst_reg(this, glsl_type::bool_type);
 984          emit(AND(temp, op[0], op[1]));
 985          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 986          return;
 987
 988       case ir_unop_f2b:
 989          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 990          return;
 991
 992       case ir_unop_i2b:
 993          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 994          return;
 995
 996       case ir_binop_greater:
 997       case ir_binop_gequal:
 998       case ir_binop_less:
 999       case ir_binop_lequal:
1000       case ir_binop_equal:
1001       case ir_binop_nequal:
1002          emit(IF(op[0], op[1],
1003                  brw_conditional_for_comparison(expr->operation)));
1004          return;
1005
1006       case ir_binop_all_equal:
1007          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1008          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1009          return;
1010
1011       case ir_binop_any_nequal:
1012          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1013          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1014          return;
1015
1016       case ir_unop_any:
1017          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1018          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1019          return;
1020
1021       case ir_triop_csel: {
1022          /* Expand the boolean condition into the flag register. */
1023          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1024          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1025
1026          /* Select which boolean to return. */
1027          dst_reg temp(this, expr->operands[1]->type);
1028          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1029          inst->predicate = BRW_PREDICATE_NORMAL;
1030
1031          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1032          return;
1033       }
1034
1035       default:
1036          unreachable("not reached");
1037       }
1038       return;
1039    }
1040
1041    ir->condition->accept(this);
1042
1043    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1044 }
1045
1046 void
1047 vec4_visitor::visit(ir_variable *ir)
1048 {
1049    dst_reg *reg = NULL;
1050
1051    if (variable_storage(ir))
1052       return;
1053
1054    switch (ir->data.mode) {
1055    case ir_var_shader_in:
1056       assert(ir->data.location != -1);
1057       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1058       break;
1059
1060    case ir_var_shader_out:
1061       assert(ir->data.location != -1);
1062       reg = new(mem_ctx) dst_reg(this, ir->type);
1063
1064       for (int i = 0; i < type_size_vec4(ir->type); i++) {
1065          output_reg[ir->data.location + i] = *reg;
1066          output_reg[ir->data.location + i].reg_offset = i;
1067          output_reg_annotation[ir->data.location + i] = ir->name;
1068       }
1069       break;
1070
1071    case ir_var_auto:
1072    case ir_var_temporary:
1073       reg = new(mem_ctx) dst_reg(this, ir->type);
1074       break;
1075
1076    case ir_var_uniform:
1077    case ir_var_shader_storage:
1078       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1079
1080       /* Thanks to the lower_ubo_reference pass, we will see only
1081        * ir_binop_{ubo,ssbo}_load expressions and not ir_dereference_variable
1082        * for UBO/SSBO variables, so no need for them to be in variable_ht.
1083        *
1084        * Some uniforms, such as samplers and atomic counters, have no actual
1085        * storage, so we should ignore them.
1086        */
1087       if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1088          return;
1089
1090       /* Track how big the whole uniform variable is, in case we need to put a
1091        * copy of its data into pull constants for array access.
1092        */
1093       assert(this->uniforms < uniform_array_size);
1094       this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1095
1096       if (!strncmp(ir->name, "gl_", 3)) {
1097          setup_builtin_uniform_values(ir);
1098       } else {
1099          setup_uniform_values(ir);
1100       }
1101       break;
1102
1103    case ir_var_system_value:
1104       reg = make_reg_for_system_value(ir->data.location, ir->type);
1105       break;
1106
1107    default:
1108       unreachable("not reached");
1109    }
1110
1111    reg->type = brw_type_for_base_type(ir->type);
1112    hash_table_insert(this->variable_ht, reg, ir);
1113 }
1114
1115 void
1116 vec4_visitor::visit(ir_loop *ir)
1117 {
1118    /* We don't want debugging output to print the whole body of the
1119     * loop as the annotation.
1120     */
1121    this->base_ir = NULL;
1122
1123    emit(BRW_OPCODE_DO);
1124
1125    visit_instructions(&ir->body_instructions);
1126
1127    emit(BRW_OPCODE_WHILE);
1128 }
1129
1130 void
1131 vec4_visitor::visit(ir_loop_jump *ir)
1132 {
1133    switch (ir->mode) {
1134    case ir_loop_jump::jump_break:
1135       emit(BRW_OPCODE_BREAK);
1136       break;
1137    case ir_loop_jump::jump_continue:
1138       emit(BRW_OPCODE_CONTINUE);
1139       break;
1140    }
1141 }
1142
1143
1144 void
1145 vec4_visitor::visit(ir_function_signature *)
1146 {
1147    unreachable("not reached");
1148 }
1149
1150 void
1151 vec4_visitor::visit(ir_function *ir)
1152 {
1153    /* Ignore function bodies other than main() -- we shouldn't see calls to
1154     * them since they should all be inlined.
1155     */
1156    if (strcmp(ir->name, "main") == 0) {
1157       const ir_function_signature *sig;
1158       exec_list empty;
1159
1160       sig = ir->matching_signature(NULL, &empty, false);
1161
1162       assert(sig);
1163
1164       visit_instructions(&sig->body);
1165    }
1166 }
1167
1168 bool
1169 vec4_visitor::try_emit_mad(ir_expression *ir)
1170 {
1171    /* 3-src instructions were introduced in gen6. */
1172    if (devinfo->gen < 6)
1173       return false;
1174
1175    /* MAD can only handle floating-point data. */
1176    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1177       return false;
1178
1179    ir_rvalue *nonmul;
1180    ir_expression *mul;
1181    bool mul_negate, mul_abs;
1182
1183    for (int i = 0; i < 2; i++) {
1184       mul_negate = false;
1185       mul_abs = false;
1186
1187       mul = ir->operands[i]->as_expression();
1188       nonmul = ir->operands[1 - i];
1189
1190       if (mul && mul->operation == ir_unop_abs) {
1191          mul = mul->operands[0]->as_expression();
1192          mul_abs = true;
1193       } else if (mul && mul->operation == ir_unop_neg) {
1194          mul = mul->operands[0]->as_expression();
1195          mul_negate = true;
1196       }
1197
1198       if (mul && mul->operation == ir_binop_mul)
1199          break;
1200    }
1201
1202    if (!mul || mul->operation != ir_binop_mul)
1203       return false;
1204
1205    nonmul->accept(this);
1206    src_reg src0 = fix_3src_operand(this->result);
1207
1208    mul->operands[0]->accept(this);
1209    src_reg src1 = fix_3src_operand(this->result);
1210    src1.negate ^= mul_negate;
1211    src1.abs = mul_abs;
1212    if (mul_abs)
1213       src1.negate = false;
1214
1215    mul->operands[1]->accept(this);
1216    src_reg src2 = fix_3src_operand(this->result);
1217    src2.abs = mul_abs;
1218    if (mul_abs)
1219       src2.negate = false;
1220
1221    this->result = src_reg(this, ir->type);
1222    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1223
1224    return true;
1225 }
1226
1227 bool
1228 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1229 {
1230    /* This optimization relies on CMP setting the destination to 0 when
1231     * false.  Early hardware only sets the least significant bit, and
1232     * leaves the other bits undefined.  So we can't use it.
1233     */
1234    if (devinfo->gen < 6)
1235       return false;
1236
1237    ir_expression *const cmp = ir->operands[0]->as_expression();
1238
1239    if (cmp == NULL)
1240       return false;
1241
1242    switch (cmp->operation) {
1243    case ir_binop_less:
1244    case ir_binop_greater:
1245    case ir_binop_lequal:
1246    case ir_binop_gequal:
1247    case ir_binop_equal:
1248    case ir_binop_nequal:
1249       break;
1250
1251    default:
1252       return false;
1253    }
1254
1255    cmp->operands[0]->accept(this);
1256    const src_reg cmp_src0 = this->result;
1257
1258    cmp->operands[1]->accept(this);
1259    const src_reg cmp_src1 = this->result;
1260
1261    this->result = src_reg(this, ir->type);
1262
1263    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1264             brw_conditional_for_comparison(cmp->operation)));
1265
1266    /* If the comparison is false, this->result will just happen to be zero.
1267     */
1268    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1269                                        this->result, src_reg(1.0f));
1270    inst->predicate = BRW_PREDICATE_NORMAL;
1271    inst->predicate_inverse = true;
1272
1273    return true;
1274 }
1275
1276 vec4_instruction *
1277 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1278                           src_reg src0, src_reg src1)
1279 {
1280    vec4_instruction *inst;
1281
1282    if (devinfo->gen >= 6) {
1283       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1284       inst->conditional_mod = conditionalmod;
1285    } else {
1286       emit(CMP(dst, src0, src1, conditionalmod));
1287
1288       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1289       inst->predicate = BRW_PREDICATE_NORMAL;
1290    }
1291
1292    return inst;
1293 }
1294
1295 vec4_instruction *
1296 vec4_visitor::emit_lrp(const dst_reg &dst,
1297                        const src_reg &x, const src_reg &y, const src_reg &a)
1298 {
1299    if (devinfo->gen >= 6) {
1300       /* Note that the instruction's argument order is reversed from GLSL
1301        * and the IR.
1302        */
1303      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1304                      fix_3src_operand(x)));
1305    } else {
1306       /* Earlier generations don't support three source operations, so we
1307        * need to emit x*(1-a) + y*a.
1308        */
1309       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1310       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1311       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1312       y_times_a.writemask           = dst.writemask;
1313       one_minus_a.writemask         = dst.writemask;
1314       x_times_one_minus_a.writemask = dst.writemask;
1315
1316       emit(MUL(y_times_a, y, a));
1317       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1318       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1319       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1320    }
1321 }
1322
1323 /**
1324  * Emits the instructions needed to perform a pull constant load. before_block
1325  * and before_inst can be NULL in which case the instruction will be appended
1326  * to the end of the instruction list.
1327  */
1328 void
1329 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1330                                           src_reg surf_index,
1331                                           src_reg offset_reg,
1332                                           bblock_t *before_block,
1333                                           vec4_instruction *before_inst)
1334 {
1335    assert((before_inst == NULL && before_block == NULL) ||
1336           (before_inst && before_block));
1337
1338    vec4_instruction *pull;
1339
1340    if (devinfo->gen >= 9) {
1341       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1342       src_reg header(this, glsl_type::uvec4_type, 2);
1343
1344       pull = new(mem_ctx)
1345          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1346                           dst_reg(header));
1347
1348       if (before_inst)
1349          emit_before(before_block, before_inst, pull);
1350       else
1351          emit(pull);
1352
1353       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1354                                  offset_reg.type);
1355       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1356
1357       if (before_inst)
1358          emit_before(before_block, before_inst, pull);
1359       else
1360          emit(pull);
1361
1362       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1363                                            dst,
1364                                            surf_index,
1365                                            header);
1366       pull->mlen = 2;
1367       pull->header_size = 1;
1368    } else if (devinfo->gen >= 7) {
1369       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1370
1371       grf_offset.type = offset_reg.type;
1372
1373       pull = MOV(grf_offset, offset_reg);
1374
1375       if (before_inst)
1376          emit_before(before_block, before_inst, pull);
1377       else
1378          emit(pull);
1379
1380       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1381                                            dst,
1382                                            surf_index,
1383                                            src_reg(grf_offset));
1384       pull->mlen = 1;
1385    } else {
1386       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1387                                            dst,
1388                                            surf_index,
1389                                            offset_reg);
1390       pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
1391       pull->mlen = 1;
1392    }
1393
1394    if (before_inst)
1395       emit_before(before_block, before_inst, pull);
1396    else
1397       emit(pull);
1398 }
1399
1400 src_reg
1401 vec4_visitor::emit_uniformize(const src_reg &src)
1402 {
1403    const src_reg chan_index(this, glsl_type::uint_type);
1404    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1405                               src.type);
1406
1407    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1408       ->force_writemask_all = true;
1409    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1410       ->force_writemask_all = true;
1411
1412    return src_reg(dst);
1413 }
1414
1415 void
1416 vec4_visitor::visit(ir_expression *ir)
1417 {
1418    unsigned int operand;
1419    src_reg op[ARRAY_SIZE(ir->operands)];
1420    vec4_instruction *inst;
1421
1422    if (ir->operation == ir_binop_add) {
1423       if (try_emit_mad(ir))
1424          return;
1425    }
1426
1427    if (ir->operation == ir_unop_b2f) {
1428       if (try_emit_b2f_of_compare(ir))
1429          return;
1430    }
1431
1432    /* Storage for our result.  Ideally for an assignment we'd be using
1433     * the actual storage for the result here, instead.
1434     */
1435    dst_reg result_dst(this, ir->type);
1436    src_reg result_src(result_dst);
1437
1438    if (ir->operation == ir_triop_csel) {
1439       ir->operands[1]->accept(this);
1440       op[1] = this->result;
1441       ir->operands[2]->accept(this);
1442       op[2] = this->result;
1443
1444       enum brw_predicate predicate;
1445       emit_bool_to_cond_code(ir->operands[0], &predicate);
1446       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1447       inst->predicate = predicate;
1448       this->result = result_src;
1449       return;
1450    }
1451
1452    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1453       this->result.file = BAD_FILE;
1454       ir->operands[operand]->accept(this);
1455       if (this->result.file == BAD_FILE) {
1456          fprintf(stderr, "Failed to get tree for expression operand:\n");
1457          ir->operands[operand]->fprint(stderr);
1458          exit(1);
1459       }
1460       op[operand] = this->result;
1461
1462       /* Matrix expression operands should have been broken down to vector
1463        * operations already.
1464        */
1465       assert(!ir->operands[operand]->type->is_matrix());
1466    }
1467
1468    /* If nothing special happens, this is the result. */
1469    this->result = result_src;
1470
1471    switch (ir->operation) {
1472    case ir_unop_logic_not:
1473       emit(NOT(result_dst, op[0]));
1474       break;
1475    case ir_unop_neg:
1476       op[0].negate = !op[0].negate;
1477       emit(MOV(result_dst, op[0]));
1478       break;
1479    case ir_unop_abs:
1480       op[0].abs = true;
1481       op[0].negate = false;
1482       emit(MOV(result_dst, op[0]));
1483       break;
1484
1485    case ir_unop_sign:
1486       if (ir->type->is_float()) {
1487          /* AND(val, 0x80000000) gives the sign bit.
1488           *
1489           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1490           * zero.
1491           */
1492          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1493
1494          op[0].type = BRW_REGISTER_TYPE_UD;
1495          result_dst.type = BRW_REGISTER_TYPE_UD;
1496          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1497
1498          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1499          inst->predicate = BRW_PREDICATE_NORMAL;
1500
1501          this->result.type = BRW_REGISTER_TYPE_F;
1502       } else {
1503          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1504           *               -> non-negative val generates 0x00000000.
1505           *  Predicated OR sets 1 if val is positive.
1506           */
1507          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1508
1509          emit(ASR(result_dst, op[0], src_reg(31)));
1510
1511          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1512          inst->predicate = BRW_PREDICATE_NORMAL;
1513       }
1514       break;
1515
1516    case ir_unop_rcp:
1517       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1518       break;
1519
1520    case ir_unop_exp2:
1521       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1522       break;
1523    case ir_unop_log2:
1524       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1525       break;
1526    case ir_unop_exp:
1527    case ir_unop_log:
1528       unreachable("not reached: should be handled by ir_explog_to_explog2");
1529    case ir_unop_sin:
1530       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1531       break;
1532    case ir_unop_cos:
1533       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1534       break;
1535
1536    case ir_unop_dFdx:
1537    case ir_unop_dFdx_coarse:
1538    case ir_unop_dFdx_fine:
1539    case ir_unop_dFdy:
1540    case ir_unop_dFdy_coarse:
1541    case ir_unop_dFdy_fine:
1542       unreachable("derivatives not valid in vertex shader");
1543
1544    case ir_unop_bitfield_reverse:
1545       emit(BFREV(result_dst, op[0]));
1546       break;
1547    case ir_unop_bit_count:
1548       emit(CBIT(result_dst, op[0]));
1549       break;
1550    case ir_unop_find_msb: {
1551       src_reg temp = src_reg(this, glsl_type::uint_type);
1552
1553       inst = emit(FBH(dst_reg(temp), op[0]));
1554       inst->dst.writemask = WRITEMASK_XYZW;
1555
1556       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1557        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1558        * subtract the result from 31 to convert the MSB count into an LSB count.
1559        */
1560
1561       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1562       temp.swizzle = BRW_SWIZZLE_NOOP;
1563       emit(MOV(result_dst, temp));
1564
1565       src_reg src_tmp = src_reg(result_dst);
1566       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1567
1568       src_tmp.negate = true;
1569       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1570       inst->predicate = BRW_PREDICATE_NORMAL;
1571       break;
1572    }
1573    case ir_unop_find_lsb:
1574       emit(FBL(result_dst, op[0]));
1575       break;
1576    case ir_unop_saturate:
1577       inst = emit(MOV(result_dst, op[0]));
1578       inst->saturate = true;
1579       break;
1580
1581    case ir_unop_noise:
1582       unreachable("not reached: should be handled by lower_noise");
1583
1584    case ir_unop_subroutine_to_int:
1585       emit(MOV(result_dst, op[0]));
1586       break;
1587
1588    case ir_unop_ssbo_unsized_array_length:
1589       unreachable("not reached: should be handled by lower_ubo_reference");
1590       break;
1591
1592    case ir_binop_add:
1593       emit(ADD(result_dst, op[0], op[1]));
1594       break;
1595    case ir_binop_sub:
1596       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1597
1598    case ir_binop_mul:
1599       if (devinfo->gen < 8 && ir->type->is_integer()) {
1600          /* For integer multiplication, the MUL uses the low 16 bits of one of
1601           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1602           * accumulates in the contribution of the upper 16 bits of that
1603           * operand.  If we can determine that one of the args is in the low
1604           * 16 bits, though, we can just emit a single MUL.
1605           */
1606          if (ir->operands[0]->is_uint16_constant()) {
1607             if (devinfo->gen < 7)
1608                emit(MUL(result_dst, op[0], op[1]));
1609             else
1610                emit(MUL(result_dst, op[1], op[0]));
1611          } else if (ir->operands[1]->is_uint16_constant()) {
1612             if (devinfo->gen < 7)
1613                emit(MUL(result_dst, op[1], op[0]));
1614             else
1615                emit(MUL(result_dst, op[0], op[1]));
1616          } else {
1617             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1618
1619             emit(MUL(acc, op[0], op[1]));
1620             emit(MACH(dst_null_d(), op[0], op[1]));
1621             emit(MOV(result_dst, src_reg(acc)));
1622          }
1623       } else {
1624          emit(MUL(result_dst, op[0], op[1]));
1625       }
1626       break;
1627    case ir_binop_imul_high: {
1628       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1629
1630       emit(MUL(acc, op[0], op[1]));
1631       emit(MACH(result_dst, op[0], op[1]));
1632       break;
1633    }
1634    case ir_binop_div:
1635       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1636       assert(ir->type->is_integer());
1637       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1638       break;
1639
1640    case ir_binop_carry:
1641       unreachable("Should have been lowered by carry_to_arith().");
1642
1643    case ir_binop_borrow:
1644       unreachable("Should have been lowered by borrow_to_arith().");
1645
1646    case ir_binop_mod:
1647       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1648       assert(ir->type->is_integer());
1649       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1650       break;
1651
1652    case ir_binop_less:
1653    case ir_binop_greater:
1654    case ir_binop_lequal:
1655    case ir_binop_gequal:
1656    case ir_binop_equal:
1657    case ir_binop_nequal: {
1658       if (devinfo->gen <= 5) {
1659          resolve_bool_comparison(ir->operands[0], &op[0]);
1660          resolve_bool_comparison(ir->operands[1], &op[1]);
1661       }
1662       emit(CMP(result_dst, op[0], op[1],
1663                brw_conditional_for_comparison(ir->operation)));
1664       break;
1665    }
1666
1667    case ir_binop_all_equal:
1668       if (devinfo->gen <= 5) {
1669          resolve_bool_comparison(ir->operands[0], &op[0]);
1670          resolve_bool_comparison(ir->operands[1], &op[1]);
1671       }
1672
1673       /* "==" operator producing a scalar boolean. */
1674       if (ir->operands[0]->type->is_vector() ||
1675           ir->operands[1]->type->is_vector()) {
1676          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1677          emit(MOV(result_dst, src_reg(0)));
1678          inst = emit(MOV(result_dst, src_reg(~0)));
1679          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1680       } else {
1681          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1682       }
1683       break;
1684    case ir_binop_any_nequal:
1685       if (devinfo->gen <= 5) {
1686          resolve_bool_comparison(ir->operands[0], &op[0]);
1687          resolve_bool_comparison(ir->operands[1], &op[1]);
1688       }
1689
1690       /* "!=" operator producing a scalar boolean. */
1691       if (ir->operands[0]->type->is_vector() ||
1692           ir->operands[1]->type->is_vector()) {
1693          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1694
1695          emit(MOV(result_dst, src_reg(0)));
1696          inst = emit(MOV(result_dst, src_reg(~0)));
1697          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1698       } else {
1699          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1700       }
1701       break;
1702
1703    case ir_unop_any:
1704       if (devinfo->gen <= 5) {
1705          resolve_bool_comparison(ir->operands[0], &op[0]);
1706       }
1707       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1708       emit(MOV(result_dst, src_reg(0)));
1709
1710       inst = emit(MOV(result_dst, src_reg(~0)));
1711       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1712       break;
1713
1714    case ir_binop_logic_xor:
1715       emit(XOR(result_dst, op[0], op[1]));
1716       break;
1717
1718    case ir_binop_logic_or:
1719       emit(OR(result_dst, op[0], op[1]));
1720       break;
1721
1722    case ir_binop_logic_and:
1723       emit(AND(result_dst, op[0], op[1]));
1724       break;
1725
1726    case ir_binop_dot:
1727       assert(ir->operands[0]->type->is_vector());
1728       assert(ir->operands[0]->type == ir->operands[1]->type);
1729       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1730       break;
1731
1732    case ir_unop_sqrt:
1733       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1734       break;
1735    case ir_unop_rsq:
1736       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1737       break;
1738
1739    case ir_unop_bitcast_i2f:
1740    case ir_unop_bitcast_u2f:
1741       this->result = op[0];
1742       this->result.type = BRW_REGISTER_TYPE_F;
1743       break;
1744
1745    case ir_unop_bitcast_f2i:
1746       this->result = op[0];
1747       this->result.type = BRW_REGISTER_TYPE_D;
1748       break;
1749
1750    case ir_unop_bitcast_f2u:
1751       this->result = op[0];
1752       this->result.type = BRW_REGISTER_TYPE_UD;
1753       break;
1754
1755    case ir_unop_i2f:
1756    case ir_unop_i2u:
1757    case ir_unop_u2i:
1758    case ir_unop_u2f:
1759    case ir_unop_f2i:
1760    case ir_unop_f2u:
1761       emit(MOV(result_dst, op[0]));
1762       break;
1763    case ir_unop_b2i:
1764    case ir_unop_b2f:
1765       if (devinfo->gen <= 5) {
1766          resolve_bool_comparison(ir->operands[0], &op[0]);
1767       }
1768       emit(MOV(result_dst, negate(op[0])));
1769       break;
1770    case ir_unop_f2b:
1771       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1772       break;
1773    case ir_unop_i2b:
1774       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1775       break;
1776
1777    case ir_unop_trunc:
1778       emit(RNDZ(result_dst, op[0]));
1779       break;
1780    case ir_unop_ceil: {
1781          src_reg tmp = src_reg(this, ir->type);
1782          op[0].negate = !op[0].negate;
1783          emit(RNDD(dst_reg(tmp), op[0]));
1784          tmp.negate = true;
1785          emit(MOV(result_dst, tmp));
1786       }
1787       break;
1788    case ir_unop_floor:
1789       inst = emit(RNDD(result_dst, op[0]));
1790       break;
1791    case ir_unop_fract:
1792       inst = emit(FRC(result_dst, op[0]));
1793       break;
1794    case ir_unop_round_even:
1795       emit(RNDE(result_dst, op[0]));
1796       break;
1797
1798    case ir_unop_get_buffer_size:
1799       unreachable("not reached: not implemented");
1800       break;
1801
1802    case ir_binop_min:
1803       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1804       break;
1805    case ir_binop_max:
1806       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1807       break;
1808
1809    case ir_binop_pow:
1810       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1811       break;
1812
1813    case ir_unop_bit_not:
1814       inst = emit(NOT(result_dst, op[0]));
1815       break;
1816    case ir_binop_bit_and:
1817       inst = emit(AND(result_dst, op[0], op[1]));
1818       break;
1819    case ir_binop_bit_xor:
1820       inst = emit(XOR(result_dst, op[0], op[1]));
1821       break;
1822    case ir_binop_bit_or:
1823       inst = emit(OR(result_dst, op[0], op[1]));
1824       break;
1825
1826    case ir_binop_lshift:
1827       inst = emit(SHL(result_dst, op[0], op[1]));
1828       break;
1829
1830    case ir_binop_rshift:
1831       if (ir->type->base_type == GLSL_TYPE_INT)
1832          inst = emit(ASR(result_dst, op[0], op[1]));
1833       else
1834          inst = emit(SHR(result_dst, op[0], op[1]));
1835       break;
1836
1837    case ir_binop_bfm:
1838       emit(BFI1(result_dst, op[0], op[1]));
1839       break;
1840
1841    case ir_binop_ubo_load: {
1842       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1843       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1844       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1845       src_reg offset;
1846
1847       /* Now, load the vector from that offset. */
1848       assert(ir->type->is_vector() || ir->type->is_scalar());
1849
1850       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1851       packed_consts.type = result.type;
1852       src_reg surf_index;
1853
1854       if (const_uniform_block) {
1855          /* The block index is a constant, so just emit the binding table entry
1856           * as an immediate.
1857           */
1858          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1859                               const_uniform_block->value.u[0]);
1860       } else {
1861          /* The block index is not a constant. Evaluate the index expression
1862           * per-channel and add the base UBO index; we have to select a value
1863           * from any live channel.
1864           */
1865          surf_index = src_reg(this, glsl_type::uint_type);
1866          emit(ADD(dst_reg(surf_index), op[0],
1867                   src_reg(prog_data->base.binding_table.ubo_start)));
1868          surf_index = emit_uniformize(surf_index);
1869
1870          /* Assume this may touch any UBO. It would be nice to provide
1871           * a tighter bound, but the array information is already lowered away.
1872           */
1873          brw_mark_surface_used(&prog_data->base,
1874                                prog_data->base.binding_table.ubo_start +
1875                                shader_prog->NumBufferInterfaceBlocks - 1);
1876       }
1877
1878       if (const_offset_ir) {
1879          if (devinfo->gen >= 8) {
1880             /* Store the offset in a GRF so we can send-from-GRF. */
1881             offset = src_reg(this, glsl_type::int_type);
1882             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1883          } else {
1884             /* Immediates are fine on older generations since they'll be moved
1885              * to a (potentially fake) MRF at the generator level.
1886              */
1887             offset = src_reg(const_offset / 16);
1888          }
1889       } else {
1890          offset = src_reg(this, glsl_type::uint_type);
1891          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1892       }
1893
1894       emit_pull_constant_load_reg(dst_reg(packed_consts),
1895                                   surf_index,
1896                                   offset,
1897                                   NULL, NULL /* before_block/inst */);
1898
1899       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1900       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1901                                             const_offset % 16 / 4,
1902                                             const_offset % 16 / 4,
1903                                             const_offset % 16 / 4);
1904
1905       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1906       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1907          emit(CMP(result_dst, packed_consts, src_reg(0u),
1908                   BRW_CONDITIONAL_NZ));
1909       } else {
1910          emit(MOV(result_dst, packed_consts));
1911       }
1912       break;
1913    }
1914
1915    case ir_binop_vector_extract:
1916       unreachable("should have been lowered by vec_index_to_cond_assign");
1917
1918    case ir_triop_fma:
1919       op[0] = fix_3src_operand(op[0]);
1920       op[1] = fix_3src_operand(op[1]);
1921       op[2] = fix_3src_operand(op[2]);
1922       /* Note that the instruction's argument order is reversed from GLSL
1923        * and the IR.
1924        */
1925       emit(MAD(result_dst, op[2], op[1], op[0]));
1926       break;
1927
1928    case ir_triop_lrp:
1929       emit_lrp(result_dst, op[0], op[1], op[2]);
1930       break;
1931
1932    case ir_triop_csel:
1933       unreachable("already handled above");
1934       break;
1935
1936    case ir_triop_bfi:
1937       op[0] = fix_3src_operand(op[0]);
1938       op[1] = fix_3src_operand(op[1]);
1939       op[2] = fix_3src_operand(op[2]);
1940       emit(BFI2(result_dst, op[0], op[1], op[2]));
1941       break;
1942
1943    case ir_triop_bitfield_extract:
1944       op[0] = fix_3src_operand(op[0]);
1945       op[1] = fix_3src_operand(op[1]);
1946       op[2] = fix_3src_operand(op[2]);
1947       /* Note that the instruction's argument order is reversed from GLSL
1948        * and the IR.
1949        */
1950       emit(BFE(result_dst, op[2], op[1], op[0]));
1951       break;
1952
1953    case ir_triop_vector_insert:
1954       unreachable("should have been lowered by lower_vector_insert");
1955
1956    case ir_quadop_bitfield_insert:
1957       unreachable("not reached: should be handled by "
1958               "bitfield_insert_to_bfm_bfi\n");
1959
1960    case ir_quadop_vector:
1961       unreachable("not reached: should be handled by lower_quadop_vector");
1962
1963    case ir_unop_pack_half_2x16:
1964       emit_pack_half_2x16(result_dst, op[0]);
1965       break;
1966    case ir_unop_unpack_half_2x16:
1967       emit_unpack_half_2x16(result_dst, op[0]);
1968       break;
1969    case ir_unop_unpack_unorm_4x8:
1970       emit_unpack_unorm_4x8(result_dst, op[0]);
1971       break;
1972    case ir_unop_unpack_snorm_4x8:
1973       emit_unpack_snorm_4x8(result_dst, op[0]);
1974       break;
1975    case ir_unop_pack_unorm_4x8:
1976       emit_pack_unorm_4x8(result_dst, op[0]);
1977       break;
1978    case ir_unop_pack_snorm_4x8:
1979       emit_pack_snorm_4x8(result_dst, op[0]);
1980       break;
1981    case ir_unop_pack_snorm_2x16:
1982    case ir_unop_pack_unorm_2x16:
1983    case ir_unop_unpack_snorm_2x16:
1984    case ir_unop_unpack_unorm_2x16:
1985       unreachable("not reached: should be handled by lower_packing_builtins");
1986    case ir_unop_unpack_half_2x16_split_x:
1987    case ir_unop_unpack_half_2x16_split_y:
1988    case ir_binop_pack_half_2x16_split:
1989    case ir_unop_interpolate_at_centroid:
1990    case ir_binop_interpolate_at_sample:
1991    case ir_binop_interpolate_at_offset:
1992       unreachable("not reached: should not occur in vertex shader");
1993    case ir_binop_ldexp:
1994       unreachable("not reached: should be handled by ldexp_to_arith()");
1995    case ir_unop_d2f:
1996    case ir_unop_f2d:
1997    case ir_unop_d2i:
1998    case ir_unop_i2d:
1999    case ir_unop_d2u:
2000    case ir_unop_u2d:
2001    case ir_unop_d2b:
2002    case ir_unop_pack_double_2x32:
2003    case ir_unop_unpack_double_2x32:
2004    case ir_unop_frexp_sig:
2005    case ir_unop_frexp_exp:
2006       unreachable("fp64 todo");
2007    }
2008 }
2009
2010
2011 void
2012 vec4_visitor::visit(ir_swizzle *ir)
2013 {
2014    /* Note that this is only swizzles in expressions, not those on the left
2015     * hand side of an assignment, which do write masking.  See ir_assignment
2016     * for that.
2017     */
2018    const unsigned swz = brw_compose_swizzle(
2019       brw_swizzle_for_size(ir->type->vector_elements),
2020       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2021
2022    ir->val->accept(this);
2023    this->result = swizzle(this->result, swz);
2024 }
2025
2026 void
2027 vec4_visitor::visit(ir_dereference_variable *ir)
2028 {
2029    const struct glsl_type *type = ir->type;
2030    dst_reg *reg = variable_storage(ir->var);
2031
2032    if (!reg) {
2033       fail("Failed to find variable storage for %s\n", ir->var->name);
2034       this->result = src_reg(brw_null_reg());
2035       return;
2036    }
2037
2038    this->result = src_reg(*reg);
2039
2040    /* System values get their swizzle from the dst_reg writemask */
2041    if (ir->var->data.mode == ir_var_system_value)
2042       return;
2043
2044    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2045       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2046 }
2047
2048
2049 int
2050 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2051 {
2052    /* Under normal circumstances array elements are stored consecutively, so
2053     * the stride is equal to the size of the array element.
2054     */
2055    return type_size_vec4(ir->type);
2056 }
2057
2058
2059 void
2060 vec4_visitor::visit(ir_dereference_array *ir)
2061 {
2062    ir_constant *constant_index;
2063    src_reg src;
2064    int array_stride = compute_array_stride(ir);
2065
2066    constant_index = ir->array_index->constant_expression_value();
2067
2068    ir->array->accept(this);
2069    src = this->result;
2070
2071    if (constant_index) {
2072       src.reg_offset += constant_index->value.i[0] * array_stride;
2073    } else {
2074       /* Variable index array dereference.  It eats the "vec4" of the
2075        * base of the array and an index that offsets the Mesa register
2076        * index.
2077        */
2078       ir->array_index->accept(this);
2079
2080       src_reg index_reg;
2081
2082       if (array_stride == 1) {
2083          index_reg = this->result;
2084       } else {
2085          index_reg = src_reg(this, glsl_type::int_type);
2086
2087          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2088       }
2089
2090       if (src.reladdr) {
2091          src_reg temp = src_reg(this, glsl_type::int_type);
2092
2093          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2094
2095          index_reg = temp;
2096       }
2097
2098       src.reladdr = ralloc(mem_ctx, src_reg);
2099       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2100    }
2101
2102    /* If the type is smaller than a vec4, replicate the last channel out. */
2103    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2104       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2105    else
2106       src.swizzle = BRW_SWIZZLE_NOOP;
2107    src.type = brw_type_for_base_type(ir->type);
2108
2109    this->result = src;
2110 }
2111
2112 void
2113 vec4_visitor::visit(ir_dereference_record *ir)
2114 {
2115    unsigned int i;
2116    const glsl_type *struct_type = ir->record->type;
2117    int offset = 0;
2118
2119    ir->record->accept(this);
2120
2121    for (i = 0; i < struct_type->length; i++) {
2122       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2123          break;
2124       offset += type_size_vec4(struct_type->fields.structure[i].type);
2125    }
2126
2127    /* If the type is smaller than a vec4, replicate the last channel out. */
2128    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2129       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2130    else
2131       this->result.swizzle = BRW_SWIZZLE_NOOP;
2132    this->result.type = brw_type_for_base_type(ir->type);
2133
2134    this->result.reg_offset += offset;
2135 }
2136
2137 /**
2138  * We want to be careful in assignment setup to hit the actual storage
2139  * instead of potentially using a temporary like we might with the
2140  * ir_dereference handler.
2141  */
2142 static dst_reg
2143 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2144 {
2145    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2146     * access of a vector, it must be separated into a series conditional moves
2147     * before reaching this point (see ir_vec_index_to_cond_assign).
2148     */
2149    assert(ir->as_dereference());
2150    ir_dereference_array *deref_array = ir->as_dereference_array();
2151    if (deref_array) {
2152       assert(!deref_array->array->type->is_vector());
2153    }
2154
2155    /* Use the rvalue deref handler for the most part.  We'll ignore
2156     * swizzles in it and write swizzles using writemask, though.
2157     */
2158    ir->accept(v);
2159    return dst_reg(v->result);
2160 }
2161
2162 void
2163 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2164                               const struct glsl_type *type,
2165                               enum brw_predicate predicate)
2166 {
2167    if (type->base_type == GLSL_TYPE_STRUCT) {
2168       for (unsigned int i = 0; i < type->length; i++) {
2169          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2170       }
2171       return;
2172    }
2173
2174    if (type->is_array()) {
2175       for (unsigned int i = 0; i < type->length; i++) {
2176          emit_block_move(dst, src, type->fields.array, predicate);
2177       }
2178       return;
2179    }
2180
2181    if (type->is_matrix()) {
2182       const struct glsl_type *vec_type;
2183
2184       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2185                                          type->vector_elements, 1);
2186
2187       for (int i = 0; i < type->matrix_columns; i++) {
2188          emit_block_move(dst, src, vec_type, predicate);
2189       }
2190       return;
2191    }
2192
2193    assert(type->is_scalar() || type->is_vector());
2194
2195    dst->type = brw_type_for_base_type(type);
2196    src->type = dst->type;
2197
2198    dst->writemask = (1 << type->vector_elements) - 1;
2199
2200    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2201
2202    vec4_instruction *inst = emit(MOV(*dst, *src));
2203    inst->predicate = predicate;
2204
2205    dst->reg_offset++;
2206    src->reg_offset++;
2207 }
2208
2209
2210 /* If the RHS processing resulted in an instruction generating a
2211  * temporary value, and it would be easy to rewrite the instruction to
2212  * generate its result right into the LHS instead, do so.  This ends
2213  * up reliably removing instructions where it can be tricky to do so
2214  * later without real UD chain information.
2215  */
2216 bool
2217 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2218                                      dst_reg dst,
2219                                      src_reg src,
2220                                      vec4_instruction *pre_rhs_inst,
2221                                      vec4_instruction *last_rhs_inst)
2222 {
2223    /* This could be supported, but it would take more smarts. */
2224    if (ir->condition)
2225       return false;
2226
2227    if (pre_rhs_inst == last_rhs_inst)
2228       return false; /* No instructions generated to work with. */
2229
2230    /* Make sure the last instruction generated our source reg. */
2231    if (src.file != GRF ||
2232        src.file != last_rhs_inst->dst.file ||
2233        src.reg != last_rhs_inst->dst.reg ||
2234        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2235        src.reladdr ||
2236        src.abs ||
2237        src.negate ||
2238        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2239       return false;
2240
2241    /* Check that that last instruction fully initialized the channels
2242     * we want to use, in the order we want to use them.  We could
2243     * potentially reswizzle the operands of many instructions so that
2244     * we could handle out of order channels, but don't yet.
2245     */
2246
2247    for (unsigned i = 0; i < 4; i++) {
2248       if (dst.writemask & (1 << i)) {
2249          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2250             return false;
2251
2252          if (BRW_GET_SWZ(src.swizzle, i) != i)
2253             return false;
2254       }
2255    }
2256
2257    /* Success!  Rewrite the instruction. */
2258    last_rhs_inst->dst.file = dst.file;
2259    last_rhs_inst->dst.reg = dst.reg;
2260    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2261    last_rhs_inst->dst.reladdr = dst.reladdr;
2262    last_rhs_inst->dst.writemask &= dst.writemask;
2263
2264    return true;
2265 }
2266
2267 void
2268 vec4_visitor::visit(ir_assignment *ir)
2269 {
2270    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2271    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2272
2273    if (!ir->lhs->type->is_scalar() &&
2274        !ir->lhs->type->is_vector()) {
2275       ir->rhs->accept(this);
2276       src_reg src = this->result;
2277
2278       if (ir->condition) {
2279          emit_bool_to_cond_code(ir->condition, &predicate);
2280       }
2281
2282       /* emit_block_move doesn't account for swizzles in the source register.
2283        * This should be ok, since the source register is a structure or an
2284        * array, and those can't be swizzled.  But double-check to be sure.
2285        */
2286       assert(src.swizzle ==
2287              (ir->rhs->type->is_matrix()
2288               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2289               : BRW_SWIZZLE_NOOP));
2290
2291       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2292       return;
2293    }
2294
2295    /* Now we're down to just a scalar/vector with writemasks. */
2296    int i;
2297
2298    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2299    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2300
2301    ir->rhs->accept(this);
2302
2303    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2304
2305    int swizzles[4];
2306    int src_chan = 0;
2307
2308    assert(ir->lhs->type->is_vector() ||
2309           ir->lhs->type->is_scalar());
2310    dst.writemask = ir->write_mask;
2311
2312    /* Swizzle a small RHS vector into the channels being written.
2313     *
2314     * glsl ir treats write_mask as dictating how many channels are
2315     * present on the RHS while in our instructions we need to make
2316     * those channels appear in the slots of the vec4 they're written to.
2317     */
2318    for (int i = 0; i < 4; i++)
2319       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2320
2321    src_reg src = swizzle(this->result,
2322                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2323                                       swizzles[2], swizzles[3]));
2324
2325    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2326       return;
2327    }
2328
2329    if (ir->condition) {
2330       emit_bool_to_cond_code(ir->condition, &predicate);
2331    }
2332
2333    for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2334       vec4_instruction *inst = emit(MOV(dst, src));
2335       inst->predicate = predicate;
2336
2337       dst.reg_offset++;
2338       src.reg_offset++;
2339    }
2340 }
2341
2342 void
2343 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2344 {
2345    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2346       foreach_in_list(ir_constant, field_value, &ir->components) {
2347          emit_constant_values(dst, field_value);
2348       }
2349       return;
2350    }
2351
2352    if (ir->type->is_array()) {
2353       for (unsigned int i = 0; i < ir->type->length; i++) {
2354          emit_constant_values(dst, ir->array_elements[i]);
2355       }
2356       return;
2357    }
2358
2359    if (ir->type->is_matrix()) {
2360       for (int i = 0; i < ir->type->matrix_columns; i++) {
2361          float *vec = &ir->value.f[i * ir->type->vector_elements];
2362
2363          for (int j = 0; j < ir->type->vector_elements; j++) {
2364             dst->writemask = 1 << j;
2365             dst->type = BRW_REGISTER_TYPE_F;
2366
2367             emit(MOV(*dst, src_reg(vec[j])));
2368          }
2369          dst->reg_offset++;
2370       }
2371       return;
2372    }
2373
2374    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2375
2376    for (int i = 0; i < ir->type->vector_elements; i++) {
2377       if (!(remaining_writemask & (1 << i)))
2378          continue;
2379
2380       dst->writemask = 1 << i;
2381       dst->type = brw_type_for_base_type(ir->type);
2382
2383       /* Find other components that match the one we're about to
2384        * write.  Emits fewer instructions for things like vec4(0.5,
2385        * 1.5, 1.5, 1.5).
2386        */
2387       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2388          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2389             if (ir->value.b[i] == ir->value.b[j])
2390                dst->writemask |= (1 << j);
2391          } else {
2392             /* u, i, and f storage all line up, so no need for a
2393              * switch case for comparing each type.
2394              */
2395             if (ir->value.u[i] == ir->value.u[j])
2396                dst->writemask |= (1 << j);
2397          }
2398       }
2399
2400       switch (ir->type->base_type) {
2401       case GLSL_TYPE_FLOAT:
2402          emit(MOV(*dst, src_reg(ir->value.f[i])));
2403          break;
2404       case GLSL_TYPE_INT:
2405          emit(MOV(*dst, src_reg(ir->value.i[i])));
2406          break;
2407       case GLSL_TYPE_UINT:
2408          emit(MOV(*dst, src_reg(ir->value.u[i])));
2409          break;
2410       case GLSL_TYPE_BOOL:
2411          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2412          break;
2413       default:
2414          unreachable("Non-float/uint/int/bool constant");
2415       }
2416
2417       remaining_writemask &= ~dst->writemask;
2418    }
2419    dst->reg_offset++;
2420 }
2421
2422 void
2423 vec4_visitor::visit(ir_constant *ir)
2424 {
2425    dst_reg dst = dst_reg(this, ir->type);
2426    this->result = src_reg(dst);
2427
2428    emit_constant_values(&dst, ir);
2429 }
2430
2431 void
2432 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2433 {
2434    ir_dereference *deref = static_cast<ir_dereference *>(
2435       ir->actual_parameters.get_head());
2436    ir_variable *location = deref->variable_referenced();
2437    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2438                           location->data.binding);
2439
2440    /* Calculate the surface offset */
2441    src_reg offset(this, glsl_type::uint_type);
2442    ir_dereference_array *deref_array = deref->as_dereference_array();
2443    if (deref_array) {
2444       deref_array->array_index->accept(this);
2445
2446       src_reg tmp(this, glsl_type::uint_type);
2447       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2448       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2449    } else {
2450       offset = location->data.atomic.offset;
2451    }
2452
2453    /* Emit the appropriate machine instruction */
2454    const char *callee = ir->callee->function_name();
2455    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2456
2457    if (!strcmp("__intrinsic_atomic_read", callee)) {
2458       emit_untyped_surface_read(surf_index, dst, offset);
2459
2460    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2461       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2462                           src_reg(), src_reg());
2463
2464    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2465       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2466                           src_reg(), src_reg());
2467    }
2468
2469    brw_mark_surface_used(stage_prog_data, surf_index);
2470 }
2471
2472 void
2473 vec4_visitor::visit(ir_call *ir)
2474 {
2475    const char *callee = ir->callee->function_name();
2476
2477    if (!strcmp("__intrinsic_atomic_read", callee) ||
2478        !strcmp("__intrinsic_atomic_increment", callee) ||
2479        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2480       visit_atomic_counter_intrinsic(ir);
2481    } else {
2482       unreachable("Unsupported intrinsic.");
2483    }
2484 }
2485
2486 src_reg
2487 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2488                              src_reg coordinate, src_reg sampler)
2489 {
2490    vec4_instruction *inst =
2491       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2492                                     dst_reg(this, glsl_type::uvec4_type));
2493    inst->base_mrf = 2;
2494    inst->src[1] = sampler;
2495
2496    int param_base;
2497
2498    if (devinfo->gen >= 9) {
2499       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2500       vec4_instruction *header_inst = new(mem_ctx)
2501          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2502                           dst_reg(MRF, inst->base_mrf));
2503
2504       emit(header_inst);
2505
2506       inst->mlen = 2;
2507       inst->header_size = 1;
2508       param_base = inst->base_mrf + 1;
2509    } else {
2510       inst->mlen = 1;
2511       param_base = inst->base_mrf;
2512    }
2513
2514    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2515    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2516    int zero_mask = 0xf & ~coord_mask;
2517
2518    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2519             coordinate));
2520
2521    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2522             src_reg(0)));
2523
2524    emit(inst);
2525    return src_reg(inst->dst);
2526 }
2527
2528 bool
2529 vec4_visitor::is_high_sampler(src_reg sampler)
2530 {
2531    if (devinfo->gen < 8 && !devinfo->is_haswell)
2532       return false;
2533
2534    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2535 }
2536
2537 void
2538 vec4_visitor::emit_texture(ir_texture_opcode op,
2539                            dst_reg dest,
2540                            const glsl_type *dest_type,
2541                            src_reg coordinate,
2542                            int coord_components,
2543                            src_reg shadow_comparitor,
2544                            src_reg lod, src_reg lod2,
2545                            src_reg sample_index,
2546                            uint32_t constant_offset,
2547                            src_reg offset_value,
2548                            src_reg mcs,
2549                            bool is_cube_array,
2550                            uint32_t sampler,
2551                            src_reg sampler_reg)
2552 {
2553    enum opcode opcode;
2554    switch (op) {
2555    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2556    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2557    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2558    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2559    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2560    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2561    case ir_tg4: opcode = offset_value.file != BAD_FILE
2562                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2563    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2564    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
2565    case ir_txb:
2566       unreachable("TXB is not valid for vertex shaders.");
2567    case ir_lod:
2568       unreachable("LOD is not valid for vertex shaders.");
2569    default:
2570       unreachable("Unrecognized tex op");
2571    }
2572
2573    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2574       opcode, dst_reg(this, dest_type));
2575
2576    inst->offset = constant_offset;
2577
2578    /* The message header is necessary for:
2579     * - Gen4 (always)
2580     * - Gen9+ for selecting SIMD4x2
2581     * - Texel offsets
2582     * - Gather channel selection
2583     * - Sampler indices too large to fit in a 4-bit value.
2584     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2585     */
2586    inst->header_size =
2587       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2588        inst->offset != 0 || op == ir_tg4 ||
2589        op == ir_texture_samples ||
2590        is_high_sampler(sampler_reg)) ? 1 : 0;
2591    inst->base_mrf = 2;
2592    inst->mlen = inst->header_size;
2593    inst->dst.writemask = WRITEMASK_XYZW;
2594    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2595
2596    inst->src[1] = sampler_reg;
2597
2598    /* MRF for the first parameter */
2599    int param_base = inst->base_mrf + inst->header_size;
2600
2601    if (op == ir_txs || op == ir_query_levels) {
2602       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2603       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2604       inst->mlen++;
2605    } else if (op == ir_texture_samples) {
2606       inst->dst.writemask = WRITEMASK_X;
2607    } else {
2608       /* Load the coordinate */
2609       /* FINISHME: gl_clamp_mask and saturate */
2610       int coord_mask = (1 << coord_components) - 1;
2611       int zero_mask = 0xf & ~coord_mask;
2612
2613       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2614                coordinate));
2615       inst->mlen++;
2616
2617       if (zero_mask != 0) {
2618          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2619                   src_reg(0)));
2620       }
2621       /* Load the shadow comparitor */
2622       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2623          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2624                           WRITEMASK_X),
2625                   shadow_comparitor));
2626          inst->mlen++;
2627       }
2628
2629       /* Load the LOD info */
2630       if (op == ir_tex || op == ir_txl) {
2631          int mrf, writemask;
2632          if (devinfo->gen >= 5) {
2633             mrf = param_base + 1;
2634             if (shadow_comparitor.file != BAD_FILE) {
2635                writemask = WRITEMASK_Y;
2636                /* mlen already incremented */
2637             } else {
2638                writemask = WRITEMASK_X;
2639                inst->mlen++;
2640             }
2641          } else /* devinfo->gen == 4 */ {
2642             mrf = param_base;
2643             writemask = WRITEMASK_W;
2644          }
2645          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2646       } else if (op == ir_txf) {
2647          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2648       } else if (op == ir_txf_ms) {
2649          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2650                   sample_index));
2651          if (devinfo->gen >= 7) {
2652             /* MCS data is in the first channel of `mcs`, but we need to get it into
2653              * the .y channel of the second vec4 of params, so replicate .x across
2654              * the whole vec4 and then mask off everything except .y
2655              */
2656             mcs.swizzle = BRW_SWIZZLE_XXXX;
2657             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2658                      mcs));
2659          }
2660          inst->mlen++;
2661       } else if (op == ir_txd) {
2662          const brw_reg_type type = lod.type;
2663
2664          if (devinfo->gen >= 5) {
2665             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2666             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2667             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2668             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2669             inst->mlen++;
2670
2671             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2672                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2673                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2674                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2675                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2676                inst->mlen++;
2677
2678                if (shadow_comparitor.file != BAD_FILE) {
2679                   emit(MOV(dst_reg(MRF, param_base + 2,
2680                                    shadow_comparitor.type, WRITEMASK_Z),
2681                            shadow_comparitor));
2682                }
2683             }
2684          } else /* devinfo->gen == 4 */ {
2685             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2686             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2687             inst->mlen += 2;
2688          }
2689       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2690          if (shadow_comparitor.file != BAD_FILE) {
2691             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2692                      shadow_comparitor));
2693          }
2694
2695          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2696                   offset_value));
2697          inst->mlen++;
2698       }
2699    }
2700
2701    emit(inst);
2702
2703    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2704     * spec requires layers.
2705     */
2706    if (op == ir_txs && is_cube_array) {
2707       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2708                 writemask(inst->dst, WRITEMASK_Z),
2709                 src_reg(inst->dst), src_reg(6));
2710    }
2711
2712    if (devinfo->gen == 6 && op == ir_tg4) {
2713       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2714    }
2715
2716    swizzle_result(op, dest,
2717                   src_reg(inst->dst), sampler, dest_type);
2718 }
2719
2720 void
2721 vec4_visitor::visit(ir_texture *ir)
2722 {
2723    uint32_t sampler =
2724       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2725
2726    ir_rvalue *nonconst_sampler_index =
2727       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2728
2729    /* Handle non-constant sampler array indexing */
2730    src_reg sampler_reg;
2731    if (nonconst_sampler_index) {
2732       /* The highest sampler which may be used by this operation is
2733        * the last element of the array. Mark it here, because the generator
2734        * doesn't have enough information to determine the bound.
2735        */
2736       uint32_t array_size = ir->sampler->as_dereference_array()
2737          ->array->type->array_size();
2738
2739       uint32_t max_used = sampler + array_size - 1;
2740       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2741          max_used += prog_data->base.binding_table.gather_texture_start;
2742       } else {
2743          max_used += prog_data->base.binding_table.texture_start;
2744       }
2745
2746       brw_mark_surface_used(&prog_data->base, max_used);
2747
2748       /* Emit code to evaluate the actual indexing expression */
2749       nonconst_sampler_index->accept(this);
2750       src_reg temp(this, glsl_type::uint_type);
2751       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2752       sampler_reg = emit_uniformize(temp);
2753    } else {
2754       /* Single sampler, or constant array index; the indexing expression
2755        * is just an immediate.
2756        */
2757       sampler_reg = src_reg(sampler);
2758    }
2759
2760    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2761     * emitting anything other than setting up the constant result.
2762     */
2763    if (ir->op == ir_tg4) {
2764       ir_constant *chan = ir->lod_info.component->as_constant();
2765       int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2766       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2767          dst_reg result(this, ir->type);
2768          this->result = src_reg(result);
2769          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2770          return;
2771       }
2772    }
2773
2774    /* Should be lowered by do_lower_texture_projection */
2775    assert(!ir->projector);
2776
2777    /* Should be lowered */
2778    assert(!ir->offset || !ir->offset->type->is_array());
2779
2780    /* Generate code to compute all the subexpression trees.  This has to be
2781     * done before loading any values into MRFs for the sampler message since
2782     * generating these values may involve SEND messages that need the MRFs.
2783     */
2784    src_reg coordinate;
2785    int coord_components = 0;
2786    if (ir->coordinate) {
2787       coord_components = ir->coordinate->type->vector_elements;
2788       ir->coordinate->accept(this);
2789       coordinate = this->result;
2790    }
2791
2792    src_reg shadow_comparitor;
2793    if (ir->shadow_comparitor) {
2794       ir->shadow_comparitor->accept(this);
2795       shadow_comparitor = this->result;
2796    }
2797
2798    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2799    src_reg offset_value;
2800    if (has_nonconstant_offset) {
2801       ir->offset->accept(this);
2802       offset_value = src_reg(this->result);
2803    }
2804
2805    src_reg lod, lod2, sample_index, mcs;
2806    switch (ir->op) {
2807    case ir_tex:
2808       lod = src_reg(0.0f);
2809       break;
2810    case ir_txf:
2811    case ir_txl:
2812    case ir_txs:
2813       ir->lod_info.lod->accept(this);
2814       lod = this->result;
2815       break;
2816    case ir_query_levels:
2817       lod = src_reg(0);
2818       break;
2819    case ir_txf_ms:
2820       ir->lod_info.sample_index->accept(this);
2821       sample_index = this->result;
2822
2823       if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2824          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2825       else
2826          mcs = src_reg(0u);
2827       break;
2828    case ir_txd:
2829       ir->lod_info.grad.dPdx->accept(this);
2830       lod = this->result;
2831
2832       ir->lod_info.grad.dPdy->accept(this);
2833       lod2 = this->result;
2834       break;
2835    case ir_txb:
2836    case ir_lod:
2837    case ir_tg4:
2838    case ir_texture_samples:
2839       break;
2840    }
2841
2842    uint32_t constant_offset = 0;
2843    if (ir->offset != NULL && !has_nonconstant_offset) {
2844       constant_offset  =
2845          brw_texture_offset(ir->offset->as_constant()->value.i,
2846                             ir->offset->type->vector_elements);
2847    }
2848
2849    /* Stuff the channel select bits in the top of the texture offset */
2850    if (ir->op == ir_tg4)
2851       constant_offset |=
2852          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2853                          sampler) << 16;
2854
2855    glsl_type const *type = ir->sampler->type;
2856    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2857       type->sampler_array;
2858
2859    this->result = src_reg(this, ir->type);
2860    dst_reg dest = dst_reg(this->result);
2861
2862    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2863                 shadow_comparitor,
2864                 lod, lod2, sample_index,
2865                 constant_offset, offset_value,
2866                 mcs, is_cube_array, sampler, sampler_reg);
2867 }
2868
2869 /**
2870  * Apply workarounds for Gen6 gather with UINT/SINT
2871  */
2872 void
2873 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2874 {
2875    if (!wa)
2876       return;
2877
2878    int width = (wa & WA_8BIT) ? 8 : 16;
2879    dst_reg dst_f = dst;
2880    dst_f.type = BRW_REGISTER_TYPE_F;
2881
2882    /* Convert from UNORM to UINT */
2883    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2884    emit(MOV(dst, src_reg(dst_f)));
2885
2886    if (wa & WA_SIGN) {
2887       /* Reinterpret the UINT value as a signed INT value by
2888        * shifting the sign bit into place, then shifting back
2889        * preserving sign.
2890        */
2891       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2892       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2893    }
2894 }
2895
2896 /**
2897  * Set up the gather channel based on the swizzle, for gather4.
2898  */
2899 uint32_t
2900 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2901 {
2902    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2903    switch (swiz) {
2904       case SWIZZLE_X: return 0;
2905       case SWIZZLE_Y:
2906          /* gather4 sampler is broken for green channel on RG32F --
2907           * we must ask for blue instead.
2908           */
2909          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2910             return 2;
2911          return 1;
2912       case SWIZZLE_Z: return 2;
2913       case SWIZZLE_W: return 3;
2914       default:
2915          unreachable("Not reached"); /* zero, one swizzles handled already */
2916    }
2917 }
2918
2919 void
2920 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2921                              src_reg orig_val, uint32_t sampler,
2922                              const glsl_type *dest_type)
2923 {
2924    int s = key_tex->swizzles[sampler];
2925
2926    dst_reg swizzled_result = dest;
2927
2928    if (op == ir_query_levels) {
2929       /* # levels is in .w */
2930       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2931       emit(MOV(swizzled_result, orig_val));
2932       return;
2933    }
2934
2935    if (op == ir_txs || dest_type == glsl_type::float_type
2936                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2937       emit(MOV(swizzled_result, orig_val));
2938       return;
2939    }
2940
2941
2942    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2943    int swizzle[4] = {0};
2944
2945    for (int i = 0; i < 4; i++) {
2946       switch (GET_SWZ(s, i)) {
2947       case SWIZZLE_ZERO:
2948          zero_mask |= (1 << i);
2949          break;
2950       case SWIZZLE_ONE:
2951          one_mask |= (1 << i);
2952          break;
2953       default:
2954          copy_mask |= (1 << i);
2955          swizzle[i] = GET_SWZ(s, i);
2956          break;
2957       }
2958    }
2959
2960    if (copy_mask) {
2961       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2962       swizzled_result.writemask = copy_mask;
2963       emit(MOV(swizzled_result, orig_val));
2964    }
2965
2966    if (zero_mask) {
2967       swizzled_result.writemask = zero_mask;
2968       emit(MOV(swizzled_result, src_reg(0.0f)));
2969    }
2970
2971    if (one_mask) {
2972       swizzled_result.writemask = one_mask;
2973       emit(MOV(swizzled_result, src_reg(1.0f)));
2974    }
2975 }
2976
2977 void
2978 vec4_visitor::visit(ir_return *)
2979 {
2980    unreachable("not reached");
2981 }
2982
2983 void
2984 vec4_visitor::visit(ir_discard *)
2985 {
2986    unreachable("not reached");
2987 }
2988
2989 void
2990 vec4_visitor::visit(ir_if *ir)
2991 {
2992    /* Don't point the annotation at the if statement, because then it plus
2993     * the then and else blocks get printed.
2994     */
2995    this->base_ir = ir->condition;
2996
2997    if (devinfo->gen == 6) {
2998       emit_if_gen6(ir);
2999    } else {
3000       enum brw_predicate predicate;
3001       emit_bool_to_cond_code(ir->condition, &predicate);
3002       emit(IF(predicate));
3003    }
3004
3005    visit_instructions(&ir->then_instructions);
3006
3007    if (!ir->else_instructions.is_empty()) {
3008       this->base_ir = ir->condition;
3009       emit(BRW_OPCODE_ELSE);
3010
3011       visit_instructions(&ir->else_instructions);
3012    }
3013
3014    this->base_ir = ir->condition;
3015    emit(BRW_OPCODE_ENDIF);
3016 }
3017
3018 void
3019 vec4_visitor::gs_emit_vertex(int stream_id)
3020 {
3021    unreachable("not reached");
3022 }
3023
3024 void
3025 vec4_visitor::visit(ir_emit_vertex *)
3026 {
3027    unreachable("not reached");
3028 }
3029
3030 void
3031 vec4_visitor::gs_end_primitive()
3032 {
3033    unreachable("not reached");
3034 }
3035
3036
3037 void
3038 vec4_visitor::visit(ir_end_primitive *)
3039 {
3040    unreachable("not reached");
3041 }
3042
3043 void
3044 vec4_visitor::visit(ir_barrier *)
3045 {
3046    unreachable("not reached");
3047 }
3048
3049 void
3050 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3051                                   dst_reg dst, src_reg offset,
3052                                   src_reg src0, src_reg src1)
3053 {
3054    unsigned mlen = 0;
3055
3056    /* Set the atomic operation offset. */
3057    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3058    mlen++;
3059
3060    /* Set the atomic operation arguments. */
3061    if (src0.file != BAD_FILE) {
3062       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3063       mlen++;
3064    }
3065
3066    if (src1.file != BAD_FILE) {
3067       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3068       mlen++;
3069    }
3070
3071    /* Emit the instruction.  Note that this maps to the normal SIMD8
3072     * untyped atomic message on Ivy Bridge, but that's OK because
3073     * unused channels will be masked out.
3074     */
3075    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3076                                  brw_message_reg(0),
3077                                  src_reg(surf_index), src_reg(atomic_op));
3078    inst->mlen = mlen;
3079 }
3080
3081 void
3082 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3083                                         src_reg offset)
3084 {
3085    /* Set the surface read offset. */
3086    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3087
3088    /* Emit the instruction.  Note that this maps to the normal SIMD8
3089     * untyped surface read message, but that's OK because unused
3090     * channels will be masked out.
3091     */
3092    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3093                                  brw_message_reg(0),
3094                                  src_reg(surf_index), src_reg(1));
3095    inst->mlen = 1;
3096 }
3097
3098 void
3099 vec4_visitor::emit_ndc_computation()
3100 {
3101    /* Get the position */
3102    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3103
3104    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3105    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3106    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3107
3108    current_annotation = "NDC";
3109    dst_reg ndc_w = ndc;
3110    ndc_w.writemask = WRITEMASK_W;
3111    src_reg pos_w = pos;
3112    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3113    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3114
3115    dst_reg ndc_xyz = ndc;
3116    ndc_xyz.writemask = WRITEMASK_XYZ;
3117
3118    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3119 }
3120
3121 void
3122 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3123 {
3124    if (devinfo->gen < 6 &&
3125        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3126         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3127         devinfo->has_negative_rhw_bug)) {
3128       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3129       dst_reg header1_w = header1;
3130       header1_w.writemask = WRITEMASK_W;
3131
3132       emit(MOV(header1, 0u));
3133
3134       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3135          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3136
3137          current_annotation = "Point size";
3138          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3139          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3140       }
3141
3142       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3143          current_annotation = "Clipping flags";
3144          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3145          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3146
3147          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3148          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3149          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3150
3151          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3152          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3153          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3154          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3155       }
3156
3157       /* i965 clipping workaround:
3158        * 1) Test for -ve rhw
3159        * 2) If set,
3160        *      set ndc = (0,0,0,0)
3161        *      set ucp[6] = 1
3162        *
3163        * Later, clipping will detect ucp[6] and ensure the primitive is
3164        * clipped against all fixed planes.
3165        */
3166       if (devinfo->has_negative_rhw_bug) {
3167          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3168          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3169          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3170          vec4_instruction *inst;
3171          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3172          inst->predicate = BRW_PREDICATE_NORMAL;
3173          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3174          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3175          inst->predicate = BRW_PREDICATE_NORMAL;
3176       }
3177
3178       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3179    } else if (devinfo->gen < 6) {
3180       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3181    } else {
3182       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3183       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3184          dst_reg reg_w = reg;
3185          reg_w.writemask = WRITEMASK_W;
3186          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3187          reg_as_src.type = reg_w.type;
3188          reg_as_src.swizzle = brw_swizzle_for_size(1);
3189          emit(MOV(reg_w, reg_as_src));
3190       }
3191       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3192          dst_reg reg_y = reg;
3193          reg_y.writemask = WRITEMASK_Y;
3194          reg_y.type = BRW_REGISTER_TYPE_D;
3195          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3196          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3197       }
3198       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3199          dst_reg reg_z = reg;
3200          reg_z.writemask = WRITEMASK_Z;
3201          reg_z.type = BRW_REGISTER_TYPE_D;
3202          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3203          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3204       }
3205    }
3206 }
3207
3208 vec4_instruction *
3209 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3210 {
3211    assert(varying < VARYING_SLOT_MAX);
3212    assert(output_reg[varying].type == reg.type);
3213    current_annotation = output_reg_annotation[varying];
3214    /* Copy the register, saturating if necessary */
3215    return emit(MOV(reg, src_reg(output_reg[varying])));
3216 }
3217
3218 void
3219 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3220 {
3221    reg.type = BRW_REGISTER_TYPE_F;
3222    output_reg[varying].type = reg.type;
3223
3224    switch (varying) {
3225    case VARYING_SLOT_PSIZ:
3226    {
3227       /* PSIZ is always in slot 0, and is coupled with other flags. */
3228       current_annotation = "indices, point width, clip flags";
3229       emit_psiz_and_flags(reg);
3230       break;
3231    }
3232    case BRW_VARYING_SLOT_NDC:
3233       current_annotation = "NDC";
3234       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3235       break;
3236    case VARYING_SLOT_POS:
3237       current_annotation = "gl_Position";
3238       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3239       break;
3240    case VARYING_SLOT_EDGE:
3241       /* This is present when doing unfilled polygons.  We're supposed to copy
3242        * the edge flag from the user-provided vertex array
3243        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3244        * of that attribute (starts as 1.0f).  This is then used in clipping to
3245        * determine which edges should be drawn as wireframe.
3246        */
3247       current_annotation = "edge flag";
3248       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3249                                     glsl_type::float_type, WRITEMASK_XYZW))));
3250       break;
3251    case BRW_VARYING_SLOT_PAD:
3252       /* No need to write to this slot */
3253       break;
3254    default:
3255       emit_generic_urb_slot(reg, varying);
3256       break;
3257    }
3258 }
3259
3260 static int
3261 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3262 {
3263    if (devinfo->gen >= 6) {
3264       /* URB data written (does not include the message header reg) must
3265        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3266        * section 5.4.3.2.2: URB_INTERLEAVED.
3267        *
3268        * URB entries are allocated on a multiple of 1024 bits, so an
3269        * extra 128 bits written here to make the end align to 256 is
3270        * no problem.
3271        */
3272       if ((mlen % 2) != 1)
3273          mlen++;
3274    }
3275
3276    return mlen;
3277 }
3278
3279
3280 /**
3281  * Generates the VUE payload plus the necessary URB write instructions to
3282  * output it.
3283  *
3284  * The VUE layout is documented in Volume 2a.
3285  */
3286 void
3287 vec4_visitor::emit_vertex()
3288 {
3289    /* MRF 0 is reserved for the debugger, so start with message header
3290     * in MRF 1.
3291     */
3292    int base_mrf = 1;
3293    int mrf = base_mrf;
3294    /* In the process of generating our URB write message contents, we
3295     * may need to unspill a register or load from an array.  Those
3296     * reads would use MRFs 14-15.
3297     */
3298    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
3299
3300    /* The following assertion verifies that max_usable_mrf causes an
3301     * even-numbered amount of URB write data, which will meet gen6's
3302     * requirements for length alignment.
3303     */
3304    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3305
3306    /* First mrf is the g0-based message header containing URB handles and
3307     * such.
3308     */
3309    emit_urb_write_header(mrf++);
3310
3311    if (devinfo->gen < 6) {
3312       emit_ndc_computation();
3313    }
3314
3315    /* We may need to split this up into several URB writes, so do them in a
3316     * loop.
3317     */
3318    int slot = 0;
3319    bool complete = false;
3320    do {
3321       /* URB offset is in URB row increments, and each of our MRFs is half of
3322        * one of those, since we're doing interleaved writes.
3323        */
3324       int offset = slot / 2;
3325
3326       mrf = base_mrf + 1;
3327       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3328          emit_urb_slot(dst_reg(MRF, mrf++),
3329                        prog_data->vue_map.slot_to_varying[slot]);
3330
3331          /* If this was max_usable_mrf, we can't fit anything more into this
3332           * URB WRITE. Same thing if we reached the maximum length available.
3333           */
3334          if (mrf > max_usable_mrf ||
3335              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
3336             slot++;
3337             break;
3338          }
3339       }
3340
3341       complete = slot >= prog_data->vue_map.num_slots;
3342       current_annotation = "URB write";
3343       vec4_instruction *inst = emit_urb_write_opcode(complete);
3344       inst->base_mrf = base_mrf;
3345       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3346       inst->offset += offset;
3347    } while(!complete);
3348 }
3349
3350
3351 src_reg
3352 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3353                                  src_reg *reladdr, int reg_offset)
3354 {
3355    /* Because we store the values to scratch interleaved like our
3356     * vertex data, we need to scale the vec4 index by 2.
3357     */
3358    int message_header_scale = 2;
3359
3360    /* Pre-gen6, the message header uses byte offsets instead of vec4
3361     * (16-byte) offset units.
3362     */
3363    if (devinfo->gen < 6)
3364       message_header_scale *= 16;
3365
3366    if (reladdr) {
3367       src_reg index = src_reg(this, glsl_type::int_type);
3368
3369       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3370                                    src_reg(reg_offset)));
3371       emit_before(block, inst, MUL(dst_reg(index), index,
3372                                    src_reg(message_header_scale)));
3373
3374       return index;
3375    } else {
3376       return src_reg(reg_offset * message_header_scale);
3377    }
3378 }
3379
3380 src_reg
3381 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3382                                        src_reg *reladdr, int reg_offset)
3383 {
3384    if (reladdr) {
3385       src_reg index = src_reg(this, glsl_type::int_type);
3386
3387       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3388                                    src_reg(reg_offset)));
3389
3390       /* Pre-gen6, the message header uses byte offsets instead of vec4
3391        * (16-byte) offset units.
3392        */
3393       if (devinfo->gen < 6) {
3394          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3395       }
3396
3397       return index;
3398    } else if (devinfo->gen >= 8) {
3399       /* Store the offset in a GRF so we can send-from-GRF. */
3400       src_reg offset = src_reg(this, glsl_type::int_type);
3401       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3402       return offset;
3403    } else {
3404       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3405       return src_reg(reg_offset * message_header_scale);
3406    }
3407 }
3408
3409 /**
3410  * Emits an instruction before @inst to load the value named by @orig_src
3411  * from scratch space at @base_offset to @temp.
3412  *
3413  * @base_offset is measured in 32-byte units (the size of a register).
3414  */
3415 void
3416 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3417                                 dst_reg temp, src_reg orig_src,
3418                                 int base_offset)
3419 {
3420    int reg_offset = base_offset + orig_src.reg_offset;
3421    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3422                                       reg_offset);
3423
3424    emit_before(block, inst, SCRATCH_READ(temp, index));
3425 }
3426
3427 /**
3428  * Emits an instruction after @inst to store the value to be written
3429  * to @orig_dst to scratch space at @base_offset, from @temp.
3430  *
3431  * @base_offset is measured in 32-byte units (the size of a register).
3432  */
3433 void
3434 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3435                                  int base_offset)
3436 {
3437    int reg_offset = base_offset + inst->dst.reg_offset;
3438    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3439                                       reg_offset);
3440
3441    /* Create a temporary register to store *inst's result in.
3442     *
3443     * We have to be careful in MOVing from our temporary result register in
3444     * the scratch write.  If we swizzle from channels of the temporary that
3445     * weren't initialized, it will confuse live interval analysis, which will
3446     * make spilling fail to make progress.
3447     */
3448    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3449                                        inst->dst.type),
3450                                 brw_swizzle_for_mask(inst->dst.writemask));
3451    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3452                                        inst->dst.writemask));
3453    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3454    if (inst->opcode != BRW_OPCODE_SEL)
3455       write->predicate = inst->predicate;
3456    write->ir = inst->ir;
3457    write->annotation = inst->annotation;
3458    inst->insert_after(block, write);
3459
3460    inst->dst.file = temp.file;
3461    inst->dst.reg = temp.reg;
3462    inst->dst.reg_offset = temp.reg_offset;
3463    inst->dst.reladdr = NULL;
3464 }
3465
3466 /**
3467  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3468  * adds the scratch read(s) before \p inst. The function also checks for
3469  * recursive reladdr scratch accesses, issuing the corresponding scratch
3470  * loads and rewriting reladdr references accordingly.
3471  *
3472  * \return \p src if it did not require a scratch load, otherwise, the
3473  * register holding the result of the scratch load that the caller should
3474  * use to rewrite src.
3475  */
3476 src_reg
3477 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3478                                    vec4_instruction *inst, src_reg src)
3479 {
3480    /* Resolve recursive reladdr scratch access by calling ourselves
3481     * with src.reladdr
3482     */
3483    if (src.reladdr)
3484       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3485                                           *src.reladdr);
3486
3487    /* Now handle scratch access on src */
3488    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3489       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3490       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3491       src.reg = temp.reg;
3492       src.reg_offset = temp.reg_offset;
3493       src.reladdr = NULL;
3494    }
3495
3496    return src;
3497 }
3498
3499 /**
3500  * We can't generally support array access in GRF space, because a
3501  * single instruction's destination can only span 2 contiguous
3502  * registers.  So, we send all GRF arrays that get variable index
3503  * access to scratch space.
3504  */
3505 void
3506 vec4_visitor::move_grf_array_access_to_scratch()
3507 {
3508    int scratch_loc[this->alloc.count];
3509    memset(scratch_loc, -1, sizeof(scratch_loc));
3510
3511    /* First, calculate the set of virtual GRFs that need to be punted
3512     * to scratch due to having any array access on them, and where in
3513     * scratch.
3514     */
3515    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3516       if (inst->dst.file == GRF && inst->dst.reladdr) {
3517          if (scratch_loc[inst->dst.reg] == -1) {
3518             scratch_loc[inst->dst.reg] = last_scratch;
3519             last_scratch += this->alloc.sizes[inst->dst.reg];
3520          }
3521
3522          for (src_reg *iter = inst->dst.reladdr;
3523               iter->reladdr;
3524               iter = iter->reladdr) {
3525             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3526                scratch_loc[iter->reg] = last_scratch;
3527                last_scratch += this->alloc.sizes[iter->reg];
3528             }
3529          }
3530       }
3531
3532       for (int i = 0 ; i < 3; i++) {
3533          for (src_reg *iter = &inst->src[i];
3534               iter->reladdr;
3535               iter = iter->reladdr) {
3536             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3537                scratch_loc[iter->reg] = last_scratch;
3538                last_scratch += this->alloc.sizes[iter->reg];
3539             }
3540          }
3541       }
3542    }
3543
3544    /* Now, for anything that will be accessed through scratch, rewrite
3545     * it to load/store.  Note that this is a _safe list walk, because
3546     * we may generate a new scratch_write instruction after the one
3547     * we're processing.
3548     */
3549    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3550       /* Set up the annotation tracking for new generated instructions. */
3551       base_ir = inst->ir;
3552       current_annotation = inst->annotation;
3553
3554       /* First handle scratch access on the dst. Notice we have to handle
3555        * the case where the dst's reladdr also points to scratch space.
3556        */
3557       if (inst->dst.reladdr)
3558          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3559                                                    *inst->dst.reladdr);
3560
3561       /* Now that we have handled any (possibly recursive) reladdr scratch
3562        * accesses for dst we can safely do the scratch write for dst itself
3563        */
3564       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3565          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3566
3567       /* Now handle scratch access on any src. In this case, since inst->src[i]
3568        * already is a src_reg, we can just call emit_resolve_reladdr with
3569        * inst->src[i] and it will take care of handling scratch loads for
3570        * both src and src.reladdr (recursively).
3571        */
3572       for (int i = 0 ; i < 3; i++) {
3573          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3574                                              inst->src[i]);
3575       }
3576    }
3577 }
3578
3579 /**
3580  * Emits an instruction before @inst to load the value named by @orig_src
3581  * from the pull constant buffer (surface) at @base_offset to @temp.
3582  */
3583 void
3584 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3585                                       dst_reg temp, src_reg orig_src,
3586                                       int base_offset)
3587 {
3588    int reg_offset = base_offset + orig_src.reg_offset;
3589    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3590    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3591                                              reg_offset);
3592
3593    emit_pull_constant_load_reg(temp,
3594                                index,
3595                                offset,
3596                                block, inst);
3597 }
3598
3599 /**
3600  * Implements array access of uniforms by inserting a
3601  * PULL_CONSTANT_LOAD instruction.
3602  *
3603  * Unlike temporary GRF array access (where we don't support it due to
3604  * the difficulty of doing relative addressing on instruction
3605  * destinations), we could potentially do array access of uniforms
3606  * that were loaded in GRF space as push constants.  In real-world
3607  * usage we've seen, though, the arrays being used are always larger
3608  * than we could load as push constants, so just always move all
3609  * uniform array access out to a pull constant buffer.
3610  */
3611 void
3612 vec4_visitor::move_uniform_array_access_to_pull_constants()
3613 {
3614    int pull_constant_loc[this->uniforms];
3615    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3616    bool nested_reladdr;
3617
3618    /* Walk through and find array access of uniforms.  Put a copy of that
3619     * uniform in the pull constant buffer.
3620     *
3621     * Note that we don't move constant-indexed accesses to arrays.  No
3622     * testing has been done of the performance impact of this choice.
3623     */
3624    do {
3625       nested_reladdr = false;
3626
3627       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3628          for (int i = 0 ; i < 3; i++) {
3629             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3630                continue;
3631
3632             int uniform = inst->src[i].reg;
3633
3634             if (inst->src[i].reladdr->reladdr)
3635                nested_reladdr = true;  /* will need another pass */
3636
3637             /* If this array isn't already present in the pull constant buffer,
3638              * add it.
3639              */
3640             if (pull_constant_loc[uniform] == -1) {
3641                const gl_constant_value **values =
3642                   &stage_prog_data->param[uniform * 4];
3643
3644                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3645
3646                assert(uniform < uniform_array_size);
3647                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3648                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3649                      = values[j];
3650                }
3651             }
3652
3653             /* Set up the annotation tracking for new generated instructions. */
3654             base_ir = inst->ir;
3655             current_annotation = inst->annotation;
3656
3657             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3658
3659             emit_pull_constant_load(block, inst, temp, inst->src[i],
3660                                     pull_constant_loc[uniform]);
3661
3662             inst->src[i].file = temp.file;
3663             inst->src[i].reg = temp.reg;
3664             inst->src[i].reg_offset = temp.reg_offset;
3665             inst->src[i].reladdr = NULL;
3666          }
3667       }
3668    } while (nested_reladdr);
3669
3670    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3671     * no need to track them as larger-than-vec4 objects.  This will be
3672     * relied on in cutting out unused uniform vectors from push
3673     * constants.
3674     */
3675    split_uniform_registers();
3676 }
3677
3678 void
3679 vec4_visitor::resolve_ud_negate(src_reg *reg)
3680 {
3681    if (reg->type != BRW_REGISTER_TYPE_UD ||
3682        !reg->negate)
3683       return;
3684
3685    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3686    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3687    *reg = temp;
3688 }
3689
3690 /**
3691  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3692  *
3693  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3694  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3695  */
3696 void
3697 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3698 {
3699    assert(devinfo->gen <= 5);
3700
3701    if (!rvalue->type->is_boolean())
3702       return;
3703
3704    src_reg and_result = src_reg(this, rvalue->type);
3705    src_reg neg_result = src_reg(this, rvalue->type);
3706    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3707    emit(MOV(dst_reg(neg_result), negate(and_result)));
3708    *reg = neg_result;
3709 }
3710
3711 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3712                            void *log_data,
3713                            struct gl_program *prog,
3714                            const struct brw_sampler_prog_key_data *key_tex,
3715                            struct brw_vue_prog_data *prog_data,
3716                            struct gl_shader_program *shader_prog,
3717                            gl_shader_stage stage,
3718                            void *mem_ctx,
3719                            bool no_spills,
3720                            int shader_time_index)
3721    : backend_shader(compiler, log_data, mem_ctx,
3722                     shader_prog, prog, &prog_data->base, stage),
3723      key_tex(key_tex),
3724      prog_data(prog_data),
3725      sanity_param_count(0),
3726      fail_msg(NULL),
3727      first_non_payload_grf(0),
3728      need_all_constants_in_pull_buffer(false),
3729      no_spills(no_spills),
3730      shader_time_index(shader_time_index),
3731      last_scratch(0)
3732 {
3733    this->failed = false;
3734
3735    this->base_ir = NULL;
3736    this->current_annotation = NULL;
3737    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3738
3739    this->variable_ht = hash_table_ctor(0,
3740                                        hash_table_pointer_hash,
3741                                        hash_table_pointer_compare);
3742
3743    this->virtual_grf_start = NULL;
3744    this->virtual_grf_end = NULL;
3745    this->live_intervals = NULL;
3746
3747    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3748
3749    this->uniforms = 0;
3750
3751    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3752     * at least one. See setup_uniforms() in brw_vec4.cpp.
3753     */
3754    this->uniform_array_size = 1;
3755    if (prog_data) {
3756       this->uniform_array_size =
3757          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3758    }
3759
3760    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3761    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3762 }
3763
3764 vec4_visitor::~vec4_visitor()
3765 {
3766    hash_table_dtor(this->variable_ht);
3767 }
3768
3769
3770 void
3771 vec4_visitor::fail(const char *format, ...)
3772 {
3773    va_list va;
3774    char *msg;
3775
3776    if (failed)
3777       return;
3778
3779    failed = true;
3780
3781    va_start(va, format);
3782    msg = ralloc_vasprintf(mem_ctx, format, va);
3783    va_end(va);
3784    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3785
3786    this->fail_msg = msg;
3787
3788    if (debug_enabled) {
3789       fprintf(stderr, "%s",  msg);
3790    }
3791 }
3792
3793 } /* namespace brw */