src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618    case GLSL_TYPE_FUNCTION:
 619       unreachable("not reached");
 620    }
 621
 622    return 0;
 623 }
 624
 625 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 626 {
 627    init();
 628
 629    this->file = GRF;
 630    this->reg = v->alloc.allocate(type_size(type));
 631
 632    if (type->is_array() || type->is_record()) {
 633       this->swizzle = BRW_SWIZZLE_NOOP;
 634    } else {
 635       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 636    }
 637
 638    this->type = brw_type_for_base_type(type);
 639 }
 640
 641 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 642 {
 643    assert(size > 0);
 644
 645    init();
 646
 647    this->file = GRF;
 648    this->reg = v->alloc.allocate(type_size(type) * size);
 649
 650    this->swizzle = BRW_SWIZZLE_NOOP;
 651
 652    this->type = brw_type_for_base_type(type);
 653 }
 654
 655 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 656 {
 657    init();
 658
 659    this->file = GRF;
 660    this->reg = v->alloc.allocate(type_size(type));
 661
 662    if (type->is_array() || type->is_record()) {
 663       this->writemask = WRITEMASK_XYZW;
 664    } else {
 665       this->writemask = (1 << type->vector_elements) - 1;
 666    }
 667
 668    this->type = brw_type_for_base_type(type);
 669 }
 670
 671 /* Our support for uniforms is piggy-backed on the struct
 672  * gl_fragment_program, because that's where the values actually
 673  * get stored, rather than in some global gl_shader_program uniform
 674  * store.
 675  */
 676 void
 677 vec4_visitor::setup_uniform_values(ir_variable *ir)
 678 {
 679    int namelen = strlen(ir->name);
 680
 681    /* The data for our (non-builtin) uniforms is stored in a series of
 682     * gl_uniform_driver_storage structs for each subcomponent that
 683     * glGetUniformLocation() could name.  We know it's been set up in the same
 684     * order we'd walk the type, so walk the list of storage and find anything
 685     * with our name, or the prefix of a component that starts with our name.
 686     */
 687    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 688       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 689
 690       if (storage->builtin)
 691          continue;
 692
 693       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 694           (storage->name[namelen] != 0 &&
 695            storage->name[namelen] != '.' &&
 696            storage->name[namelen] != '[')) {
 697          continue;
 698       }
 699
 700       gl_constant_value *components = storage->storage;
 701       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 702                                storage->type->matrix_columns);
 703
 704       for (unsigned s = 0; s < vector_count; s++) {
 705          assert(uniforms < uniform_array_size);
 706          uniform_vector_size[uniforms] = storage->type->vector_elements;
 707
 708          int i;
 709          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 710             stage_prog_data->param[uniforms * 4 + i] = components;
 711             components++;
 712          }
 713          for (; i < 4; i++) {
 714             static gl_constant_value zero = { 0.0 };
 715             stage_prog_data->param[uniforms * 4 + i] = &zero;
 716          }
 717
 718          uniforms++;
 719       }
 720    }
 721 }
 722
 723 void
 724 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 725 {
 726    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 727       assert(this->uniforms < uniform_array_size);
 728       this->uniform_vector_size[this->uniforms] = 4;
 729       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 730       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 731       for (int j = 0; j < 4; ++j) {
 732          stage_prog_data->param[this->uniforms * 4 + j] =
 733             (gl_constant_value *) &clip_planes[i][j];
 734       }
 735       ++this->uniforms;
 736    }
 737 }
 738
 739 /* Our support for builtin uniforms is even scarier than non-builtin.
 740  * It sits on top of the PROG_STATE_VAR parameters that are
 741  * automatically updated from GL context state.
 742  */
 743 void
 744 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 745 {
 746    const ir_state_slot *const slots = ir->get_state_slots();
 747    assert(slots != NULL);
 748
 749    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 750       /* This state reference has already been setup by ir_to_mesa,
 751        * but we'll get the same index back here.  We can reference
 752        * ParameterValues directly, since unlike brw_fs.cpp, we never
 753        * add new state references during compile.
 754        */
 755       int index = _mesa_add_state_reference(this->prog->Parameters,
 756                                             (gl_state_index *)slots[i].tokens);
 757       gl_constant_value *values =
 758          &this->prog->Parameters->ParameterValues[index][0];
 759
 760       assert(this->uniforms < uniform_array_size);
 761
 762       for (unsigned j = 0; j < 4; j++)
 763          stage_prog_data->param[this->uniforms * 4 + j] =
 764             &values[GET_SWZ(slots[i].swizzle, j)];
 765
 766       this->uniform_vector_size[this->uniforms] =
 767          (ir->type->is_scalar() || ir->type->is_vector() ||
 768           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 769
 770       this->uniforms++;
 771    }
 772 }
 773
 774 dst_reg *
 775 vec4_visitor::variable_storage(ir_variable *var)
 776 {
 777    return (dst_reg *)hash_table_find(this->variable_ht, var);
 778 }
 779
 780 void
 781 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 782                                      enum brw_predicate *predicate)
 783 {
 784    ir_expression *expr = ir->as_expression();
 785
 786    *predicate = BRW_PREDICATE_NORMAL;
 787
 788    if (expr && expr->operation != ir_binop_ubo_load) {
 789       src_reg op[3];
 790       vec4_instruction *inst;
 791
 792       assert(expr->get_num_operands() <= 3);
 793       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 794          expr->operands[i]->accept(this);
 795          op[i] = this->result;
 796
 797          resolve_ud_negate(&op[i]);
 798       }
 799
 800       switch (expr->operation) {
 801       case ir_unop_logic_not:
 802          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 803          inst->conditional_mod = BRW_CONDITIONAL_Z;
 804          break;
 805
 806       case ir_binop_logic_xor:
 807          if (devinfo->gen <= 5) {
 808             src_reg temp = src_reg(this, ir->type);
 809             emit(XOR(dst_reg(temp), op[0], op[1]));
 810             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 811          } else {
 812             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 813          }
 814          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 815          break;
 816
 817       case ir_binop_logic_or:
 818          if (devinfo->gen <= 5) {
 819             src_reg temp = src_reg(this, ir->type);
 820             emit(OR(dst_reg(temp), op[0], op[1]));
 821             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 822          } else {
 823             inst = emit(OR(dst_null_d(), op[0], op[1]));
 824          }
 825          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 826          break;
 827
 828       case ir_binop_logic_and:
 829          if (devinfo->gen <= 5) {
 830             src_reg temp = src_reg(this, ir->type);
 831             emit(AND(dst_reg(temp), op[0], op[1]));
 832             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 833          } else {
 834             inst = emit(AND(dst_null_d(), op[0], op[1]));
 835          }
 836          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 837          break;
 838
 839       case ir_unop_f2b:
 840          if (devinfo->gen >= 6) {
 841             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 842          } else {
 843             inst = emit(MOV(dst_null_f(), op[0]));
 844             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 845          }
 846          break;
 847
 848       case ir_unop_i2b:
 849          if (devinfo->gen >= 6) {
 850             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 851          } else {
 852             inst = emit(MOV(dst_null_d(), op[0]));
 853             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 854          }
 855          break;
 856
 857       case ir_binop_all_equal:
 858          if (devinfo->gen <= 5) {
 859             resolve_bool_comparison(expr->operands[0], &op[0]);
 860             resolve_bool_comparison(expr->operands[1], &op[1]);
 861          }
 862          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 863          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 864          break;
 865
 866       case ir_binop_any_nequal:
 867          if (devinfo->gen <= 5) {
 868             resolve_bool_comparison(expr->operands[0], &op[0]);
 869             resolve_bool_comparison(expr->operands[1], &op[1]);
 870          }
 871          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 872          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 873          break;
 874
 875       case ir_unop_any:
 876          if (devinfo->gen <= 5) {
 877             resolve_bool_comparison(expr->operands[0], &op[0]);
 878          }
 879          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 880          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 881          break;
 882
 883       case ir_binop_greater:
 884       case ir_binop_gequal:
 885       case ir_binop_less:
 886       case ir_binop_lequal:
 887       case ir_binop_equal:
 888       case ir_binop_nequal:
 889          if (devinfo->gen <= 5) {
 890             resolve_bool_comparison(expr->operands[0], &op[0]);
 891             resolve_bool_comparison(expr->operands[1], &op[1]);
 892          }
 893          emit(CMP(dst_null_d(), op[0], op[1],
 894                   brw_conditional_for_comparison(expr->operation)));
 895          break;
 896
 897       case ir_triop_csel: {
 898          /* Expand the boolean condition into the flag register. */
 899          inst = emit(MOV(dst_null_d(), op[0]));
 900          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 901
 902          /* Select which boolean to return. */
 903          dst_reg temp(this, expr->operands[1]->type);
 904          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 905          inst->predicate = BRW_PREDICATE_NORMAL;
 906
 907          /* Expand the result to a condition code. */
 908          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 909          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 910          break;
 911       }
 912
 913       default:
 914          unreachable("not reached");
 915       }
 916       return;
 917    }
 918
 919    ir->accept(this);
 920
 921    resolve_ud_negate(&this->result);
 922
 923    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 924    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 925 }
 926
 927 /**
 928  * Emit a gen6 IF statement with the comparison folded into the IF
 929  * instruction.
 930  */
 931 void
 932 vec4_visitor::emit_if_gen6(ir_if *ir)
 933 {
 934    ir_expression *expr = ir->condition->as_expression();
 935
 936    if (expr && expr->operation != ir_binop_ubo_load) {
 937       src_reg op[3];
 938       dst_reg temp;
 939
 940       assert(expr->get_num_operands() <= 3);
 941       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 942          expr->operands[i]->accept(this);
 943          op[i] = this->result;
 944       }
 945
 946       switch (expr->operation) {
 947       case ir_unop_logic_not:
 948          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 949          return;
 950
 951       case ir_binop_logic_xor:
 952          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 953          return;
 954
 955       case ir_binop_logic_or:
 956          temp = dst_reg(this, glsl_type::bool_type);
 957          emit(OR(temp, op[0], op[1]));
 958          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 959          return;
 960
 961       case ir_binop_logic_and:
 962          temp = dst_reg(this, glsl_type::bool_type);
 963          emit(AND(temp, op[0], op[1]));
 964          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 965          return;
 966
 967       case ir_unop_f2b:
 968          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 969          return;
 970
 971       case ir_unop_i2b:
 972          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 973          return;
 974
 975       case ir_binop_greater:
 976       case ir_binop_gequal:
 977       case ir_binop_less:
 978       case ir_binop_lequal:
 979       case ir_binop_equal:
 980       case ir_binop_nequal:
 981          emit(IF(op[0], op[1],
 982                  brw_conditional_for_comparison(expr->operation)));
 983          return;
 984
 985       case ir_binop_all_equal:
 986          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 987          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 988          return;
 989
 990       case ir_binop_any_nequal:
 991          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 992          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 993          return;
 994
 995       case ir_unop_any:
 996          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 997          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 998          return;
 999
1000       case ir_triop_csel: {
1001          /* Expand the boolean condition into the flag register. */
1002          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1003          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1004
1005          /* Select which boolean to return. */
1006          dst_reg temp(this, expr->operands[1]->type);
1007          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1008          inst->predicate = BRW_PREDICATE_NORMAL;
1009
1010          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1011          return;
1012       }
1013
1014       default:
1015          unreachable("not reached");
1016       }
1017       return;
1018    }
1019
1020    ir->condition->accept(this);
1021
1022    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1023 }
1024
1025 void
1026 vec4_visitor::visit(ir_variable *ir)
1027 {
1028    dst_reg *reg = NULL;
1029
1030    if (variable_storage(ir))
1031       return;
1032
1033    switch (ir->data.mode) {
1034    case ir_var_shader_in:
1035       assert(ir->data.location != -1);
1036       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1037       break;
1038
1039    case ir_var_shader_out:
1040       assert(ir->data.location != -1);
1041       reg = new(mem_ctx) dst_reg(this, ir->type);
1042
1043       for (int i = 0; i < type_size(ir->type); i++) {
1044          output_reg[ir->data.location + i] = *reg;
1045          output_reg[ir->data.location + i].reg_offset = i;
1046          output_reg[ir->data.location + i].type =
1047             brw_type_for_base_type(ir->type->get_scalar_type());
1048          output_reg_annotation[ir->data.location + i] = ir->name;
1049       }
1050       break;
1051
1052    case ir_var_auto:
1053    case ir_var_temporary:
1054       reg = new(mem_ctx) dst_reg(this, ir->type);
1055       break;
1056
1057    case ir_var_uniform:
1058       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1059
1060       /* Thanks to the lower_ubo_reference pass, we will see only
1061        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1062        * variables, so no need for them to be in variable_ht.
1063        *
1064        * Some uniforms, such as samplers and atomic counters, have no actual
1065        * storage, so we should ignore them.
1066        */
1067       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1068          return;
1069
1070       /* Track how big the whole uniform variable is, in case we need to put a
1071        * copy of its data into pull constants for array access.
1072        */
1073       assert(this->uniforms < uniform_array_size);
1074       this->uniform_size[this->uniforms] = type_size(ir->type);
1075
1076       if (!strncmp(ir->name, "gl_", 3)) {
1077          setup_builtin_uniform_values(ir);
1078       } else {
1079          setup_uniform_values(ir);
1080       }
1081       break;
1082
1083    case ir_var_system_value:
1084       reg = make_reg_for_system_value(ir);
1085       break;
1086
1087    default:
1088       unreachable("not reached");
1089    }
1090
1091    reg->type = brw_type_for_base_type(ir->type);
1092    hash_table_insert(this->variable_ht, reg, ir);
1093 }
1094
1095 void
1096 vec4_visitor::visit(ir_loop *ir)
1097 {
1098    /* We don't want debugging output to print the whole body of the
1099     * loop as the annotation.
1100     */
1101    this->base_ir = NULL;
1102
1103    emit(BRW_OPCODE_DO);
1104
1105    visit_instructions(&ir->body_instructions);
1106
1107    emit(BRW_OPCODE_WHILE);
1108 }
1109
1110 void
1111 vec4_visitor::visit(ir_loop_jump *ir)
1112 {
1113    switch (ir->mode) {
1114    case ir_loop_jump::jump_break:
1115       emit(BRW_OPCODE_BREAK);
1116       break;
1117    case ir_loop_jump::jump_continue:
1118       emit(BRW_OPCODE_CONTINUE);
1119       break;
1120    }
1121 }
1122
1123
1124 void
1125 vec4_visitor::visit(ir_function_signature *)
1126 {
1127    unreachable("not reached");
1128 }
1129
1130 void
1131 vec4_visitor::visit(ir_function *ir)
1132 {
1133    /* Ignore function bodies other than main() -- we shouldn't see calls to
1134     * them since they should all be inlined.
1135     */
1136    if (strcmp(ir->name, "main") == 0) {
1137       const ir_function_signature *sig;
1138       exec_list empty;
1139
1140       sig = ir->matching_signature(NULL, &empty, false);
1141
1142       assert(sig);
1143
1144       visit_instructions(&sig->body);
1145    }
1146 }
1147
1148 bool
1149 vec4_visitor::try_emit_mad(ir_expression *ir)
1150 {
1151    /* 3-src instructions were introduced in gen6. */
1152    if (devinfo->gen < 6)
1153       return false;
1154
1155    /* MAD can only handle floating-point data. */
1156    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1157       return false;
1158
1159    ir_rvalue *nonmul;
1160    ir_expression *mul;
1161    bool mul_negate, mul_abs;
1162
1163    for (int i = 0; i < 2; i++) {
1164       mul_negate = false;
1165       mul_abs = false;
1166
1167       mul = ir->operands[i]->as_expression();
1168       nonmul = ir->operands[1 - i];
1169
1170       if (mul && mul->operation == ir_unop_abs) {
1171          mul = mul->operands[0]->as_expression();
1172          mul_abs = true;
1173       } else if (mul && mul->operation == ir_unop_neg) {
1174          mul = mul->operands[0]->as_expression();
1175          mul_negate = true;
1176       }
1177
1178       if (mul && mul->operation == ir_binop_mul)
1179          break;
1180    }
1181
1182    if (!mul || mul->operation != ir_binop_mul)
1183       return false;
1184
1185    nonmul->accept(this);
1186    src_reg src0 = fix_3src_operand(this->result);
1187
1188    mul->operands[0]->accept(this);
1189    src_reg src1 = fix_3src_operand(this->result);
1190    src1.negate ^= mul_negate;
1191    src1.abs = mul_abs;
1192    if (mul_abs)
1193       src1.negate = false;
1194
1195    mul->operands[1]->accept(this);
1196    src_reg src2 = fix_3src_operand(this->result);
1197    src2.abs = mul_abs;
1198    if (mul_abs)
1199       src2.negate = false;
1200
1201    this->result = src_reg(this, ir->type);
1202    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1203
1204    return true;
1205 }
1206
1207 bool
1208 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1209 {
1210    /* This optimization relies on CMP setting the destination to 0 when
1211     * false.  Early hardware only sets the least significant bit, and
1212     * leaves the other bits undefined.  So we can't use it.
1213     */
1214    if (devinfo->gen < 6)
1215       return false;
1216
1217    ir_expression *const cmp = ir->operands[0]->as_expression();
1218
1219    if (cmp == NULL)
1220       return false;
1221
1222    switch (cmp->operation) {
1223    case ir_binop_less:
1224    case ir_binop_greater:
1225    case ir_binop_lequal:
1226    case ir_binop_gequal:
1227    case ir_binop_equal:
1228    case ir_binop_nequal:
1229       break;
1230
1231    default:
1232       return false;
1233    }
1234
1235    cmp->operands[0]->accept(this);
1236    const src_reg cmp_src0 = this->result;
1237
1238    cmp->operands[1]->accept(this);
1239    const src_reg cmp_src1 = this->result;
1240
1241    this->result = src_reg(this, ir->type);
1242
1243    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1244             brw_conditional_for_comparison(cmp->operation)));
1245
1246    /* If the comparison is false, this->result will just happen to be zero.
1247     */
1248    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1249                                        this->result, src_reg(1.0f));
1250    inst->predicate = BRW_PREDICATE_NORMAL;
1251    inst->predicate_inverse = true;
1252
1253    return true;
1254 }
1255
1256 void
1257 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1258                           src_reg src0, src_reg src1)
1259 {
1260    vec4_instruction *inst;
1261
1262    if (devinfo->gen >= 6) {
1263       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1264       inst->conditional_mod = conditionalmod;
1265    } else {
1266       emit(CMP(dst, src0, src1, conditionalmod));
1267
1268       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1269       inst->predicate = BRW_PREDICATE_NORMAL;
1270    }
1271 }
1272
1273 void
1274 vec4_visitor::emit_lrp(const dst_reg &dst,
1275                        const src_reg &x, const src_reg &y, const src_reg &a)
1276 {
1277    if (devinfo->gen >= 6) {
1278       /* Note that the instruction's argument order is reversed from GLSL
1279        * and the IR.
1280        */
1281       emit(LRP(dst,
1282                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1283    } else {
1284       /* Earlier generations don't support three source operations, so we
1285        * need to emit x*(1-a) + y*a.
1286        */
1287       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1288       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1289       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1290       y_times_a.writemask           = dst.writemask;
1291       one_minus_a.writemask         = dst.writemask;
1292       x_times_one_minus_a.writemask = dst.writemask;
1293
1294       emit(MUL(y_times_a, y, a));
1295       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1296       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1297       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1298    }
1299 }
1300
1301 /**
1302  * Emits the instructions needed to perform a pull constant load. before_block
1303  * and before_inst can be NULL in which case the instruction will be appended
1304  * to the end of the instruction list.
1305  */
1306 void
1307 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1308                                           src_reg surf_index,
1309                                           src_reg offset_reg,
1310                                           bblock_t *before_block,
1311                                           vec4_instruction *before_inst)
1312 {
1313    assert((before_inst == NULL && before_block == NULL) ||
1314           (before_inst && before_block));
1315
1316    vec4_instruction *pull;
1317
1318    if (devinfo->gen >= 9) {
1319       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1320       src_reg header(this, glsl_type::uvec4_type, 2);
1321
1322       pull = new(mem_ctx)
1323          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1324                           dst_reg(header));
1325
1326       if (before_inst)
1327          emit_before(before_block, before_inst, pull);
1328       else
1329          emit(pull);
1330
1331       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1332                                  offset_reg.type);
1333       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1334
1335       if (before_inst)
1336          emit_before(before_block, before_inst, pull);
1337       else
1338          emit(pull);
1339
1340       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1341                                            dst,
1342                                            surf_index,
1343                                            header);
1344       pull->mlen = 2;
1345       pull->header_size = 1;
1346    } else if (devinfo->gen >= 7) {
1347       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1348
1349       grf_offset.type = offset_reg.type;
1350
1351       pull = MOV(grf_offset, offset_reg);
1352
1353       if (before_inst)
1354          emit_before(before_block, before_inst, pull);
1355       else
1356          emit(pull);
1357
1358       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1359                                            dst,
1360                                            surf_index,
1361                                            src_reg(grf_offset));
1362       pull->mlen = 1;
1363    } else {
1364       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1365                                            dst,
1366                                            surf_index,
1367                                            offset_reg);
1368       pull->base_mrf = 14;
1369       pull->mlen = 1;
1370    }
1371
1372    if (before_inst)
1373       emit_before(before_block, before_inst, pull);
1374    else
1375       emit(pull);
1376 }
1377
1378 void
1379 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1380 {
1381    const src_reg chan_index(this, glsl_type::uint_type);
1382
1383    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1384       ->force_writemask_all = true;
1385    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1386       ->force_writemask_all = true;
1387 }
1388
1389 void
1390 vec4_visitor::visit(ir_expression *ir)
1391 {
1392    unsigned int operand;
1393    src_reg op[ARRAY_SIZE(ir->operands)];
1394    vec4_instruction *inst;
1395
1396    if (ir->operation == ir_binop_add) {
1397       if (try_emit_mad(ir))
1398          return;
1399    }
1400
1401    if (ir->operation == ir_unop_b2f) {
1402       if (try_emit_b2f_of_compare(ir))
1403          return;
1404    }
1405
1406    /* Storage for our result.  Ideally for an assignment we'd be using
1407     * the actual storage for the result here, instead.
1408     */
1409    dst_reg result_dst(this, ir->type);
1410    src_reg result_src(result_dst);
1411
1412    if (ir->operation == ir_triop_csel) {
1413       ir->operands[1]->accept(this);
1414       op[1] = this->result;
1415       ir->operands[2]->accept(this);
1416       op[2] = this->result;
1417
1418       enum brw_predicate predicate;
1419       emit_bool_to_cond_code(ir->operands[0], &predicate);
1420       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1421       inst->predicate = predicate;
1422       this->result = result_src;
1423       return;
1424    }
1425
1426    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1427       this->result.file = BAD_FILE;
1428       ir->operands[operand]->accept(this);
1429       if (this->result.file == BAD_FILE) {
1430          fprintf(stderr, "Failed to get tree for expression operand:\n");
1431          ir->operands[operand]->fprint(stderr);
1432          exit(1);
1433       }
1434       op[operand] = this->result;
1435
1436       /* Matrix expression operands should have been broken down to vector
1437        * operations already.
1438        */
1439       assert(!ir->operands[operand]->type->is_matrix());
1440    }
1441
1442    /* If nothing special happens, this is the result. */
1443    this->result = result_src;
1444
1445    switch (ir->operation) {
1446    case ir_unop_logic_not:
1447       emit(NOT(result_dst, op[0]));
1448       break;
1449    case ir_unop_neg:
1450       op[0].negate = !op[0].negate;
1451       emit(MOV(result_dst, op[0]));
1452       break;
1453    case ir_unop_abs:
1454       op[0].abs = true;
1455       op[0].negate = false;
1456       emit(MOV(result_dst, op[0]));
1457       break;
1458
1459    case ir_unop_sign:
1460       if (ir->type->is_float()) {
1461          /* AND(val, 0x80000000) gives the sign bit.
1462           *
1463           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1464           * zero.
1465           */
1466          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1467
1468          op[0].type = BRW_REGISTER_TYPE_UD;
1469          result_dst.type = BRW_REGISTER_TYPE_UD;
1470          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1471
1472          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1473          inst->predicate = BRW_PREDICATE_NORMAL;
1474
1475          this->result.type = BRW_REGISTER_TYPE_F;
1476       } else {
1477          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1478           *               -> non-negative val generates 0x00000000.
1479           *  Predicated OR sets 1 if val is positive.
1480           */
1481          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1482
1483          emit(ASR(result_dst, op[0], src_reg(31)));
1484
1485          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1486          inst->predicate = BRW_PREDICATE_NORMAL;
1487       }
1488       break;
1489
1490    case ir_unop_rcp:
1491       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1492       break;
1493
1494    case ir_unop_exp2:
1495       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1496       break;
1497    case ir_unop_log2:
1498       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1499       break;
1500    case ir_unop_exp:
1501    case ir_unop_log:
1502       unreachable("not reached: should be handled by ir_explog_to_explog2");
1503    case ir_unop_sin:
1504       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1505       break;
1506    case ir_unop_cos:
1507       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1508       break;
1509
1510    case ir_unop_dFdx:
1511    case ir_unop_dFdx_coarse:
1512    case ir_unop_dFdx_fine:
1513    case ir_unop_dFdy:
1514    case ir_unop_dFdy_coarse:
1515    case ir_unop_dFdy_fine:
1516       unreachable("derivatives not valid in vertex shader");
1517
1518    case ir_unop_bitfield_reverse:
1519       emit(BFREV(result_dst, op[0]));
1520       break;
1521    case ir_unop_bit_count:
1522       emit(CBIT(result_dst, op[0]));
1523       break;
1524    case ir_unop_find_msb: {
1525       src_reg temp = src_reg(this, glsl_type::uint_type);
1526
1527       inst = emit(FBH(dst_reg(temp), op[0]));
1528       inst->dst.writemask = WRITEMASK_XYZW;
1529
1530       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1531        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1532        * subtract the result from 31 to convert the MSB count into an LSB count.
1533        */
1534
1535       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1536       temp.swizzle = BRW_SWIZZLE_NOOP;
1537       emit(MOV(result_dst, temp));
1538
1539       src_reg src_tmp = src_reg(result_dst);
1540       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1541
1542       src_tmp.negate = true;
1543       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1544       inst->predicate = BRW_PREDICATE_NORMAL;
1545       break;
1546    }
1547    case ir_unop_find_lsb:
1548       emit(FBL(result_dst, op[0]));
1549       break;
1550    case ir_unop_saturate:
1551       inst = emit(MOV(result_dst, op[0]));
1552       inst->saturate = true;
1553       break;
1554
1555    case ir_unop_noise:
1556       unreachable("not reached: should be handled by lower_noise");
1557
1558    case ir_binop_add:
1559       emit(ADD(result_dst, op[0], op[1]));
1560       break;
1561    case ir_binop_sub:
1562       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1563
1564    case ir_binop_mul:
1565       if (devinfo->gen < 8 && ir->type->is_integer()) {
1566          /* For integer multiplication, the MUL uses the low 16 bits of one of
1567           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1568           * accumulates in the contribution of the upper 16 bits of that
1569           * operand.  If we can determine that one of the args is in the low
1570           * 16 bits, though, we can just emit a single MUL.
1571           */
1572          if (ir->operands[0]->is_uint16_constant()) {
1573             if (devinfo->gen < 7)
1574                emit(MUL(result_dst, op[0], op[1]));
1575             else
1576                emit(MUL(result_dst, op[1], op[0]));
1577          } else if (ir->operands[1]->is_uint16_constant()) {
1578             if (devinfo->gen < 7)
1579                emit(MUL(result_dst, op[1], op[0]));
1580             else
1581                emit(MUL(result_dst, op[0], op[1]));
1582          } else {
1583             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1584
1585             emit(MUL(acc, op[0], op[1]));
1586             emit(MACH(dst_null_d(), op[0], op[1]));
1587             emit(MOV(result_dst, src_reg(acc)));
1588          }
1589       } else {
1590          emit(MUL(result_dst, op[0], op[1]));
1591       }
1592       break;
1593    case ir_binop_imul_high: {
1594       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1595
1596       emit(MUL(acc, op[0], op[1]));
1597       emit(MACH(result_dst, op[0], op[1]));
1598       break;
1599    }
1600    case ir_binop_div:
1601       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1602       assert(ir->type->is_integer());
1603       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1604       break;
1605    case ir_binop_carry: {
1606       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1607
1608       emit(ADDC(dst_null_ud(), op[0], op[1]));
1609       emit(MOV(result_dst, src_reg(acc)));
1610       break;
1611    }
1612    case ir_binop_borrow: {
1613       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1614
1615       emit(SUBB(dst_null_ud(), op[0], op[1]));
1616       emit(MOV(result_dst, src_reg(acc)));
1617       break;
1618    }
1619    case ir_binop_mod:
1620       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1621       assert(ir->type->is_integer());
1622       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1623       break;
1624
1625    case ir_binop_less:
1626    case ir_binop_greater:
1627    case ir_binop_lequal:
1628    case ir_binop_gequal:
1629    case ir_binop_equal:
1630    case ir_binop_nequal: {
1631       if (devinfo->gen <= 5) {
1632          resolve_bool_comparison(ir->operands[0], &op[0]);
1633          resolve_bool_comparison(ir->operands[1], &op[1]);
1634       }
1635       emit(CMP(result_dst, op[0], op[1],
1636                brw_conditional_for_comparison(ir->operation)));
1637       break;
1638    }
1639
1640    case ir_binop_all_equal:
1641       if (devinfo->gen <= 5) {
1642          resolve_bool_comparison(ir->operands[0], &op[0]);
1643          resolve_bool_comparison(ir->operands[1], &op[1]);
1644       }
1645
1646       /* "==" operator producing a scalar boolean. */
1647       if (ir->operands[0]->type->is_vector() ||
1648           ir->operands[1]->type->is_vector()) {
1649          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1650          emit(MOV(result_dst, src_reg(0)));
1651          inst = emit(MOV(result_dst, src_reg(~0)));
1652          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1653       } else {
1654          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1655       }
1656       break;
1657    case ir_binop_any_nequal:
1658       if (devinfo->gen <= 5) {
1659          resolve_bool_comparison(ir->operands[0], &op[0]);
1660          resolve_bool_comparison(ir->operands[1], &op[1]);
1661       }
1662
1663       /* "!=" operator producing a scalar boolean. */
1664       if (ir->operands[0]->type->is_vector() ||
1665           ir->operands[1]->type->is_vector()) {
1666          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1667
1668          emit(MOV(result_dst, src_reg(0)));
1669          inst = emit(MOV(result_dst, src_reg(~0)));
1670          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1671       } else {
1672          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1673       }
1674       break;
1675
1676    case ir_unop_any:
1677       if (devinfo->gen <= 5) {
1678          resolve_bool_comparison(ir->operands[0], &op[0]);
1679       }
1680       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1681       emit(MOV(result_dst, src_reg(0)));
1682
1683       inst = emit(MOV(result_dst, src_reg(~0)));
1684       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1685       break;
1686
1687    case ir_binop_logic_xor:
1688       emit(XOR(result_dst, op[0], op[1]));
1689       break;
1690
1691    case ir_binop_logic_or:
1692       emit(OR(result_dst, op[0], op[1]));
1693       break;
1694
1695    case ir_binop_logic_and:
1696       emit(AND(result_dst, op[0], op[1]));
1697       break;
1698
1699    case ir_binop_dot:
1700       assert(ir->operands[0]->type->is_vector());
1701       assert(ir->operands[0]->type == ir->operands[1]->type);
1702       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1703       break;
1704
1705    case ir_unop_sqrt:
1706       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1707       break;
1708    case ir_unop_rsq:
1709       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1710       break;
1711
1712    case ir_unop_bitcast_i2f:
1713    case ir_unop_bitcast_u2f:
1714       this->result = op[0];
1715       this->result.type = BRW_REGISTER_TYPE_F;
1716       break;
1717
1718    case ir_unop_bitcast_f2i:
1719       this->result = op[0];
1720       this->result.type = BRW_REGISTER_TYPE_D;
1721       break;
1722
1723    case ir_unop_bitcast_f2u:
1724       this->result = op[0];
1725       this->result.type = BRW_REGISTER_TYPE_UD;
1726       break;
1727
1728    case ir_unop_i2f:
1729    case ir_unop_i2u:
1730    case ir_unop_u2i:
1731    case ir_unop_u2f:
1732    case ir_unop_f2i:
1733    case ir_unop_f2u:
1734       emit(MOV(result_dst, op[0]));
1735       break;
1736    case ir_unop_b2i:
1737       emit(AND(result_dst, op[0], src_reg(1)));
1738       break;
1739    case ir_unop_b2f:
1740       if (devinfo->gen <= 5) {
1741          resolve_bool_comparison(ir->operands[0], &op[0]);
1742       }
1743       op[0].type = BRW_REGISTER_TYPE_D;
1744       result_dst.type = BRW_REGISTER_TYPE_D;
1745       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1746       result_dst.type = BRW_REGISTER_TYPE_F;
1747       break;
1748    case ir_unop_f2b:
1749       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1750       break;
1751    case ir_unop_i2b:
1752       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1753       break;
1754
1755    case ir_unop_trunc:
1756       emit(RNDZ(result_dst, op[0]));
1757       break;
1758    case ir_unop_ceil: {
1759          src_reg tmp = src_reg(this, ir->type);
1760          op[0].negate = !op[0].negate;
1761          emit(RNDD(dst_reg(tmp), op[0]));
1762          tmp.negate = true;
1763          emit(MOV(result_dst, tmp));
1764       }
1765       break;
1766    case ir_unop_floor:
1767       inst = emit(RNDD(result_dst, op[0]));
1768       break;
1769    case ir_unop_fract:
1770       inst = emit(FRC(result_dst, op[0]));
1771       break;
1772    case ir_unop_round_even:
1773       emit(RNDE(result_dst, op[0]));
1774       break;
1775
1776    case ir_binop_min:
1777       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1778       break;
1779    case ir_binop_max:
1780       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1781       break;
1782
1783    case ir_binop_pow:
1784       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1785       break;
1786
1787    case ir_unop_bit_not:
1788       inst = emit(NOT(result_dst, op[0]));
1789       break;
1790    case ir_binop_bit_and:
1791       inst = emit(AND(result_dst, op[0], op[1]));
1792       break;
1793    case ir_binop_bit_xor:
1794       inst = emit(XOR(result_dst, op[0], op[1]));
1795       break;
1796    case ir_binop_bit_or:
1797       inst = emit(OR(result_dst, op[0], op[1]));
1798       break;
1799
1800    case ir_binop_lshift:
1801       inst = emit(SHL(result_dst, op[0], op[1]));
1802       break;
1803
1804    case ir_binop_rshift:
1805       if (ir->type->base_type == GLSL_TYPE_INT)
1806          inst = emit(ASR(result_dst, op[0], op[1]));
1807       else
1808          inst = emit(SHR(result_dst, op[0], op[1]));
1809       break;
1810
1811    case ir_binop_bfm:
1812       emit(BFI1(result_dst, op[0], op[1]));
1813       break;
1814
1815    case ir_binop_ubo_load: {
1816       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1817       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1818       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1819       src_reg offset;
1820
1821       /* Now, load the vector from that offset. */
1822       assert(ir->type->is_vector() || ir->type->is_scalar());
1823
1824       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1825       packed_consts.type = result.type;
1826       src_reg surf_index;
1827
1828       if (const_uniform_block) {
1829          /* The block index is a constant, so just emit the binding table entry
1830           * as an immediate.
1831           */
1832          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1833                               const_uniform_block->value.u[0]);
1834       } else {
1835          /* The block index is not a constant. Evaluate the index expression
1836           * per-channel and add the base UBO index; we have to select a value
1837           * from any live channel.
1838           */
1839          surf_index = src_reg(this, glsl_type::uint_type);
1840          emit(ADD(dst_reg(surf_index), op[0],
1841                   src_reg(prog_data->base.binding_table.ubo_start)));
1842          emit_uniformize(dst_reg(surf_index), surf_index);
1843
1844          /* Assume this may touch any UBO. It would be nice to provide
1845           * a tighter bound, but the array information is already lowered away.
1846           */
1847          brw_mark_surface_used(&prog_data->base,
1848                                prog_data->base.binding_table.ubo_start +
1849                                shader_prog->NumUniformBlocks - 1);
1850       }
1851
1852       if (const_offset_ir) {
1853          if (devinfo->gen >= 8) {
1854             /* Store the offset in a GRF so we can send-from-GRF. */
1855             offset = src_reg(this, glsl_type::int_type);
1856             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1857          } else {
1858             /* Immediates are fine on older generations since they'll be moved
1859              * to a (potentially fake) MRF at the generator level.
1860              */
1861             offset = src_reg(const_offset / 16);
1862          }
1863       } else {
1864          offset = src_reg(this, glsl_type::uint_type);
1865          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1866       }
1867
1868       emit_pull_constant_load_reg(dst_reg(packed_consts),
1869                                   surf_index,
1870                                   offset,
1871                                   NULL, NULL /* before_block/inst */);
1872
1873       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1874       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1875                                             const_offset % 16 / 4,
1876                                             const_offset % 16 / 4,
1877                                             const_offset % 16 / 4);
1878
1879       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1880       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1881          emit(CMP(result_dst, packed_consts, src_reg(0u),
1882                   BRW_CONDITIONAL_NZ));
1883       } else {
1884          emit(MOV(result_dst, packed_consts));
1885       }
1886       break;
1887    }
1888
1889    case ir_binop_vector_extract:
1890       unreachable("should have been lowered by vec_index_to_cond_assign");
1891
1892    case ir_triop_fma:
1893       op[0] = fix_3src_operand(op[0]);
1894       op[1] = fix_3src_operand(op[1]);
1895       op[2] = fix_3src_operand(op[2]);
1896       /* Note that the instruction's argument order is reversed from GLSL
1897        * and the IR.
1898        */
1899       emit(MAD(result_dst, op[2], op[1], op[0]));
1900       break;
1901
1902    case ir_triop_lrp:
1903       emit_lrp(result_dst, op[0], op[1], op[2]);
1904       break;
1905
1906    case ir_triop_csel:
1907       unreachable("already handled above");
1908       break;
1909
1910    case ir_triop_bfi:
1911       op[0] = fix_3src_operand(op[0]);
1912       op[1] = fix_3src_operand(op[1]);
1913       op[2] = fix_3src_operand(op[2]);
1914       emit(BFI2(result_dst, op[0], op[1], op[2]));
1915       break;
1916
1917    case ir_triop_bitfield_extract:
1918       op[0] = fix_3src_operand(op[0]);
1919       op[1] = fix_3src_operand(op[1]);
1920       op[2] = fix_3src_operand(op[2]);
1921       /* Note that the instruction's argument order is reversed from GLSL
1922        * and the IR.
1923        */
1924       emit(BFE(result_dst, op[2], op[1], op[0]));
1925       break;
1926
1927    case ir_triop_vector_insert:
1928       unreachable("should have been lowered by lower_vector_insert");
1929
1930    case ir_quadop_bitfield_insert:
1931       unreachable("not reached: should be handled by "
1932               "bitfield_insert_to_bfm_bfi\n");
1933
1934    case ir_quadop_vector:
1935       unreachable("not reached: should be handled by lower_quadop_vector");
1936
1937    case ir_unop_pack_half_2x16:
1938       emit_pack_half_2x16(result_dst, op[0]);
1939       break;
1940    case ir_unop_unpack_half_2x16:
1941       emit_unpack_half_2x16(result_dst, op[0]);
1942       break;
1943    case ir_unop_unpack_unorm_4x8:
1944       emit_unpack_unorm_4x8(result_dst, op[0]);
1945       break;
1946    case ir_unop_unpack_snorm_4x8:
1947       emit_unpack_snorm_4x8(result_dst, op[0]);
1948       break;
1949    case ir_unop_pack_unorm_4x8:
1950       emit_pack_unorm_4x8(result_dst, op[0]);
1951       break;
1952    case ir_unop_pack_snorm_4x8:
1953       emit_pack_snorm_4x8(result_dst, op[0]);
1954       break;
1955    case ir_unop_pack_snorm_2x16:
1956    case ir_unop_pack_unorm_2x16:
1957    case ir_unop_unpack_snorm_2x16:
1958    case ir_unop_unpack_unorm_2x16:
1959       unreachable("not reached: should be handled by lower_packing_builtins");
1960    case ir_unop_unpack_half_2x16_split_x:
1961    case ir_unop_unpack_half_2x16_split_y:
1962    case ir_binop_pack_half_2x16_split:
1963    case ir_unop_interpolate_at_centroid:
1964    case ir_binop_interpolate_at_sample:
1965    case ir_binop_interpolate_at_offset:
1966       unreachable("not reached: should not occur in vertex shader");
1967    case ir_binop_ldexp:
1968       unreachable("not reached: should be handled by ldexp_to_arith()");
1969    case ir_unop_d2f:
1970    case ir_unop_f2d:
1971    case ir_unop_d2i:
1972    case ir_unop_i2d:
1973    case ir_unop_d2u:
1974    case ir_unop_u2d:
1975    case ir_unop_d2b:
1976    case ir_unop_pack_double_2x32:
1977    case ir_unop_unpack_double_2x32:
1978    case ir_unop_frexp_sig:
1979    case ir_unop_frexp_exp:
1980       unreachable("fp64 todo");
1981    }
1982 }
1983
1984
1985 void
1986 vec4_visitor::visit(ir_swizzle *ir)
1987 {
1988    /* Note that this is only swizzles in expressions, not those on the left
1989     * hand side of an assignment, which do write masking.  See ir_assignment
1990     * for that.
1991     */
1992    const unsigned swz = brw_compose_swizzle(
1993       brw_swizzle_for_size(ir->type->vector_elements),
1994       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1995
1996    ir->val->accept(this);
1997    this->result = swizzle(this->result, swz);
1998 }
1999
2000 void
2001 vec4_visitor::visit(ir_dereference_variable *ir)
2002 {
2003    const struct glsl_type *type = ir->type;
2004    dst_reg *reg = variable_storage(ir->var);
2005
2006    if (!reg) {
2007       fail("Failed to find variable storage for %s\n", ir->var->name);
2008       this->result = src_reg(brw_null_reg());
2009       return;
2010    }
2011
2012    this->result = src_reg(*reg);
2013
2014    /* System values get their swizzle from the dst_reg writemask */
2015    if (ir->var->data.mode == ir_var_system_value)
2016       return;
2017
2018    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2019       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2020 }
2021
2022
2023 int
2024 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2025 {
2026    /* Under normal circumstances array elements are stored consecutively, so
2027     * the stride is equal to the size of the array element.
2028     */
2029    return type_size(ir->type);
2030 }
2031
2032
2033 void
2034 vec4_visitor::visit(ir_dereference_array *ir)
2035 {
2036    ir_constant *constant_index;
2037    src_reg src;
2038    int array_stride = compute_array_stride(ir);
2039
2040    constant_index = ir->array_index->constant_expression_value();
2041
2042    ir->array->accept(this);
2043    src = this->result;
2044
2045    if (constant_index) {
2046       src.reg_offset += constant_index->value.i[0] * array_stride;
2047    } else {
2048       /* Variable index array dereference.  It eats the "vec4" of the
2049        * base of the array and an index that offsets the Mesa register
2050        * index.
2051        */
2052       ir->array_index->accept(this);
2053
2054       src_reg index_reg;
2055
2056       if (array_stride == 1) {
2057          index_reg = this->result;
2058       } else {
2059          index_reg = src_reg(this, glsl_type::int_type);
2060
2061          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2062       }
2063
2064       if (src.reladdr) {
2065          src_reg temp = src_reg(this, glsl_type::int_type);
2066
2067          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2068
2069          index_reg = temp;
2070       }
2071
2072       src.reladdr = ralloc(mem_ctx, src_reg);
2073       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2074    }
2075
2076    /* If the type is smaller than a vec4, replicate the last channel out. */
2077    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2078       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2079    else
2080       src.swizzle = BRW_SWIZZLE_NOOP;
2081    src.type = brw_type_for_base_type(ir->type);
2082
2083    this->result = src;
2084 }
2085
2086 void
2087 vec4_visitor::visit(ir_dereference_record *ir)
2088 {
2089    unsigned int i;
2090    const glsl_type *struct_type = ir->record->type;
2091    int offset = 0;
2092
2093    ir->record->accept(this);
2094
2095    for (i = 0; i < struct_type->length; i++) {
2096       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2097          break;
2098       offset += type_size(struct_type->fields.structure[i].type);
2099    }
2100
2101    /* If the type is smaller than a vec4, replicate the last channel out. */
2102    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2103       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2104    else
2105       this->result.swizzle = BRW_SWIZZLE_NOOP;
2106    this->result.type = brw_type_for_base_type(ir->type);
2107
2108    this->result.reg_offset += offset;
2109 }
2110
2111 /**
2112  * We want to be careful in assignment setup to hit the actual storage
2113  * instead of potentially using a temporary like we might with the
2114  * ir_dereference handler.
2115  */
2116 static dst_reg
2117 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2118 {
2119    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2120     * access of a vector, it must be separated into a series conditional moves
2121     * before reaching this point (see ir_vec_index_to_cond_assign).
2122     */
2123    assert(ir->as_dereference());
2124    ir_dereference_array *deref_array = ir->as_dereference_array();
2125    if (deref_array) {
2126       assert(!deref_array->array->type->is_vector());
2127    }
2128
2129    /* Use the rvalue deref handler for the most part.  We'll ignore
2130     * swizzles in it and write swizzles using writemask, though.
2131     */
2132    ir->accept(v);
2133    return dst_reg(v->result);
2134 }
2135
2136 void
2137 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2138                               const struct glsl_type *type,
2139                               enum brw_predicate predicate)
2140 {
2141    if (type->base_type == GLSL_TYPE_STRUCT) {
2142       for (unsigned int i = 0; i < type->length; i++) {
2143          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2144       }
2145       return;
2146    }
2147
2148    if (type->is_array()) {
2149       for (unsigned int i = 0; i < type->length; i++) {
2150          emit_block_move(dst, src, type->fields.array, predicate);
2151       }
2152       return;
2153    }
2154
2155    if (type->is_matrix()) {
2156       const struct glsl_type *vec_type;
2157
2158       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2159                                          type->vector_elements, 1);
2160
2161       for (int i = 0; i < type->matrix_columns; i++) {
2162          emit_block_move(dst, src, vec_type, predicate);
2163       }
2164       return;
2165    }
2166
2167    assert(type->is_scalar() || type->is_vector());
2168
2169    dst->type = brw_type_for_base_type(type);
2170    src->type = dst->type;
2171
2172    dst->writemask = (1 << type->vector_elements) - 1;
2173
2174    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2175
2176    vec4_instruction *inst = emit(MOV(*dst, *src));
2177    inst->predicate = predicate;
2178
2179    dst->reg_offset++;
2180    src->reg_offset++;
2181 }
2182
2183
2184 /* If the RHS processing resulted in an instruction generating a
2185  * temporary value, and it would be easy to rewrite the instruction to
2186  * generate its result right into the LHS instead, do so.  This ends
2187  * up reliably removing instructions where it can be tricky to do so
2188  * later without real UD chain information.
2189  */
2190 bool
2191 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2192                                      dst_reg dst,
2193                                      src_reg src,
2194                                      vec4_instruction *pre_rhs_inst,
2195                                      vec4_instruction *last_rhs_inst)
2196 {
2197    /* This could be supported, but it would take more smarts. */
2198    if (ir->condition)
2199       return false;
2200
2201    if (pre_rhs_inst == last_rhs_inst)
2202       return false; /* No instructions generated to work with. */
2203
2204    /* Make sure the last instruction generated our source reg. */
2205    if (src.file != GRF ||
2206        src.file != last_rhs_inst->dst.file ||
2207        src.reg != last_rhs_inst->dst.reg ||
2208        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2209        src.reladdr ||
2210        src.abs ||
2211        src.negate ||
2212        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2213       return false;
2214
2215    /* Check that that last instruction fully initialized the channels
2216     * we want to use, in the order we want to use them.  We could
2217     * potentially reswizzle the operands of many instructions so that
2218     * we could handle out of order channels, but don't yet.
2219     */
2220
2221    for (unsigned i = 0; i < 4; i++) {
2222       if (dst.writemask & (1 << i)) {
2223          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2224             return false;
2225
2226          if (BRW_GET_SWZ(src.swizzle, i) != i)
2227             return false;
2228       }
2229    }
2230
2231    /* Success!  Rewrite the instruction. */
2232    last_rhs_inst->dst.file = dst.file;
2233    last_rhs_inst->dst.reg = dst.reg;
2234    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2235    last_rhs_inst->dst.reladdr = dst.reladdr;
2236    last_rhs_inst->dst.writemask &= dst.writemask;
2237
2238    return true;
2239 }
2240
2241 void
2242 vec4_visitor::visit(ir_assignment *ir)
2243 {
2244    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2245    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2246
2247    if (!ir->lhs->type->is_scalar() &&
2248        !ir->lhs->type->is_vector()) {
2249       ir->rhs->accept(this);
2250       src_reg src = this->result;
2251
2252       if (ir->condition) {
2253          emit_bool_to_cond_code(ir->condition, &predicate);
2254       }
2255
2256       /* emit_block_move doesn't account for swizzles in the source register.
2257        * This should be ok, since the source register is a structure or an
2258        * array, and those can't be swizzled.  But double-check to be sure.
2259        */
2260       assert(src.swizzle ==
2261              (ir->rhs->type->is_matrix()
2262               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2263               : BRW_SWIZZLE_NOOP));
2264
2265       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2266       return;
2267    }
2268
2269    /* Now we're down to just a scalar/vector with writemasks. */
2270    int i;
2271
2272    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2273    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2274
2275    ir->rhs->accept(this);
2276
2277    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2278
2279    int swizzles[4];
2280    int src_chan = 0;
2281
2282    assert(ir->lhs->type->is_vector() ||
2283           ir->lhs->type->is_scalar());
2284    dst.writemask = ir->write_mask;
2285
2286    /* Swizzle a small RHS vector into the channels being written.
2287     *
2288     * glsl ir treats write_mask as dictating how many channels are
2289     * present on the RHS while in our instructions we need to make
2290     * those channels appear in the slots of the vec4 they're written to.
2291     */
2292    for (int i = 0; i < 4; i++)
2293       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2294
2295    src_reg src = swizzle(this->result,
2296                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2297                                       swizzles[2], swizzles[3]));
2298
2299    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2300       return;
2301    }
2302
2303    if (ir->condition) {
2304       emit_bool_to_cond_code(ir->condition, &predicate);
2305    }
2306
2307    for (i = 0; i < type_size(ir->lhs->type); i++) {
2308       vec4_instruction *inst = emit(MOV(dst, src));
2309       inst->predicate = predicate;
2310
2311       dst.reg_offset++;
2312       src.reg_offset++;
2313    }
2314 }
2315
2316 void
2317 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2318 {
2319    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2320       foreach_in_list(ir_constant, field_value, &ir->components) {
2321          emit_constant_values(dst, field_value);
2322       }
2323       return;
2324    }
2325
2326    if (ir->type->is_array()) {
2327       for (unsigned int i = 0; i < ir->type->length; i++) {
2328          emit_constant_values(dst, ir->array_elements[i]);
2329       }
2330       return;
2331    }
2332
2333    if (ir->type->is_matrix()) {
2334       for (int i = 0; i < ir->type->matrix_columns; i++) {
2335          float *vec = &ir->value.f[i * ir->type->vector_elements];
2336
2337          for (int j = 0; j < ir->type->vector_elements; j++) {
2338             dst->writemask = 1 << j;
2339             dst->type = BRW_REGISTER_TYPE_F;
2340
2341             emit(MOV(*dst, src_reg(vec[j])));
2342          }
2343          dst->reg_offset++;
2344       }
2345       return;
2346    }
2347
2348    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2349
2350    for (int i = 0; i < ir->type->vector_elements; i++) {
2351       if (!(remaining_writemask & (1 << i)))
2352          continue;
2353
2354       dst->writemask = 1 << i;
2355       dst->type = brw_type_for_base_type(ir->type);
2356
2357       /* Find other components that match the one we're about to
2358        * write.  Emits fewer instructions for things like vec4(0.5,
2359        * 1.5, 1.5, 1.5).
2360        */
2361       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2362          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2363             if (ir->value.b[i] == ir->value.b[j])
2364                dst->writemask |= (1 << j);
2365          } else {
2366             /* u, i, and f storage all line up, so no need for a
2367              * switch case for comparing each type.
2368              */
2369             if (ir->value.u[i] == ir->value.u[j])
2370                dst->writemask |= (1 << j);
2371          }
2372       }
2373
2374       switch (ir->type->base_type) {
2375       case GLSL_TYPE_FLOAT:
2376          emit(MOV(*dst, src_reg(ir->value.f[i])));
2377          break;
2378       case GLSL_TYPE_INT:
2379          emit(MOV(*dst, src_reg(ir->value.i[i])));
2380          break;
2381       case GLSL_TYPE_UINT:
2382          emit(MOV(*dst, src_reg(ir->value.u[i])));
2383          break;
2384       case GLSL_TYPE_BOOL:
2385          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2386          break;
2387       default:
2388          unreachable("Non-float/uint/int/bool constant");
2389       }
2390
2391       remaining_writemask &= ~dst->writemask;
2392    }
2393    dst->reg_offset++;
2394 }
2395
2396 void
2397 vec4_visitor::visit(ir_constant *ir)
2398 {
2399    dst_reg dst = dst_reg(this, ir->type);
2400    this->result = src_reg(dst);
2401
2402    emit_constant_values(&dst, ir);
2403 }
2404
2405 void
2406 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2407 {
2408    ir_dereference *deref = static_cast<ir_dereference *>(
2409       ir->actual_parameters.get_head());
2410    ir_variable *location = deref->variable_referenced();
2411    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2412                           location->data.binding);
2413
2414    /* Calculate the surface offset */
2415    src_reg offset(this, glsl_type::uint_type);
2416    ir_dereference_array *deref_array = deref->as_dereference_array();
2417    if (deref_array) {
2418       deref_array->array_index->accept(this);
2419
2420       src_reg tmp(this, glsl_type::uint_type);
2421       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2422       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2423    } else {
2424       offset = location->data.atomic.offset;
2425    }
2426
2427    /* Emit the appropriate machine instruction */
2428    const char *callee = ir->callee->function_name();
2429    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2430
2431    if (!strcmp("__intrinsic_atomic_read", callee)) {
2432       emit_untyped_surface_read(surf_index, dst, offset);
2433
2434    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2435       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2436                           src_reg(), src_reg());
2437
2438    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2439       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2440                           src_reg(), src_reg());
2441    }
2442 }
2443
2444 void
2445 vec4_visitor::visit(ir_call *ir)
2446 {
2447    const char *callee = ir->callee->function_name();
2448
2449    if (!strcmp("__intrinsic_atomic_read", callee) ||
2450        !strcmp("__intrinsic_atomic_increment", callee) ||
2451        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2452       visit_atomic_counter_intrinsic(ir);
2453    } else {
2454       unreachable("Unsupported intrinsic.");
2455    }
2456 }
2457
2458 src_reg
2459 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2460 {
2461    vec4_instruction *inst =
2462       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2463                                     dst_reg(this, glsl_type::uvec4_type));
2464    inst->base_mrf = 2;
2465    inst->src[1] = sampler;
2466
2467    int param_base;
2468
2469    if (devinfo->gen >= 9) {
2470       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2471       vec4_instruction *header_inst = new(mem_ctx)
2472          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2473                           dst_reg(MRF, inst->base_mrf));
2474
2475       emit(header_inst);
2476
2477       inst->mlen = 2;
2478       inst->header_size = 1;
2479       param_base = inst->base_mrf + 1;
2480    } else {
2481       inst->mlen = 1;
2482       param_base = inst->base_mrf;
2483    }
2484
2485    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2486    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2487    int zero_mask = 0xf & ~coord_mask;
2488
2489    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2490             coordinate));
2491
2492    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2493             src_reg(0)));
2494
2495    emit(inst);
2496    return src_reg(inst->dst);
2497 }
2498
2499 static bool
2500 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2501 {
2502    if (devinfo->gen < 8 && !devinfo->is_haswell)
2503       return false;
2504
2505    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2506 }
2507
2508 void
2509 vec4_visitor::visit(ir_texture *ir)
2510 {
2511    uint32_t sampler =
2512       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2513
2514    ir_rvalue *nonconst_sampler_index =
2515       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2516
2517    /* Handle non-constant sampler array indexing */
2518    src_reg sampler_reg;
2519    if (nonconst_sampler_index) {
2520       /* The highest sampler which may be used by this operation is
2521        * the last element of the array. Mark it here, because the generator
2522        * doesn't have enough information to determine the bound.
2523        */
2524       uint32_t array_size = ir->sampler->as_dereference_array()
2525          ->array->type->array_size();
2526
2527       uint32_t max_used = sampler + array_size - 1;
2528       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2529          max_used += prog_data->base.binding_table.gather_texture_start;
2530       } else {
2531          max_used += prog_data->base.binding_table.texture_start;
2532       }
2533
2534       brw_mark_surface_used(&prog_data->base, max_used);
2535
2536       /* Emit code to evaluate the actual indexing expression */
2537       nonconst_sampler_index->accept(this);
2538       dst_reg temp(this, glsl_type::uint_type);
2539       emit(ADD(temp, this->result, src_reg(sampler)));
2540       emit_uniformize(temp, src_reg(temp));
2541
2542       sampler_reg = src_reg(temp);
2543    } else {
2544       /* Single sampler, or constant array index; the indexing expression
2545        * is just an immediate.
2546        */
2547       sampler_reg = src_reg(sampler);
2548    }
2549
2550    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2551     * emitting anything other than setting up the constant result.
2552     */
2553    if (ir->op == ir_tg4) {
2554       ir_constant *chan = ir->lod_info.component->as_constant();
2555       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2556       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2557          dst_reg result(this, ir->type);
2558          this->result = src_reg(result);
2559          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2560          return;
2561       }
2562    }
2563
2564    /* Should be lowered by do_lower_texture_projection */
2565    assert(!ir->projector);
2566
2567    /* Should be lowered */
2568    assert(!ir->offset || !ir->offset->type->is_array());
2569
2570    /* Generate code to compute all the subexpression trees.  This has to be
2571     * done before loading any values into MRFs for the sampler message since
2572     * generating these values may involve SEND messages that need the MRFs.
2573     */
2574    src_reg coordinate;
2575    if (ir->coordinate) {
2576       ir->coordinate->accept(this);
2577       coordinate = this->result;
2578    }
2579
2580    src_reg shadow_comparitor;
2581    if (ir->shadow_comparitor) {
2582       ir->shadow_comparitor->accept(this);
2583       shadow_comparitor = this->result;
2584    }
2585
2586    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2587    src_reg offset_value;
2588    if (has_nonconstant_offset) {
2589       ir->offset->accept(this);
2590       offset_value = src_reg(this->result);
2591    }
2592
2593    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2594    src_reg lod, dPdx, dPdy, sample_index, mcs;
2595    switch (ir->op) {
2596    case ir_tex:
2597       lod = src_reg(0.0f);
2598       lod_type = glsl_type::float_type;
2599       break;
2600    case ir_txf:
2601    case ir_txl:
2602    case ir_txs:
2603       ir->lod_info.lod->accept(this);
2604       lod = this->result;
2605       lod_type = ir->lod_info.lod->type;
2606       break;
2607    case ir_query_levels:
2608       lod = src_reg(0);
2609       lod_type = glsl_type::int_type;
2610       break;
2611    case ir_txf_ms:
2612       ir->lod_info.sample_index->accept(this);
2613       sample_index = this->result;
2614       sample_index_type = ir->lod_info.sample_index->type;
2615
2616       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2617          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2618       else
2619          mcs = src_reg(0u);
2620       break;
2621    case ir_txd:
2622       ir->lod_info.grad.dPdx->accept(this);
2623       dPdx = this->result;
2624
2625       ir->lod_info.grad.dPdy->accept(this);
2626       dPdy = this->result;
2627
2628       lod_type = ir->lod_info.grad.dPdx->type;
2629       break;
2630    case ir_txb:
2631    case ir_lod:
2632    case ir_tg4:
2633       break;
2634    }
2635
2636    enum opcode opcode;
2637    switch (ir->op) {
2638    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2639    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2640    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2641    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2642    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2643    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2644    case ir_tg4: opcode = has_nonconstant_offset
2645                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2646    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2647    case ir_txb:
2648       unreachable("TXB is not valid for vertex shaders.");
2649    case ir_lod:
2650       unreachable("LOD is not valid for vertex shaders.");
2651    default:
2652       unreachable("Unrecognized tex op");
2653    }
2654
2655    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2656       opcode, dst_reg(this, ir->type));
2657
2658    if (ir->offset != NULL && !has_nonconstant_offset) {
2659       inst->offset =
2660          brw_texture_offset(ir->offset->as_constant()->value.i,
2661                             ir->offset->type->vector_elements);
2662    }
2663
2664    /* Stuff the channel select bits in the top of the texture offset */
2665    if (ir->op == ir_tg4)
2666       inst->offset |= gather_channel(ir, sampler) << 16;
2667
2668    /* The message header is necessary for:
2669     * - Gen4 (always)
2670     * - Gen9+ for selecting SIMD4x2
2671     * - Texel offsets
2672     * - Gather channel selection
2673     * - Sampler indices too large to fit in a 4-bit value.
2674     */
2675    inst->header_size =
2676       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2677        inst->offset != 0 || ir->op == ir_tg4 ||
2678        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2679    inst->base_mrf = 2;
2680    inst->mlen = inst->header_size + 1; /* always at least one */
2681    inst->dst.writemask = WRITEMASK_XYZW;
2682    inst->shadow_compare = ir->shadow_comparitor != NULL;
2683
2684    inst->src[1] = sampler_reg;
2685
2686    /* MRF for the first parameter */
2687    int param_base = inst->base_mrf + inst->header_size;
2688
2689    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2690       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2691       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2692    } else {
2693       /* Load the coordinate */
2694       /* FINISHME: gl_clamp_mask and saturate */
2695       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2696       int zero_mask = 0xf & ~coord_mask;
2697
2698       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2699                coordinate));
2700
2701       if (zero_mask != 0) {
2702          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2703                   src_reg(0)));
2704       }
2705       /* Load the shadow comparitor */
2706       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2707          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2708                           WRITEMASK_X),
2709                   shadow_comparitor));
2710          inst->mlen++;
2711       }
2712
2713       /* Load the LOD info */
2714       if (ir->op == ir_tex || ir->op == ir_txl) {
2715          int mrf, writemask;
2716          if (devinfo->gen >= 5) {
2717             mrf = param_base + 1;
2718             if (ir->shadow_comparitor) {
2719                writemask = WRITEMASK_Y;
2720                /* mlen already incremented */
2721             } else {
2722                writemask = WRITEMASK_X;
2723                inst->mlen++;
2724             }
2725          } else /* devinfo->gen == 4 */ {
2726             mrf = param_base;
2727             writemask = WRITEMASK_W;
2728          }
2729          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2730       } else if (ir->op == ir_txf) {
2731          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2732       } else if (ir->op == ir_txf_ms) {
2733          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2734                   sample_index));
2735          if (devinfo->gen >= 7) {
2736             /* MCS data is in the first channel of `mcs`, but we need to get it into
2737              * the .y channel of the second vec4 of params, so replicate .x across
2738              * the whole vec4 and then mask off everything except .y
2739              */
2740             mcs.swizzle = BRW_SWIZZLE_XXXX;
2741             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2742                      mcs));
2743          }
2744          inst->mlen++;
2745       } else if (ir->op == ir_txd) {
2746          const glsl_type *type = lod_type;
2747
2748          if (devinfo->gen >= 5) {
2749             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2750             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2751             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2752             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2753             inst->mlen++;
2754
2755             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2756                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2757                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2758                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2759                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2760                inst->mlen++;
2761
2762                if (ir->shadow_comparitor) {
2763                   emit(MOV(dst_reg(MRF, param_base + 2,
2764                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2765                            shadow_comparitor));
2766                }
2767             }
2768          } else /* devinfo->gen == 4 */ {
2769             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2770             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2771             inst->mlen += 2;
2772          }
2773       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2774          if (ir->shadow_comparitor) {
2775             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2776                      shadow_comparitor));
2777          }
2778
2779          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2780                   offset_value));
2781          inst->mlen++;
2782       }
2783    }
2784
2785    emit(inst);
2786
2787    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2788     * spec requires layers.
2789     */
2790    if (ir->op == ir_txs) {
2791       glsl_type const *type = ir->sampler->type;
2792       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2793           type->sampler_array) {
2794          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2795                    writemask(inst->dst, WRITEMASK_Z),
2796                    src_reg(inst->dst), src_reg(6));
2797       }
2798    }
2799
2800    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2801       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2802    }
2803
2804    swizzle_result(ir, src_reg(inst->dst), sampler);
2805 }
2806
2807 /**
2808  * Apply workarounds for Gen6 gather with UINT/SINT
2809  */
2810 void
2811 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2812 {
2813    if (!wa)
2814       return;
2815
2816    int width = (wa & WA_8BIT) ? 8 : 16;
2817    dst_reg dst_f = dst;
2818    dst_f.type = BRW_REGISTER_TYPE_F;
2819
2820    /* Convert from UNORM to UINT */
2821    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2822    emit(MOV(dst, src_reg(dst_f)));
2823
2824    if (wa & WA_SIGN) {
2825       /* Reinterpret the UINT value as a signed INT value by
2826        * shifting the sign bit into place, then shifting back
2827        * preserving sign.
2828        */
2829       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2830       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2831    }
2832 }
2833
2834 /**
2835  * Set up the gather channel based on the swizzle, for gather4.
2836  */
2837 uint32_t
2838 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2839 {
2840    ir_constant *chan = ir->lod_info.component->as_constant();
2841    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2842    switch (swiz) {
2843       case SWIZZLE_X: return 0;
2844       case SWIZZLE_Y:
2845          /* gather4 sampler is broken for green channel on RG32F --
2846           * we must ask for blue instead.
2847           */
2848          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2849             return 2;
2850          return 1;
2851       case SWIZZLE_Z: return 2;
2852       case SWIZZLE_W: return 3;
2853       default:
2854          unreachable("Not reached"); /* zero, one swizzles handled already */
2855    }
2856 }
2857
2858 void
2859 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2860 {
2861    int s = key->tex.swizzles[sampler];
2862
2863    this->result = src_reg(this, ir->type);
2864    dst_reg swizzled_result(this->result);
2865
2866    if (ir->op == ir_query_levels) {
2867       /* # levels is in .w */
2868       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2869       emit(MOV(swizzled_result, orig_val));
2870       return;
2871    }
2872
2873    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2874                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2875       emit(MOV(swizzled_result, orig_val));
2876       return;
2877    }
2878
2879
2880    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2881    int swizzle[4] = {0};
2882
2883    for (int i = 0; i < 4; i++) {
2884       switch (GET_SWZ(s, i)) {
2885       case SWIZZLE_ZERO:
2886          zero_mask |= (1 << i);
2887          break;
2888       case SWIZZLE_ONE:
2889          one_mask |= (1 << i);
2890          break;
2891       default:
2892          copy_mask |= (1 << i);
2893          swizzle[i] = GET_SWZ(s, i);
2894          break;
2895       }
2896    }
2897
2898    if (copy_mask) {
2899       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2900       swizzled_result.writemask = copy_mask;
2901       emit(MOV(swizzled_result, orig_val));
2902    }
2903
2904    if (zero_mask) {
2905       swizzled_result.writemask = zero_mask;
2906       emit(MOV(swizzled_result, src_reg(0.0f)));
2907    }
2908
2909    if (one_mask) {
2910       swizzled_result.writemask = one_mask;
2911       emit(MOV(swizzled_result, src_reg(1.0f)));
2912    }
2913 }
2914
2915 void
2916 vec4_visitor::visit(ir_return *)
2917 {
2918    unreachable("not reached");
2919 }
2920
2921 void
2922 vec4_visitor::visit(ir_discard *)
2923 {
2924    unreachable("not reached");
2925 }
2926
2927 void
2928 vec4_visitor::visit(ir_if *ir)
2929 {
2930    /* Don't point the annotation at the if statement, because then it plus
2931     * the then and else blocks get printed.
2932     */
2933    this->base_ir = ir->condition;
2934
2935    if (devinfo->gen == 6) {
2936       emit_if_gen6(ir);
2937    } else {
2938       enum brw_predicate predicate;
2939       emit_bool_to_cond_code(ir->condition, &predicate);
2940       emit(IF(predicate));
2941    }
2942
2943    visit_instructions(&ir->then_instructions);
2944
2945    if (!ir->else_instructions.is_empty()) {
2946       this->base_ir = ir->condition;
2947       emit(BRW_OPCODE_ELSE);
2948
2949       visit_instructions(&ir->else_instructions);
2950    }
2951
2952    this->base_ir = ir->condition;
2953    emit(BRW_OPCODE_ENDIF);
2954 }
2955
2956 void
2957 vec4_visitor::visit(ir_emit_vertex *)
2958 {
2959    unreachable("not reached");
2960 }
2961
2962 void
2963 vec4_visitor::visit(ir_end_primitive *)
2964 {
2965    unreachable("not reached");
2966 }
2967
2968 void
2969 vec4_visitor::visit(ir_barrier *)
2970 {
2971    unreachable("not reached");
2972 }
2973
2974 void
2975 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2976                                   dst_reg dst, src_reg offset,
2977                                   src_reg src0, src_reg src1)
2978 {
2979    unsigned mlen = 0;
2980
2981    /* Set the atomic operation offset. */
2982    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2983    mlen++;
2984
2985    /* Set the atomic operation arguments. */
2986    if (src0.file != BAD_FILE) {
2987       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2988       mlen++;
2989    }
2990
2991    if (src1.file != BAD_FILE) {
2992       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2993       mlen++;
2994    }
2995
2996    /* Emit the instruction.  Note that this maps to the normal SIMD8
2997     * untyped atomic message on Ivy Bridge, but that's OK because
2998     * unused channels will be masked out.
2999     */
3000    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3001                                  brw_message_reg(0),
3002                                  src_reg(surf_index), src_reg(atomic_op));
3003    inst->mlen = mlen;
3004 }
3005
3006 void
3007 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3008                                         src_reg offset)
3009 {
3010    /* Set the surface read offset. */
3011    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3012
3013    /* Emit the instruction.  Note that this maps to the normal SIMD8
3014     * untyped surface read message, but that's OK because unused
3015     * channels will be masked out.
3016     */
3017    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3018                                  brw_message_reg(0),
3019                                  src_reg(surf_index), src_reg(1));
3020    inst->mlen = 1;
3021 }
3022
3023 void
3024 vec4_visitor::emit_ndc_computation()
3025 {
3026    /* Get the position */
3027    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3028
3029    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3030    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3031    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3032
3033    current_annotation = "NDC";
3034    dst_reg ndc_w = ndc;
3035    ndc_w.writemask = WRITEMASK_W;
3036    src_reg pos_w = pos;
3037    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3038    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3039
3040    dst_reg ndc_xyz = ndc;
3041    ndc_xyz.writemask = WRITEMASK_XYZ;
3042
3043    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3044 }
3045
3046 void
3047 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3048 {
3049    if (devinfo->gen < 6 &&
3050        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3051         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3052       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3053       dst_reg header1_w = header1;
3054       header1_w.writemask = WRITEMASK_W;
3055
3056       emit(MOV(header1, 0u));
3057
3058       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3059          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3060
3061          current_annotation = "Point size";
3062          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3063          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3064       }
3065
3066       if (key->userclip_active) {
3067          current_annotation = "Clipping flags";
3068          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3069          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3070
3071          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3072          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3073          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3074
3075          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3076          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3077          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3078          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3079       }
3080
3081       /* i965 clipping workaround:
3082        * 1) Test for -ve rhw
3083        * 2) If set,
3084        *      set ndc = (0,0,0,0)
3085        *      set ucp[6] = 1
3086        *
3087        * Later, clipping will detect ucp[6] and ensure the primitive is
3088        * clipped against all fixed planes.
3089        */
3090       if (devinfo->has_negative_rhw_bug) {
3091          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3092          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3093          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3094          vec4_instruction *inst;
3095          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3096          inst->predicate = BRW_PREDICATE_NORMAL;
3097          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3098          inst->predicate = BRW_PREDICATE_NORMAL;
3099       }
3100
3101       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3102    } else if (devinfo->gen < 6) {
3103       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3104    } else {
3105       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3106       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3107          dst_reg reg_w = reg;
3108          reg_w.writemask = WRITEMASK_W;
3109          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3110       }
3111       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3112          dst_reg reg_y = reg;
3113          reg_y.writemask = WRITEMASK_Y;
3114          reg_y.type = BRW_REGISTER_TYPE_D;
3115          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3116       }
3117       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3118          dst_reg reg_z = reg;
3119          reg_z.writemask = WRITEMASK_Z;
3120          reg_z.type = BRW_REGISTER_TYPE_D;
3121          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3122       }
3123    }
3124 }
3125
3126 void
3127 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3128 {
3129    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3130     *
3131     *     "If a linked set of shaders forming the vertex stage contains no
3132     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3133     *     application has requested clipping against user clip planes through
3134     *     the API, then the coordinate written to gl_Position is used for
3135     *     comparison against the user clip planes."
3136     *
3137     * This function is only called if the shader didn't write to
3138     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3139     * if the user wrote to it; otherwise we use gl_Position.
3140     */
3141    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3142    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3143       clip_vertex = VARYING_SLOT_POS;
3144    }
3145
3146    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3147         ++i) {
3148       reg.writemask = 1 << i;
3149       emit(DP4(reg,
3150                src_reg(output_reg[clip_vertex]),
3151                src_reg(this->userplane[i + offset])));
3152    }
3153 }
3154
3155 vec4_instruction *
3156 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3157 {
3158    assert (varying < VARYING_SLOT_MAX);
3159    reg.type = output_reg[varying].type;
3160    current_annotation = output_reg_annotation[varying];
3161    /* Copy the register, saturating if necessary */
3162    return emit(MOV(reg, src_reg(output_reg[varying])));
3163 }
3164
3165 void
3166 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3167 {
3168    reg.type = BRW_REGISTER_TYPE_F;
3169
3170    switch (varying) {
3171    case VARYING_SLOT_PSIZ:
3172    {
3173       /* PSIZ is always in slot 0, and is coupled with other flags. */
3174       current_annotation = "indices, point width, clip flags";
3175       emit_psiz_and_flags(reg);
3176       break;
3177    }
3178    case BRW_VARYING_SLOT_NDC:
3179       current_annotation = "NDC";
3180       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3181       break;
3182    case VARYING_SLOT_POS:
3183       current_annotation = "gl_Position";
3184       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3185       break;
3186    case VARYING_SLOT_EDGE:
3187       /* This is present when doing unfilled polygons.  We're supposed to copy
3188        * the edge flag from the user-provided vertex array
3189        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3190        * of that attribute (starts as 1.0f).  This is then used in clipping to
3191        * determine which edges should be drawn as wireframe.
3192        */
3193       current_annotation = "edge flag";
3194       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3195                                     glsl_type::float_type, WRITEMASK_XYZW))));
3196       break;
3197    case BRW_VARYING_SLOT_PAD:
3198       /* No need to write to this slot */
3199       break;
3200    case VARYING_SLOT_COL0:
3201    case VARYING_SLOT_COL1:
3202    case VARYING_SLOT_BFC0:
3203    case VARYING_SLOT_BFC1: {
3204       /* These built-in varyings are only supported in compatibility mode,
3205        * and we only support GS in core profile.  So, this must be a vertex
3206        * shader.
3207        */
3208       assert(stage == MESA_SHADER_VERTEX);
3209       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3210       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3211          inst->saturate = true;
3212       break;
3213    }
3214
3215    default:
3216       emit_generic_urb_slot(reg, varying);
3217       break;
3218    }
3219 }
3220
3221 static int
3222 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3223 {
3224    if (devinfo->gen >= 6) {
3225       /* URB data written (does not include the message header reg) must
3226        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3227        * section 5.4.3.2.2: URB_INTERLEAVED.
3228        *
3229        * URB entries are allocated on a multiple of 1024 bits, so an
3230        * extra 128 bits written here to make the end align to 256 is
3231        * no problem.
3232        */
3233       if ((mlen % 2) != 1)
3234          mlen++;
3235    }
3236
3237    return mlen;
3238 }
3239
3240
3241 /**
3242  * Generates the VUE payload plus the necessary URB write instructions to
3243  * output it.
3244  *
3245  * The VUE layout is documented in Volume 2a.
3246  */
3247 void
3248 vec4_visitor::emit_vertex()
3249 {
3250    /* MRF 0 is reserved for the debugger, so start with message header
3251     * in MRF 1.
3252     */
3253    int base_mrf = 1;
3254    int mrf = base_mrf;
3255    /* In the process of generating our URB write message contents, we
3256     * may need to unspill a register or load from an array.  Those
3257     * reads would use MRFs 14-15.
3258     */
3259    int max_usable_mrf = 13;
3260
3261    /* The following assertion verifies that max_usable_mrf causes an
3262     * even-numbered amount of URB write data, which will meet gen6's
3263     * requirements for length alignment.
3264     */
3265    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3266
3267    /* First mrf is the g0-based message header containing URB handles and
3268     * such.
3269     */
3270    emit_urb_write_header(mrf++);
3271
3272    if (devinfo->gen < 6) {
3273       emit_ndc_computation();
3274    }
3275
3276    /* Lower legacy ff and ClipVertex clipping to clip distances */
3277    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3278       current_annotation = "user clip distances";
3279
3280       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3281       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3282
3283       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3284       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3285    }
3286
3287    /* We may need to split this up into several URB writes, so do them in a
3288     * loop.
3289     */
3290    int slot = 0;
3291    bool complete = false;
3292    do {
3293       /* URB offset is in URB row increments, and each of our MRFs is half of
3294        * one of those, since we're doing interleaved writes.
3295        */
3296       int offset = slot / 2;
3297
3298       mrf = base_mrf + 1;
3299       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3300          emit_urb_slot(dst_reg(MRF, mrf++),
3301                        prog_data->vue_map.slot_to_varying[slot]);
3302
3303          /* If this was max_usable_mrf, we can't fit anything more into this
3304           * URB WRITE.
3305           */
3306          if (mrf > max_usable_mrf) {
3307             slot++;
3308             break;
3309          }
3310       }
3311
3312       complete = slot >= prog_data->vue_map.num_slots;
3313       current_annotation = "URB write";
3314       vec4_instruction *inst = emit_urb_write_opcode(complete);
3315       inst->base_mrf = base_mrf;
3316       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3317       inst->offset += offset;
3318    } while(!complete);
3319 }
3320
3321
3322 src_reg
3323 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3324                                  src_reg *reladdr, int reg_offset)
3325 {
3326    /* Because we store the values to scratch interleaved like our
3327     * vertex data, we need to scale the vec4 index by 2.
3328     */
3329    int message_header_scale = 2;
3330
3331    /* Pre-gen6, the message header uses byte offsets instead of vec4
3332     * (16-byte) offset units.
3333     */
3334    if (devinfo->gen < 6)
3335       message_header_scale *= 16;
3336
3337    if (reladdr) {
3338       src_reg index = src_reg(this, glsl_type::int_type);
3339
3340       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3341                                    src_reg(reg_offset)));
3342       emit_before(block, inst, MUL(dst_reg(index), index,
3343                                    src_reg(message_header_scale)));
3344
3345       return index;
3346    } else {
3347       return src_reg(reg_offset * message_header_scale);
3348    }
3349 }
3350
3351 src_reg
3352 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3353                                        src_reg *reladdr, int reg_offset)
3354 {
3355    if (reladdr) {
3356       src_reg index = src_reg(this, glsl_type::int_type);
3357
3358       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3359                                    src_reg(reg_offset)));
3360
3361       /* Pre-gen6, the message header uses byte offsets instead of vec4
3362        * (16-byte) offset units.
3363        */
3364       if (devinfo->gen < 6) {
3365          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3366       }
3367
3368       return index;
3369    } else if (devinfo->gen >= 8) {
3370       /* Store the offset in a GRF so we can send-from-GRF. */
3371       src_reg offset = src_reg(this, glsl_type::int_type);
3372       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3373       return offset;
3374    } else {
3375       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3376       return src_reg(reg_offset * message_header_scale);
3377    }
3378 }
3379
3380 /**
3381  * Emits an instruction before @inst to load the value named by @orig_src
3382  * from scratch space at @base_offset to @temp.
3383  *
3384  * @base_offset is measured in 32-byte units (the size of a register).
3385  */
3386 void
3387 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3388                                 dst_reg temp, src_reg orig_src,
3389                                 int base_offset)
3390 {
3391    int reg_offset = base_offset + orig_src.reg_offset;
3392    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3393                                       reg_offset);
3394
3395    emit_before(block, inst, SCRATCH_READ(temp, index));
3396 }
3397
3398 /**
3399  * Emits an instruction after @inst to store the value to be written
3400  * to @orig_dst to scratch space at @base_offset, from @temp.
3401  *
3402  * @base_offset is measured in 32-byte units (the size of a register).
3403  */
3404 void
3405 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3406                                  int base_offset)
3407 {
3408    int reg_offset = base_offset + inst->dst.reg_offset;
3409    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3410                                       reg_offset);
3411
3412    /* Create a temporary register to store *inst's result in.
3413     *
3414     * We have to be careful in MOVing from our temporary result register in
3415     * the scratch write.  If we swizzle from channels of the temporary that
3416     * weren't initialized, it will confuse live interval analysis, which will
3417     * make spilling fail to make progress.
3418     */
3419    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3420                                        inst->dst.type),
3421                                 brw_swizzle_for_mask(inst->dst.writemask));
3422    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3423                                        inst->dst.writemask));
3424    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3425    write->predicate = inst->predicate;
3426    write->ir = inst->ir;
3427    write->annotation = inst->annotation;
3428    inst->insert_after(block, write);
3429
3430    inst->dst.file = temp.file;
3431    inst->dst.reg = temp.reg;
3432    inst->dst.reg_offset = temp.reg_offset;
3433    inst->dst.reladdr = NULL;
3434 }
3435
3436 /**
3437  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3438  * adds the scratch read(s) before \p inst. The function also checks for
3439  * recursive reladdr scratch accesses, issuing the corresponding scratch
3440  * loads and rewriting reladdr references accordingly.
3441  *
3442  * \return \p src if it did not require a scratch load, otherwise, the
3443  * register holding the result of the scratch load that the caller should
3444  * use to rewrite src.
3445  */
3446 src_reg
3447 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3448                                    vec4_instruction *inst, src_reg src)
3449 {
3450    /* Resolve recursive reladdr scratch access by calling ourselves
3451     * with src.reladdr
3452     */
3453    if (src.reladdr)
3454       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3455                                           *src.reladdr);
3456
3457    /* Now handle scratch access on src */
3458    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3459       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3460       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3461       src.reg = temp.reg;
3462       src.reg_offset = temp.reg_offset;
3463       src.reladdr = NULL;
3464    }
3465
3466    return src;
3467 }
3468
3469 /**
3470  * We can't generally support array access in GRF space, because a
3471  * single instruction's destination can only span 2 contiguous
3472  * registers.  So, we send all GRF arrays that get variable index
3473  * access to scratch space.
3474  */
3475 void
3476 vec4_visitor::move_grf_array_access_to_scratch()
3477 {
3478    int scratch_loc[this->alloc.count];
3479    memset(scratch_loc, -1, sizeof(scratch_loc));
3480
3481    /* First, calculate the set of virtual GRFs that need to be punted
3482     * to scratch due to having any array access on them, and where in
3483     * scratch.
3484     */
3485    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3486       if (inst->dst.file == GRF && inst->dst.reladdr) {
3487          if (scratch_loc[inst->dst.reg] == -1) {
3488             scratch_loc[inst->dst.reg] = c->last_scratch;
3489             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3490          }
3491
3492          for (src_reg *iter = inst->dst.reladdr;
3493               iter->reladdr;
3494               iter = iter->reladdr) {
3495             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3496                scratch_loc[iter->reg] = c->last_scratch;
3497                c->last_scratch += this->alloc.sizes[iter->reg];
3498             }
3499          }
3500       }
3501
3502       for (int i = 0 ; i < 3; i++) {
3503          for (src_reg *iter = &inst->src[i];
3504               iter->reladdr;
3505               iter = iter->reladdr) {
3506             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3507                scratch_loc[iter->reg] = c->last_scratch;
3508                c->last_scratch += this->alloc.sizes[iter->reg];
3509             }
3510          }
3511       }
3512    }
3513
3514    /* Now, for anything that will be accessed through scratch, rewrite
3515     * it to load/store.  Note that this is a _safe list walk, because
3516     * we may generate a new scratch_write instruction after the one
3517     * we're processing.
3518     */
3519    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3520       /* Set up the annotation tracking for new generated instructions. */
3521       base_ir = inst->ir;
3522       current_annotation = inst->annotation;
3523
3524       /* First handle scratch access on the dst. Notice we have to handle
3525        * the case where the dst's reladdr also points to scratch space.
3526        */
3527       if (inst->dst.reladdr)
3528          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3529                                                    *inst->dst.reladdr);
3530
3531       /* Now that we have handled any (possibly recursive) reladdr scratch
3532        * accesses for dst we can safely do the scratch write for dst itself
3533        */
3534       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3535          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3536
3537       /* Now handle scratch access on any src. In this case, since inst->src[i]
3538        * already is a src_reg, we can just call emit_resolve_reladdr with
3539        * inst->src[i] and it will take care of handling scratch loads for
3540        * both src and src.reladdr (recursively).
3541        */
3542       for (int i = 0 ; i < 3; i++) {
3543          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3544                                              inst->src[i]);
3545       }
3546    }
3547 }
3548
3549 /**
3550  * Emits an instruction before @inst to load the value named by @orig_src
3551  * from the pull constant buffer (surface) at @base_offset to @temp.
3552  */
3553 void
3554 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3555                                       dst_reg temp, src_reg orig_src,
3556                                       int base_offset)
3557 {
3558    int reg_offset = base_offset + orig_src.reg_offset;
3559    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3560    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3561                                              reg_offset);
3562
3563    emit_pull_constant_load_reg(temp,
3564                                index,
3565                                offset,
3566                                block, inst);
3567 }
3568
3569 /**
3570  * Implements array access of uniforms by inserting a
3571  * PULL_CONSTANT_LOAD instruction.
3572  *
3573  * Unlike temporary GRF array access (where we don't support it due to
3574  * the difficulty of doing relative addressing on instruction
3575  * destinations), we could potentially do array access of uniforms
3576  * that were loaded in GRF space as push constants.  In real-world
3577  * usage we've seen, though, the arrays being used are always larger
3578  * than we could load as push constants, so just always move all
3579  * uniform array access out to a pull constant buffer.
3580  */
3581 void
3582 vec4_visitor::move_uniform_array_access_to_pull_constants()
3583 {
3584    int pull_constant_loc[this->uniforms];
3585    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3586    bool nested_reladdr;
3587
3588    /* Walk through and find array access of uniforms.  Put a copy of that
3589     * uniform in the pull constant buffer.
3590     *
3591     * Note that we don't move constant-indexed accesses to arrays.  No
3592     * testing has been done of the performance impact of this choice.
3593     */
3594    do {
3595       nested_reladdr = false;
3596
3597       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3598          for (int i = 0 ; i < 3; i++) {
3599             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3600                continue;
3601
3602             int uniform = inst->src[i].reg;
3603
3604             if (inst->src[i].reladdr->reladdr)
3605                nested_reladdr = true;  /* will need another pass */
3606
3607             /* If this array isn't already present in the pull constant buffer,
3608              * add it.
3609              */
3610             if (pull_constant_loc[uniform] == -1) {
3611                const gl_constant_value **values =
3612                   &stage_prog_data->param[uniform * 4];
3613
3614                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3615
3616                assert(uniform < uniform_array_size);
3617                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3618                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3619                      = values[j];
3620                }
3621             }
3622
3623             /* Set up the annotation tracking for new generated instructions. */
3624             base_ir = inst->ir;
3625             current_annotation = inst->annotation;
3626
3627             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3628
3629             emit_pull_constant_load(block, inst, temp, inst->src[i],
3630                                     pull_constant_loc[uniform]);
3631
3632             inst->src[i].file = temp.file;
3633             inst->src[i].reg = temp.reg;
3634             inst->src[i].reg_offset = temp.reg_offset;
3635             inst->src[i].reladdr = NULL;
3636          }
3637       }
3638    } while (nested_reladdr);
3639
3640    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3641     * no need to track them as larger-than-vec4 objects.  This will be
3642     * relied on in cutting out unused uniform vectors from push
3643     * constants.
3644     */
3645    split_uniform_registers();
3646 }
3647
3648 void
3649 vec4_visitor::resolve_ud_negate(src_reg *reg)
3650 {
3651    if (reg->type != BRW_REGISTER_TYPE_UD ||
3652        !reg->negate)
3653       return;
3654
3655    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3656    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3657    *reg = temp;
3658 }
3659
3660 /**
3661  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3662  *
3663  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3664  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3665  */
3666 void
3667 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3668 {
3669    assert(devinfo->gen <= 5);
3670
3671    if (!rvalue->type->is_boolean())
3672       return;
3673
3674    src_reg and_result = src_reg(this, rvalue->type);
3675    src_reg neg_result = src_reg(this, rvalue->type);
3676    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3677    emit(MOV(dst_reg(neg_result), negate(and_result)));
3678    *reg = neg_result;
3679 }
3680
3681 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3682                            struct brw_vec4_compile *c,
3683                            struct gl_program *prog,
3684                            const struct brw_vue_prog_key *key,
3685                            struct brw_vue_prog_data *prog_data,
3686                            struct gl_shader_program *shader_prog,
3687                            gl_shader_stage stage,
3688                            void *mem_ctx,
3689                            bool no_spills,
3690                            int shader_time_index)
3691    : backend_shader(compiler, NULL, mem_ctx,
3692                     shader_prog, prog, &prog_data->base, stage),
3693      c(c),
3694      key(key),
3695      prog_data(prog_data),
3696      sanity_param_count(0),
3697      fail_msg(NULL),
3698      first_non_payload_grf(0),
3699      need_all_constants_in_pull_buffer(false),
3700      no_spills(no_spills),
3701      shader_time_index(shader_time_index)
3702 {
3703    this->failed = false;
3704
3705    this->base_ir = NULL;
3706    this->current_annotation = NULL;
3707    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3708
3709    this->variable_ht = hash_table_ctor(0,
3710                                        hash_table_pointer_hash,
3711                                        hash_table_pointer_compare);
3712
3713    this->virtual_grf_start = NULL;
3714    this->virtual_grf_end = NULL;
3715    this->live_intervals = NULL;
3716
3717    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3718
3719    this->uniforms = 0;
3720
3721    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3722     * at least one. See setup_uniforms() in brw_vec4.cpp.
3723     */
3724    this->uniform_array_size = 1;
3725    if (prog_data) {
3726       this->uniform_array_size =
3727          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3728    }
3729
3730    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3731    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3732 }
3733
3734 vec4_visitor::~vec4_visitor()
3735 {
3736    hash_table_dtor(this->variable_ht);
3737 }
3738
3739
3740 void
3741 vec4_visitor::fail(const char *format, ...)
3742 {
3743    va_list va;
3744    char *msg;
3745
3746    if (failed)
3747       return;
3748
3749    failed = true;
3750
3751    va_start(va, format);
3752    msg = ralloc_vasprintf(mem_ctx, format, va);
3753    va_end(va);
3754    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3755
3756    this->fail_msg = msg;
3757
3758    if (debug_enabled) {
3759       fprintf(stderr, "%s",  msg);
3760    }
3761 }
3762
3763 } /* namespace brw */