src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (storage->builtin)
 690          continue;
 691
 692       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 693           (storage->name[namelen] != 0 &&
 694            storage->name[namelen] != '.' &&
 695            storage->name[namelen] != '[')) {
 696          continue;
 697       }
 698
 699       gl_constant_value *components = storage->storage;
 700       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 701                                storage->type->matrix_columns);
 702
 703       for (unsigned s = 0; s < vector_count; s++) {
 704          assert(uniforms < uniform_array_size);
 705          uniform_vector_size[uniforms] = storage->type->vector_elements;
 706
 707          int i;
 708          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 709             stage_prog_data->param[uniforms * 4 + i] = components;
 710             components++;
 711          }
 712          for (; i < 4; i++) {
 713             static gl_constant_value zero = { 0.0 };
 714             stage_prog_data->param[uniforms * 4 + i] = &zero;
 715          }
 716
 717          uniforms++;
 718       }
 719    }
 720 }
 721
 722 void
 723 vec4_visitor::setup_uniform_clipplane_values()
 724 {
 725    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 726
 727    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 728       assert(this->uniforms < uniform_array_size);
 729       this->uniform_vector_size[this->uniforms] = 4;
 730       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 731       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 732       for (int j = 0; j < 4; ++j) {
 733          stage_prog_data->param[this->uniforms * 4 + j] =
 734             (gl_constant_value *) &clip_planes[i][j];
 735       }
 736       ++this->uniforms;
 737    }
 738 }
 739
 740 /* Our support for builtin uniforms is even scarier than non-builtin.
 741  * It sits on top of the PROG_STATE_VAR parameters that are
 742  * automatically updated from GL context state.
 743  */
 744 void
 745 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 746 {
 747    const ir_state_slot *const slots = ir->get_state_slots();
 748    assert(slots != NULL);
 749
 750    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 751       /* This state reference has already been setup by ir_to_mesa,
 752        * but we'll get the same index back here.  We can reference
 753        * ParameterValues directly, since unlike brw_fs.cpp, we never
 754        * add new state references during compile.
 755        */
 756       int index = _mesa_add_state_reference(this->prog->Parameters,
 757                                             (gl_state_index *)slots[i].tokens);
 758       gl_constant_value *values =
 759          &this->prog->Parameters->ParameterValues[index][0];
 760
 761       assert(this->uniforms < uniform_array_size);
 762
 763       for (unsigned j = 0; j < 4; j++)
 764          stage_prog_data->param[this->uniforms * 4 + j] =
 765             &values[GET_SWZ(slots[i].swizzle, j)];
 766
 767       this->uniform_vector_size[this->uniforms] =
 768          (ir->type->is_scalar() || ir->type->is_vector() ||
 769           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 770
 771       this->uniforms++;
 772    }
 773 }
 774
 775 dst_reg *
 776 vec4_visitor::variable_storage(ir_variable *var)
 777 {
 778    return (dst_reg *)hash_table_find(this->variable_ht, var);
 779 }
 780
 781 void
 782 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 783                                      enum brw_predicate *predicate)
 784 {
 785    ir_expression *expr = ir->as_expression();
 786
 787    *predicate = BRW_PREDICATE_NORMAL;
 788
 789    if (expr && expr->operation != ir_binop_ubo_load) {
 790       src_reg op[3];
 791       vec4_instruction *inst;
 792
 793       assert(expr->get_num_operands() <= 3);
 794       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 795          expr->operands[i]->accept(this);
 796          op[i] = this->result;
 797
 798          resolve_ud_negate(&op[i]);
 799       }
 800
 801       switch (expr->operation) {
 802       case ir_unop_logic_not:
 803          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 804          inst->conditional_mod = BRW_CONDITIONAL_Z;
 805          break;
 806
 807       case ir_binop_logic_xor:
 808          if (devinfo->gen <= 5) {
 809             src_reg temp = src_reg(this, ir->type);
 810             emit(XOR(dst_reg(temp), op[0], op[1]));
 811             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 812          } else {
 813             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 814          }
 815          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 816          break;
 817
 818       case ir_binop_logic_or:
 819          if (devinfo->gen <= 5) {
 820             src_reg temp = src_reg(this, ir->type);
 821             emit(OR(dst_reg(temp), op[0], op[1]));
 822             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 823          } else {
 824             inst = emit(OR(dst_null_d(), op[0], op[1]));
 825          }
 826          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          break;
 828
 829       case ir_binop_logic_and:
 830          if (devinfo->gen <= 5) {
 831             src_reg temp = src_reg(this, ir->type);
 832             emit(AND(dst_reg(temp), op[0], op[1]));
 833             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 834          } else {
 835             inst = emit(AND(dst_null_d(), op[0], op[1]));
 836          }
 837          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 838          break;
 839
 840       case ir_unop_f2b:
 841          if (devinfo->gen >= 6) {
 842             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 843          } else {
 844             inst = emit(MOV(dst_null_f(), op[0]));
 845             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 846          }
 847          break;
 848
 849       case ir_unop_i2b:
 850          if (devinfo->gen >= 6) {
 851             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 852          } else {
 853             inst = emit(MOV(dst_null_d(), op[0]));
 854             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 855          }
 856          break;
 857
 858       case ir_binop_all_equal:
 859          if (devinfo->gen <= 5) {
 860             resolve_bool_comparison(expr->operands[0], &op[0]);
 861             resolve_bool_comparison(expr->operands[1], &op[1]);
 862          }
 863          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 864          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 865          break;
 866
 867       case ir_binop_any_nequal:
 868          if (devinfo->gen <= 5) {
 869             resolve_bool_comparison(expr->operands[0], &op[0]);
 870             resolve_bool_comparison(expr->operands[1], &op[1]);
 871          }
 872          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 873          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 874          break;
 875
 876       case ir_unop_any:
 877          if (devinfo->gen <= 5) {
 878             resolve_bool_comparison(expr->operands[0], &op[0]);
 879          }
 880          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 881          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 882          break;
 883
 884       case ir_binop_greater:
 885       case ir_binop_gequal:
 886       case ir_binop_less:
 887       case ir_binop_lequal:
 888       case ir_binop_equal:
 889       case ir_binop_nequal:
 890          if (devinfo->gen <= 5) {
 891             resolve_bool_comparison(expr->operands[0], &op[0]);
 892             resolve_bool_comparison(expr->operands[1], &op[1]);
 893          }
 894          emit(CMP(dst_null_d(), op[0], op[1],
 895                   brw_conditional_for_comparison(expr->operation)));
 896          break;
 897
 898       case ir_triop_csel: {
 899          /* Expand the boolean condition into the flag register. */
 900          inst = emit(MOV(dst_null_d(), op[0]));
 901          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 902
 903          /* Select which boolean to return. */
 904          dst_reg temp(this, expr->operands[1]->type);
 905          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 906          inst->predicate = BRW_PREDICATE_NORMAL;
 907
 908          /* Expand the result to a condition code. */
 909          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 910          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 911          break;
 912       }
 913
 914       default:
 915          unreachable("not reached");
 916       }
 917       return;
 918    }
 919
 920    ir->accept(this);
 921
 922    resolve_ud_negate(&this->result);
 923
 924    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 925    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 926 }
 927
 928 /**
 929  * Emit a gen6 IF statement with the comparison folded into the IF
 930  * instruction.
 931  */
 932 void
 933 vec4_visitor::emit_if_gen6(ir_if *ir)
 934 {
 935    ir_expression *expr = ir->condition->as_expression();
 936
 937    if (expr && expr->operation != ir_binop_ubo_load) {
 938       src_reg op[3];
 939       dst_reg temp;
 940
 941       assert(expr->get_num_operands() <= 3);
 942       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 943          expr->operands[i]->accept(this);
 944          op[i] = this->result;
 945       }
 946
 947       switch (expr->operation) {
 948       case ir_unop_logic_not:
 949          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 950          return;
 951
 952       case ir_binop_logic_xor:
 953          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 954          return;
 955
 956       case ir_binop_logic_or:
 957          temp = dst_reg(this, glsl_type::bool_type);
 958          emit(OR(temp, op[0], op[1]));
 959          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 960          return;
 961
 962       case ir_binop_logic_and:
 963          temp = dst_reg(this, glsl_type::bool_type);
 964          emit(AND(temp, op[0], op[1]));
 965          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 966          return;
 967
 968       case ir_unop_f2b:
 969          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 970          return;
 971
 972       case ir_unop_i2b:
 973          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 974          return;
 975
 976       case ir_binop_greater:
 977       case ir_binop_gequal:
 978       case ir_binop_less:
 979       case ir_binop_lequal:
 980       case ir_binop_equal:
 981       case ir_binop_nequal:
 982          emit(IF(op[0], op[1],
 983                  brw_conditional_for_comparison(expr->operation)));
 984          return;
 985
 986       case ir_binop_all_equal:
 987          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 988          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 989          return;
 990
 991       case ir_binop_any_nequal:
 992          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 993          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 994          return;
 995
 996       case ir_unop_any:
 997          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 998          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 999          return;
1000
1001       case ir_triop_csel: {
1002          /* Expand the boolean condition into the flag register. */
1003          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1004          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1005
1006          /* Select which boolean to return. */
1007          dst_reg temp(this, expr->operands[1]->type);
1008          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1009          inst->predicate = BRW_PREDICATE_NORMAL;
1010
1011          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1012          return;
1013       }
1014
1015       default:
1016          unreachable("not reached");
1017       }
1018       return;
1019    }
1020
1021    ir->condition->accept(this);
1022
1023    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1024 }
1025
1026 void
1027 vec4_visitor::visit(ir_variable *ir)
1028 {
1029    dst_reg *reg = NULL;
1030
1031    if (variable_storage(ir))
1032       return;
1033
1034    switch (ir->data.mode) {
1035    case ir_var_shader_in:
1036       assert(ir->data.location != -1);
1037       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1038       break;
1039
1040    case ir_var_shader_out:
1041       assert(ir->data.location != -1);
1042       reg = new(mem_ctx) dst_reg(this, ir->type);
1043
1044       for (int i = 0; i < type_size(ir->type); i++) {
1045          output_reg[ir->data.location + i] = *reg;
1046          output_reg[ir->data.location + i].reg_offset = i;
1047          output_reg[ir->data.location + i].type =
1048             brw_type_for_base_type(ir->type->get_scalar_type());
1049          output_reg_annotation[ir->data.location + i] = ir->name;
1050       }
1051       break;
1052
1053    case ir_var_auto:
1054    case ir_var_temporary:
1055       reg = new(mem_ctx) dst_reg(this, ir->type);
1056       break;
1057
1058    case ir_var_uniform:
1059       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1060
1061       /* Thanks to the lower_ubo_reference pass, we will see only
1062        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1063        * variables, so no need for them to be in variable_ht.
1064        *
1065        * Some uniforms, such as samplers and atomic counters, have no actual
1066        * storage, so we should ignore them.
1067        */
1068       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1069          return;
1070
1071       /* Track how big the whole uniform variable is, in case we need to put a
1072        * copy of its data into pull constants for array access.
1073        */
1074       assert(this->uniforms < uniform_array_size);
1075       this->uniform_size[this->uniforms] = type_size(ir->type);
1076
1077       if (!strncmp(ir->name, "gl_", 3)) {
1078          setup_builtin_uniform_values(ir);
1079       } else {
1080          setup_uniform_values(ir);
1081       }
1082       break;
1083
1084    case ir_var_system_value:
1085       reg = make_reg_for_system_value(ir);
1086       break;
1087
1088    default:
1089       unreachable("not reached");
1090    }
1091
1092    reg->type = brw_type_for_base_type(ir->type);
1093    hash_table_insert(this->variable_ht, reg, ir);
1094 }
1095
1096 void
1097 vec4_visitor::visit(ir_loop *ir)
1098 {
1099    /* We don't want debugging output to print the whole body of the
1100     * loop as the annotation.
1101     */
1102    this->base_ir = NULL;
1103
1104    emit(BRW_OPCODE_DO);
1105
1106    visit_instructions(&ir->body_instructions);
1107
1108    emit(BRW_OPCODE_WHILE);
1109 }
1110
1111 void
1112 vec4_visitor::visit(ir_loop_jump *ir)
1113 {
1114    switch (ir->mode) {
1115    case ir_loop_jump::jump_break:
1116       emit(BRW_OPCODE_BREAK);
1117       break;
1118    case ir_loop_jump::jump_continue:
1119       emit(BRW_OPCODE_CONTINUE);
1120       break;
1121    }
1122 }
1123
1124
1125 void
1126 vec4_visitor::visit(ir_function_signature *)
1127 {
1128    unreachable("not reached");
1129 }
1130
1131 void
1132 vec4_visitor::visit(ir_function *ir)
1133 {
1134    /* Ignore function bodies other than main() -- we shouldn't see calls to
1135     * them since they should all be inlined.
1136     */
1137    if (strcmp(ir->name, "main") == 0) {
1138       const ir_function_signature *sig;
1139       exec_list empty;
1140
1141       sig = ir->matching_signature(NULL, &empty, false);
1142
1143       assert(sig);
1144
1145       visit_instructions(&sig->body);
1146    }
1147 }
1148
1149 bool
1150 vec4_visitor::try_emit_mad(ir_expression *ir)
1151 {
1152    /* 3-src instructions were introduced in gen6. */
1153    if (devinfo->gen < 6)
1154       return false;
1155
1156    /* MAD can only handle floating-point data. */
1157    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1158       return false;
1159
1160    ir_rvalue *nonmul;
1161    ir_expression *mul;
1162    bool mul_negate, mul_abs;
1163
1164    for (int i = 0; i < 2; i++) {
1165       mul_negate = false;
1166       mul_abs = false;
1167
1168       mul = ir->operands[i]->as_expression();
1169       nonmul = ir->operands[1 - i];
1170
1171       if (mul && mul->operation == ir_unop_abs) {
1172          mul = mul->operands[0]->as_expression();
1173          mul_abs = true;
1174       } else if (mul && mul->operation == ir_unop_neg) {
1175          mul = mul->operands[0]->as_expression();
1176          mul_negate = true;
1177       }
1178
1179       if (mul && mul->operation == ir_binop_mul)
1180          break;
1181    }
1182
1183    if (!mul || mul->operation != ir_binop_mul)
1184       return false;
1185
1186    nonmul->accept(this);
1187    src_reg src0 = fix_3src_operand(this->result);
1188
1189    mul->operands[0]->accept(this);
1190    src_reg src1 = fix_3src_operand(this->result);
1191    src1.negate ^= mul_negate;
1192    src1.abs = mul_abs;
1193    if (mul_abs)
1194       src1.negate = false;
1195
1196    mul->operands[1]->accept(this);
1197    src_reg src2 = fix_3src_operand(this->result);
1198    src2.abs = mul_abs;
1199    if (mul_abs)
1200       src2.negate = false;
1201
1202    this->result = src_reg(this, ir->type);
1203    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1204
1205    return true;
1206 }
1207
1208 bool
1209 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1210 {
1211    /* This optimization relies on CMP setting the destination to 0 when
1212     * false.  Early hardware only sets the least significant bit, and
1213     * leaves the other bits undefined.  So we can't use it.
1214     */
1215    if (devinfo->gen < 6)
1216       return false;
1217
1218    ir_expression *const cmp = ir->operands[0]->as_expression();
1219
1220    if (cmp == NULL)
1221       return false;
1222
1223    switch (cmp->operation) {
1224    case ir_binop_less:
1225    case ir_binop_greater:
1226    case ir_binop_lequal:
1227    case ir_binop_gequal:
1228    case ir_binop_equal:
1229    case ir_binop_nequal:
1230       break;
1231
1232    default:
1233       return false;
1234    }
1235
1236    cmp->operands[0]->accept(this);
1237    const src_reg cmp_src0 = this->result;
1238
1239    cmp->operands[1]->accept(this);
1240    const src_reg cmp_src1 = this->result;
1241
1242    this->result = src_reg(this, ir->type);
1243
1244    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1245             brw_conditional_for_comparison(cmp->operation)));
1246
1247    /* If the comparison is false, this->result will just happen to be zero.
1248     */
1249    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1250                                        this->result, src_reg(1.0f));
1251    inst->predicate = BRW_PREDICATE_NORMAL;
1252    inst->predicate_inverse = true;
1253
1254    return true;
1255 }
1256
1257 void
1258 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1259                           src_reg src0, src_reg src1)
1260 {
1261    vec4_instruction *inst;
1262
1263    if (devinfo->gen >= 6) {
1264       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1265       inst->conditional_mod = conditionalmod;
1266    } else {
1267       emit(CMP(dst, src0, src1, conditionalmod));
1268
1269       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1270       inst->predicate = BRW_PREDICATE_NORMAL;
1271    }
1272 }
1273
1274 void
1275 vec4_visitor::emit_lrp(const dst_reg &dst,
1276                        const src_reg &x, const src_reg &y, const src_reg &a)
1277 {
1278    if (devinfo->gen >= 6) {
1279       /* Note that the instruction's argument order is reversed from GLSL
1280        * and the IR.
1281        */
1282       emit(LRP(dst,
1283                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1284    } else {
1285       /* Earlier generations don't support three source operations, so we
1286        * need to emit x*(1-a) + y*a.
1287        */
1288       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1289       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1290       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1291       y_times_a.writemask           = dst.writemask;
1292       one_minus_a.writemask         = dst.writemask;
1293       x_times_one_minus_a.writemask = dst.writemask;
1294
1295       emit(MUL(y_times_a, y, a));
1296       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1297       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1298       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1299    }
1300 }
1301
1302 /**
1303  * Emits the instructions needed to perform a pull constant load. before_block
1304  * and before_inst can be NULL in which case the instruction will be appended
1305  * to the end of the instruction list.
1306  */
1307 void
1308 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1309                                           src_reg surf_index,
1310                                           src_reg offset_reg,
1311                                           bblock_t *before_block,
1312                                           vec4_instruction *before_inst)
1313 {
1314    assert((before_inst == NULL && before_block == NULL) ||
1315           (before_inst && before_block));
1316
1317    vec4_instruction *pull;
1318
1319    if (devinfo->gen >= 9) {
1320       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1321       src_reg header(this, glsl_type::uvec4_type, 2);
1322
1323       pull = new(mem_ctx)
1324          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1325                           dst_reg(header));
1326
1327       if (before_inst)
1328          emit_before(before_block, before_inst, pull);
1329       else
1330          emit(pull);
1331
1332       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1333                                  offset_reg.type);
1334       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1335
1336       if (before_inst)
1337          emit_before(before_block, before_inst, pull);
1338       else
1339          emit(pull);
1340
1341       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1342                                            dst,
1343                                            surf_index,
1344                                            header);
1345       pull->mlen = 2;
1346       pull->header_size = 1;
1347    } else if (devinfo->gen >= 7) {
1348       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1349
1350       grf_offset.type = offset_reg.type;
1351
1352       pull = MOV(grf_offset, offset_reg);
1353
1354       if (before_inst)
1355          emit_before(before_block, before_inst, pull);
1356       else
1357          emit(pull);
1358
1359       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1360                                            dst,
1361                                            surf_index,
1362                                            src_reg(grf_offset));
1363       pull->mlen = 1;
1364    } else {
1365       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1366                                            dst,
1367                                            surf_index,
1368                                            offset_reg);
1369       pull->base_mrf = 14;
1370       pull->mlen = 1;
1371    }
1372
1373    if (before_inst)
1374       emit_before(before_block, before_inst, pull);
1375    else
1376       emit(pull);
1377 }
1378
1379 void
1380 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1381 {
1382    const src_reg chan_index(this, glsl_type::uint_type);
1383
1384    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1385       ->force_writemask_all = true;
1386    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1387       ->force_writemask_all = true;
1388 }
1389
1390 void
1391 vec4_visitor::visit(ir_expression *ir)
1392 {
1393    unsigned int operand;
1394    src_reg op[ARRAY_SIZE(ir->operands)];
1395    vec4_instruction *inst;
1396
1397    if (ir->operation == ir_binop_add) {
1398       if (try_emit_mad(ir))
1399          return;
1400    }
1401
1402    if (ir->operation == ir_unop_b2f) {
1403       if (try_emit_b2f_of_compare(ir))
1404          return;
1405    }
1406
1407    /* Storage for our result.  Ideally for an assignment we'd be using
1408     * the actual storage for the result here, instead.
1409     */
1410    dst_reg result_dst(this, ir->type);
1411    src_reg result_src(result_dst);
1412
1413    if (ir->operation == ir_triop_csel) {
1414       ir->operands[1]->accept(this);
1415       op[1] = this->result;
1416       ir->operands[2]->accept(this);
1417       op[2] = this->result;
1418
1419       enum brw_predicate predicate;
1420       emit_bool_to_cond_code(ir->operands[0], &predicate);
1421       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1422       inst->predicate = predicate;
1423       this->result = result_src;
1424       return;
1425    }
1426
1427    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1428       this->result.file = BAD_FILE;
1429       ir->operands[operand]->accept(this);
1430       if (this->result.file == BAD_FILE) {
1431          fprintf(stderr, "Failed to get tree for expression operand:\n");
1432          ir->operands[operand]->fprint(stderr);
1433          exit(1);
1434       }
1435       op[operand] = this->result;
1436
1437       /* Matrix expression operands should have been broken down to vector
1438        * operations already.
1439        */
1440       assert(!ir->operands[operand]->type->is_matrix());
1441    }
1442
1443    /* If nothing special happens, this is the result. */
1444    this->result = result_src;
1445
1446    switch (ir->operation) {
1447    case ir_unop_logic_not:
1448       emit(NOT(result_dst, op[0]));
1449       break;
1450    case ir_unop_neg:
1451       op[0].negate = !op[0].negate;
1452       emit(MOV(result_dst, op[0]));
1453       break;
1454    case ir_unop_abs:
1455       op[0].abs = true;
1456       op[0].negate = false;
1457       emit(MOV(result_dst, op[0]));
1458       break;
1459
1460    case ir_unop_sign:
1461       if (ir->type->is_float()) {
1462          /* AND(val, 0x80000000) gives the sign bit.
1463           *
1464           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1465           * zero.
1466           */
1467          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1468
1469          op[0].type = BRW_REGISTER_TYPE_UD;
1470          result_dst.type = BRW_REGISTER_TYPE_UD;
1471          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1472
1473          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1474          inst->predicate = BRW_PREDICATE_NORMAL;
1475
1476          this->result.type = BRW_REGISTER_TYPE_F;
1477       } else {
1478          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1479           *               -> non-negative val generates 0x00000000.
1480           *  Predicated OR sets 1 if val is positive.
1481           */
1482          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1483
1484          emit(ASR(result_dst, op[0], src_reg(31)));
1485
1486          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1487          inst->predicate = BRW_PREDICATE_NORMAL;
1488       }
1489       break;
1490
1491    case ir_unop_rcp:
1492       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1493       break;
1494
1495    case ir_unop_exp2:
1496       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1497       break;
1498    case ir_unop_log2:
1499       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1500       break;
1501    case ir_unop_exp:
1502    case ir_unop_log:
1503       unreachable("not reached: should be handled by ir_explog_to_explog2");
1504    case ir_unop_sin:
1505       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1506       break;
1507    case ir_unop_cos:
1508       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1509       break;
1510
1511    case ir_unop_dFdx:
1512    case ir_unop_dFdx_coarse:
1513    case ir_unop_dFdx_fine:
1514    case ir_unop_dFdy:
1515    case ir_unop_dFdy_coarse:
1516    case ir_unop_dFdy_fine:
1517       unreachable("derivatives not valid in vertex shader");
1518
1519    case ir_unop_bitfield_reverse:
1520       emit(BFREV(result_dst, op[0]));
1521       break;
1522    case ir_unop_bit_count:
1523       emit(CBIT(result_dst, op[0]));
1524       break;
1525    case ir_unop_find_msb: {
1526       src_reg temp = src_reg(this, glsl_type::uint_type);
1527
1528       inst = emit(FBH(dst_reg(temp), op[0]));
1529       inst->dst.writemask = WRITEMASK_XYZW;
1530
1531       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1532        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1533        * subtract the result from 31 to convert the MSB count into an LSB count.
1534        */
1535
1536       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1537       temp.swizzle = BRW_SWIZZLE_NOOP;
1538       emit(MOV(result_dst, temp));
1539
1540       src_reg src_tmp = src_reg(result_dst);
1541       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1542
1543       src_tmp.negate = true;
1544       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1545       inst->predicate = BRW_PREDICATE_NORMAL;
1546       break;
1547    }
1548    case ir_unop_find_lsb:
1549       emit(FBL(result_dst, op[0]));
1550       break;
1551    case ir_unop_saturate:
1552       inst = emit(MOV(result_dst, op[0]));
1553       inst->saturate = true;
1554       break;
1555
1556    case ir_unop_noise:
1557       unreachable("not reached: should be handled by lower_noise");
1558
1559    case ir_binop_add:
1560       emit(ADD(result_dst, op[0], op[1]));
1561       break;
1562    case ir_binop_sub:
1563       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1564
1565    case ir_binop_mul:
1566       if (devinfo->gen < 8 && ir->type->is_integer()) {
1567          /* For integer multiplication, the MUL uses the low 16 bits of one of
1568           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1569           * accumulates in the contribution of the upper 16 bits of that
1570           * operand.  If we can determine that one of the args is in the low
1571           * 16 bits, though, we can just emit a single MUL.
1572           */
1573          if (ir->operands[0]->is_uint16_constant()) {
1574             if (devinfo->gen < 7)
1575                emit(MUL(result_dst, op[0], op[1]));
1576             else
1577                emit(MUL(result_dst, op[1], op[0]));
1578          } else if (ir->operands[1]->is_uint16_constant()) {
1579             if (devinfo->gen < 7)
1580                emit(MUL(result_dst, op[1], op[0]));
1581             else
1582                emit(MUL(result_dst, op[0], op[1]));
1583          } else {
1584             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1585
1586             emit(MUL(acc, op[0], op[1]));
1587             emit(MACH(dst_null_d(), op[0], op[1]));
1588             emit(MOV(result_dst, src_reg(acc)));
1589          }
1590       } else {
1591          emit(MUL(result_dst, op[0], op[1]));
1592       }
1593       break;
1594    case ir_binop_imul_high: {
1595       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1596
1597       emit(MUL(acc, op[0], op[1]));
1598       emit(MACH(result_dst, op[0], op[1]));
1599       break;
1600    }
1601    case ir_binop_div:
1602       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1603       assert(ir->type->is_integer());
1604       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1605       break;
1606    case ir_binop_carry: {
1607       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1608
1609       emit(ADDC(dst_null_ud(), op[0], op[1]));
1610       emit(MOV(result_dst, src_reg(acc)));
1611       break;
1612    }
1613    case ir_binop_borrow: {
1614       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1615
1616       emit(SUBB(dst_null_ud(), op[0], op[1]));
1617       emit(MOV(result_dst, src_reg(acc)));
1618       break;
1619    }
1620    case ir_binop_mod:
1621       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1622       assert(ir->type->is_integer());
1623       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1624       break;
1625
1626    case ir_binop_less:
1627    case ir_binop_greater:
1628    case ir_binop_lequal:
1629    case ir_binop_gequal:
1630    case ir_binop_equal:
1631    case ir_binop_nequal: {
1632       if (devinfo->gen <= 5) {
1633          resolve_bool_comparison(ir->operands[0], &op[0]);
1634          resolve_bool_comparison(ir->operands[1], &op[1]);
1635       }
1636       emit(CMP(result_dst, op[0], op[1],
1637                brw_conditional_for_comparison(ir->operation)));
1638       break;
1639    }
1640
1641    case ir_binop_all_equal:
1642       if (devinfo->gen <= 5) {
1643          resolve_bool_comparison(ir->operands[0], &op[0]);
1644          resolve_bool_comparison(ir->operands[1], &op[1]);
1645       }
1646
1647       /* "==" operator producing a scalar boolean. */
1648       if (ir->operands[0]->type->is_vector() ||
1649           ir->operands[1]->type->is_vector()) {
1650          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1651          emit(MOV(result_dst, src_reg(0)));
1652          inst = emit(MOV(result_dst, src_reg(~0)));
1653          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1654       } else {
1655          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1656       }
1657       break;
1658    case ir_binop_any_nequal:
1659       if (devinfo->gen <= 5) {
1660          resolve_bool_comparison(ir->operands[0], &op[0]);
1661          resolve_bool_comparison(ir->operands[1], &op[1]);
1662       }
1663
1664       /* "!=" operator producing a scalar boolean. */
1665       if (ir->operands[0]->type->is_vector() ||
1666           ir->operands[1]->type->is_vector()) {
1667          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1668
1669          emit(MOV(result_dst, src_reg(0)));
1670          inst = emit(MOV(result_dst, src_reg(~0)));
1671          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1672       } else {
1673          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1674       }
1675       break;
1676
1677    case ir_unop_any:
1678       if (devinfo->gen <= 5) {
1679          resolve_bool_comparison(ir->operands[0], &op[0]);
1680       }
1681       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1682       emit(MOV(result_dst, src_reg(0)));
1683
1684       inst = emit(MOV(result_dst, src_reg(~0)));
1685       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1686       break;
1687
1688    case ir_binop_logic_xor:
1689       emit(XOR(result_dst, op[0], op[1]));
1690       break;
1691
1692    case ir_binop_logic_or:
1693       emit(OR(result_dst, op[0], op[1]));
1694       break;
1695
1696    case ir_binop_logic_and:
1697       emit(AND(result_dst, op[0], op[1]));
1698       break;
1699
1700    case ir_binop_dot:
1701       assert(ir->operands[0]->type->is_vector());
1702       assert(ir->operands[0]->type == ir->operands[1]->type);
1703       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1704       break;
1705
1706    case ir_unop_sqrt:
1707       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1708       break;
1709    case ir_unop_rsq:
1710       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1711       break;
1712
1713    case ir_unop_bitcast_i2f:
1714    case ir_unop_bitcast_u2f:
1715       this->result = op[0];
1716       this->result.type = BRW_REGISTER_TYPE_F;
1717       break;
1718
1719    case ir_unop_bitcast_f2i:
1720       this->result = op[0];
1721       this->result.type = BRW_REGISTER_TYPE_D;
1722       break;
1723
1724    case ir_unop_bitcast_f2u:
1725       this->result = op[0];
1726       this->result.type = BRW_REGISTER_TYPE_UD;
1727       break;
1728
1729    case ir_unop_i2f:
1730    case ir_unop_i2u:
1731    case ir_unop_u2i:
1732    case ir_unop_u2f:
1733    case ir_unop_f2i:
1734    case ir_unop_f2u:
1735       emit(MOV(result_dst, op[0]));
1736       break;
1737    case ir_unop_b2i:
1738       emit(AND(result_dst, op[0], src_reg(1)));
1739       break;
1740    case ir_unop_b2f:
1741       if (devinfo->gen <= 5) {
1742          resolve_bool_comparison(ir->operands[0], &op[0]);
1743       }
1744       op[0].type = BRW_REGISTER_TYPE_D;
1745       result_dst.type = BRW_REGISTER_TYPE_D;
1746       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1747       result_dst.type = BRW_REGISTER_TYPE_F;
1748       break;
1749    case ir_unop_f2b:
1750       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1751       break;
1752    case ir_unop_i2b:
1753       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1754       break;
1755
1756    case ir_unop_trunc:
1757       emit(RNDZ(result_dst, op[0]));
1758       break;
1759    case ir_unop_ceil: {
1760          src_reg tmp = src_reg(this, ir->type);
1761          op[0].negate = !op[0].negate;
1762          emit(RNDD(dst_reg(tmp), op[0]));
1763          tmp.negate = true;
1764          emit(MOV(result_dst, tmp));
1765       }
1766       break;
1767    case ir_unop_floor:
1768       inst = emit(RNDD(result_dst, op[0]));
1769       break;
1770    case ir_unop_fract:
1771       inst = emit(FRC(result_dst, op[0]));
1772       break;
1773    case ir_unop_round_even:
1774       emit(RNDE(result_dst, op[0]));
1775       break;
1776
1777    case ir_binop_min:
1778       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1779       break;
1780    case ir_binop_max:
1781       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1782       break;
1783
1784    case ir_binop_pow:
1785       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1786       break;
1787
1788    case ir_unop_bit_not:
1789       inst = emit(NOT(result_dst, op[0]));
1790       break;
1791    case ir_binop_bit_and:
1792       inst = emit(AND(result_dst, op[0], op[1]));
1793       break;
1794    case ir_binop_bit_xor:
1795       inst = emit(XOR(result_dst, op[0], op[1]));
1796       break;
1797    case ir_binop_bit_or:
1798       inst = emit(OR(result_dst, op[0], op[1]));
1799       break;
1800
1801    case ir_binop_lshift:
1802       inst = emit(SHL(result_dst, op[0], op[1]));
1803       break;
1804
1805    case ir_binop_rshift:
1806       if (ir->type->base_type == GLSL_TYPE_INT)
1807          inst = emit(ASR(result_dst, op[0], op[1]));
1808       else
1809          inst = emit(SHR(result_dst, op[0], op[1]));
1810       break;
1811
1812    case ir_binop_bfm:
1813       emit(BFI1(result_dst, op[0], op[1]));
1814       break;
1815
1816    case ir_binop_ubo_load: {
1817       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1818       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1819       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1820       src_reg offset;
1821
1822       /* Now, load the vector from that offset. */
1823       assert(ir->type->is_vector() || ir->type->is_scalar());
1824
1825       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1826       packed_consts.type = result.type;
1827       src_reg surf_index;
1828
1829       if (const_uniform_block) {
1830          /* The block index is a constant, so just emit the binding table entry
1831           * as an immediate.
1832           */
1833          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1834                               const_uniform_block->value.u[0]);
1835       } else {
1836          /* The block index is not a constant. Evaluate the index expression
1837           * per-channel and add the base UBO index; we have to select a value
1838           * from any live channel.
1839           */
1840          surf_index = src_reg(this, glsl_type::uint_type);
1841          emit(ADD(dst_reg(surf_index), op[0],
1842                   src_reg(prog_data->base.binding_table.ubo_start)));
1843          emit_uniformize(dst_reg(surf_index), surf_index);
1844
1845          /* Assume this may touch any UBO. It would be nice to provide
1846           * a tighter bound, but the array information is already lowered away.
1847           */
1848          brw_mark_surface_used(&prog_data->base,
1849                                prog_data->base.binding_table.ubo_start +
1850                                shader_prog->NumUniformBlocks - 1);
1851       }
1852
1853       if (const_offset_ir) {
1854          if (devinfo->gen >= 8) {
1855             /* Store the offset in a GRF so we can send-from-GRF. */
1856             offset = src_reg(this, glsl_type::int_type);
1857             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1858          } else {
1859             /* Immediates are fine on older generations since they'll be moved
1860              * to a (potentially fake) MRF at the generator level.
1861              */
1862             offset = src_reg(const_offset / 16);
1863          }
1864       } else {
1865          offset = src_reg(this, glsl_type::uint_type);
1866          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1867       }
1868
1869       emit_pull_constant_load_reg(dst_reg(packed_consts),
1870                                   surf_index,
1871                                   offset,
1872                                   NULL, NULL /* before_block/inst */);
1873
1874       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1875       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1876                                             const_offset % 16 / 4,
1877                                             const_offset % 16 / 4,
1878                                             const_offset % 16 / 4);
1879
1880       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1881       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1882          emit(CMP(result_dst, packed_consts, src_reg(0u),
1883                   BRW_CONDITIONAL_NZ));
1884       } else {
1885          emit(MOV(result_dst, packed_consts));
1886       }
1887       break;
1888    }
1889
1890    case ir_binop_vector_extract:
1891       unreachable("should have been lowered by vec_index_to_cond_assign");
1892
1893    case ir_triop_fma:
1894       op[0] = fix_3src_operand(op[0]);
1895       op[1] = fix_3src_operand(op[1]);
1896       op[2] = fix_3src_operand(op[2]);
1897       /* Note that the instruction's argument order is reversed from GLSL
1898        * and the IR.
1899        */
1900       emit(MAD(result_dst, op[2], op[1], op[0]));
1901       break;
1902
1903    case ir_triop_lrp:
1904       emit_lrp(result_dst, op[0], op[1], op[2]);
1905       break;
1906
1907    case ir_triop_csel:
1908       unreachable("already handled above");
1909       break;
1910
1911    case ir_triop_bfi:
1912       op[0] = fix_3src_operand(op[0]);
1913       op[1] = fix_3src_operand(op[1]);
1914       op[2] = fix_3src_operand(op[2]);
1915       emit(BFI2(result_dst, op[0], op[1], op[2]));
1916       break;
1917
1918    case ir_triop_bitfield_extract:
1919       op[0] = fix_3src_operand(op[0]);
1920       op[1] = fix_3src_operand(op[1]);
1921       op[2] = fix_3src_operand(op[2]);
1922       /* Note that the instruction's argument order is reversed from GLSL
1923        * and the IR.
1924        */
1925       emit(BFE(result_dst, op[2], op[1], op[0]));
1926       break;
1927
1928    case ir_triop_vector_insert:
1929       unreachable("should have been lowered by lower_vector_insert");
1930
1931    case ir_quadop_bitfield_insert:
1932       unreachable("not reached: should be handled by "
1933               "bitfield_insert_to_bfm_bfi\n");
1934
1935    case ir_quadop_vector:
1936       unreachable("not reached: should be handled by lower_quadop_vector");
1937
1938    case ir_unop_pack_half_2x16:
1939       emit_pack_half_2x16(result_dst, op[0]);
1940       break;
1941    case ir_unop_unpack_half_2x16:
1942       emit_unpack_half_2x16(result_dst, op[0]);
1943       break;
1944    case ir_unop_unpack_unorm_4x8:
1945       emit_unpack_unorm_4x8(result_dst, op[0]);
1946       break;
1947    case ir_unop_unpack_snorm_4x8:
1948       emit_unpack_snorm_4x8(result_dst, op[0]);
1949       break;
1950    case ir_unop_pack_unorm_4x8:
1951       emit_pack_unorm_4x8(result_dst, op[0]);
1952       break;
1953    case ir_unop_pack_snorm_4x8:
1954       emit_pack_snorm_4x8(result_dst, op[0]);
1955       break;
1956    case ir_unop_pack_snorm_2x16:
1957    case ir_unop_pack_unorm_2x16:
1958    case ir_unop_unpack_snorm_2x16:
1959    case ir_unop_unpack_unorm_2x16:
1960       unreachable("not reached: should be handled by lower_packing_builtins");
1961    case ir_unop_unpack_half_2x16_split_x:
1962    case ir_unop_unpack_half_2x16_split_y:
1963    case ir_binop_pack_half_2x16_split:
1964    case ir_unop_interpolate_at_centroid:
1965    case ir_binop_interpolate_at_sample:
1966    case ir_binop_interpolate_at_offset:
1967       unreachable("not reached: should not occur in vertex shader");
1968    case ir_binop_ldexp:
1969       unreachable("not reached: should be handled by ldexp_to_arith()");
1970    case ir_unop_d2f:
1971    case ir_unop_f2d:
1972    case ir_unop_d2i:
1973    case ir_unop_i2d:
1974    case ir_unop_d2u:
1975    case ir_unop_u2d:
1976    case ir_unop_d2b:
1977    case ir_unop_pack_double_2x32:
1978    case ir_unop_unpack_double_2x32:
1979    case ir_unop_frexp_sig:
1980    case ir_unop_frexp_exp:
1981       unreachable("fp64 todo");
1982    }
1983 }
1984
1985
1986 void
1987 vec4_visitor::visit(ir_swizzle *ir)
1988 {
1989    /* Note that this is only swizzles in expressions, not those on the left
1990     * hand side of an assignment, which do write masking.  See ir_assignment
1991     * for that.
1992     */
1993    const unsigned swz = brw_compose_swizzle(
1994       brw_swizzle_for_size(ir->type->vector_elements),
1995       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1996
1997    ir->val->accept(this);
1998    this->result = swizzle(this->result, swz);
1999 }
2000
2001 void
2002 vec4_visitor::visit(ir_dereference_variable *ir)
2003 {
2004    const struct glsl_type *type = ir->type;
2005    dst_reg *reg = variable_storage(ir->var);
2006
2007    if (!reg) {
2008       fail("Failed to find variable storage for %s\n", ir->var->name);
2009       this->result = src_reg(brw_null_reg());
2010       return;
2011    }
2012
2013    this->result = src_reg(*reg);
2014
2015    /* System values get their swizzle from the dst_reg writemask */
2016    if (ir->var->data.mode == ir_var_system_value)
2017       return;
2018
2019    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2020       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2021 }
2022
2023
2024 int
2025 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2026 {
2027    /* Under normal circumstances array elements are stored consecutively, so
2028     * the stride is equal to the size of the array element.
2029     */
2030    return type_size(ir->type);
2031 }
2032
2033
2034 void
2035 vec4_visitor::visit(ir_dereference_array *ir)
2036 {
2037    ir_constant *constant_index;
2038    src_reg src;
2039    int array_stride = compute_array_stride(ir);
2040
2041    constant_index = ir->array_index->constant_expression_value();
2042
2043    ir->array->accept(this);
2044    src = this->result;
2045
2046    if (constant_index) {
2047       src.reg_offset += constant_index->value.i[0] * array_stride;
2048    } else {
2049       /* Variable index array dereference.  It eats the "vec4" of the
2050        * base of the array and an index that offsets the Mesa register
2051        * index.
2052        */
2053       ir->array_index->accept(this);
2054
2055       src_reg index_reg;
2056
2057       if (array_stride == 1) {
2058          index_reg = this->result;
2059       } else {
2060          index_reg = src_reg(this, glsl_type::int_type);
2061
2062          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2063       }
2064
2065       if (src.reladdr) {
2066          src_reg temp = src_reg(this, glsl_type::int_type);
2067
2068          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2069
2070          index_reg = temp;
2071       }
2072
2073       src.reladdr = ralloc(mem_ctx, src_reg);
2074       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2075    }
2076
2077    /* If the type is smaller than a vec4, replicate the last channel out. */
2078    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2079       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2080    else
2081       src.swizzle = BRW_SWIZZLE_NOOP;
2082    src.type = brw_type_for_base_type(ir->type);
2083
2084    this->result = src;
2085 }
2086
2087 void
2088 vec4_visitor::visit(ir_dereference_record *ir)
2089 {
2090    unsigned int i;
2091    const glsl_type *struct_type = ir->record->type;
2092    int offset = 0;
2093
2094    ir->record->accept(this);
2095
2096    for (i = 0; i < struct_type->length; i++) {
2097       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2098          break;
2099       offset += type_size(struct_type->fields.structure[i].type);
2100    }
2101
2102    /* If the type is smaller than a vec4, replicate the last channel out. */
2103    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2104       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2105    else
2106       this->result.swizzle = BRW_SWIZZLE_NOOP;
2107    this->result.type = brw_type_for_base_type(ir->type);
2108
2109    this->result.reg_offset += offset;
2110 }
2111
2112 /**
2113  * We want to be careful in assignment setup to hit the actual storage
2114  * instead of potentially using a temporary like we might with the
2115  * ir_dereference handler.
2116  */
2117 static dst_reg
2118 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2119 {
2120    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2121     * access of a vector, it must be separated into a series conditional moves
2122     * before reaching this point (see ir_vec_index_to_cond_assign).
2123     */
2124    assert(ir->as_dereference());
2125    ir_dereference_array *deref_array = ir->as_dereference_array();
2126    if (deref_array) {
2127       assert(!deref_array->array->type->is_vector());
2128    }
2129
2130    /* Use the rvalue deref handler for the most part.  We'll ignore
2131     * swizzles in it and write swizzles using writemask, though.
2132     */
2133    ir->accept(v);
2134    return dst_reg(v->result);
2135 }
2136
2137 void
2138 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2139                               const struct glsl_type *type,
2140                               enum brw_predicate predicate)
2141 {
2142    if (type->base_type == GLSL_TYPE_STRUCT) {
2143       for (unsigned int i = 0; i < type->length; i++) {
2144          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2145       }
2146       return;
2147    }
2148
2149    if (type->is_array()) {
2150       for (unsigned int i = 0; i < type->length; i++) {
2151          emit_block_move(dst, src, type->fields.array, predicate);
2152       }
2153       return;
2154    }
2155
2156    if (type->is_matrix()) {
2157       const struct glsl_type *vec_type;
2158
2159       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2160                                          type->vector_elements, 1);
2161
2162       for (int i = 0; i < type->matrix_columns; i++) {
2163          emit_block_move(dst, src, vec_type, predicate);
2164       }
2165       return;
2166    }
2167
2168    assert(type->is_scalar() || type->is_vector());
2169
2170    dst->type = brw_type_for_base_type(type);
2171    src->type = dst->type;
2172
2173    dst->writemask = (1 << type->vector_elements) - 1;
2174
2175    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2176
2177    vec4_instruction *inst = emit(MOV(*dst, *src));
2178    inst->predicate = predicate;
2179
2180    dst->reg_offset++;
2181    src->reg_offset++;
2182 }
2183
2184
2185 /* If the RHS processing resulted in an instruction generating a
2186  * temporary value, and it would be easy to rewrite the instruction to
2187  * generate its result right into the LHS instead, do so.  This ends
2188  * up reliably removing instructions where it can be tricky to do so
2189  * later without real UD chain information.
2190  */
2191 bool
2192 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2193                                      dst_reg dst,
2194                                      src_reg src,
2195                                      vec4_instruction *pre_rhs_inst,
2196                                      vec4_instruction *last_rhs_inst)
2197 {
2198    /* This could be supported, but it would take more smarts. */
2199    if (ir->condition)
2200       return false;
2201
2202    if (pre_rhs_inst == last_rhs_inst)
2203       return false; /* No instructions generated to work with. */
2204
2205    /* Make sure the last instruction generated our source reg. */
2206    if (src.file != GRF ||
2207        src.file != last_rhs_inst->dst.file ||
2208        src.reg != last_rhs_inst->dst.reg ||
2209        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2210        src.reladdr ||
2211        src.abs ||
2212        src.negate ||
2213        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2214       return false;
2215
2216    /* Check that that last instruction fully initialized the channels
2217     * we want to use, in the order we want to use them.  We could
2218     * potentially reswizzle the operands of many instructions so that
2219     * we could handle out of order channels, but don't yet.
2220     */
2221
2222    for (unsigned i = 0; i < 4; i++) {
2223       if (dst.writemask & (1 << i)) {
2224          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2225             return false;
2226
2227          if (BRW_GET_SWZ(src.swizzle, i) != i)
2228             return false;
2229       }
2230    }
2231
2232    /* Success!  Rewrite the instruction. */
2233    last_rhs_inst->dst.file = dst.file;
2234    last_rhs_inst->dst.reg = dst.reg;
2235    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2236    last_rhs_inst->dst.reladdr = dst.reladdr;
2237    last_rhs_inst->dst.writemask &= dst.writemask;
2238
2239    return true;
2240 }
2241
2242 void
2243 vec4_visitor::visit(ir_assignment *ir)
2244 {
2245    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2246    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2247
2248    if (!ir->lhs->type->is_scalar() &&
2249        !ir->lhs->type->is_vector()) {
2250       ir->rhs->accept(this);
2251       src_reg src = this->result;
2252
2253       if (ir->condition) {
2254          emit_bool_to_cond_code(ir->condition, &predicate);
2255       }
2256
2257       /* emit_block_move doesn't account for swizzles in the source register.
2258        * This should be ok, since the source register is a structure or an
2259        * array, and those can't be swizzled.  But double-check to be sure.
2260        */
2261       assert(src.swizzle ==
2262              (ir->rhs->type->is_matrix()
2263               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2264               : BRW_SWIZZLE_NOOP));
2265
2266       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2267       return;
2268    }
2269
2270    /* Now we're down to just a scalar/vector with writemasks. */
2271    int i;
2272
2273    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2274    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2275
2276    ir->rhs->accept(this);
2277
2278    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2279
2280    int swizzles[4];
2281    int src_chan = 0;
2282
2283    assert(ir->lhs->type->is_vector() ||
2284           ir->lhs->type->is_scalar());
2285    dst.writemask = ir->write_mask;
2286
2287    /* Swizzle a small RHS vector into the channels being written.
2288     *
2289     * glsl ir treats write_mask as dictating how many channels are
2290     * present on the RHS while in our instructions we need to make
2291     * those channels appear in the slots of the vec4 they're written to.
2292     */
2293    for (int i = 0; i < 4; i++)
2294       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2295
2296    src_reg src = swizzle(this->result,
2297                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2298                                       swizzles[2], swizzles[3]));
2299
2300    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2301       return;
2302    }
2303
2304    if (ir->condition) {
2305       emit_bool_to_cond_code(ir->condition, &predicate);
2306    }
2307
2308    for (i = 0; i < type_size(ir->lhs->type); i++) {
2309       vec4_instruction *inst = emit(MOV(dst, src));
2310       inst->predicate = predicate;
2311
2312       dst.reg_offset++;
2313       src.reg_offset++;
2314    }
2315 }
2316
2317 void
2318 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2319 {
2320    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2321       foreach_in_list(ir_constant, field_value, &ir->components) {
2322          emit_constant_values(dst, field_value);
2323       }
2324       return;
2325    }
2326
2327    if (ir->type->is_array()) {
2328       for (unsigned int i = 0; i < ir->type->length; i++) {
2329          emit_constant_values(dst, ir->array_elements[i]);
2330       }
2331       return;
2332    }
2333
2334    if (ir->type->is_matrix()) {
2335       for (int i = 0; i < ir->type->matrix_columns; i++) {
2336          float *vec = &ir->value.f[i * ir->type->vector_elements];
2337
2338          for (int j = 0; j < ir->type->vector_elements; j++) {
2339             dst->writemask = 1 << j;
2340             dst->type = BRW_REGISTER_TYPE_F;
2341
2342             emit(MOV(*dst, src_reg(vec[j])));
2343          }
2344          dst->reg_offset++;
2345       }
2346       return;
2347    }
2348
2349    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2350
2351    for (int i = 0; i < ir->type->vector_elements; i++) {
2352       if (!(remaining_writemask & (1 << i)))
2353          continue;
2354
2355       dst->writemask = 1 << i;
2356       dst->type = brw_type_for_base_type(ir->type);
2357
2358       /* Find other components that match the one we're about to
2359        * write.  Emits fewer instructions for things like vec4(0.5,
2360        * 1.5, 1.5, 1.5).
2361        */
2362       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2363          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2364             if (ir->value.b[i] == ir->value.b[j])
2365                dst->writemask |= (1 << j);
2366          } else {
2367             /* u, i, and f storage all line up, so no need for a
2368              * switch case for comparing each type.
2369              */
2370             if (ir->value.u[i] == ir->value.u[j])
2371                dst->writemask |= (1 << j);
2372          }
2373       }
2374
2375       switch (ir->type->base_type) {
2376       case GLSL_TYPE_FLOAT:
2377          emit(MOV(*dst, src_reg(ir->value.f[i])));
2378          break;
2379       case GLSL_TYPE_INT:
2380          emit(MOV(*dst, src_reg(ir->value.i[i])));
2381          break;
2382       case GLSL_TYPE_UINT:
2383          emit(MOV(*dst, src_reg(ir->value.u[i])));
2384          break;
2385       case GLSL_TYPE_BOOL:
2386          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2387          break;
2388       default:
2389          unreachable("Non-float/uint/int/bool constant");
2390       }
2391
2392       remaining_writemask &= ~dst->writemask;
2393    }
2394    dst->reg_offset++;
2395 }
2396
2397 void
2398 vec4_visitor::visit(ir_constant *ir)
2399 {
2400    dst_reg dst = dst_reg(this, ir->type);
2401    this->result = src_reg(dst);
2402
2403    emit_constant_values(&dst, ir);
2404 }
2405
2406 void
2407 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2408 {
2409    ir_dereference *deref = static_cast<ir_dereference *>(
2410       ir->actual_parameters.get_head());
2411    ir_variable *location = deref->variable_referenced();
2412    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2413                           location->data.binding);
2414
2415    /* Calculate the surface offset */
2416    src_reg offset(this, glsl_type::uint_type);
2417    ir_dereference_array *deref_array = deref->as_dereference_array();
2418    if (deref_array) {
2419       deref_array->array_index->accept(this);
2420
2421       src_reg tmp(this, glsl_type::uint_type);
2422       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2423       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2424    } else {
2425       offset = location->data.atomic.offset;
2426    }
2427
2428    /* Emit the appropriate machine instruction */
2429    const char *callee = ir->callee->function_name();
2430    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2431
2432    if (!strcmp("__intrinsic_atomic_read", callee)) {
2433       emit_untyped_surface_read(surf_index, dst, offset);
2434
2435    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2436       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2437                           src_reg(), src_reg());
2438
2439    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2440       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2441                           src_reg(), src_reg());
2442    }
2443 }
2444
2445 void
2446 vec4_visitor::visit(ir_call *ir)
2447 {
2448    const char *callee = ir->callee->function_name();
2449
2450    if (!strcmp("__intrinsic_atomic_read", callee) ||
2451        !strcmp("__intrinsic_atomic_increment", callee) ||
2452        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2453       visit_atomic_counter_intrinsic(ir);
2454    } else {
2455       unreachable("Unsupported intrinsic.");
2456    }
2457 }
2458
2459 src_reg
2460 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2461 {
2462    vec4_instruction *inst =
2463       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2464                                     dst_reg(this, glsl_type::uvec4_type));
2465    inst->base_mrf = 2;
2466    inst->src[1] = sampler;
2467
2468    int param_base;
2469
2470    if (devinfo->gen >= 9) {
2471       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2472       vec4_instruction *header_inst = new(mem_ctx)
2473          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2474                           dst_reg(MRF, inst->base_mrf));
2475
2476       emit(header_inst);
2477
2478       inst->mlen = 2;
2479       inst->header_size = 1;
2480       param_base = inst->base_mrf + 1;
2481    } else {
2482       inst->mlen = 1;
2483       param_base = inst->base_mrf;
2484    }
2485
2486    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2487    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2488    int zero_mask = 0xf & ~coord_mask;
2489
2490    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2491             coordinate));
2492
2493    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2494             src_reg(0)));
2495
2496    emit(inst);
2497    return src_reg(inst->dst);
2498 }
2499
2500 static bool
2501 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2502 {
2503    if (devinfo->gen < 8 && !devinfo->is_haswell)
2504       return false;
2505
2506    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2507 }
2508
2509 void
2510 vec4_visitor::visit(ir_texture *ir)
2511 {
2512    uint32_t sampler =
2513       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2514
2515    ir_rvalue *nonconst_sampler_index =
2516       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2517
2518    /* Handle non-constant sampler array indexing */
2519    src_reg sampler_reg;
2520    if (nonconst_sampler_index) {
2521       /* The highest sampler which may be used by this operation is
2522        * the last element of the array. Mark it here, because the generator
2523        * doesn't have enough information to determine the bound.
2524        */
2525       uint32_t array_size = ir->sampler->as_dereference_array()
2526          ->array->type->array_size();
2527
2528       uint32_t max_used = sampler + array_size - 1;
2529       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2530          max_used += prog_data->base.binding_table.gather_texture_start;
2531       } else {
2532          max_used += prog_data->base.binding_table.texture_start;
2533       }
2534
2535       brw_mark_surface_used(&prog_data->base, max_used);
2536
2537       /* Emit code to evaluate the actual indexing expression */
2538       nonconst_sampler_index->accept(this);
2539       dst_reg temp(this, glsl_type::uint_type);
2540       emit(ADD(temp, this->result, src_reg(sampler)));
2541       emit_uniformize(temp, src_reg(temp));
2542
2543       sampler_reg = src_reg(temp);
2544    } else {
2545       /* Single sampler, or constant array index; the indexing expression
2546        * is just an immediate.
2547        */
2548       sampler_reg = src_reg(sampler);
2549    }
2550
2551    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2552     * emitting anything other than setting up the constant result.
2553     */
2554    if (ir->op == ir_tg4) {
2555       ir_constant *chan = ir->lod_info.component->as_constant();
2556       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2557       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2558          dst_reg result(this, ir->type);
2559          this->result = src_reg(result);
2560          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2561          return;
2562       }
2563    }
2564
2565    /* Should be lowered by do_lower_texture_projection */
2566    assert(!ir->projector);
2567
2568    /* Should be lowered */
2569    assert(!ir->offset || !ir->offset->type->is_array());
2570
2571    /* Generate code to compute all the subexpression trees.  This has to be
2572     * done before loading any values into MRFs for the sampler message since
2573     * generating these values may involve SEND messages that need the MRFs.
2574     */
2575    src_reg coordinate;
2576    if (ir->coordinate) {
2577       ir->coordinate->accept(this);
2578       coordinate = this->result;
2579    }
2580
2581    src_reg shadow_comparitor;
2582    if (ir->shadow_comparitor) {
2583       ir->shadow_comparitor->accept(this);
2584       shadow_comparitor = this->result;
2585    }
2586
2587    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2588    src_reg offset_value;
2589    if (has_nonconstant_offset) {
2590       ir->offset->accept(this);
2591       offset_value = src_reg(this->result);
2592    }
2593
2594    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2595    src_reg lod, dPdx, dPdy, sample_index, mcs;
2596    switch (ir->op) {
2597    case ir_tex:
2598       lod = src_reg(0.0f);
2599       lod_type = glsl_type::float_type;
2600       break;
2601    case ir_txf:
2602    case ir_txl:
2603    case ir_txs:
2604       ir->lod_info.lod->accept(this);
2605       lod = this->result;
2606       lod_type = ir->lod_info.lod->type;
2607       break;
2608    case ir_query_levels:
2609       lod = src_reg(0);
2610       lod_type = glsl_type::int_type;
2611       break;
2612    case ir_txf_ms:
2613       ir->lod_info.sample_index->accept(this);
2614       sample_index = this->result;
2615       sample_index_type = ir->lod_info.sample_index->type;
2616
2617       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2618          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2619       else
2620          mcs = src_reg(0u);
2621       break;
2622    case ir_txd:
2623       ir->lod_info.grad.dPdx->accept(this);
2624       dPdx = this->result;
2625
2626       ir->lod_info.grad.dPdy->accept(this);
2627       dPdy = this->result;
2628
2629       lod_type = ir->lod_info.grad.dPdx->type;
2630       break;
2631    case ir_txb:
2632    case ir_lod:
2633    case ir_tg4:
2634       break;
2635    }
2636
2637    enum opcode opcode;
2638    switch (ir->op) {
2639    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2640    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2641    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2642    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2643    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2644    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2645    case ir_tg4: opcode = has_nonconstant_offset
2646                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2647    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2648    case ir_txb:
2649       unreachable("TXB is not valid for vertex shaders.");
2650    case ir_lod:
2651       unreachable("LOD is not valid for vertex shaders.");
2652    default:
2653       unreachable("Unrecognized tex op");
2654    }
2655
2656    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2657       opcode, dst_reg(this, ir->type));
2658
2659    if (ir->offset != NULL && !has_nonconstant_offset) {
2660       inst->offset =
2661          brw_texture_offset(ir->offset->as_constant()->value.i,
2662                             ir->offset->type->vector_elements);
2663    }
2664
2665    /* Stuff the channel select bits in the top of the texture offset */
2666    if (ir->op == ir_tg4)
2667       inst->offset |= gather_channel(ir, sampler) << 16;
2668
2669    /* The message header is necessary for:
2670     * - Gen4 (always)
2671     * - Gen9+ for selecting SIMD4x2
2672     * - Texel offsets
2673     * - Gather channel selection
2674     * - Sampler indices too large to fit in a 4-bit value.
2675     */
2676    inst->header_size =
2677       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2678        inst->offset != 0 || ir->op == ir_tg4 ||
2679        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2680    inst->base_mrf = 2;
2681    inst->mlen = inst->header_size + 1; /* always at least one */
2682    inst->dst.writemask = WRITEMASK_XYZW;
2683    inst->shadow_compare = ir->shadow_comparitor != NULL;
2684
2685    inst->src[1] = sampler_reg;
2686
2687    /* MRF for the first parameter */
2688    int param_base = inst->base_mrf + inst->header_size;
2689
2690    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2691       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2692       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2693    } else {
2694       /* Load the coordinate */
2695       /* FINISHME: gl_clamp_mask and saturate */
2696       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2697       int zero_mask = 0xf & ~coord_mask;
2698
2699       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2700                coordinate));
2701
2702       if (zero_mask != 0) {
2703          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2704                   src_reg(0)));
2705       }
2706       /* Load the shadow comparitor */
2707       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2708          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2709                           WRITEMASK_X),
2710                   shadow_comparitor));
2711          inst->mlen++;
2712       }
2713
2714       /* Load the LOD info */
2715       if (ir->op == ir_tex || ir->op == ir_txl) {
2716          int mrf, writemask;
2717          if (devinfo->gen >= 5) {
2718             mrf = param_base + 1;
2719             if (ir->shadow_comparitor) {
2720                writemask = WRITEMASK_Y;
2721                /* mlen already incremented */
2722             } else {
2723                writemask = WRITEMASK_X;
2724                inst->mlen++;
2725             }
2726          } else /* devinfo->gen == 4 */ {
2727             mrf = param_base;
2728             writemask = WRITEMASK_W;
2729          }
2730          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2731       } else if (ir->op == ir_txf) {
2732          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2733       } else if (ir->op == ir_txf_ms) {
2734          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2735                   sample_index));
2736          if (devinfo->gen >= 7) {
2737             /* MCS data is in the first channel of `mcs`, but we need to get it into
2738              * the .y channel of the second vec4 of params, so replicate .x across
2739              * the whole vec4 and then mask off everything except .y
2740              */
2741             mcs.swizzle = BRW_SWIZZLE_XXXX;
2742             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2743                      mcs));
2744          }
2745          inst->mlen++;
2746       } else if (ir->op == ir_txd) {
2747          const glsl_type *type = lod_type;
2748
2749          if (devinfo->gen >= 5) {
2750             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2751             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2752             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2753             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2754             inst->mlen++;
2755
2756             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2757                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2758                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2759                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2760                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2761                inst->mlen++;
2762
2763                if (ir->shadow_comparitor) {
2764                   emit(MOV(dst_reg(MRF, param_base + 2,
2765                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2766                            shadow_comparitor));
2767                }
2768             }
2769          } else /* devinfo->gen == 4 */ {
2770             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2771             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2772             inst->mlen += 2;
2773          }
2774       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2775          if (ir->shadow_comparitor) {
2776             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2777                      shadow_comparitor));
2778          }
2779
2780          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2781                   offset_value));
2782          inst->mlen++;
2783       }
2784    }
2785
2786    emit(inst);
2787
2788    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2789     * spec requires layers.
2790     */
2791    if (ir->op == ir_txs) {
2792       glsl_type const *type = ir->sampler->type;
2793       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2794           type->sampler_array) {
2795          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2796                    writemask(inst->dst, WRITEMASK_Z),
2797                    src_reg(inst->dst), src_reg(6));
2798       }
2799    }
2800
2801    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2802       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2803    }
2804
2805    swizzle_result(ir, src_reg(inst->dst), sampler);
2806 }
2807
2808 /**
2809  * Apply workarounds for Gen6 gather with UINT/SINT
2810  */
2811 void
2812 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2813 {
2814    if (!wa)
2815       return;
2816
2817    int width = (wa & WA_8BIT) ? 8 : 16;
2818    dst_reg dst_f = dst;
2819    dst_f.type = BRW_REGISTER_TYPE_F;
2820
2821    /* Convert from UNORM to UINT */
2822    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2823    emit(MOV(dst, src_reg(dst_f)));
2824
2825    if (wa & WA_SIGN) {
2826       /* Reinterpret the UINT value as a signed INT value by
2827        * shifting the sign bit into place, then shifting back
2828        * preserving sign.
2829        */
2830       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2831       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2832    }
2833 }
2834
2835 /**
2836  * Set up the gather channel based on the swizzle, for gather4.
2837  */
2838 uint32_t
2839 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2840 {
2841    ir_constant *chan = ir->lod_info.component->as_constant();
2842    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2843    switch (swiz) {
2844       case SWIZZLE_X: return 0;
2845       case SWIZZLE_Y:
2846          /* gather4 sampler is broken for green channel on RG32F --
2847           * we must ask for blue instead.
2848           */
2849          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2850             return 2;
2851          return 1;
2852       case SWIZZLE_Z: return 2;
2853       case SWIZZLE_W: return 3;
2854       default:
2855          unreachable("Not reached"); /* zero, one swizzles handled already */
2856    }
2857 }
2858
2859 void
2860 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2861 {
2862    int s = key->tex.swizzles[sampler];
2863
2864    this->result = src_reg(this, ir->type);
2865    dst_reg swizzled_result(this->result);
2866
2867    if (ir->op == ir_query_levels) {
2868       /* # levels is in .w */
2869       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2870       emit(MOV(swizzled_result, orig_val));
2871       return;
2872    }
2873
2874    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2875                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2876       emit(MOV(swizzled_result, orig_val));
2877       return;
2878    }
2879
2880
2881    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2882    int swizzle[4] = {0};
2883
2884    for (int i = 0; i < 4; i++) {
2885       switch (GET_SWZ(s, i)) {
2886       case SWIZZLE_ZERO:
2887          zero_mask |= (1 << i);
2888          break;
2889       case SWIZZLE_ONE:
2890          one_mask |= (1 << i);
2891          break;
2892       default:
2893          copy_mask |= (1 << i);
2894          swizzle[i] = GET_SWZ(s, i);
2895          break;
2896       }
2897    }
2898
2899    if (copy_mask) {
2900       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2901       swizzled_result.writemask = copy_mask;
2902       emit(MOV(swizzled_result, orig_val));
2903    }
2904
2905    if (zero_mask) {
2906       swizzled_result.writemask = zero_mask;
2907       emit(MOV(swizzled_result, src_reg(0.0f)));
2908    }
2909
2910    if (one_mask) {
2911       swizzled_result.writemask = one_mask;
2912       emit(MOV(swizzled_result, src_reg(1.0f)));
2913    }
2914 }
2915
2916 void
2917 vec4_visitor::visit(ir_return *)
2918 {
2919    unreachable("not reached");
2920 }
2921
2922 void
2923 vec4_visitor::visit(ir_discard *)
2924 {
2925    unreachable("not reached");
2926 }
2927
2928 void
2929 vec4_visitor::visit(ir_if *ir)
2930 {
2931    /* Don't point the annotation at the if statement, because then it plus
2932     * the then and else blocks get printed.
2933     */
2934    this->base_ir = ir->condition;
2935
2936    if (devinfo->gen == 6) {
2937       emit_if_gen6(ir);
2938    } else {
2939       enum brw_predicate predicate;
2940       emit_bool_to_cond_code(ir->condition, &predicate);
2941       emit(IF(predicate));
2942    }
2943
2944    visit_instructions(&ir->then_instructions);
2945
2946    if (!ir->else_instructions.is_empty()) {
2947       this->base_ir = ir->condition;
2948       emit(BRW_OPCODE_ELSE);
2949
2950       visit_instructions(&ir->else_instructions);
2951    }
2952
2953    this->base_ir = ir->condition;
2954    emit(BRW_OPCODE_ENDIF);
2955 }
2956
2957 void
2958 vec4_visitor::visit(ir_emit_vertex *)
2959 {
2960    unreachable("not reached");
2961 }
2962
2963 void
2964 vec4_visitor::visit(ir_end_primitive *)
2965 {
2966    unreachable("not reached");
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_barrier *)
2971 {
2972    unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2977                                   dst_reg dst, src_reg offset,
2978                                   src_reg src0, src_reg src1)
2979 {
2980    unsigned mlen = 0;
2981
2982    /* Set the atomic operation offset. */
2983    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2984    mlen++;
2985
2986    /* Set the atomic operation arguments. */
2987    if (src0.file != BAD_FILE) {
2988       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2989       mlen++;
2990    }
2991
2992    if (src1.file != BAD_FILE) {
2993       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2994       mlen++;
2995    }
2996
2997    /* Emit the instruction.  Note that this maps to the normal SIMD8
2998     * untyped atomic message on Ivy Bridge, but that's OK because
2999     * unused channels will be masked out.
3000     */
3001    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3002                                  brw_message_reg(0),
3003                                  src_reg(surf_index), src_reg(atomic_op));
3004    inst->mlen = mlen;
3005 }
3006
3007 void
3008 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3009                                         src_reg offset)
3010 {
3011    /* Set the surface read offset. */
3012    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3013
3014    /* Emit the instruction.  Note that this maps to the normal SIMD8
3015     * untyped surface read message, but that's OK because unused
3016     * channels will be masked out.
3017     */
3018    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3019                                  brw_message_reg(0),
3020                                  src_reg(surf_index), src_reg(1));
3021    inst->mlen = 1;
3022 }
3023
3024 void
3025 vec4_visitor::emit_ndc_computation()
3026 {
3027    /* Get the position */
3028    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3029
3030    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3031    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3032    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3033
3034    current_annotation = "NDC";
3035    dst_reg ndc_w = ndc;
3036    ndc_w.writemask = WRITEMASK_W;
3037    src_reg pos_w = pos;
3038    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3039    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3040
3041    dst_reg ndc_xyz = ndc;
3042    ndc_xyz.writemask = WRITEMASK_XYZ;
3043
3044    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3045 }
3046
3047 void
3048 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3049 {
3050    if (devinfo->gen < 6 &&
3051        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3052         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3053       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3054       dst_reg header1_w = header1;
3055       header1_w.writemask = WRITEMASK_W;
3056
3057       emit(MOV(header1, 0u));
3058
3059       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3060          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3061
3062          current_annotation = "Point size";
3063          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3064          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3065       }
3066
3067       if (key->userclip_active) {
3068          current_annotation = "Clipping flags";
3069          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3070          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3071
3072          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3073          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3074          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3075
3076          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3077          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3078          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3079          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3080       }
3081
3082       /* i965 clipping workaround:
3083        * 1) Test for -ve rhw
3084        * 2) If set,
3085        *      set ndc = (0,0,0,0)
3086        *      set ucp[6] = 1
3087        *
3088        * Later, clipping will detect ucp[6] and ensure the primitive is
3089        * clipped against all fixed planes.
3090        */
3091       if (devinfo->has_negative_rhw_bug) {
3092          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3093          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3094          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3095          vec4_instruction *inst;
3096          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3097          inst->predicate = BRW_PREDICATE_NORMAL;
3098          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3099          inst->predicate = BRW_PREDICATE_NORMAL;
3100       }
3101
3102       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3103    } else if (devinfo->gen < 6) {
3104       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3105    } else {
3106       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3107       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3108          dst_reg reg_w = reg;
3109          reg_w.writemask = WRITEMASK_W;
3110          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3111       }
3112       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3113          dst_reg reg_y = reg;
3114          reg_y.writemask = WRITEMASK_Y;
3115          reg_y.type = BRW_REGISTER_TYPE_D;
3116          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3117       }
3118       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3119          dst_reg reg_z = reg;
3120          reg_z.writemask = WRITEMASK_Z;
3121          reg_z.type = BRW_REGISTER_TYPE_D;
3122          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3123       }
3124    }
3125 }
3126
3127 void
3128 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3129 {
3130    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3131     *
3132     *     "If a linked set of shaders forming the vertex stage contains no
3133     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3134     *     application has requested clipping against user clip planes through
3135     *     the API, then the coordinate written to gl_Position is used for
3136     *     comparison against the user clip planes."
3137     *
3138     * This function is only called if the shader didn't write to
3139     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3140     * if the user wrote to it; otherwise we use gl_Position.
3141     */
3142    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3143    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3144       clip_vertex = VARYING_SLOT_POS;
3145    }
3146
3147    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3148         ++i) {
3149       reg.writemask = 1 << i;
3150       emit(DP4(reg,
3151                src_reg(output_reg[clip_vertex]),
3152                src_reg(this->userplane[i + offset])));
3153    }
3154 }
3155
3156 vec4_instruction *
3157 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3158 {
3159    assert (varying < VARYING_SLOT_MAX);
3160    reg.type = output_reg[varying].type;
3161    current_annotation = output_reg_annotation[varying];
3162    /* Copy the register, saturating if necessary */
3163    return emit(MOV(reg, src_reg(output_reg[varying])));
3164 }
3165
3166 void
3167 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3168 {
3169    reg.type = BRW_REGISTER_TYPE_F;
3170
3171    switch (varying) {
3172    case VARYING_SLOT_PSIZ:
3173    {
3174       /* PSIZ is always in slot 0, and is coupled with other flags. */
3175       current_annotation = "indices, point width, clip flags";
3176       emit_psiz_and_flags(reg);
3177       break;
3178    }
3179    case BRW_VARYING_SLOT_NDC:
3180       current_annotation = "NDC";
3181       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3182       break;
3183    case VARYING_SLOT_POS:
3184       current_annotation = "gl_Position";
3185       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3186       break;
3187    case VARYING_SLOT_EDGE:
3188       /* This is present when doing unfilled polygons.  We're supposed to copy
3189        * the edge flag from the user-provided vertex array
3190        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3191        * of that attribute (starts as 1.0f).  This is then used in clipping to
3192        * determine which edges should be drawn as wireframe.
3193        */
3194       current_annotation = "edge flag";
3195       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3196                                     glsl_type::float_type, WRITEMASK_XYZW))));
3197       break;
3198    case BRW_VARYING_SLOT_PAD:
3199       /* No need to write to this slot */
3200       break;
3201    case VARYING_SLOT_COL0:
3202    case VARYING_SLOT_COL1:
3203    case VARYING_SLOT_BFC0:
3204    case VARYING_SLOT_BFC1: {
3205       /* These built-in varyings are only supported in compatibility mode,
3206        * and we only support GS in core profile.  So, this must be a vertex
3207        * shader.
3208        */
3209       assert(stage == MESA_SHADER_VERTEX);
3210       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3211       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3212          inst->saturate = true;
3213       break;
3214    }
3215
3216    default:
3217       emit_generic_urb_slot(reg, varying);
3218       break;
3219    }
3220 }
3221
3222 static int
3223 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3224 {
3225    if (devinfo->gen >= 6) {
3226       /* URB data written (does not include the message header reg) must
3227        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3228        * section 5.4.3.2.2: URB_INTERLEAVED.
3229        *
3230        * URB entries are allocated on a multiple of 1024 bits, so an
3231        * extra 128 bits written here to make the end align to 256 is
3232        * no problem.
3233        */
3234       if ((mlen % 2) != 1)
3235          mlen++;
3236    }
3237
3238    return mlen;
3239 }
3240
3241
3242 /**
3243  * Generates the VUE payload plus the necessary URB write instructions to
3244  * output it.
3245  *
3246  * The VUE layout is documented in Volume 2a.
3247  */
3248 void
3249 vec4_visitor::emit_vertex()
3250 {
3251    /* MRF 0 is reserved for the debugger, so start with message header
3252     * in MRF 1.
3253     */
3254    int base_mrf = 1;
3255    int mrf = base_mrf;
3256    /* In the process of generating our URB write message contents, we
3257     * may need to unspill a register or load from an array.  Those
3258     * reads would use MRFs 14-15.
3259     */
3260    int max_usable_mrf = 13;
3261
3262    /* The following assertion verifies that max_usable_mrf causes an
3263     * even-numbered amount of URB write data, which will meet gen6's
3264     * requirements for length alignment.
3265     */
3266    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3267
3268    /* First mrf is the g0-based message header containing URB handles and
3269     * such.
3270     */
3271    emit_urb_write_header(mrf++);
3272
3273    if (devinfo->gen < 6) {
3274       emit_ndc_computation();
3275    }
3276
3277    /* Lower legacy ff and ClipVertex clipping to clip distances */
3278    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3279       current_annotation = "user clip distances";
3280
3281       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3282       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3283
3284       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3285       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3286    }
3287
3288    /* We may need to split this up into several URB writes, so do them in a
3289     * loop.
3290     */
3291    int slot = 0;
3292    bool complete = false;
3293    do {
3294       /* URB offset is in URB row increments, and each of our MRFs is half of
3295        * one of those, since we're doing interleaved writes.
3296        */
3297       int offset = slot / 2;
3298
3299       mrf = base_mrf + 1;
3300       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3301          emit_urb_slot(dst_reg(MRF, mrf++),
3302                        prog_data->vue_map.slot_to_varying[slot]);
3303
3304          /* If this was max_usable_mrf, we can't fit anything more into this
3305           * URB WRITE.
3306           */
3307          if (mrf > max_usable_mrf) {
3308             slot++;
3309             break;
3310          }
3311       }
3312
3313       complete = slot >= prog_data->vue_map.num_slots;
3314       current_annotation = "URB write";
3315       vec4_instruction *inst = emit_urb_write_opcode(complete);
3316       inst->base_mrf = base_mrf;
3317       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3318       inst->offset += offset;
3319    } while(!complete);
3320 }
3321
3322
3323 src_reg
3324 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3325                                  src_reg *reladdr, int reg_offset)
3326 {
3327    /* Because we store the values to scratch interleaved like our
3328     * vertex data, we need to scale the vec4 index by 2.
3329     */
3330    int message_header_scale = 2;
3331
3332    /* Pre-gen6, the message header uses byte offsets instead of vec4
3333     * (16-byte) offset units.
3334     */
3335    if (devinfo->gen < 6)
3336       message_header_scale *= 16;
3337
3338    if (reladdr) {
3339       src_reg index = src_reg(this, glsl_type::int_type);
3340
3341       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3342                                    src_reg(reg_offset)));
3343       emit_before(block, inst, MUL(dst_reg(index), index,
3344                                    src_reg(message_header_scale)));
3345
3346       return index;
3347    } else {
3348       return src_reg(reg_offset * message_header_scale);
3349    }
3350 }
3351
3352 src_reg
3353 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3354                                        src_reg *reladdr, int reg_offset)
3355 {
3356    if (reladdr) {
3357       src_reg index = src_reg(this, glsl_type::int_type);
3358
3359       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3360                                    src_reg(reg_offset)));
3361
3362       /* Pre-gen6, the message header uses byte offsets instead of vec4
3363        * (16-byte) offset units.
3364        */
3365       if (devinfo->gen < 6) {
3366          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3367       }
3368
3369       return index;
3370    } else if (devinfo->gen >= 8) {
3371       /* Store the offset in a GRF so we can send-from-GRF. */
3372       src_reg offset = src_reg(this, glsl_type::int_type);
3373       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3374       return offset;
3375    } else {
3376       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3377       return src_reg(reg_offset * message_header_scale);
3378    }
3379 }
3380
3381 /**
3382  * Emits an instruction before @inst to load the value named by @orig_src
3383  * from scratch space at @base_offset to @temp.
3384  *
3385  * @base_offset is measured in 32-byte units (the size of a register).
3386  */
3387 void
3388 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3389                                 dst_reg temp, src_reg orig_src,
3390                                 int base_offset)
3391 {
3392    int reg_offset = base_offset + orig_src.reg_offset;
3393    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3394                                       reg_offset);
3395
3396    emit_before(block, inst, SCRATCH_READ(temp, index));
3397 }
3398
3399 /**
3400  * Emits an instruction after @inst to store the value to be written
3401  * to @orig_dst to scratch space at @base_offset, from @temp.
3402  *
3403  * @base_offset is measured in 32-byte units (the size of a register).
3404  */
3405 void
3406 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3407                                  int base_offset)
3408 {
3409    int reg_offset = base_offset + inst->dst.reg_offset;
3410    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3411                                       reg_offset);
3412
3413    /* Create a temporary register to store *inst's result in.
3414     *
3415     * We have to be careful in MOVing from our temporary result register in
3416     * the scratch write.  If we swizzle from channels of the temporary that
3417     * weren't initialized, it will confuse live interval analysis, which will
3418     * make spilling fail to make progress.
3419     */
3420    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3421                                        inst->dst.type),
3422                                 brw_swizzle_for_mask(inst->dst.writemask));
3423    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3424                                        inst->dst.writemask));
3425    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3426    write->predicate = inst->predicate;
3427    write->ir = inst->ir;
3428    write->annotation = inst->annotation;
3429    inst->insert_after(block, write);
3430
3431    inst->dst.file = temp.file;
3432    inst->dst.reg = temp.reg;
3433    inst->dst.reg_offset = temp.reg_offset;
3434    inst->dst.reladdr = NULL;
3435 }
3436
3437 /**
3438  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3439  * adds the scratch read(s) before \p inst. The function also checks for
3440  * recursive reladdr scratch accesses, issuing the corresponding scratch
3441  * loads and rewriting reladdr references accordingly.
3442  *
3443  * \return \p src if it did not require a scratch load, otherwise, the
3444  * register holding the result of the scratch load that the caller should
3445  * use to rewrite src.
3446  */
3447 src_reg
3448 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3449                                    vec4_instruction *inst, src_reg src)
3450 {
3451    /* Resolve recursive reladdr scratch access by calling ourselves
3452     * with src.reladdr
3453     */
3454    if (src.reladdr)
3455       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3456                                           *src.reladdr);
3457
3458    /* Now handle scratch access on src */
3459    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3460       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3461       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3462       src.reg = temp.reg;
3463       src.reg_offset = temp.reg_offset;
3464       src.reladdr = NULL;
3465    }
3466
3467    return src;
3468 }
3469
3470 /**
3471  * We can't generally support array access in GRF space, because a
3472  * single instruction's destination can only span 2 contiguous
3473  * registers.  So, we send all GRF arrays that get variable index
3474  * access to scratch space.
3475  */
3476 void
3477 vec4_visitor::move_grf_array_access_to_scratch()
3478 {
3479    int scratch_loc[this->alloc.count];
3480    memset(scratch_loc, -1, sizeof(scratch_loc));
3481
3482    /* First, calculate the set of virtual GRFs that need to be punted
3483     * to scratch due to having any array access on them, and where in
3484     * scratch.
3485     */
3486    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3487       if (inst->dst.file == GRF && inst->dst.reladdr) {
3488          if (scratch_loc[inst->dst.reg] == -1) {
3489             scratch_loc[inst->dst.reg] = c->last_scratch;
3490             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3491          }
3492
3493          for (src_reg *iter = inst->dst.reladdr;
3494               iter->reladdr;
3495               iter = iter->reladdr) {
3496             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3497                scratch_loc[iter->reg] = c->last_scratch;
3498                c->last_scratch += this->alloc.sizes[iter->reg];
3499             }
3500          }
3501       }
3502
3503       for (int i = 0 ; i < 3; i++) {
3504          for (src_reg *iter = &inst->src[i];
3505               iter->reladdr;
3506               iter = iter->reladdr) {
3507             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3508                scratch_loc[iter->reg] = c->last_scratch;
3509                c->last_scratch += this->alloc.sizes[iter->reg];
3510             }
3511          }
3512       }
3513    }
3514
3515    /* Now, for anything that will be accessed through scratch, rewrite
3516     * it to load/store.  Note that this is a _safe list walk, because
3517     * we may generate a new scratch_write instruction after the one
3518     * we're processing.
3519     */
3520    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3521       /* Set up the annotation tracking for new generated instructions. */
3522       base_ir = inst->ir;
3523       current_annotation = inst->annotation;
3524
3525       /* First handle scratch access on the dst. Notice we have to handle
3526        * the case where the dst's reladdr also points to scratch space.
3527        */
3528       if (inst->dst.reladdr)
3529          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3530                                                    *inst->dst.reladdr);
3531
3532       /* Now that we have handled any (possibly recursive) reladdr scratch
3533        * accesses for dst we can safely do the scratch write for dst itself
3534        */
3535       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3536          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3537
3538       /* Now handle scratch access on any src. In this case, since inst->src[i]
3539        * already is a src_reg, we can just call emit_resolve_reladdr with
3540        * inst->src[i] and it will take care of handling scratch loads for
3541        * both src and src.reladdr (recursively).
3542        */
3543       for (int i = 0 ; i < 3; i++) {
3544          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3545                                              inst->src[i]);
3546       }
3547    }
3548 }
3549
3550 /**
3551  * Emits an instruction before @inst to load the value named by @orig_src
3552  * from the pull constant buffer (surface) at @base_offset to @temp.
3553  */
3554 void
3555 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3556                                       dst_reg temp, src_reg orig_src,
3557                                       int base_offset)
3558 {
3559    int reg_offset = base_offset + orig_src.reg_offset;
3560    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3561    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3562                                              reg_offset);
3563
3564    emit_pull_constant_load_reg(temp,
3565                                index,
3566                                offset,
3567                                block, inst);
3568 }
3569
3570 /**
3571  * Implements array access of uniforms by inserting a
3572  * PULL_CONSTANT_LOAD instruction.
3573  *
3574  * Unlike temporary GRF array access (where we don't support it due to
3575  * the difficulty of doing relative addressing on instruction
3576  * destinations), we could potentially do array access of uniforms
3577  * that were loaded in GRF space as push constants.  In real-world
3578  * usage we've seen, though, the arrays being used are always larger
3579  * than we could load as push constants, so just always move all
3580  * uniform array access out to a pull constant buffer.
3581  */
3582 void
3583 vec4_visitor::move_uniform_array_access_to_pull_constants()
3584 {
3585    int pull_constant_loc[this->uniforms];
3586    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3587    bool nested_reladdr;
3588
3589    /* Walk through and find array access of uniforms.  Put a copy of that
3590     * uniform in the pull constant buffer.
3591     *
3592     * Note that we don't move constant-indexed accesses to arrays.  No
3593     * testing has been done of the performance impact of this choice.
3594     */
3595    do {
3596       nested_reladdr = false;
3597
3598       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3599          for (int i = 0 ; i < 3; i++) {
3600             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3601                continue;
3602
3603             int uniform = inst->src[i].reg;
3604
3605             if (inst->src[i].reladdr->reladdr)
3606                nested_reladdr = true;  /* will need another pass */
3607
3608             /* If this array isn't already present in the pull constant buffer,
3609              * add it.
3610              */
3611             if (pull_constant_loc[uniform] == -1) {
3612                const gl_constant_value **values =
3613                   &stage_prog_data->param[uniform * 4];
3614
3615                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3616
3617                assert(uniform < uniform_array_size);
3618                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3619                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3620                      = values[j];
3621                }
3622             }
3623
3624             /* Set up the annotation tracking for new generated instructions. */
3625             base_ir = inst->ir;
3626             current_annotation = inst->annotation;
3627
3628             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3629
3630             emit_pull_constant_load(block, inst, temp, inst->src[i],
3631                                     pull_constant_loc[uniform]);
3632
3633             inst->src[i].file = temp.file;
3634             inst->src[i].reg = temp.reg;
3635             inst->src[i].reg_offset = temp.reg_offset;
3636             inst->src[i].reladdr = NULL;
3637          }
3638       }
3639    } while (nested_reladdr);
3640
3641    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3642     * no need to track them as larger-than-vec4 objects.  This will be
3643     * relied on in cutting out unused uniform vectors from push
3644     * constants.
3645     */
3646    split_uniform_registers();
3647 }
3648
3649 void
3650 vec4_visitor::resolve_ud_negate(src_reg *reg)
3651 {
3652    if (reg->type != BRW_REGISTER_TYPE_UD ||
3653        !reg->negate)
3654       return;
3655
3656    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3657    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3658    *reg = temp;
3659 }
3660
3661 /**
3662  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3663  *
3664  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3665  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3666  */
3667 void
3668 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3669 {
3670    assert(devinfo->gen <= 5);
3671
3672    if (!rvalue->type->is_boolean())
3673       return;
3674
3675    src_reg and_result = src_reg(this, rvalue->type);
3676    src_reg neg_result = src_reg(this, rvalue->type);
3677    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3678    emit(MOV(dst_reg(neg_result), negate(and_result)));
3679    *reg = neg_result;
3680 }
3681
3682 vec4_visitor::vec4_visitor(struct brw_context *brw,
3683                            struct brw_vec4_compile *c,
3684                            struct gl_program *prog,
3685                            const struct brw_vue_prog_key *key,
3686                            struct brw_vue_prog_data *prog_data,
3687                            struct gl_shader_program *shader_prog,
3688                            gl_shader_stage stage,
3689                            void *mem_ctx,
3690                            bool no_spills,
3691                            shader_time_shader_type st_type)
3692    : backend_shader(brw, mem_ctx, shader_prog, prog, &prog_data->base, stage),
3693      c(c),
3694      key(key),
3695      prog_data(prog_data),
3696      sanity_param_count(0),
3697      fail_msg(NULL),
3698      first_non_payload_grf(0),
3699      need_all_constants_in_pull_buffer(false),
3700      no_spills(no_spills),
3701      st_type(st_type)
3702 {
3703    this->failed = false;
3704
3705    this->base_ir = NULL;
3706    this->current_annotation = NULL;
3707    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3708
3709    this->variable_ht = hash_table_ctor(0,
3710                                        hash_table_pointer_hash,
3711                                        hash_table_pointer_compare);
3712
3713    this->virtual_grf_start = NULL;
3714    this->virtual_grf_end = NULL;
3715    this->live_intervals = NULL;
3716
3717    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3718
3719    this->uniforms = 0;
3720
3721    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3722     * at least one. See setup_uniforms() in brw_vec4.cpp.
3723     */
3724    this->uniform_array_size = 1;
3725    if (prog_data) {
3726       this->uniform_array_size =
3727          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3728    }
3729
3730    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3731    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3732 }
3733
3734 vec4_visitor::~vec4_visitor()
3735 {
3736    hash_table_dtor(this->variable_ht);
3737 }
3738
3739
3740 void
3741 vec4_visitor::fail(const char *format, ...)
3742 {
3743    va_list va;
3744    char *msg;
3745
3746    if (failed)
3747       return;
3748
3749    failed = true;
3750
3751    va_start(va, format);
3752    msg = ralloc_vasprintf(mem_ctx, format, va);
3753    va_end(va);
3754    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3755
3756    this->fail_msg = msg;
3757
3758    if (debug_enabled) {
3759       fprintf(stderr, "%s",  msg);
3760    }
3761 }
3762
3763 } /* namespace brw */