src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (storage->builtin)
 690          continue;
 691
 692       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 693           (storage->name[namelen] != 0 &&
 694            storage->name[namelen] != '.' &&
 695            storage->name[namelen] != '[')) {
 696          continue;
 697       }
 698
 699       gl_constant_value *components = storage->storage;
 700       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 701                                storage->type->matrix_columns);
 702
 703       for (unsigned s = 0; s < vector_count; s++) {
 704          assert(uniforms < uniform_array_size);
 705          uniform_vector_size[uniforms] = storage->type->vector_elements;
 706
 707          int i;
 708          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 709             stage_prog_data->param[uniforms * 4 + i] = components;
 710             components++;
 711          }
 712          for (; i < 4; i++) {
 713             static gl_constant_value zero = { 0.0 };
 714             stage_prog_data->param[uniforms * 4 + i] = &zero;
 715          }
 716
 717          uniforms++;
 718       }
 719    }
 720 }
 721
 722 void
 723 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 724 {
 725    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 726       assert(this->uniforms < uniform_array_size);
 727       this->uniform_vector_size[this->uniforms] = 4;
 728       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 729       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 730       for (int j = 0; j < 4; ++j) {
 731          stage_prog_data->param[this->uniforms * 4 + j] =
 732             (gl_constant_value *) &clip_planes[i][j];
 733       }
 734       ++this->uniforms;
 735    }
 736 }
 737
 738 /* Our support for builtin uniforms is even scarier than non-builtin.
 739  * It sits on top of the PROG_STATE_VAR parameters that are
 740  * automatically updated from GL context state.
 741  */
 742 void
 743 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 744 {
 745    const ir_state_slot *const slots = ir->get_state_slots();
 746    assert(slots != NULL);
 747
 748    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 749       /* This state reference has already been setup by ir_to_mesa,
 750        * but we'll get the same index back here.  We can reference
 751        * ParameterValues directly, since unlike brw_fs.cpp, we never
 752        * add new state references during compile.
 753        */
 754       int index = _mesa_add_state_reference(this->prog->Parameters,
 755                                             (gl_state_index *)slots[i].tokens);
 756       gl_constant_value *values =
 757          &this->prog->Parameters->ParameterValues[index][0];
 758
 759       assert(this->uniforms < uniform_array_size);
 760
 761       for (unsigned j = 0; j < 4; j++)
 762          stage_prog_data->param[this->uniforms * 4 + j] =
 763             &values[GET_SWZ(slots[i].swizzle, j)];
 764
 765       this->uniform_vector_size[this->uniforms] =
 766          (ir->type->is_scalar() || ir->type->is_vector() ||
 767           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 768
 769       this->uniforms++;
 770    }
 771 }
 772
 773 dst_reg *
 774 vec4_visitor::variable_storage(ir_variable *var)
 775 {
 776    return (dst_reg *)hash_table_find(this->variable_ht, var);
 777 }
 778
 779 void
 780 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 781                                      enum brw_predicate *predicate)
 782 {
 783    ir_expression *expr = ir->as_expression();
 784
 785    *predicate = BRW_PREDICATE_NORMAL;
 786
 787    if (expr && expr->operation != ir_binop_ubo_load) {
 788       src_reg op[3];
 789       vec4_instruction *inst;
 790
 791       assert(expr->get_num_operands() <= 3);
 792       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 793          expr->operands[i]->accept(this);
 794          op[i] = this->result;
 795
 796          resolve_ud_negate(&op[i]);
 797       }
 798
 799       switch (expr->operation) {
 800       case ir_unop_logic_not:
 801          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 802          inst->conditional_mod = BRW_CONDITIONAL_Z;
 803          break;
 804
 805       case ir_binop_logic_xor:
 806          if (devinfo->gen <= 5) {
 807             src_reg temp = src_reg(this, ir->type);
 808             emit(XOR(dst_reg(temp), op[0], op[1]));
 809             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 810          } else {
 811             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 812          }
 813          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 814          break;
 815
 816       case ir_binop_logic_or:
 817          if (devinfo->gen <= 5) {
 818             src_reg temp = src_reg(this, ir->type);
 819             emit(OR(dst_reg(temp), op[0], op[1]));
 820             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 821          } else {
 822             inst = emit(OR(dst_null_d(), op[0], op[1]));
 823          }
 824          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 825          break;
 826
 827       case ir_binop_logic_and:
 828          if (devinfo->gen <= 5) {
 829             src_reg temp = src_reg(this, ir->type);
 830             emit(AND(dst_reg(temp), op[0], op[1]));
 831             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 832          } else {
 833             inst = emit(AND(dst_null_d(), op[0], op[1]));
 834          }
 835          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836          break;
 837
 838       case ir_unop_f2b:
 839          if (devinfo->gen >= 6) {
 840             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 841          } else {
 842             inst = emit(MOV(dst_null_f(), op[0]));
 843             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 844          }
 845          break;
 846
 847       case ir_unop_i2b:
 848          if (devinfo->gen >= 6) {
 849             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 850          } else {
 851             inst = emit(MOV(dst_null_d(), op[0]));
 852             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 853          }
 854          break;
 855
 856       case ir_binop_all_equal:
 857          if (devinfo->gen <= 5) {
 858             resolve_bool_comparison(expr->operands[0], &op[0]);
 859             resolve_bool_comparison(expr->operands[1], &op[1]);
 860          }
 861          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 862          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 863          break;
 864
 865       case ir_binop_any_nequal:
 866          if (devinfo->gen <= 5) {
 867             resolve_bool_comparison(expr->operands[0], &op[0]);
 868             resolve_bool_comparison(expr->operands[1], &op[1]);
 869          }
 870          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 871          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 872          break;
 873
 874       case ir_unop_any:
 875          if (devinfo->gen <= 5) {
 876             resolve_bool_comparison(expr->operands[0], &op[0]);
 877          }
 878          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 879          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 880          break;
 881
 882       case ir_binop_greater:
 883       case ir_binop_gequal:
 884       case ir_binop_less:
 885       case ir_binop_lequal:
 886       case ir_binop_equal:
 887       case ir_binop_nequal:
 888          if (devinfo->gen <= 5) {
 889             resolve_bool_comparison(expr->operands[0], &op[0]);
 890             resolve_bool_comparison(expr->operands[1], &op[1]);
 891          }
 892          emit(CMP(dst_null_d(), op[0], op[1],
 893                   brw_conditional_for_comparison(expr->operation)));
 894          break;
 895
 896       case ir_triop_csel: {
 897          /* Expand the boolean condition into the flag register. */
 898          inst = emit(MOV(dst_null_d(), op[0]));
 899          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 900
 901          /* Select which boolean to return. */
 902          dst_reg temp(this, expr->operands[1]->type);
 903          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 904          inst->predicate = BRW_PREDICATE_NORMAL;
 905
 906          /* Expand the result to a condition code. */
 907          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 908          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 909          break;
 910       }
 911
 912       default:
 913          unreachable("not reached");
 914       }
 915       return;
 916    }
 917
 918    ir->accept(this);
 919
 920    resolve_ud_negate(&this->result);
 921
 922    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 923    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 924 }
 925
 926 /**
 927  * Emit a gen6 IF statement with the comparison folded into the IF
 928  * instruction.
 929  */
 930 void
 931 vec4_visitor::emit_if_gen6(ir_if *ir)
 932 {
 933    ir_expression *expr = ir->condition->as_expression();
 934
 935    if (expr && expr->operation != ir_binop_ubo_load) {
 936       src_reg op[3];
 937       dst_reg temp;
 938
 939       assert(expr->get_num_operands() <= 3);
 940       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 941          expr->operands[i]->accept(this);
 942          op[i] = this->result;
 943       }
 944
 945       switch (expr->operation) {
 946       case ir_unop_logic_not:
 947          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 948          return;
 949
 950       case ir_binop_logic_xor:
 951          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 952          return;
 953
 954       case ir_binop_logic_or:
 955          temp = dst_reg(this, glsl_type::bool_type);
 956          emit(OR(temp, op[0], op[1]));
 957          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 958          return;
 959
 960       case ir_binop_logic_and:
 961          temp = dst_reg(this, glsl_type::bool_type);
 962          emit(AND(temp, op[0], op[1]));
 963          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 964          return;
 965
 966       case ir_unop_f2b:
 967          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_unop_i2b:
 971          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_binop_greater:
 975       case ir_binop_gequal:
 976       case ir_binop_less:
 977       case ir_binop_lequal:
 978       case ir_binop_equal:
 979       case ir_binop_nequal:
 980          emit(IF(op[0], op[1],
 981                  brw_conditional_for_comparison(expr->operation)));
 982          return;
 983
 984       case ir_binop_all_equal:
 985          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 986          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 987          return;
 988
 989       case ir_binop_any_nequal:
 990          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 991          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 992          return;
 993
 994       case ir_unop_any:
 995          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 996          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 997          return;
 998
 999       case ir_triop_csel: {
1000          /* Expand the boolean condition into the flag register. */
1001          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1002          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1003
1004          /* Select which boolean to return. */
1005          dst_reg temp(this, expr->operands[1]->type);
1006          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1007          inst->predicate = BRW_PREDICATE_NORMAL;
1008
1009          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1010          return;
1011       }
1012
1013       default:
1014          unreachable("not reached");
1015       }
1016       return;
1017    }
1018
1019    ir->condition->accept(this);
1020
1021    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_variable *ir)
1026 {
1027    dst_reg *reg = NULL;
1028
1029    if (variable_storage(ir))
1030       return;
1031
1032    switch (ir->data.mode) {
1033    case ir_var_shader_in:
1034       assert(ir->data.location != -1);
1035       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1036       break;
1037
1038    case ir_var_shader_out:
1039       assert(ir->data.location != -1);
1040       reg = new(mem_ctx) dst_reg(this, ir->type);
1041
1042       for (int i = 0; i < type_size(ir->type); i++) {
1043          output_reg[ir->data.location + i] = *reg;
1044          output_reg[ir->data.location + i].reg_offset = i;
1045          output_reg[ir->data.location + i].type =
1046             brw_type_for_base_type(ir->type->get_scalar_type());
1047          output_reg_annotation[ir->data.location + i] = ir->name;
1048       }
1049       break;
1050
1051    case ir_var_auto:
1052    case ir_var_temporary:
1053       reg = new(mem_ctx) dst_reg(this, ir->type);
1054       break;
1055
1056    case ir_var_uniform:
1057       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1058
1059       /* Thanks to the lower_ubo_reference pass, we will see only
1060        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1061        * variables, so no need for them to be in variable_ht.
1062        *
1063        * Some uniforms, such as samplers and atomic counters, have no actual
1064        * storage, so we should ignore them.
1065        */
1066       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1067          return;
1068
1069       /* Track how big the whole uniform variable is, in case we need to put a
1070        * copy of its data into pull constants for array access.
1071        */
1072       assert(this->uniforms < uniform_array_size);
1073       this->uniform_size[this->uniforms] = type_size(ir->type);
1074
1075       if (!strncmp(ir->name, "gl_", 3)) {
1076          setup_builtin_uniform_values(ir);
1077       } else {
1078          setup_uniform_values(ir);
1079       }
1080       break;
1081
1082    case ir_var_system_value:
1083       reg = make_reg_for_system_value(ir);
1084       break;
1085
1086    default:
1087       unreachable("not reached");
1088    }
1089
1090    reg->type = brw_type_for_base_type(ir->type);
1091    hash_table_insert(this->variable_ht, reg, ir);
1092 }
1093
1094 void
1095 vec4_visitor::visit(ir_loop *ir)
1096 {
1097    /* We don't want debugging output to print the whole body of the
1098     * loop as the annotation.
1099     */
1100    this->base_ir = NULL;
1101
1102    emit(BRW_OPCODE_DO);
1103
1104    visit_instructions(&ir->body_instructions);
1105
1106    emit(BRW_OPCODE_WHILE);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop_jump *ir)
1111 {
1112    switch (ir->mode) {
1113    case ir_loop_jump::jump_break:
1114       emit(BRW_OPCODE_BREAK);
1115       break;
1116    case ir_loop_jump::jump_continue:
1117       emit(BRW_OPCODE_CONTINUE);
1118       break;
1119    }
1120 }
1121
1122
1123 void
1124 vec4_visitor::visit(ir_function_signature *)
1125 {
1126    unreachable("not reached");
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_function *ir)
1131 {
1132    /* Ignore function bodies other than main() -- we shouldn't see calls to
1133     * them since they should all be inlined.
1134     */
1135    if (strcmp(ir->name, "main") == 0) {
1136       const ir_function_signature *sig;
1137       exec_list empty;
1138
1139       sig = ir->matching_signature(NULL, &empty, false);
1140
1141       assert(sig);
1142
1143       visit_instructions(&sig->body);
1144    }
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_mad(ir_expression *ir)
1149 {
1150    /* 3-src instructions were introduced in gen6. */
1151    if (devinfo->gen < 6)
1152       return false;
1153
1154    /* MAD can only handle floating-point data. */
1155    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1156       return false;
1157
1158    ir_rvalue *nonmul;
1159    ir_expression *mul;
1160    bool mul_negate, mul_abs;
1161
1162    for (int i = 0; i < 2; i++) {
1163       mul_negate = false;
1164       mul_abs = false;
1165
1166       mul = ir->operands[i]->as_expression();
1167       nonmul = ir->operands[1 - i];
1168
1169       if (mul && mul->operation == ir_unop_abs) {
1170          mul = mul->operands[0]->as_expression();
1171          mul_abs = true;
1172       } else if (mul && mul->operation == ir_unop_neg) {
1173          mul = mul->operands[0]->as_expression();
1174          mul_negate = true;
1175       }
1176
1177       if (mul && mul->operation == ir_binop_mul)
1178          break;
1179    }
1180
1181    if (!mul || mul->operation != ir_binop_mul)
1182       return false;
1183
1184    nonmul->accept(this);
1185    src_reg src0 = fix_3src_operand(this->result);
1186
1187    mul->operands[0]->accept(this);
1188    src_reg src1 = fix_3src_operand(this->result);
1189    src1.negate ^= mul_negate;
1190    src1.abs = mul_abs;
1191    if (mul_abs)
1192       src1.negate = false;
1193
1194    mul->operands[1]->accept(this);
1195    src_reg src2 = fix_3src_operand(this->result);
1196    src2.abs = mul_abs;
1197    if (mul_abs)
1198       src2.negate = false;
1199
1200    this->result = src_reg(this, ir->type);
1201    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1202
1203    return true;
1204 }
1205
1206 bool
1207 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1208 {
1209    /* This optimization relies on CMP setting the destination to 0 when
1210     * false.  Early hardware only sets the least significant bit, and
1211     * leaves the other bits undefined.  So we can't use it.
1212     */
1213    if (devinfo->gen < 6)
1214       return false;
1215
1216    ir_expression *const cmp = ir->operands[0]->as_expression();
1217
1218    if (cmp == NULL)
1219       return false;
1220
1221    switch (cmp->operation) {
1222    case ir_binop_less:
1223    case ir_binop_greater:
1224    case ir_binop_lequal:
1225    case ir_binop_gequal:
1226    case ir_binop_equal:
1227    case ir_binop_nequal:
1228       break;
1229
1230    default:
1231       return false;
1232    }
1233
1234    cmp->operands[0]->accept(this);
1235    const src_reg cmp_src0 = this->result;
1236
1237    cmp->operands[1]->accept(this);
1238    const src_reg cmp_src1 = this->result;
1239
1240    this->result = src_reg(this, ir->type);
1241
1242    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1243             brw_conditional_for_comparison(cmp->operation)));
1244
1245    /* If the comparison is false, this->result will just happen to be zero.
1246     */
1247    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1248                                        this->result, src_reg(1.0f));
1249    inst->predicate = BRW_PREDICATE_NORMAL;
1250    inst->predicate_inverse = true;
1251
1252    return true;
1253 }
1254
1255 void
1256 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1257                           src_reg src0, src_reg src1)
1258 {
1259    vec4_instruction *inst;
1260
1261    if (devinfo->gen >= 6) {
1262       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1263       inst->conditional_mod = conditionalmod;
1264    } else {
1265       emit(CMP(dst, src0, src1, conditionalmod));
1266
1267       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268       inst->predicate = BRW_PREDICATE_NORMAL;
1269    }
1270 }
1271
1272 void
1273 vec4_visitor::emit_lrp(const dst_reg &dst,
1274                        const src_reg &x, const src_reg &y, const src_reg &a)
1275 {
1276    if (devinfo->gen >= 6) {
1277       /* Note that the instruction's argument order is reversed from GLSL
1278        * and the IR.
1279        */
1280       emit(LRP(dst,
1281                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1282    } else {
1283       /* Earlier generations don't support three source operations, so we
1284        * need to emit x*(1-a) + y*a.
1285        */
1286       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1287       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1288       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1289       y_times_a.writemask           = dst.writemask;
1290       one_minus_a.writemask         = dst.writemask;
1291       x_times_one_minus_a.writemask = dst.writemask;
1292
1293       emit(MUL(y_times_a, y, a));
1294       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1295       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1296       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1297    }
1298 }
1299
1300 /**
1301  * Emits the instructions needed to perform a pull constant load. before_block
1302  * and before_inst can be NULL in which case the instruction will be appended
1303  * to the end of the instruction list.
1304  */
1305 void
1306 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1307                                           src_reg surf_index,
1308                                           src_reg offset_reg,
1309                                           bblock_t *before_block,
1310                                           vec4_instruction *before_inst)
1311 {
1312    assert((before_inst == NULL && before_block == NULL) ||
1313           (before_inst && before_block));
1314
1315    vec4_instruction *pull;
1316
1317    if (devinfo->gen >= 9) {
1318       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1319       src_reg header(this, glsl_type::uvec4_type, 2);
1320
1321       pull = new(mem_ctx)
1322          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1323                           dst_reg(header));
1324
1325       if (before_inst)
1326          emit_before(before_block, before_inst, pull);
1327       else
1328          emit(pull);
1329
1330       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1331                                  offset_reg.type);
1332       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1333
1334       if (before_inst)
1335          emit_before(before_block, before_inst, pull);
1336       else
1337          emit(pull);
1338
1339       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1340                                            dst,
1341                                            surf_index,
1342                                            header);
1343       pull->mlen = 2;
1344       pull->header_size = 1;
1345    } else if (devinfo->gen >= 7) {
1346       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1347
1348       grf_offset.type = offset_reg.type;
1349
1350       pull = MOV(grf_offset, offset_reg);
1351
1352       if (before_inst)
1353          emit_before(before_block, before_inst, pull);
1354       else
1355          emit(pull);
1356
1357       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1358                                            dst,
1359                                            surf_index,
1360                                            src_reg(grf_offset));
1361       pull->mlen = 1;
1362    } else {
1363       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1364                                            dst,
1365                                            surf_index,
1366                                            offset_reg);
1367       pull->base_mrf = 14;
1368       pull->mlen = 1;
1369    }
1370
1371    if (before_inst)
1372       emit_before(before_block, before_inst, pull);
1373    else
1374       emit(pull);
1375 }
1376
1377 void
1378 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1379 {
1380    const src_reg chan_index(this, glsl_type::uint_type);
1381
1382    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1383       ->force_writemask_all = true;
1384    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1385       ->force_writemask_all = true;
1386 }
1387
1388 void
1389 vec4_visitor::visit(ir_expression *ir)
1390 {
1391    unsigned int operand;
1392    src_reg op[ARRAY_SIZE(ir->operands)];
1393    vec4_instruction *inst;
1394
1395    if (ir->operation == ir_binop_add) {
1396       if (try_emit_mad(ir))
1397          return;
1398    }
1399
1400    if (ir->operation == ir_unop_b2f) {
1401       if (try_emit_b2f_of_compare(ir))
1402          return;
1403    }
1404
1405    /* Storage for our result.  Ideally for an assignment we'd be using
1406     * the actual storage for the result here, instead.
1407     */
1408    dst_reg result_dst(this, ir->type);
1409    src_reg result_src(result_dst);
1410
1411    if (ir->operation == ir_triop_csel) {
1412       ir->operands[1]->accept(this);
1413       op[1] = this->result;
1414       ir->operands[2]->accept(this);
1415       op[2] = this->result;
1416
1417       enum brw_predicate predicate;
1418       emit_bool_to_cond_code(ir->operands[0], &predicate);
1419       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1420       inst->predicate = predicate;
1421       this->result = result_src;
1422       return;
1423    }
1424
1425    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1426       this->result.file = BAD_FILE;
1427       ir->operands[operand]->accept(this);
1428       if (this->result.file == BAD_FILE) {
1429          fprintf(stderr, "Failed to get tree for expression operand:\n");
1430          ir->operands[operand]->fprint(stderr);
1431          exit(1);
1432       }
1433       op[operand] = this->result;
1434
1435       /* Matrix expression operands should have been broken down to vector
1436        * operations already.
1437        */
1438       assert(!ir->operands[operand]->type->is_matrix());
1439    }
1440
1441    /* If nothing special happens, this is the result. */
1442    this->result = result_src;
1443
1444    switch (ir->operation) {
1445    case ir_unop_logic_not:
1446       emit(NOT(result_dst, op[0]));
1447       break;
1448    case ir_unop_neg:
1449       op[0].negate = !op[0].negate;
1450       emit(MOV(result_dst, op[0]));
1451       break;
1452    case ir_unop_abs:
1453       op[0].abs = true;
1454       op[0].negate = false;
1455       emit(MOV(result_dst, op[0]));
1456       break;
1457
1458    case ir_unop_sign:
1459       if (ir->type->is_float()) {
1460          /* AND(val, 0x80000000) gives the sign bit.
1461           *
1462           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1463           * zero.
1464           */
1465          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1466
1467          op[0].type = BRW_REGISTER_TYPE_UD;
1468          result_dst.type = BRW_REGISTER_TYPE_UD;
1469          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1470
1471          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1472          inst->predicate = BRW_PREDICATE_NORMAL;
1473
1474          this->result.type = BRW_REGISTER_TYPE_F;
1475       } else {
1476          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1477           *               -> non-negative val generates 0x00000000.
1478           *  Predicated OR sets 1 if val is positive.
1479           */
1480          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1481
1482          emit(ASR(result_dst, op[0], src_reg(31)));
1483
1484          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1485          inst->predicate = BRW_PREDICATE_NORMAL;
1486       }
1487       break;
1488
1489    case ir_unop_rcp:
1490       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1491       break;
1492
1493    case ir_unop_exp2:
1494       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1495       break;
1496    case ir_unop_log2:
1497       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1498       break;
1499    case ir_unop_exp:
1500    case ir_unop_log:
1501       unreachable("not reached: should be handled by ir_explog_to_explog2");
1502    case ir_unop_sin:
1503       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1504       break;
1505    case ir_unop_cos:
1506       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1507       break;
1508
1509    case ir_unop_dFdx:
1510    case ir_unop_dFdx_coarse:
1511    case ir_unop_dFdx_fine:
1512    case ir_unop_dFdy:
1513    case ir_unop_dFdy_coarse:
1514    case ir_unop_dFdy_fine:
1515       unreachable("derivatives not valid in vertex shader");
1516
1517    case ir_unop_bitfield_reverse:
1518       emit(BFREV(result_dst, op[0]));
1519       break;
1520    case ir_unop_bit_count:
1521       emit(CBIT(result_dst, op[0]));
1522       break;
1523    case ir_unop_find_msb: {
1524       src_reg temp = src_reg(this, glsl_type::uint_type);
1525
1526       inst = emit(FBH(dst_reg(temp), op[0]));
1527       inst->dst.writemask = WRITEMASK_XYZW;
1528
1529       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1530        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1531        * subtract the result from 31 to convert the MSB count into an LSB count.
1532        */
1533
1534       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1535       temp.swizzle = BRW_SWIZZLE_NOOP;
1536       emit(MOV(result_dst, temp));
1537
1538       src_reg src_tmp = src_reg(result_dst);
1539       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1540
1541       src_tmp.negate = true;
1542       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1543       inst->predicate = BRW_PREDICATE_NORMAL;
1544       break;
1545    }
1546    case ir_unop_find_lsb:
1547       emit(FBL(result_dst, op[0]));
1548       break;
1549    case ir_unop_saturate:
1550       inst = emit(MOV(result_dst, op[0]));
1551       inst->saturate = true;
1552       break;
1553
1554    case ir_unop_noise:
1555       unreachable("not reached: should be handled by lower_noise");
1556
1557    case ir_binop_add:
1558       emit(ADD(result_dst, op[0], op[1]));
1559       break;
1560    case ir_binop_sub:
1561       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1562
1563    case ir_binop_mul:
1564       if (devinfo->gen < 8 && ir->type->is_integer()) {
1565          /* For integer multiplication, the MUL uses the low 16 bits of one of
1566           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1567           * accumulates in the contribution of the upper 16 bits of that
1568           * operand.  If we can determine that one of the args is in the low
1569           * 16 bits, though, we can just emit a single MUL.
1570           */
1571          if (ir->operands[0]->is_uint16_constant()) {
1572             if (devinfo->gen < 7)
1573                emit(MUL(result_dst, op[0], op[1]));
1574             else
1575                emit(MUL(result_dst, op[1], op[0]));
1576          } else if (ir->operands[1]->is_uint16_constant()) {
1577             if (devinfo->gen < 7)
1578                emit(MUL(result_dst, op[1], op[0]));
1579             else
1580                emit(MUL(result_dst, op[0], op[1]));
1581          } else {
1582             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1583
1584             emit(MUL(acc, op[0], op[1]));
1585             emit(MACH(dst_null_d(), op[0], op[1]));
1586             emit(MOV(result_dst, src_reg(acc)));
1587          }
1588       } else {
1589          emit(MUL(result_dst, op[0], op[1]));
1590       }
1591       break;
1592    case ir_binop_imul_high: {
1593       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1594
1595       emit(MUL(acc, op[0], op[1]));
1596       emit(MACH(result_dst, op[0], op[1]));
1597       break;
1598    }
1599    case ir_binop_div:
1600       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1601       assert(ir->type->is_integer());
1602       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1603       break;
1604    case ir_binop_carry: {
1605       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1606
1607       emit(ADDC(dst_null_ud(), op[0], op[1]));
1608       emit(MOV(result_dst, src_reg(acc)));
1609       break;
1610    }
1611    case ir_binop_borrow: {
1612       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1613
1614       emit(SUBB(dst_null_ud(), op[0], op[1]));
1615       emit(MOV(result_dst, src_reg(acc)));
1616       break;
1617    }
1618    case ir_binop_mod:
1619       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1620       assert(ir->type->is_integer());
1621       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1622       break;
1623
1624    case ir_binop_less:
1625    case ir_binop_greater:
1626    case ir_binop_lequal:
1627    case ir_binop_gequal:
1628    case ir_binop_equal:
1629    case ir_binop_nequal: {
1630       if (devinfo->gen <= 5) {
1631          resolve_bool_comparison(ir->operands[0], &op[0]);
1632          resolve_bool_comparison(ir->operands[1], &op[1]);
1633       }
1634       emit(CMP(result_dst, op[0], op[1],
1635                brw_conditional_for_comparison(ir->operation)));
1636       break;
1637    }
1638
1639    case ir_binop_all_equal:
1640       if (devinfo->gen <= 5) {
1641          resolve_bool_comparison(ir->operands[0], &op[0]);
1642          resolve_bool_comparison(ir->operands[1], &op[1]);
1643       }
1644
1645       /* "==" operator producing a scalar boolean. */
1646       if (ir->operands[0]->type->is_vector() ||
1647           ir->operands[1]->type->is_vector()) {
1648          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1649          emit(MOV(result_dst, src_reg(0)));
1650          inst = emit(MOV(result_dst, src_reg(~0)));
1651          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1652       } else {
1653          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1654       }
1655       break;
1656    case ir_binop_any_nequal:
1657       if (devinfo->gen <= 5) {
1658          resolve_bool_comparison(ir->operands[0], &op[0]);
1659          resolve_bool_comparison(ir->operands[1], &op[1]);
1660       }
1661
1662       /* "!=" operator producing a scalar boolean. */
1663       if (ir->operands[0]->type->is_vector() ||
1664           ir->operands[1]->type->is_vector()) {
1665          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1666
1667          emit(MOV(result_dst, src_reg(0)));
1668          inst = emit(MOV(result_dst, src_reg(~0)));
1669          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1670       } else {
1671          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1672       }
1673       break;
1674
1675    case ir_unop_any:
1676       if (devinfo->gen <= 5) {
1677          resolve_bool_comparison(ir->operands[0], &op[0]);
1678       }
1679       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1680       emit(MOV(result_dst, src_reg(0)));
1681
1682       inst = emit(MOV(result_dst, src_reg(~0)));
1683       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1684       break;
1685
1686    case ir_binop_logic_xor:
1687       emit(XOR(result_dst, op[0], op[1]));
1688       break;
1689
1690    case ir_binop_logic_or:
1691       emit(OR(result_dst, op[0], op[1]));
1692       break;
1693
1694    case ir_binop_logic_and:
1695       emit(AND(result_dst, op[0], op[1]));
1696       break;
1697
1698    case ir_binop_dot:
1699       assert(ir->operands[0]->type->is_vector());
1700       assert(ir->operands[0]->type == ir->operands[1]->type);
1701       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1702       break;
1703
1704    case ir_unop_sqrt:
1705       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1706       break;
1707    case ir_unop_rsq:
1708       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1709       break;
1710
1711    case ir_unop_bitcast_i2f:
1712    case ir_unop_bitcast_u2f:
1713       this->result = op[0];
1714       this->result.type = BRW_REGISTER_TYPE_F;
1715       break;
1716
1717    case ir_unop_bitcast_f2i:
1718       this->result = op[0];
1719       this->result.type = BRW_REGISTER_TYPE_D;
1720       break;
1721
1722    case ir_unop_bitcast_f2u:
1723       this->result = op[0];
1724       this->result.type = BRW_REGISTER_TYPE_UD;
1725       break;
1726
1727    case ir_unop_i2f:
1728    case ir_unop_i2u:
1729    case ir_unop_u2i:
1730    case ir_unop_u2f:
1731    case ir_unop_f2i:
1732    case ir_unop_f2u:
1733       emit(MOV(result_dst, op[0]));
1734       break;
1735    case ir_unop_b2i:
1736       emit(AND(result_dst, op[0], src_reg(1)));
1737       break;
1738    case ir_unop_b2f:
1739       if (devinfo->gen <= 5) {
1740          resolve_bool_comparison(ir->operands[0], &op[0]);
1741       }
1742       op[0].type = BRW_REGISTER_TYPE_D;
1743       result_dst.type = BRW_REGISTER_TYPE_D;
1744       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1745       result_dst.type = BRW_REGISTER_TYPE_F;
1746       break;
1747    case ir_unop_f2b:
1748       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1749       break;
1750    case ir_unop_i2b:
1751       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1752       break;
1753
1754    case ir_unop_trunc:
1755       emit(RNDZ(result_dst, op[0]));
1756       break;
1757    case ir_unop_ceil: {
1758          src_reg tmp = src_reg(this, ir->type);
1759          op[0].negate = !op[0].negate;
1760          emit(RNDD(dst_reg(tmp), op[0]));
1761          tmp.negate = true;
1762          emit(MOV(result_dst, tmp));
1763       }
1764       break;
1765    case ir_unop_floor:
1766       inst = emit(RNDD(result_dst, op[0]));
1767       break;
1768    case ir_unop_fract:
1769       inst = emit(FRC(result_dst, op[0]));
1770       break;
1771    case ir_unop_round_even:
1772       emit(RNDE(result_dst, op[0]));
1773       break;
1774
1775    case ir_binop_min:
1776       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1777       break;
1778    case ir_binop_max:
1779       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1780       break;
1781
1782    case ir_binop_pow:
1783       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1784       break;
1785
1786    case ir_unop_bit_not:
1787       inst = emit(NOT(result_dst, op[0]));
1788       break;
1789    case ir_binop_bit_and:
1790       inst = emit(AND(result_dst, op[0], op[1]));
1791       break;
1792    case ir_binop_bit_xor:
1793       inst = emit(XOR(result_dst, op[0], op[1]));
1794       break;
1795    case ir_binop_bit_or:
1796       inst = emit(OR(result_dst, op[0], op[1]));
1797       break;
1798
1799    case ir_binop_lshift:
1800       inst = emit(SHL(result_dst, op[0], op[1]));
1801       break;
1802
1803    case ir_binop_rshift:
1804       if (ir->type->base_type == GLSL_TYPE_INT)
1805          inst = emit(ASR(result_dst, op[0], op[1]));
1806       else
1807          inst = emit(SHR(result_dst, op[0], op[1]));
1808       break;
1809
1810    case ir_binop_bfm:
1811       emit(BFI1(result_dst, op[0], op[1]));
1812       break;
1813
1814    case ir_binop_ubo_load: {
1815       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1816       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1817       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1818       src_reg offset;
1819
1820       /* Now, load the vector from that offset. */
1821       assert(ir->type->is_vector() || ir->type->is_scalar());
1822
1823       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1824       packed_consts.type = result.type;
1825       src_reg surf_index;
1826
1827       if (const_uniform_block) {
1828          /* The block index is a constant, so just emit the binding table entry
1829           * as an immediate.
1830           */
1831          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1832                               const_uniform_block->value.u[0]);
1833       } else {
1834          /* The block index is not a constant. Evaluate the index expression
1835           * per-channel and add the base UBO index; we have to select a value
1836           * from any live channel.
1837           */
1838          surf_index = src_reg(this, glsl_type::uint_type);
1839          emit(ADD(dst_reg(surf_index), op[0],
1840                   src_reg(prog_data->base.binding_table.ubo_start)));
1841          emit_uniformize(dst_reg(surf_index), surf_index);
1842
1843          /* Assume this may touch any UBO. It would be nice to provide
1844           * a tighter bound, but the array information is already lowered away.
1845           */
1846          brw_mark_surface_used(&prog_data->base,
1847                                prog_data->base.binding_table.ubo_start +
1848                                shader_prog->NumUniformBlocks - 1);
1849       }
1850
1851       if (const_offset_ir) {
1852          if (devinfo->gen >= 8) {
1853             /* Store the offset in a GRF so we can send-from-GRF. */
1854             offset = src_reg(this, glsl_type::int_type);
1855             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1856          } else {
1857             /* Immediates are fine on older generations since they'll be moved
1858              * to a (potentially fake) MRF at the generator level.
1859              */
1860             offset = src_reg(const_offset / 16);
1861          }
1862       } else {
1863          offset = src_reg(this, glsl_type::uint_type);
1864          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1865       }
1866
1867       emit_pull_constant_load_reg(dst_reg(packed_consts),
1868                                   surf_index,
1869                                   offset,
1870                                   NULL, NULL /* before_block/inst */);
1871
1872       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1873       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1874                                             const_offset % 16 / 4,
1875                                             const_offset % 16 / 4,
1876                                             const_offset % 16 / 4);
1877
1878       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1879       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1880          emit(CMP(result_dst, packed_consts, src_reg(0u),
1881                   BRW_CONDITIONAL_NZ));
1882       } else {
1883          emit(MOV(result_dst, packed_consts));
1884       }
1885       break;
1886    }
1887
1888    case ir_binop_vector_extract:
1889       unreachable("should have been lowered by vec_index_to_cond_assign");
1890
1891    case ir_triop_fma:
1892       op[0] = fix_3src_operand(op[0]);
1893       op[1] = fix_3src_operand(op[1]);
1894       op[2] = fix_3src_operand(op[2]);
1895       /* Note that the instruction's argument order is reversed from GLSL
1896        * and the IR.
1897        */
1898       emit(MAD(result_dst, op[2], op[1], op[0]));
1899       break;
1900
1901    case ir_triop_lrp:
1902       emit_lrp(result_dst, op[0], op[1], op[2]);
1903       break;
1904
1905    case ir_triop_csel:
1906       unreachable("already handled above");
1907       break;
1908
1909    case ir_triop_bfi:
1910       op[0] = fix_3src_operand(op[0]);
1911       op[1] = fix_3src_operand(op[1]);
1912       op[2] = fix_3src_operand(op[2]);
1913       emit(BFI2(result_dst, op[0], op[1], op[2]));
1914       break;
1915
1916    case ir_triop_bitfield_extract:
1917       op[0] = fix_3src_operand(op[0]);
1918       op[1] = fix_3src_operand(op[1]);
1919       op[2] = fix_3src_operand(op[2]);
1920       /* Note that the instruction's argument order is reversed from GLSL
1921        * and the IR.
1922        */
1923       emit(BFE(result_dst, op[2], op[1], op[0]));
1924       break;
1925
1926    case ir_triop_vector_insert:
1927       unreachable("should have been lowered by lower_vector_insert");
1928
1929    case ir_quadop_bitfield_insert:
1930       unreachable("not reached: should be handled by "
1931               "bitfield_insert_to_bfm_bfi\n");
1932
1933    case ir_quadop_vector:
1934       unreachable("not reached: should be handled by lower_quadop_vector");
1935
1936    case ir_unop_pack_half_2x16:
1937       emit_pack_half_2x16(result_dst, op[0]);
1938       break;
1939    case ir_unop_unpack_half_2x16:
1940       emit_unpack_half_2x16(result_dst, op[0]);
1941       break;
1942    case ir_unop_unpack_unorm_4x8:
1943       emit_unpack_unorm_4x8(result_dst, op[0]);
1944       break;
1945    case ir_unop_unpack_snorm_4x8:
1946       emit_unpack_snorm_4x8(result_dst, op[0]);
1947       break;
1948    case ir_unop_pack_unorm_4x8:
1949       emit_pack_unorm_4x8(result_dst, op[0]);
1950       break;
1951    case ir_unop_pack_snorm_4x8:
1952       emit_pack_snorm_4x8(result_dst, op[0]);
1953       break;
1954    case ir_unop_pack_snorm_2x16:
1955    case ir_unop_pack_unorm_2x16:
1956    case ir_unop_unpack_snorm_2x16:
1957    case ir_unop_unpack_unorm_2x16:
1958       unreachable("not reached: should be handled by lower_packing_builtins");
1959    case ir_unop_unpack_half_2x16_split_x:
1960    case ir_unop_unpack_half_2x16_split_y:
1961    case ir_binop_pack_half_2x16_split:
1962    case ir_unop_interpolate_at_centroid:
1963    case ir_binop_interpolate_at_sample:
1964    case ir_binop_interpolate_at_offset:
1965       unreachable("not reached: should not occur in vertex shader");
1966    case ir_binop_ldexp:
1967       unreachable("not reached: should be handled by ldexp_to_arith()");
1968    case ir_unop_d2f:
1969    case ir_unop_f2d:
1970    case ir_unop_d2i:
1971    case ir_unop_i2d:
1972    case ir_unop_d2u:
1973    case ir_unop_u2d:
1974    case ir_unop_d2b:
1975    case ir_unop_pack_double_2x32:
1976    case ir_unop_unpack_double_2x32:
1977    case ir_unop_frexp_sig:
1978    case ir_unop_frexp_exp:
1979       unreachable("fp64 todo");
1980    }
1981 }
1982
1983
1984 void
1985 vec4_visitor::visit(ir_swizzle *ir)
1986 {
1987    /* Note that this is only swizzles in expressions, not those on the left
1988     * hand side of an assignment, which do write masking.  See ir_assignment
1989     * for that.
1990     */
1991    const unsigned swz = brw_compose_swizzle(
1992       brw_swizzle_for_size(ir->type->vector_elements),
1993       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1994
1995    ir->val->accept(this);
1996    this->result = swizzle(this->result, swz);
1997 }
1998
1999 void
2000 vec4_visitor::visit(ir_dereference_variable *ir)
2001 {
2002    const struct glsl_type *type = ir->type;
2003    dst_reg *reg = variable_storage(ir->var);
2004
2005    if (!reg) {
2006       fail("Failed to find variable storage for %s\n", ir->var->name);
2007       this->result = src_reg(brw_null_reg());
2008       return;
2009    }
2010
2011    this->result = src_reg(*reg);
2012
2013    /* System values get their swizzle from the dst_reg writemask */
2014    if (ir->var->data.mode == ir_var_system_value)
2015       return;
2016
2017    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2018       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2019 }
2020
2021
2022 int
2023 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2024 {
2025    /* Under normal circumstances array elements are stored consecutively, so
2026     * the stride is equal to the size of the array element.
2027     */
2028    return type_size(ir->type);
2029 }
2030
2031
2032 void
2033 vec4_visitor::visit(ir_dereference_array *ir)
2034 {
2035    ir_constant *constant_index;
2036    src_reg src;
2037    int array_stride = compute_array_stride(ir);
2038
2039    constant_index = ir->array_index->constant_expression_value();
2040
2041    ir->array->accept(this);
2042    src = this->result;
2043
2044    if (constant_index) {
2045       src.reg_offset += constant_index->value.i[0] * array_stride;
2046    } else {
2047       /* Variable index array dereference.  It eats the "vec4" of the
2048        * base of the array and an index that offsets the Mesa register
2049        * index.
2050        */
2051       ir->array_index->accept(this);
2052
2053       src_reg index_reg;
2054
2055       if (array_stride == 1) {
2056          index_reg = this->result;
2057       } else {
2058          index_reg = src_reg(this, glsl_type::int_type);
2059
2060          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2061       }
2062
2063       if (src.reladdr) {
2064          src_reg temp = src_reg(this, glsl_type::int_type);
2065
2066          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2067
2068          index_reg = temp;
2069       }
2070
2071       src.reladdr = ralloc(mem_ctx, src_reg);
2072       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2073    }
2074
2075    /* If the type is smaller than a vec4, replicate the last channel out. */
2076    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2077       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2078    else
2079       src.swizzle = BRW_SWIZZLE_NOOP;
2080    src.type = brw_type_for_base_type(ir->type);
2081
2082    this->result = src;
2083 }
2084
2085 void
2086 vec4_visitor::visit(ir_dereference_record *ir)
2087 {
2088    unsigned int i;
2089    const glsl_type *struct_type = ir->record->type;
2090    int offset = 0;
2091
2092    ir->record->accept(this);
2093
2094    for (i = 0; i < struct_type->length; i++) {
2095       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2096          break;
2097       offset += type_size(struct_type->fields.structure[i].type);
2098    }
2099
2100    /* If the type is smaller than a vec4, replicate the last channel out. */
2101    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2102       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2103    else
2104       this->result.swizzle = BRW_SWIZZLE_NOOP;
2105    this->result.type = brw_type_for_base_type(ir->type);
2106
2107    this->result.reg_offset += offset;
2108 }
2109
2110 /**
2111  * We want to be careful in assignment setup to hit the actual storage
2112  * instead of potentially using a temporary like we might with the
2113  * ir_dereference handler.
2114  */
2115 static dst_reg
2116 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2117 {
2118    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2119     * access of a vector, it must be separated into a series conditional moves
2120     * before reaching this point (see ir_vec_index_to_cond_assign).
2121     */
2122    assert(ir->as_dereference());
2123    ir_dereference_array *deref_array = ir->as_dereference_array();
2124    if (deref_array) {
2125       assert(!deref_array->array->type->is_vector());
2126    }
2127
2128    /* Use the rvalue deref handler for the most part.  We'll ignore
2129     * swizzles in it and write swizzles using writemask, though.
2130     */
2131    ir->accept(v);
2132    return dst_reg(v->result);
2133 }
2134
2135 void
2136 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2137                               const struct glsl_type *type,
2138                               enum brw_predicate predicate)
2139 {
2140    if (type->base_type == GLSL_TYPE_STRUCT) {
2141       for (unsigned int i = 0; i < type->length; i++) {
2142          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2143       }
2144       return;
2145    }
2146
2147    if (type->is_array()) {
2148       for (unsigned int i = 0; i < type->length; i++) {
2149          emit_block_move(dst, src, type->fields.array, predicate);
2150       }
2151       return;
2152    }
2153
2154    if (type->is_matrix()) {
2155       const struct glsl_type *vec_type;
2156
2157       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2158                                          type->vector_elements, 1);
2159
2160       for (int i = 0; i < type->matrix_columns; i++) {
2161          emit_block_move(dst, src, vec_type, predicate);
2162       }
2163       return;
2164    }
2165
2166    assert(type->is_scalar() || type->is_vector());
2167
2168    dst->type = brw_type_for_base_type(type);
2169    src->type = dst->type;
2170
2171    dst->writemask = (1 << type->vector_elements) - 1;
2172
2173    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2174
2175    vec4_instruction *inst = emit(MOV(*dst, *src));
2176    inst->predicate = predicate;
2177
2178    dst->reg_offset++;
2179    src->reg_offset++;
2180 }
2181
2182
2183 /* If the RHS processing resulted in an instruction generating a
2184  * temporary value, and it would be easy to rewrite the instruction to
2185  * generate its result right into the LHS instead, do so.  This ends
2186  * up reliably removing instructions where it can be tricky to do so
2187  * later without real UD chain information.
2188  */
2189 bool
2190 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2191                                      dst_reg dst,
2192                                      src_reg src,
2193                                      vec4_instruction *pre_rhs_inst,
2194                                      vec4_instruction *last_rhs_inst)
2195 {
2196    /* This could be supported, but it would take more smarts. */
2197    if (ir->condition)
2198       return false;
2199
2200    if (pre_rhs_inst == last_rhs_inst)
2201       return false; /* No instructions generated to work with. */
2202
2203    /* Make sure the last instruction generated our source reg. */
2204    if (src.file != GRF ||
2205        src.file != last_rhs_inst->dst.file ||
2206        src.reg != last_rhs_inst->dst.reg ||
2207        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2208        src.reladdr ||
2209        src.abs ||
2210        src.negate ||
2211        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2212       return false;
2213
2214    /* Check that that last instruction fully initialized the channels
2215     * we want to use, in the order we want to use them.  We could
2216     * potentially reswizzle the operands of many instructions so that
2217     * we could handle out of order channels, but don't yet.
2218     */
2219
2220    for (unsigned i = 0; i < 4; i++) {
2221       if (dst.writemask & (1 << i)) {
2222          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2223             return false;
2224
2225          if (BRW_GET_SWZ(src.swizzle, i) != i)
2226             return false;
2227       }
2228    }
2229
2230    /* Success!  Rewrite the instruction. */
2231    last_rhs_inst->dst.file = dst.file;
2232    last_rhs_inst->dst.reg = dst.reg;
2233    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2234    last_rhs_inst->dst.reladdr = dst.reladdr;
2235    last_rhs_inst->dst.writemask &= dst.writemask;
2236
2237    return true;
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_assignment *ir)
2242 {
2243    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2244    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2245
2246    if (!ir->lhs->type->is_scalar() &&
2247        !ir->lhs->type->is_vector()) {
2248       ir->rhs->accept(this);
2249       src_reg src = this->result;
2250
2251       if (ir->condition) {
2252          emit_bool_to_cond_code(ir->condition, &predicate);
2253       }
2254
2255       /* emit_block_move doesn't account for swizzles in the source register.
2256        * This should be ok, since the source register is a structure or an
2257        * array, and those can't be swizzled.  But double-check to be sure.
2258        */
2259       assert(src.swizzle ==
2260              (ir->rhs->type->is_matrix()
2261               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2262               : BRW_SWIZZLE_NOOP));
2263
2264       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2265       return;
2266    }
2267
2268    /* Now we're down to just a scalar/vector with writemasks. */
2269    int i;
2270
2271    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2272    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2273
2274    ir->rhs->accept(this);
2275
2276    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2277
2278    int swizzles[4];
2279    int src_chan = 0;
2280
2281    assert(ir->lhs->type->is_vector() ||
2282           ir->lhs->type->is_scalar());
2283    dst.writemask = ir->write_mask;
2284
2285    /* Swizzle a small RHS vector into the channels being written.
2286     *
2287     * glsl ir treats write_mask as dictating how many channels are
2288     * present on the RHS while in our instructions we need to make
2289     * those channels appear in the slots of the vec4 they're written to.
2290     */
2291    for (int i = 0; i < 4; i++)
2292       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2293
2294    src_reg src = swizzle(this->result,
2295                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2296                                       swizzles[2], swizzles[3]));
2297
2298    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2299       return;
2300    }
2301
2302    if (ir->condition) {
2303       emit_bool_to_cond_code(ir->condition, &predicate);
2304    }
2305
2306    for (i = 0; i < type_size(ir->lhs->type); i++) {
2307       vec4_instruction *inst = emit(MOV(dst, src));
2308       inst->predicate = predicate;
2309
2310       dst.reg_offset++;
2311       src.reg_offset++;
2312    }
2313 }
2314
2315 void
2316 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2317 {
2318    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2319       foreach_in_list(ir_constant, field_value, &ir->components) {
2320          emit_constant_values(dst, field_value);
2321       }
2322       return;
2323    }
2324
2325    if (ir->type->is_array()) {
2326       for (unsigned int i = 0; i < ir->type->length; i++) {
2327          emit_constant_values(dst, ir->array_elements[i]);
2328       }
2329       return;
2330    }
2331
2332    if (ir->type->is_matrix()) {
2333       for (int i = 0; i < ir->type->matrix_columns; i++) {
2334          float *vec = &ir->value.f[i * ir->type->vector_elements];
2335
2336          for (int j = 0; j < ir->type->vector_elements; j++) {
2337             dst->writemask = 1 << j;
2338             dst->type = BRW_REGISTER_TYPE_F;
2339
2340             emit(MOV(*dst, src_reg(vec[j])));
2341          }
2342          dst->reg_offset++;
2343       }
2344       return;
2345    }
2346
2347    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2348
2349    for (int i = 0; i < ir->type->vector_elements; i++) {
2350       if (!(remaining_writemask & (1 << i)))
2351          continue;
2352
2353       dst->writemask = 1 << i;
2354       dst->type = brw_type_for_base_type(ir->type);
2355
2356       /* Find other components that match the one we're about to
2357        * write.  Emits fewer instructions for things like vec4(0.5,
2358        * 1.5, 1.5, 1.5).
2359        */
2360       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2361          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2362             if (ir->value.b[i] == ir->value.b[j])
2363                dst->writemask |= (1 << j);
2364          } else {
2365             /* u, i, and f storage all line up, so no need for a
2366              * switch case for comparing each type.
2367              */
2368             if (ir->value.u[i] == ir->value.u[j])
2369                dst->writemask |= (1 << j);
2370          }
2371       }
2372
2373       switch (ir->type->base_type) {
2374       case GLSL_TYPE_FLOAT:
2375          emit(MOV(*dst, src_reg(ir->value.f[i])));
2376          break;
2377       case GLSL_TYPE_INT:
2378          emit(MOV(*dst, src_reg(ir->value.i[i])));
2379          break;
2380       case GLSL_TYPE_UINT:
2381          emit(MOV(*dst, src_reg(ir->value.u[i])));
2382          break;
2383       case GLSL_TYPE_BOOL:
2384          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2385          break;
2386       default:
2387          unreachable("Non-float/uint/int/bool constant");
2388       }
2389
2390       remaining_writemask &= ~dst->writemask;
2391    }
2392    dst->reg_offset++;
2393 }
2394
2395 void
2396 vec4_visitor::visit(ir_constant *ir)
2397 {
2398    dst_reg dst = dst_reg(this, ir->type);
2399    this->result = src_reg(dst);
2400
2401    emit_constant_values(&dst, ir);
2402 }
2403
2404 void
2405 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2406 {
2407    ir_dereference *deref = static_cast<ir_dereference *>(
2408       ir->actual_parameters.get_head());
2409    ir_variable *location = deref->variable_referenced();
2410    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2411                           location->data.binding);
2412
2413    /* Calculate the surface offset */
2414    src_reg offset(this, glsl_type::uint_type);
2415    ir_dereference_array *deref_array = deref->as_dereference_array();
2416    if (deref_array) {
2417       deref_array->array_index->accept(this);
2418
2419       src_reg tmp(this, glsl_type::uint_type);
2420       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2421       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2422    } else {
2423       offset = location->data.atomic.offset;
2424    }
2425
2426    /* Emit the appropriate machine instruction */
2427    const char *callee = ir->callee->function_name();
2428    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2429
2430    if (!strcmp("__intrinsic_atomic_read", callee)) {
2431       emit_untyped_surface_read(surf_index, dst, offset);
2432
2433    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2434       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2435                           src_reg(), src_reg());
2436
2437    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2438       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2439                           src_reg(), src_reg());
2440    }
2441 }
2442
2443 void
2444 vec4_visitor::visit(ir_call *ir)
2445 {
2446    const char *callee = ir->callee->function_name();
2447
2448    if (!strcmp("__intrinsic_atomic_read", callee) ||
2449        !strcmp("__intrinsic_atomic_increment", callee) ||
2450        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2451       visit_atomic_counter_intrinsic(ir);
2452    } else {
2453       unreachable("Unsupported intrinsic.");
2454    }
2455 }
2456
2457 src_reg
2458 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2459 {
2460    vec4_instruction *inst =
2461       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2462                                     dst_reg(this, glsl_type::uvec4_type));
2463    inst->base_mrf = 2;
2464    inst->src[1] = sampler;
2465
2466    int param_base;
2467
2468    if (devinfo->gen >= 9) {
2469       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2470       vec4_instruction *header_inst = new(mem_ctx)
2471          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2472                           dst_reg(MRF, inst->base_mrf));
2473
2474       emit(header_inst);
2475
2476       inst->mlen = 2;
2477       inst->header_size = 1;
2478       param_base = inst->base_mrf + 1;
2479    } else {
2480       inst->mlen = 1;
2481       param_base = inst->base_mrf;
2482    }
2483
2484    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2485    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2486    int zero_mask = 0xf & ~coord_mask;
2487
2488    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2489             coordinate));
2490
2491    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2492             src_reg(0)));
2493
2494    emit(inst);
2495    return src_reg(inst->dst);
2496 }
2497
2498 static bool
2499 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2500 {
2501    if (devinfo->gen < 8 && !devinfo->is_haswell)
2502       return false;
2503
2504    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2505 }
2506
2507 void
2508 vec4_visitor::visit(ir_texture *ir)
2509 {
2510    uint32_t sampler =
2511       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2512
2513    ir_rvalue *nonconst_sampler_index =
2514       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2515
2516    /* Handle non-constant sampler array indexing */
2517    src_reg sampler_reg;
2518    if (nonconst_sampler_index) {
2519       /* The highest sampler which may be used by this operation is
2520        * the last element of the array. Mark it here, because the generator
2521        * doesn't have enough information to determine the bound.
2522        */
2523       uint32_t array_size = ir->sampler->as_dereference_array()
2524          ->array->type->array_size();
2525
2526       uint32_t max_used = sampler + array_size - 1;
2527       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2528          max_used += prog_data->base.binding_table.gather_texture_start;
2529       } else {
2530          max_used += prog_data->base.binding_table.texture_start;
2531       }
2532
2533       brw_mark_surface_used(&prog_data->base, max_used);
2534
2535       /* Emit code to evaluate the actual indexing expression */
2536       nonconst_sampler_index->accept(this);
2537       dst_reg temp(this, glsl_type::uint_type);
2538       emit(ADD(temp, this->result, src_reg(sampler)));
2539       emit_uniformize(temp, src_reg(temp));
2540
2541       sampler_reg = src_reg(temp);
2542    } else {
2543       /* Single sampler, or constant array index; the indexing expression
2544        * is just an immediate.
2545        */
2546       sampler_reg = src_reg(sampler);
2547    }
2548
2549    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2550     * emitting anything other than setting up the constant result.
2551     */
2552    if (ir->op == ir_tg4) {
2553       ir_constant *chan = ir->lod_info.component->as_constant();
2554       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2555       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2556          dst_reg result(this, ir->type);
2557          this->result = src_reg(result);
2558          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2559          return;
2560       }
2561    }
2562
2563    /* Should be lowered by do_lower_texture_projection */
2564    assert(!ir->projector);
2565
2566    /* Should be lowered */
2567    assert(!ir->offset || !ir->offset->type->is_array());
2568
2569    /* Generate code to compute all the subexpression trees.  This has to be
2570     * done before loading any values into MRFs for the sampler message since
2571     * generating these values may involve SEND messages that need the MRFs.
2572     */
2573    src_reg coordinate;
2574    if (ir->coordinate) {
2575       ir->coordinate->accept(this);
2576       coordinate = this->result;
2577    }
2578
2579    src_reg shadow_comparitor;
2580    if (ir->shadow_comparitor) {
2581       ir->shadow_comparitor->accept(this);
2582       shadow_comparitor = this->result;
2583    }
2584
2585    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2586    src_reg offset_value;
2587    if (has_nonconstant_offset) {
2588       ir->offset->accept(this);
2589       offset_value = src_reg(this->result);
2590    }
2591
2592    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2593    src_reg lod, dPdx, dPdy, sample_index, mcs;
2594    switch (ir->op) {
2595    case ir_tex:
2596       lod = src_reg(0.0f);
2597       lod_type = glsl_type::float_type;
2598       break;
2599    case ir_txf:
2600    case ir_txl:
2601    case ir_txs:
2602       ir->lod_info.lod->accept(this);
2603       lod = this->result;
2604       lod_type = ir->lod_info.lod->type;
2605       break;
2606    case ir_query_levels:
2607       lod = src_reg(0);
2608       lod_type = glsl_type::int_type;
2609       break;
2610    case ir_txf_ms:
2611       ir->lod_info.sample_index->accept(this);
2612       sample_index = this->result;
2613       sample_index_type = ir->lod_info.sample_index->type;
2614
2615       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2616          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2617       else
2618          mcs = src_reg(0u);
2619       break;
2620    case ir_txd:
2621       ir->lod_info.grad.dPdx->accept(this);
2622       dPdx = this->result;
2623
2624       ir->lod_info.grad.dPdy->accept(this);
2625       dPdy = this->result;
2626
2627       lod_type = ir->lod_info.grad.dPdx->type;
2628       break;
2629    case ir_txb:
2630    case ir_lod:
2631    case ir_tg4:
2632       break;
2633    }
2634
2635    enum opcode opcode;
2636    switch (ir->op) {
2637    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2638    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2639    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2640    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2641    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2642    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2643    case ir_tg4: opcode = has_nonconstant_offset
2644                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2645    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2646    case ir_txb:
2647       unreachable("TXB is not valid for vertex shaders.");
2648    case ir_lod:
2649       unreachable("LOD is not valid for vertex shaders.");
2650    default:
2651       unreachable("Unrecognized tex op");
2652    }
2653
2654    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2655       opcode, dst_reg(this, ir->type));
2656
2657    if (ir->offset != NULL && !has_nonconstant_offset) {
2658       inst->offset =
2659          brw_texture_offset(ir->offset->as_constant()->value.i,
2660                             ir->offset->type->vector_elements);
2661    }
2662
2663    /* Stuff the channel select bits in the top of the texture offset */
2664    if (ir->op == ir_tg4)
2665       inst->offset |= gather_channel(ir, sampler) << 16;
2666
2667    /* The message header is necessary for:
2668     * - Gen4 (always)
2669     * - Gen9+ for selecting SIMD4x2
2670     * - Texel offsets
2671     * - Gather channel selection
2672     * - Sampler indices too large to fit in a 4-bit value.
2673     */
2674    inst->header_size =
2675       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2676        inst->offset != 0 || ir->op == ir_tg4 ||
2677        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2678    inst->base_mrf = 2;
2679    inst->mlen = inst->header_size + 1; /* always at least one */
2680    inst->dst.writemask = WRITEMASK_XYZW;
2681    inst->shadow_compare = ir->shadow_comparitor != NULL;
2682
2683    inst->src[1] = sampler_reg;
2684
2685    /* MRF for the first parameter */
2686    int param_base = inst->base_mrf + inst->header_size;
2687
2688    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2689       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2690       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2691    } else {
2692       /* Load the coordinate */
2693       /* FINISHME: gl_clamp_mask and saturate */
2694       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2695       int zero_mask = 0xf & ~coord_mask;
2696
2697       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2698                coordinate));
2699
2700       if (zero_mask != 0) {
2701          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2702                   src_reg(0)));
2703       }
2704       /* Load the shadow comparitor */
2705       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2706          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2707                           WRITEMASK_X),
2708                   shadow_comparitor));
2709          inst->mlen++;
2710       }
2711
2712       /* Load the LOD info */
2713       if (ir->op == ir_tex || ir->op == ir_txl) {
2714          int mrf, writemask;
2715          if (devinfo->gen >= 5) {
2716             mrf = param_base + 1;
2717             if (ir->shadow_comparitor) {
2718                writemask = WRITEMASK_Y;
2719                /* mlen already incremented */
2720             } else {
2721                writemask = WRITEMASK_X;
2722                inst->mlen++;
2723             }
2724          } else /* devinfo->gen == 4 */ {
2725             mrf = param_base;
2726             writemask = WRITEMASK_W;
2727          }
2728          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2729       } else if (ir->op == ir_txf) {
2730          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2731       } else if (ir->op == ir_txf_ms) {
2732          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2733                   sample_index));
2734          if (devinfo->gen >= 7) {
2735             /* MCS data is in the first channel of `mcs`, but we need to get it into
2736              * the .y channel of the second vec4 of params, so replicate .x across
2737              * the whole vec4 and then mask off everything except .y
2738              */
2739             mcs.swizzle = BRW_SWIZZLE_XXXX;
2740             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2741                      mcs));
2742          }
2743          inst->mlen++;
2744       } else if (ir->op == ir_txd) {
2745          const glsl_type *type = lod_type;
2746
2747          if (devinfo->gen >= 5) {
2748             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2749             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2750             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2751             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2752             inst->mlen++;
2753
2754             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2755                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2756                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2757                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2758                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2759                inst->mlen++;
2760
2761                if (ir->shadow_comparitor) {
2762                   emit(MOV(dst_reg(MRF, param_base + 2,
2763                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2764                            shadow_comparitor));
2765                }
2766             }
2767          } else /* devinfo->gen == 4 */ {
2768             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2769             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2770             inst->mlen += 2;
2771          }
2772       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2773          if (ir->shadow_comparitor) {
2774             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2775                      shadow_comparitor));
2776          }
2777
2778          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2779                   offset_value));
2780          inst->mlen++;
2781       }
2782    }
2783
2784    emit(inst);
2785
2786    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2787     * spec requires layers.
2788     */
2789    if (ir->op == ir_txs) {
2790       glsl_type const *type = ir->sampler->type;
2791       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2792           type->sampler_array) {
2793          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2794                    writemask(inst->dst, WRITEMASK_Z),
2795                    src_reg(inst->dst), src_reg(6));
2796       }
2797    }
2798
2799    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2800       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2801    }
2802
2803    swizzle_result(ir, src_reg(inst->dst), sampler);
2804 }
2805
2806 /**
2807  * Apply workarounds for Gen6 gather with UINT/SINT
2808  */
2809 void
2810 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2811 {
2812    if (!wa)
2813       return;
2814
2815    int width = (wa & WA_8BIT) ? 8 : 16;
2816    dst_reg dst_f = dst;
2817    dst_f.type = BRW_REGISTER_TYPE_F;
2818
2819    /* Convert from UNORM to UINT */
2820    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2821    emit(MOV(dst, src_reg(dst_f)));
2822
2823    if (wa & WA_SIGN) {
2824       /* Reinterpret the UINT value as a signed INT value by
2825        * shifting the sign bit into place, then shifting back
2826        * preserving sign.
2827        */
2828       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2829       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2830    }
2831 }
2832
2833 /**
2834  * Set up the gather channel based on the swizzle, for gather4.
2835  */
2836 uint32_t
2837 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2838 {
2839    ir_constant *chan = ir->lod_info.component->as_constant();
2840    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2841    switch (swiz) {
2842       case SWIZZLE_X: return 0;
2843       case SWIZZLE_Y:
2844          /* gather4 sampler is broken for green channel on RG32F --
2845           * we must ask for blue instead.
2846           */
2847          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2848             return 2;
2849          return 1;
2850       case SWIZZLE_Z: return 2;
2851       case SWIZZLE_W: return 3;
2852       default:
2853          unreachable("Not reached"); /* zero, one swizzles handled already */
2854    }
2855 }
2856
2857 void
2858 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2859 {
2860    int s = key->tex.swizzles[sampler];
2861
2862    this->result = src_reg(this, ir->type);
2863    dst_reg swizzled_result(this->result);
2864
2865    if (ir->op == ir_query_levels) {
2866       /* # levels is in .w */
2867       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2868       emit(MOV(swizzled_result, orig_val));
2869       return;
2870    }
2871
2872    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2873                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2874       emit(MOV(swizzled_result, orig_val));
2875       return;
2876    }
2877
2878
2879    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2880    int swizzle[4] = {0};
2881
2882    for (int i = 0; i < 4; i++) {
2883       switch (GET_SWZ(s, i)) {
2884       case SWIZZLE_ZERO:
2885          zero_mask |= (1 << i);
2886          break;
2887       case SWIZZLE_ONE:
2888          one_mask |= (1 << i);
2889          break;
2890       default:
2891          copy_mask |= (1 << i);
2892          swizzle[i] = GET_SWZ(s, i);
2893          break;
2894       }
2895    }
2896
2897    if (copy_mask) {
2898       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2899       swizzled_result.writemask = copy_mask;
2900       emit(MOV(swizzled_result, orig_val));
2901    }
2902
2903    if (zero_mask) {
2904       swizzled_result.writemask = zero_mask;
2905       emit(MOV(swizzled_result, src_reg(0.0f)));
2906    }
2907
2908    if (one_mask) {
2909       swizzled_result.writemask = one_mask;
2910       emit(MOV(swizzled_result, src_reg(1.0f)));
2911    }
2912 }
2913
2914 void
2915 vec4_visitor::visit(ir_return *)
2916 {
2917    unreachable("not reached");
2918 }
2919
2920 void
2921 vec4_visitor::visit(ir_discard *)
2922 {
2923    unreachable("not reached");
2924 }
2925
2926 void
2927 vec4_visitor::visit(ir_if *ir)
2928 {
2929    /* Don't point the annotation at the if statement, because then it plus
2930     * the then and else blocks get printed.
2931     */
2932    this->base_ir = ir->condition;
2933
2934    if (devinfo->gen == 6) {
2935       emit_if_gen6(ir);
2936    } else {
2937       enum brw_predicate predicate;
2938       emit_bool_to_cond_code(ir->condition, &predicate);
2939       emit(IF(predicate));
2940    }
2941
2942    visit_instructions(&ir->then_instructions);
2943
2944    if (!ir->else_instructions.is_empty()) {
2945       this->base_ir = ir->condition;
2946       emit(BRW_OPCODE_ELSE);
2947
2948       visit_instructions(&ir->else_instructions);
2949    }
2950
2951    this->base_ir = ir->condition;
2952    emit(BRW_OPCODE_ENDIF);
2953 }
2954
2955 void
2956 vec4_visitor::visit(ir_emit_vertex *)
2957 {
2958    unreachable("not reached");
2959 }
2960
2961 void
2962 vec4_visitor::visit(ir_end_primitive *)
2963 {
2964    unreachable("not reached");
2965 }
2966
2967 void
2968 vec4_visitor::visit(ir_barrier *)
2969 {
2970    unreachable("not reached");
2971 }
2972
2973 void
2974 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2975                                   dst_reg dst, src_reg offset,
2976                                   src_reg src0, src_reg src1)
2977 {
2978    unsigned mlen = 0;
2979
2980    /* Set the atomic operation offset. */
2981    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2982    mlen++;
2983
2984    /* Set the atomic operation arguments. */
2985    if (src0.file != BAD_FILE) {
2986       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2987       mlen++;
2988    }
2989
2990    if (src1.file != BAD_FILE) {
2991       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2992       mlen++;
2993    }
2994
2995    /* Emit the instruction.  Note that this maps to the normal SIMD8
2996     * untyped atomic message on Ivy Bridge, but that's OK because
2997     * unused channels will be masked out.
2998     */
2999    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3000                                  brw_message_reg(0),
3001                                  src_reg(surf_index), src_reg(atomic_op));
3002    inst->mlen = mlen;
3003 }
3004
3005 void
3006 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3007                                         src_reg offset)
3008 {
3009    /* Set the surface read offset. */
3010    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3011
3012    /* Emit the instruction.  Note that this maps to the normal SIMD8
3013     * untyped surface read message, but that's OK because unused
3014     * channels will be masked out.
3015     */
3016    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3017                                  brw_message_reg(0),
3018                                  src_reg(surf_index), src_reg(1));
3019    inst->mlen = 1;
3020 }
3021
3022 void
3023 vec4_visitor::emit_ndc_computation()
3024 {
3025    /* Get the position */
3026    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3027
3028    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3029    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3030    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3031
3032    current_annotation = "NDC";
3033    dst_reg ndc_w = ndc;
3034    ndc_w.writemask = WRITEMASK_W;
3035    src_reg pos_w = pos;
3036    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3037    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3038
3039    dst_reg ndc_xyz = ndc;
3040    ndc_xyz.writemask = WRITEMASK_XYZ;
3041
3042    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3043 }
3044
3045 void
3046 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3047 {
3048    if (devinfo->gen < 6 &&
3049        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3050         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3051       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3052       dst_reg header1_w = header1;
3053       header1_w.writemask = WRITEMASK_W;
3054
3055       emit(MOV(header1, 0u));
3056
3057       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3058          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3059
3060          current_annotation = "Point size";
3061          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3062          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3063       }
3064
3065       if (key->userclip_active) {
3066          current_annotation = "Clipping flags";
3067          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3068          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3069
3070          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3071          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3072          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3073
3074          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3075          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3076          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3077          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3078       }
3079
3080       /* i965 clipping workaround:
3081        * 1) Test for -ve rhw
3082        * 2) If set,
3083        *      set ndc = (0,0,0,0)
3084        *      set ucp[6] = 1
3085        *
3086        * Later, clipping will detect ucp[6] and ensure the primitive is
3087        * clipped against all fixed planes.
3088        */
3089       if (devinfo->has_negative_rhw_bug) {
3090          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3091          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3092          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3093          vec4_instruction *inst;
3094          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3095          inst->predicate = BRW_PREDICATE_NORMAL;
3096          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3097          inst->predicate = BRW_PREDICATE_NORMAL;
3098       }
3099
3100       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3101    } else if (devinfo->gen < 6) {
3102       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3103    } else {
3104       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3105       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3106          dst_reg reg_w = reg;
3107          reg_w.writemask = WRITEMASK_W;
3108          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3109       }
3110       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3111          dst_reg reg_y = reg;
3112          reg_y.writemask = WRITEMASK_Y;
3113          reg_y.type = BRW_REGISTER_TYPE_D;
3114          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3115       }
3116       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3117          dst_reg reg_z = reg;
3118          reg_z.writemask = WRITEMASK_Z;
3119          reg_z.type = BRW_REGISTER_TYPE_D;
3120          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3121       }
3122    }
3123 }
3124
3125 void
3126 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3127 {
3128    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3129     *
3130     *     "If a linked set of shaders forming the vertex stage contains no
3131     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3132     *     application has requested clipping against user clip planes through
3133     *     the API, then the coordinate written to gl_Position is used for
3134     *     comparison against the user clip planes."
3135     *
3136     * This function is only called if the shader didn't write to
3137     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3138     * if the user wrote to it; otherwise we use gl_Position.
3139     */
3140    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3141    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3142       clip_vertex = VARYING_SLOT_POS;
3143    }
3144
3145    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3146         ++i) {
3147       reg.writemask = 1 << i;
3148       emit(DP4(reg,
3149                src_reg(output_reg[clip_vertex]),
3150                src_reg(this->userplane[i + offset])));
3151    }
3152 }
3153
3154 vec4_instruction *
3155 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3156 {
3157    assert (varying < VARYING_SLOT_MAX);
3158    reg.type = output_reg[varying].type;
3159    current_annotation = output_reg_annotation[varying];
3160    /* Copy the register, saturating if necessary */
3161    return emit(MOV(reg, src_reg(output_reg[varying])));
3162 }
3163
3164 void
3165 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3166 {
3167    reg.type = BRW_REGISTER_TYPE_F;
3168
3169    switch (varying) {
3170    case VARYING_SLOT_PSIZ:
3171    {
3172       /* PSIZ is always in slot 0, and is coupled with other flags. */
3173       current_annotation = "indices, point width, clip flags";
3174       emit_psiz_and_flags(reg);
3175       break;
3176    }
3177    case BRW_VARYING_SLOT_NDC:
3178       current_annotation = "NDC";
3179       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3180       break;
3181    case VARYING_SLOT_POS:
3182       current_annotation = "gl_Position";
3183       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3184       break;
3185    case VARYING_SLOT_EDGE:
3186       /* This is present when doing unfilled polygons.  We're supposed to copy
3187        * the edge flag from the user-provided vertex array
3188        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3189        * of that attribute (starts as 1.0f).  This is then used in clipping to
3190        * determine which edges should be drawn as wireframe.
3191        */
3192       current_annotation = "edge flag";
3193       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3194                                     glsl_type::float_type, WRITEMASK_XYZW))));
3195       break;
3196    case BRW_VARYING_SLOT_PAD:
3197       /* No need to write to this slot */
3198       break;
3199    case VARYING_SLOT_COL0:
3200    case VARYING_SLOT_COL1:
3201    case VARYING_SLOT_BFC0:
3202    case VARYING_SLOT_BFC1: {
3203       /* These built-in varyings are only supported in compatibility mode,
3204        * and we only support GS in core profile.  So, this must be a vertex
3205        * shader.
3206        */
3207       assert(stage == MESA_SHADER_VERTEX);
3208       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3209       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3210          inst->saturate = true;
3211       break;
3212    }
3213
3214    default:
3215       emit_generic_urb_slot(reg, varying);
3216       break;
3217    }
3218 }
3219
3220 static int
3221 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3222 {
3223    if (devinfo->gen >= 6) {
3224       /* URB data written (does not include the message header reg) must
3225        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3226        * section 5.4.3.2.2: URB_INTERLEAVED.
3227        *
3228        * URB entries are allocated on a multiple of 1024 bits, so an
3229        * extra 128 bits written here to make the end align to 256 is
3230        * no problem.
3231        */
3232       if ((mlen % 2) != 1)
3233          mlen++;
3234    }
3235
3236    return mlen;
3237 }
3238
3239
3240 /**
3241  * Generates the VUE payload plus the necessary URB write instructions to
3242  * output it.
3243  *
3244  * The VUE layout is documented in Volume 2a.
3245  */
3246 void
3247 vec4_visitor::emit_vertex()
3248 {
3249    /* MRF 0 is reserved for the debugger, so start with message header
3250     * in MRF 1.
3251     */
3252    int base_mrf = 1;
3253    int mrf = base_mrf;
3254    /* In the process of generating our URB write message contents, we
3255     * may need to unspill a register or load from an array.  Those
3256     * reads would use MRFs 14-15.
3257     */
3258    int max_usable_mrf = 13;
3259
3260    /* The following assertion verifies that max_usable_mrf causes an
3261     * even-numbered amount of URB write data, which will meet gen6's
3262     * requirements for length alignment.
3263     */
3264    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3265
3266    /* First mrf is the g0-based message header containing URB handles and
3267     * such.
3268     */
3269    emit_urb_write_header(mrf++);
3270
3271    if (devinfo->gen < 6) {
3272       emit_ndc_computation();
3273    }
3274
3275    /* Lower legacy ff and ClipVertex clipping to clip distances */
3276    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3277       current_annotation = "user clip distances";
3278
3279       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3280       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3281
3282       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3283       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3284    }
3285
3286    /* We may need to split this up into several URB writes, so do them in a
3287     * loop.
3288     */
3289    int slot = 0;
3290    bool complete = false;
3291    do {
3292       /* URB offset is in URB row increments, and each of our MRFs is half of
3293        * one of those, since we're doing interleaved writes.
3294        */
3295       int offset = slot / 2;
3296
3297       mrf = base_mrf + 1;
3298       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3299          emit_urb_slot(dst_reg(MRF, mrf++),
3300                        prog_data->vue_map.slot_to_varying[slot]);
3301
3302          /* If this was max_usable_mrf, we can't fit anything more into this
3303           * URB WRITE.
3304           */
3305          if (mrf > max_usable_mrf) {
3306             slot++;
3307             break;
3308          }
3309       }
3310
3311       complete = slot >= prog_data->vue_map.num_slots;
3312       current_annotation = "URB write";
3313       vec4_instruction *inst = emit_urb_write_opcode(complete);
3314       inst->base_mrf = base_mrf;
3315       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3316       inst->offset += offset;
3317    } while(!complete);
3318 }
3319
3320
3321 src_reg
3322 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3323                                  src_reg *reladdr, int reg_offset)
3324 {
3325    /* Because we store the values to scratch interleaved like our
3326     * vertex data, we need to scale the vec4 index by 2.
3327     */
3328    int message_header_scale = 2;
3329
3330    /* Pre-gen6, the message header uses byte offsets instead of vec4
3331     * (16-byte) offset units.
3332     */
3333    if (devinfo->gen < 6)
3334       message_header_scale *= 16;
3335
3336    if (reladdr) {
3337       src_reg index = src_reg(this, glsl_type::int_type);
3338
3339       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3340                                    src_reg(reg_offset)));
3341       emit_before(block, inst, MUL(dst_reg(index), index,
3342                                    src_reg(message_header_scale)));
3343
3344       return index;
3345    } else {
3346       return src_reg(reg_offset * message_header_scale);
3347    }
3348 }
3349
3350 src_reg
3351 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3352                                        src_reg *reladdr, int reg_offset)
3353 {
3354    if (reladdr) {
3355       src_reg index = src_reg(this, glsl_type::int_type);
3356
3357       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3358                                    src_reg(reg_offset)));
3359
3360       /* Pre-gen6, the message header uses byte offsets instead of vec4
3361        * (16-byte) offset units.
3362        */
3363       if (devinfo->gen < 6) {
3364          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3365       }
3366
3367       return index;
3368    } else if (devinfo->gen >= 8) {
3369       /* Store the offset in a GRF so we can send-from-GRF. */
3370       src_reg offset = src_reg(this, glsl_type::int_type);
3371       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3372       return offset;
3373    } else {
3374       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3375       return src_reg(reg_offset * message_header_scale);
3376    }
3377 }
3378
3379 /**
3380  * Emits an instruction before @inst to load the value named by @orig_src
3381  * from scratch space at @base_offset to @temp.
3382  *
3383  * @base_offset is measured in 32-byte units (the size of a register).
3384  */
3385 void
3386 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3387                                 dst_reg temp, src_reg orig_src,
3388                                 int base_offset)
3389 {
3390    int reg_offset = base_offset + orig_src.reg_offset;
3391    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3392                                       reg_offset);
3393
3394    emit_before(block, inst, SCRATCH_READ(temp, index));
3395 }
3396
3397 /**
3398  * Emits an instruction after @inst to store the value to be written
3399  * to @orig_dst to scratch space at @base_offset, from @temp.
3400  *
3401  * @base_offset is measured in 32-byte units (the size of a register).
3402  */
3403 void
3404 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3405                                  int base_offset)
3406 {
3407    int reg_offset = base_offset + inst->dst.reg_offset;
3408    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3409                                       reg_offset);
3410
3411    /* Create a temporary register to store *inst's result in.
3412     *
3413     * We have to be careful in MOVing from our temporary result register in
3414     * the scratch write.  If we swizzle from channels of the temporary that
3415     * weren't initialized, it will confuse live interval analysis, which will
3416     * make spilling fail to make progress.
3417     */
3418    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3419                                        inst->dst.type),
3420                                 brw_swizzle_for_mask(inst->dst.writemask));
3421    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3422                                        inst->dst.writemask));
3423    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3424    write->predicate = inst->predicate;
3425    write->ir = inst->ir;
3426    write->annotation = inst->annotation;
3427    inst->insert_after(block, write);
3428
3429    inst->dst.file = temp.file;
3430    inst->dst.reg = temp.reg;
3431    inst->dst.reg_offset = temp.reg_offset;
3432    inst->dst.reladdr = NULL;
3433 }
3434
3435 /**
3436  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3437  * adds the scratch read(s) before \p inst. The function also checks for
3438  * recursive reladdr scratch accesses, issuing the corresponding scratch
3439  * loads and rewriting reladdr references accordingly.
3440  *
3441  * \return \p src if it did not require a scratch load, otherwise, the
3442  * register holding the result of the scratch load that the caller should
3443  * use to rewrite src.
3444  */
3445 src_reg
3446 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3447                                    vec4_instruction *inst, src_reg src)
3448 {
3449    /* Resolve recursive reladdr scratch access by calling ourselves
3450     * with src.reladdr
3451     */
3452    if (src.reladdr)
3453       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3454                                           *src.reladdr);
3455
3456    /* Now handle scratch access on src */
3457    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3458       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3459       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3460       src.reg = temp.reg;
3461       src.reg_offset = temp.reg_offset;
3462       src.reladdr = NULL;
3463    }
3464
3465    return src;
3466 }
3467
3468 /**
3469  * We can't generally support array access in GRF space, because a
3470  * single instruction's destination can only span 2 contiguous
3471  * registers.  So, we send all GRF arrays that get variable index
3472  * access to scratch space.
3473  */
3474 void
3475 vec4_visitor::move_grf_array_access_to_scratch()
3476 {
3477    int scratch_loc[this->alloc.count];
3478    memset(scratch_loc, -1, sizeof(scratch_loc));
3479
3480    /* First, calculate the set of virtual GRFs that need to be punted
3481     * to scratch due to having any array access on them, and where in
3482     * scratch.
3483     */
3484    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3485       if (inst->dst.file == GRF && inst->dst.reladdr) {
3486          if (scratch_loc[inst->dst.reg] == -1) {
3487             scratch_loc[inst->dst.reg] = c->last_scratch;
3488             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3489          }
3490
3491          for (src_reg *iter = inst->dst.reladdr;
3492               iter->reladdr;
3493               iter = iter->reladdr) {
3494             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3495                scratch_loc[iter->reg] = c->last_scratch;
3496                c->last_scratch += this->alloc.sizes[iter->reg];
3497             }
3498          }
3499       }
3500
3501       for (int i = 0 ; i < 3; i++) {
3502          for (src_reg *iter = &inst->src[i];
3503               iter->reladdr;
3504               iter = iter->reladdr) {
3505             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3506                scratch_loc[iter->reg] = c->last_scratch;
3507                c->last_scratch += this->alloc.sizes[iter->reg];
3508             }
3509          }
3510       }
3511    }
3512
3513    /* Now, for anything that will be accessed through scratch, rewrite
3514     * it to load/store.  Note that this is a _safe list walk, because
3515     * we may generate a new scratch_write instruction after the one
3516     * we're processing.
3517     */
3518    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3519       /* Set up the annotation tracking for new generated instructions. */
3520       base_ir = inst->ir;
3521       current_annotation = inst->annotation;
3522
3523       /* First handle scratch access on the dst. Notice we have to handle
3524        * the case where the dst's reladdr also points to scratch space.
3525        */
3526       if (inst->dst.reladdr)
3527          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3528                                                    *inst->dst.reladdr);
3529
3530       /* Now that we have handled any (possibly recursive) reladdr scratch
3531        * accesses for dst we can safely do the scratch write for dst itself
3532        */
3533       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3534          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3535
3536       /* Now handle scratch access on any src. In this case, since inst->src[i]
3537        * already is a src_reg, we can just call emit_resolve_reladdr with
3538        * inst->src[i] and it will take care of handling scratch loads for
3539        * both src and src.reladdr (recursively).
3540        */
3541       for (int i = 0 ; i < 3; i++) {
3542          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3543                                              inst->src[i]);
3544       }
3545    }
3546 }
3547
3548 /**
3549  * Emits an instruction before @inst to load the value named by @orig_src
3550  * from the pull constant buffer (surface) at @base_offset to @temp.
3551  */
3552 void
3553 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3554                                       dst_reg temp, src_reg orig_src,
3555                                       int base_offset)
3556 {
3557    int reg_offset = base_offset + orig_src.reg_offset;
3558    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3559    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3560                                              reg_offset);
3561
3562    emit_pull_constant_load_reg(temp,
3563                                index,
3564                                offset,
3565                                block, inst);
3566 }
3567
3568 /**
3569  * Implements array access of uniforms by inserting a
3570  * PULL_CONSTANT_LOAD instruction.
3571  *
3572  * Unlike temporary GRF array access (where we don't support it due to
3573  * the difficulty of doing relative addressing on instruction
3574  * destinations), we could potentially do array access of uniforms
3575  * that were loaded in GRF space as push constants.  In real-world
3576  * usage we've seen, though, the arrays being used are always larger
3577  * than we could load as push constants, so just always move all
3578  * uniform array access out to a pull constant buffer.
3579  */
3580 void
3581 vec4_visitor::move_uniform_array_access_to_pull_constants()
3582 {
3583    int pull_constant_loc[this->uniforms];
3584    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3585    bool nested_reladdr;
3586
3587    /* Walk through and find array access of uniforms.  Put a copy of that
3588     * uniform in the pull constant buffer.
3589     *
3590     * Note that we don't move constant-indexed accesses to arrays.  No
3591     * testing has been done of the performance impact of this choice.
3592     */
3593    do {
3594       nested_reladdr = false;
3595
3596       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3597          for (int i = 0 ; i < 3; i++) {
3598             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3599                continue;
3600
3601             int uniform = inst->src[i].reg;
3602
3603             if (inst->src[i].reladdr->reladdr)
3604                nested_reladdr = true;  /* will need another pass */
3605
3606             /* If this array isn't already present in the pull constant buffer,
3607              * add it.
3608              */
3609             if (pull_constant_loc[uniform] == -1) {
3610                const gl_constant_value **values =
3611                   &stage_prog_data->param[uniform * 4];
3612
3613                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3614
3615                assert(uniform < uniform_array_size);
3616                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3617                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3618                      = values[j];
3619                }
3620             }
3621
3622             /* Set up the annotation tracking for new generated instructions. */
3623             base_ir = inst->ir;
3624             current_annotation = inst->annotation;
3625
3626             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3627
3628             emit_pull_constant_load(block, inst, temp, inst->src[i],
3629                                     pull_constant_loc[uniform]);
3630
3631             inst->src[i].file = temp.file;
3632             inst->src[i].reg = temp.reg;
3633             inst->src[i].reg_offset = temp.reg_offset;
3634             inst->src[i].reladdr = NULL;
3635          }
3636       }
3637    } while (nested_reladdr);
3638
3639    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3640     * no need to track them as larger-than-vec4 objects.  This will be
3641     * relied on in cutting out unused uniform vectors from push
3642     * constants.
3643     */
3644    split_uniform_registers();
3645 }
3646
3647 void
3648 vec4_visitor::resolve_ud_negate(src_reg *reg)
3649 {
3650    if (reg->type != BRW_REGISTER_TYPE_UD ||
3651        !reg->negate)
3652       return;
3653
3654    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3655    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3656    *reg = temp;
3657 }
3658
3659 /**
3660  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3661  *
3662  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3663  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3664  */
3665 void
3666 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3667 {
3668    assert(devinfo->gen <= 5);
3669
3670    if (!rvalue->type->is_boolean())
3671       return;
3672
3673    src_reg and_result = src_reg(this, rvalue->type);
3674    src_reg neg_result = src_reg(this, rvalue->type);
3675    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3676    emit(MOV(dst_reg(neg_result), negate(and_result)));
3677    *reg = neg_result;
3678 }
3679
3680 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3681                            void *log_data,
3682                            struct brw_vec4_compile *c,
3683                            struct gl_program *prog,
3684                            const struct brw_vue_prog_key *key,
3685                            struct brw_vue_prog_data *prog_data,
3686                            struct gl_shader_program *shader_prog,
3687                            gl_shader_stage stage,
3688                            void *mem_ctx,
3689                            bool no_spills,
3690                            int shader_time_index)
3691    : backend_shader(compiler, log_data, mem_ctx,
3692                     shader_prog, prog, &prog_data->base, stage),
3693      c(c),
3694      key(key),
3695      prog_data(prog_data),
3696      sanity_param_count(0),
3697      fail_msg(NULL),
3698      first_non_payload_grf(0),
3699      need_all_constants_in_pull_buffer(false),
3700      no_spills(no_spills),
3701      shader_time_index(shader_time_index)
3702 {
3703    this->failed = false;
3704
3705    this->base_ir = NULL;
3706    this->current_annotation = NULL;
3707    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3708
3709    this->variable_ht = hash_table_ctor(0,
3710                                        hash_table_pointer_hash,
3711                                        hash_table_pointer_compare);
3712
3713    this->virtual_grf_start = NULL;
3714    this->virtual_grf_end = NULL;
3715    this->live_intervals = NULL;
3716
3717    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3718
3719    this->uniforms = 0;
3720
3721    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3722     * at least one. See setup_uniforms() in brw_vec4.cpp.
3723     */
3724    this->uniform_array_size = 1;
3725    if (prog_data) {
3726       this->uniform_array_size =
3727          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3728    }
3729
3730    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3731    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3732 }
3733
3734 vec4_visitor::~vec4_visitor()
3735 {
3736    hash_table_dtor(this->variable_ht);
3737 }
3738
3739
3740 void
3741 vec4_visitor::fail(const char *format, ...)
3742 {
3743    va_list va;
3744    char *msg;
3745
3746    if (failed)
3747       return;
3748
3749    failed = true;
3750
3751    va_start(va, format);
3752    msg = ralloc_vasprintf(mem_ctx, format, va);
3753    va_end(va);
3754    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3755
3756    this->fail_msg = msg;
3757
3758    if (debug_enabled) {
3759       fprintf(stderr, "%s",  msg);
3760    }
3761 }
3762
3763 } /* namespace brw */