src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SUBROUTINE:
 607       return 1;
 608
 609    case GLSL_TYPE_SAMPLER:
 610       /* Samplers take up no register space, since they're baked in at
 611        * link time.
 612        */
 613       return 0;
 614    case GLSL_TYPE_ATOMIC_UINT:
 615       return 0;
 616    case GLSL_TYPE_IMAGE:
 617    case GLSL_TYPE_VOID:
 618    case GLSL_TYPE_DOUBLE:
 619    case GLSL_TYPE_ERROR:
 620    case GLSL_TYPE_INTERFACE:
 621       unreachable("not reached");
 622    }
 623
 624    return 0;
 625 }
 626
 627 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 628 {
 629    init();
 630
 631    this->file = GRF;
 632    this->reg = v->alloc.allocate(type_size(type));
 633
 634    if (type->is_array() || type->is_record()) {
 635       this->swizzle = BRW_SWIZZLE_NOOP;
 636    } else {
 637       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 638    }
 639
 640    this->type = brw_type_for_base_type(type);
 641 }
 642
 643 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 644 {
 645    assert(size > 0);
 646
 647    init();
 648
 649    this->file = GRF;
 650    this->reg = v->alloc.allocate(type_size(type) * size);
 651
 652    this->swizzle = BRW_SWIZZLE_NOOP;
 653
 654    this->type = brw_type_for_base_type(type);
 655 }
 656
 657 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 658 {
 659    init();
 660
 661    this->file = GRF;
 662    this->reg = v->alloc.allocate(type_size(type));
 663
 664    if (type->is_array() || type->is_record()) {
 665       this->writemask = WRITEMASK_XYZW;
 666    } else {
 667       this->writemask = (1 << type->vector_elements) - 1;
 668    }
 669
 670    this->type = brw_type_for_base_type(type);
 671 }
 672
 673 /* Our support for uniforms is piggy-backed on the struct
 674  * gl_fragment_program, because that's where the values actually
 675  * get stored, rather than in some global gl_shader_program uniform
 676  * store.
 677  */
 678 void
 679 vec4_visitor::setup_uniform_values(ir_variable *ir)
 680 {
 681    int namelen = strlen(ir->name);
 682
 683    /* The data for our (non-builtin) uniforms is stored in a series of
 684     * gl_uniform_driver_storage structs for each subcomponent that
 685     * glGetUniformLocation() could name.  We know it's been set up in the same
 686     * order we'd walk the type, so walk the list of storage and find anything
 687     * with our name, or the prefix of a component that starts with our name.
 688     */
 689    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 690       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 691
 692       if (storage->builtin)
 693          continue;
 694
 695       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 696           (storage->name[namelen] != 0 &&
 697            storage->name[namelen] != '.' &&
 698            storage->name[namelen] != '[')) {
 699          continue;
 700       }
 701
 702       gl_constant_value *components = storage->storage;
 703       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 704                                storage->type->matrix_columns);
 705
 706       for (unsigned s = 0; s < vector_count; s++) {
 707          assert(uniforms < uniform_array_size);
 708          uniform_vector_size[uniforms] = storage->type->vector_elements;
 709
 710          int i;
 711          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 712             stage_prog_data->param[uniforms * 4 + i] = components;
 713             components++;
 714          }
 715          for (; i < 4; i++) {
 716             static gl_constant_value zero = { 0.0 };
 717             stage_prog_data->param[uniforms * 4 + i] = &zero;
 718          }
 719
 720          uniforms++;
 721       }
 722    }
 723 }
 724
 725 void
 726 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 727 {
 728    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 729       assert(this->uniforms < uniform_array_size);
 730       this->uniform_vector_size[this->uniforms] = 4;
 731       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 732       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 733       for (int j = 0; j < 4; ++j) {
 734          stage_prog_data->param[this->uniforms * 4 + j] =
 735             (gl_constant_value *) &clip_planes[i][j];
 736       }
 737       ++this->uniforms;
 738    }
 739 }
 740
 741 /* Our support for builtin uniforms is even scarier than non-builtin.
 742  * It sits on top of the PROG_STATE_VAR parameters that are
 743  * automatically updated from GL context state.
 744  */
 745 void
 746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 747 {
 748    const ir_state_slot *const slots = ir->get_state_slots();
 749    assert(slots != NULL);
 750
 751    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 752       /* This state reference has already been setup by ir_to_mesa,
 753        * but we'll get the same index back here.  We can reference
 754        * ParameterValues directly, since unlike brw_fs.cpp, we never
 755        * add new state references during compile.
 756        */
 757       int index = _mesa_add_state_reference(this->prog->Parameters,
 758                                             (gl_state_index *)slots[i].tokens);
 759       gl_constant_value *values =
 760          &this->prog->Parameters->ParameterValues[index][0];
 761
 762       assert(this->uniforms < uniform_array_size);
 763
 764       for (unsigned j = 0; j < 4; j++)
 765          stage_prog_data->param[this->uniforms * 4 + j] =
 766             &values[GET_SWZ(slots[i].swizzle, j)];
 767
 768       this->uniform_vector_size[this->uniforms] =
 769          (ir->type->is_scalar() || ir->type->is_vector() ||
 770           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 771
 772       this->uniforms++;
 773    }
 774 }
 775
 776 dst_reg *
 777 vec4_visitor::variable_storage(ir_variable *var)
 778 {
 779    return (dst_reg *)hash_table_find(this->variable_ht, var);
 780 }
 781
 782 void
 783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 784                                      enum brw_predicate *predicate)
 785 {
 786    ir_expression *expr = ir->as_expression();
 787
 788    *predicate = BRW_PREDICATE_NORMAL;
 789
 790    if (expr && expr->operation != ir_binop_ubo_load) {
 791       src_reg op[3];
 792       vec4_instruction *inst;
 793
 794       assert(expr->get_num_operands() <= 3);
 795       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 796          expr->operands[i]->accept(this);
 797          op[i] = this->result;
 798
 799          resolve_ud_negate(&op[i]);
 800       }
 801
 802       switch (expr->operation) {
 803       case ir_unop_logic_not:
 804          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 805          inst->conditional_mod = BRW_CONDITIONAL_Z;
 806          break;
 807
 808       case ir_binop_logic_xor:
 809          if (devinfo->gen <= 5) {
 810             src_reg temp = src_reg(this, ir->type);
 811             emit(XOR(dst_reg(temp), op[0], op[1]));
 812             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 813          } else {
 814             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 815          }
 816          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          break;
 818
 819       case ir_binop_logic_or:
 820          if (devinfo->gen <= 5) {
 821             src_reg temp = src_reg(this, ir->type);
 822             emit(OR(dst_reg(temp), op[0], op[1]));
 823             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 824          } else {
 825             inst = emit(OR(dst_null_d(), op[0], op[1]));
 826          }
 827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 828          break;
 829
 830       case ir_binop_logic_and:
 831          if (devinfo->gen <= 5) {
 832             src_reg temp = src_reg(this, ir->type);
 833             emit(AND(dst_reg(temp), op[0], op[1]));
 834             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 835          } else {
 836             inst = emit(AND(dst_null_d(), op[0], op[1]));
 837          }
 838          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839          break;
 840
 841       case ir_unop_f2b:
 842          if (devinfo->gen >= 6) {
 843             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 844          } else {
 845             inst = emit(MOV(dst_null_f(), op[0]));
 846             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 847          }
 848          break;
 849
 850       case ir_unop_i2b:
 851          if (devinfo->gen >= 6) {
 852             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 853          } else {
 854             inst = emit(MOV(dst_null_d(), op[0]));
 855             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 856          }
 857          break;
 858
 859       case ir_binop_all_equal:
 860          if (devinfo->gen <= 5) {
 861             resolve_bool_comparison(expr->operands[0], &op[0]);
 862             resolve_bool_comparison(expr->operands[1], &op[1]);
 863          }
 864          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 865          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 866          break;
 867
 868       case ir_binop_any_nequal:
 869          if (devinfo->gen <= 5) {
 870             resolve_bool_comparison(expr->operands[0], &op[0]);
 871             resolve_bool_comparison(expr->operands[1], &op[1]);
 872          }
 873          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 874          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 875          break;
 876
 877       case ir_unop_any:
 878          if (devinfo->gen <= 5) {
 879             resolve_bool_comparison(expr->operands[0], &op[0]);
 880          }
 881          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 882          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 883          break;
 884
 885       case ir_binop_greater:
 886       case ir_binop_gequal:
 887       case ir_binop_less:
 888       case ir_binop_lequal:
 889       case ir_binop_equal:
 890       case ir_binop_nequal:
 891          if (devinfo->gen <= 5) {
 892             resolve_bool_comparison(expr->operands[0], &op[0]);
 893             resolve_bool_comparison(expr->operands[1], &op[1]);
 894          }
 895          emit(CMP(dst_null_d(), op[0], op[1],
 896                   brw_conditional_for_comparison(expr->operation)));
 897          break;
 898
 899       case ir_triop_csel: {
 900          /* Expand the boolean condition into the flag register. */
 901          inst = emit(MOV(dst_null_d(), op[0]));
 902          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 903
 904          /* Select which boolean to return. */
 905          dst_reg temp(this, expr->operands[1]->type);
 906          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 907          inst->predicate = BRW_PREDICATE_NORMAL;
 908
 909          /* Expand the result to a condition code. */
 910          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 911          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 912          break;
 913       }
 914
 915       default:
 916          unreachable("not reached");
 917       }
 918       return;
 919    }
 920
 921    ir->accept(this);
 922
 923    resolve_ud_negate(&this->result);
 924
 925    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 926    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 927 }
 928
 929 /**
 930  * Emit a gen6 IF statement with the comparison folded into the IF
 931  * instruction.
 932  */
 933 void
 934 vec4_visitor::emit_if_gen6(ir_if *ir)
 935 {
 936    ir_expression *expr = ir->condition->as_expression();
 937
 938    if (expr && expr->operation != ir_binop_ubo_load) {
 939       src_reg op[3];
 940       dst_reg temp;
 941
 942       assert(expr->get_num_operands() <= 3);
 943       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 944          expr->operands[i]->accept(this);
 945          op[i] = this->result;
 946       }
 947
 948       switch (expr->operation) {
 949       case ir_unop_logic_not:
 950          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 951          return;
 952
 953       case ir_binop_logic_xor:
 954          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 955          return;
 956
 957       case ir_binop_logic_or:
 958          temp = dst_reg(this, glsl_type::bool_type);
 959          emit(OR(temp, op[0], op[1]));
 960          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 961          return;
 962
 963       case ir_binop_logic_and:
 964          temp = dst_reg(this, glsl_type::bool_type);
 965          emit(AND(temp, op[0], op[1]));
 966          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_f2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_unop_i2b:
 974          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 975          return;
 976
 977       case ir_binop_greater:
 978       case ir_binop_gequal:
 979       case ir_binop_less:
 980       case ir_binop_lequal:
 981       case ir_binop_equal:
 982       case ir_binop_nequal:
 983          emit(IF(op[0], op[1],
 984                  brw_conditional_for_comparison(expr->operation)));
 985          return;
 986
 987       case ir_binop_all_equal:
 988          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 989          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 990          return;
 991
 992       case ir_binop_any_nequal:
 993          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 994          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 995          return;
 996
 997       case ir_unop_any:
 998          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 999          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000          return;
1001
1002       case ir_triop_csel: {
1003          /* Expand the boolean condition into the flag register. */
1004          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007          /* Select which boolean to return. */
1008          dst_reg temp(this, expr->operands[1]->type);
1009          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010          inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013          return;
1014       }
1015
1016       default:
1017          unreachable("not reached");
1018       }
1019       return;
1020    }
1021
1022    ir->condition->accept(this);
1023
1024    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030    dst_reg *reg = NULL;
1031
1032    if (variable_storage(ir))
1033       return;
1034
1035    switch (ir->data.mode) {
1036    case ir_var_shader_in:
1037       assert(ir->data.location != -1);
1038       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039       break;
1040
1041    case ir_var_shader_out:
1042       assert(ir->data.location != -1);
1043       reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045       for (int i = 0; i < type_size(ir->type); i++) {
1046          output_reg[ir->data.location + i] = *reg;
1047          output_reg[ir->data.location + i].reg_offset = i;
1048          output_reg[ir->data.location + i].type =
1049             brw_type_for_base_type(ir->type->get_scalar_type());
1050          output_reg_annotation[ir->data.location + i] = ir->name;
1051       }
1052       break;
1053
1054    case ir_var_auto:
1055    case ir_var_temporary:
1056       reg = new(mem_ctx) dst_reg(this, ir->type);
1057       break;
1058
1059    case ir_var_uniform:
1060       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062       /* Thanks to the lower_ubo_reference pass, we will see only
1063        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064        * variables, so no need for them to be in variable_ht.
1065        *
1066        * Some uniforms, such as samplers and atomic counters, have no actual
1067        * storage, so we should ignore them.
1068        */
1069       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1070          return;
1071
1072       /* Track how big the whole uniform variable is, in case we need to put a
1073        * copy of its data into pull constants for array access.
1074        */
1075       assert(this->uniforms < uniform_array_size);
1076       this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078       if (!strncmp(ir->name, "gl_", 3)) {
1079          setup_builtin_uniform_values(ir);
1080       } else {
1081          setup_uniform_values(ir);
1082       }
1083       break;
1084
1085    case ir_var_system_value:
1086       reg = make_reg_for_system_value(ir);
1087       break;
1088
1089    default:
1090       unreachable("not reached");
1091    }
1092
1093    reg->type = brw_type_for_base_type(ir->type);
1094    hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100    /* We don't want debugging output to print the whole body of the
1101     * loop as the annotation.
1102     */
1103    this->base_ir = NULL;
1104
1105    emit(BRW_OPCODE_DO);
1106
1107    visit_instructions(&ir->body_instructions);
1108
1109    emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115    switch (ir->mode) {
1116    case ir_loop_jump::jump_break:
1117       emit(BRW_OPCODE_BREAK);
1118       break;
1119    case ir_loop_jump::jump_continue:
1120       emit(BRW_OPCODE_CONTINUE);
1121       break;
1122    }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129    unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135    /* Ignore function bodies other than main() -- we shouldn't see calls to
1136     * them since they should all be inlined.
1137     */
1138    if (strcmp(ir->name, "main") == 0) {
1139       const ir_function_signature *sig;
1140       exec_list empty;
1141
1142       sig = ir->matching_signature(NULL, &empty, false);
1143
1144       assert(sig);
1145
1146       visit_instructions(&sig->body);
1147    }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153    /* 3-src instructions were introduced in gen6. */
1154    if (devinfo->gen < 6)
1155       return false;
1156
1157    /* MAD can only handle floating-point data. */
1158    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159       return false;
1160
1161    ir_rvalue *nonmul;
1162    ir_expression *mul;
1163    bool mul_negate, mul_abs;
1164
1165    for (int i = 0; i < 2; i++) {
1166       mul_negate = false;
1167       mul_abs = false;
1168
1169       mul = ir->operands[i]->as_expression();
1170       nonmul = ir->operands[1 - i];
1171
1172       if (mul && mul->operation == ir_unop_abs) {
1173          mul = mul->operands[0]->as_expression();
1174          mul_abs = true;
1175       } else if (mul && mul->operation == ir_unop_neg) {
1176          mul = mul->operands[0]->as_expression();
1177          mul_negate = true;
1178       }
1179
1180       if (mul && mul->operation == ir_binop_mul)
1181          break;
1182    }
1183
1184    if (!mul || mul->operation != ir_binop_mul)
1185       return false;
1186
1187    nonmul->accept(this);
1188    src_reg src0 = fix_3src_operand(this->result);
1189
1190    mul->operands[0]->accept(this);
1191    src_reg src1 = fix_3src_operand(this->result);
1192    src1.negate ^= mul_negate;
1193    src1.abs = mul_abs;
1194    if (mul_abs)
1195       src1.negate = false;
1196
1197    mul->operands[1]->accept(this);
1198    src_reg src2 = fix_3src_operand(this->result);
1199    src2.abs = mul_abs;
1200    if (mul_abs)
1201       src2.negate = false;
1202
1203    this->result = src_reg(this, ir->type);
1204    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1205
1206    return true;
1207 }
1208
1209 bool
1210 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1211 {
1212    /* This optimization relies on CMP setting the destination to 0 when
1213     * false.  Early hardware only sets the least significant bit, and
1214     * leaves the other bits undefined.  So we can't use it.
1215     */
1216    if (devinfo->gen < 6)
1217       return false;
1218
1219    ir_expression *const cmp = ir->operands[0]->as_expression();
1220
1221    if (cmp == NULL)
1222       return false;
1223
1224    switch (cmp->operation) {
1225    case ir_binop_less:
1226    case ir_binop_greater:
1227    case ir_binop_lequal:
1228    case ir_binop_gequal:
1229    case ir_binop_equal:
1230    case ir_binop_nequal:
1231       break;
1232
1233    default:
1234       return false;
1235    }
1236
1237    cmp->operands[0]->accept(this);
1238    const src_reg cmp_src0 = this->result;
1239
1240    cmp->operands[1]->accept(this);
1241    const src_reg cmp_src1 = this->result;
1242
1243    this->result = src_reg(this, ir->type);
1244
1245    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1246             brw_conditional_for_comparison(cmp->operation)));
1247
1248    /* If the comparison is false, this->result will just happen to be zero.
1249     */
1250    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1251                                        this->result, src_reg(1.0f));
1252    inst->predicate = BRW_PREDICATE_NORMAL;
1253    inst->predicate_inverse = true;
1254
1255    return true;
1256 }
1257
1258 void
1259 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1260                           src_reg src0, src_reg src1)
1261 {
1262    vec4_instruction *inst;
1263
1264    if (devinfo->gen >= 6) {
1265       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1266       inst->conditional_mod = conditionalmod;
1267    } else {
1268       emit(CMP(dst, src0, src1, conditionalmod));
1269
1270       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1271       inst->predicate = BRW_PREDICATE_NORMAL;
1272    }
1273 }
1274
1275 void
1276 vec4_visitor::emit_lrp(const dst_reg &dst,
1277                        const src_reg &x, const src_reg &y, const src_reg &a)
1278 {
1279    if (devinfo->gen >= 6) {
1280       /* Note that the instruction's argument order is reversed from GLSL
1281        * and the IR.
1282        */
1283       emit(LRP(dst,
1284                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1285    } else {
1286       /* Earlier generations don't support three source operations, so we
1287        * need to emit x*(1-a) + y*a.
1288        */
1289       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1290       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1291       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1292       y_times_a.writemask           = dst.writemask;
1293       one_minus_a.writemask         = dst.writemask;
1294       x_times_one_minus_a.writemask = dst.writemask;
1295
1296       emit(MUL(y_times_a, y, a));
1297       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1298       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1299       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1300    }
1301 }
1302
1303 /**
1304  * Emits the instructions needed to perform a pull constant load. before_block
1305  * and before_inst can be NULL in which case the instruction will be appended
1306  * to the end of the instruction list.
1307  */
1308 void
1309 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1310                                           src_reg surf_index,
1311                                           src_reg offset_reg,
1312                                           bblock_t *before_block,
1313                                           vec4_instruction *before_inst)
1314 {
1315    assert((before_inst == NULL && before_block == NULL) ||
1316           (before_inst && before_block));
1317
1318    vec4_instruction *pull;
1319
1320    if (devinfo->gen >= 9) {
1321       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1322       src_reg header(this, glsl_type::uvec4_type, 2);
1323
1324       pull = new(mem_ctx)
1325          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1326                           dst_reg(header));
1327
1328       if (before_inst)
1329          emit_before(before_block, before_inst, pull);
1330       else
1331          emit(pull);
1332
1333       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1334                                  offset_reg.type);
1335       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1336
1337       if (before_inst)
1338          emit_before(before_block, before_inst, pull);
1339       else
1340          emit(pull);
1341
1342       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1343                                            dst,
1344                                            surf_index,
1345                                            header);
1346       pull->mlen = 2;
1347       pull->header_size = 1;
1348    } else if (devinfo->gen >= 7) {
1349       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1350
1351       grf_offset.type = offset_reg.type;
1352
1353       pull = MOV(grf_offset, offset_reg);
1354
1355       if (before_inst)
1356          emit_before(before_block, before_inst, pull);
1357       else
1358          emit(pull);
1359
1360       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1361                                            dst,
1362                                            surf_index,
1363                                            src_reg(grf_offset));
1364       pull->mlen = 1;
1365    } else {
1366       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1367                                            dst,
1368                                            surf_index,
1369                                            offset_reg);
1370       pull->base_mrf = 14;
1371       pull->mlen = 1;
1372    }
1373
1374    if (before_inst)
1375       emit_before(before_block, before_inst, pull);
1376    else
1377       emit(pull);
1378 }
1379
1380 src_reg
1381 vec4_visitor::emit_uniformize(const src_reg &src)
1382 {
1383    const src_reg chan_index(this, glsl_type::uint_type);
1384    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1385                               src.type);
1386
1387    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1388       ->force_writemask_all = true;
1389    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1390       ->force_writemask_all = true;
1391
1392    return src_reg(dst);
1393 }
1394
1395 void
1396 vec4_visitor::visit(ir_expression *ir)
1397 {
1398    unsigned int operand;
1399    src_reg op[ARRAY_SIZE(ir->operands)];
1400    vec4_instruction *inst;
1401
1402    if (ir->operation == ir_binop_add) {
1403       if (try_emit_mad(ir))
1404          return;
1405    }
1406
1407    if (ir->operation == ir_unop_b2f) {
1408       if (try_emit_b2f_of_compare(ir))
1409          return;
1410    }
1411
1412    /* Storage for our result.  Ideally for an assignment we'd be using
1413     * the actual storage for the result here, instead.
1414     */
1415    dst_reg result_dst(this, ir->type);
1416    src_reg result_src(result_dst);
1417
1418    if (ir->operation == ir_triop_csel) {
1419       ir->operands[1]->accept(this);
1420       op[1] = this->result;
1421       ir->operands[2]->accept(this);
1422       op[2] = this->result;
1423
1424       enum brw_predicate predicate;
1425       emit_bool_to_cond_code(ir->operands[0], &predicate);
1426       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1427       inst->predicate = predicate;
1428       this->result = result_src;
1429       return;
1430    }
1431
1432    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1433       this->result.file = BAD_FILE;
1434       ir->operands[operand]->accept(this);
1435       if (this->result.file == BAD_FILE) {
1436          fprintf(stderr, "Failed to get tree for expression operand:\n");
1437          ir->operands[operand]->fprint(stderr);
1438          exit(1);
1439       }
1440       op[operand] = this->result;
1441
1442       /* Matrix expression operands should have been broken down to vector
1443        * operations already.
1444        */
1445       assert(!ir->operands[operand]->type->is_matrix());
1446    }
1447
1448    /* If nothing special happens, this is the result. */
1449    this->result = result_src;
1450
1451    switch (ir->operation) {
1452    case ir_unop_logic_not:
1453       emit(NOT(result_dst, op[0]));
1454       break;
1455    case ir_unop_neg:
1456       op[0].negate = !op[0].negate;
1457       emit(MOV(result_dst, op[0]));
1458       break;
1459    case ir_unop_abs:
1460       op[0].abs = true;
1461       op[0].negate = false;
1462       emit(MOV(result_dst, op[0]));
1463       break;
1464
1465    case ir_unop_sign:
1466       if (ir->type->is_float()) {
1467          /* AND(val, 0x80000000) gives the sign bit.
1468           *
1469           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1470           * zero.
1471           */
1472          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1473
1474          op[0].type = BRW_REGISTER_TYPE_UD;
1475          result_dst.type = BRW_REGISTER_TYPE_UD;
1476          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1477
1478          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1479          inst->predicate = BRW_PREDICATE_NORMAL;
1480
1481          this->result.type = BRW_REGISTER_TYPE_F;
1482       } else {
1483          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1484           *               -> non-negative val generates 0x00000000.
1485           *  Predicated OR sets 1 if val is positive.
1486           */
1487          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1488
1489          emit(ASR(result_dst, op[0], src_reg(31)));
1490
1491          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1492          inst->predicate = BRW_PREDICATE_NORMAL;
1493       }
1494       break;
1495
1496    case ir_unop_rcp:
1497       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1498       break;
1499
1500    case ir_unop_exp2:
1501       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1502       break;
1503    case ir_unop_log2:
1504       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1505       break;
1506    case ir_unop_exp:
1507    case ir_unop_log:
1508       unreachable("not reached: should be handled by ir_explog_to_explog2");
1509    case ir_unop_sin:
1510       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1511       break;
1512    case ir_unop_cos:
1513       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1514       break;
1515
1516    case ir_unop_dFdx:
1517    case ir_unop_dFdx_coarse:
1518    case ir_unop_dFdx_fine:
1519    case ir_unop_dFdy:
1520    case ir_unop_dFdy_coarse:
1521    case ir_unop_dFdy_fine:
1522       unreachable("derivatives not valid in vertex shader");
1523
1524    case ir_unop_bitfield_reverse:
1525       emit(BFREV(result_dst, op[0]));
1526       break;
1527    case ir_unop_bit_count:
1528       emit(CBIT(result_dst, op[0]));
1529       break;
1530    case ir_unop_find_msb: {
1531       src_reg temp = src_reg(this, glsl_type::uint_type);
1532
1533       inst = emit(FBH(dst_reg(temp), op[0]));
1534       inst->dst.writemask = WRITEMASK_XYZW;
1535
1536       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1537        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1538        * subtract the result from 31 to convert the MSB count into an LSB count.
1539        */
1540
1541       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1542       temp.swizzle = BRW_SWIZZLE_NOOP;
1543       emit(MOV(result_dst, temp));
1544
1545       src_reg src_tmp = src_reg(result_dst);
1546       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1547
1548       src_tmp.negate = true;
1549       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1550       inst->predicate = BRW_PREDICATE_NORMAL;
1551       break;
1552    }
1553    case ir_unop_find_lsb:
1554       emit(FBL(result_dst, op[0]));
1555       break;
1556    case ir_unop_saturate:
1557       inst = emit(MOV(result_dst, op[0]));
1558       inst->saturate = true;
1559       break;
1560
1561    case ir_unop_noise:
1562       unreachable("not reached: should be handled by lower_noise");
1563
1564    case ir_unop_subroutine_to_int:
1565       emit(MOV(result_dst, op[0]));
1566       break;
1567
1568    case ir_binop_add:
1569       emit(ADD(result_dst, op[0], op[1]));
1570       break;
1571    case ir_binop_sub:
1572       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1573
1574    case ir_binop_mul:
1575       if (devinfo->gen < 8 && ir->type->is_integer()) {
1576          /* For integer multiplication, the MUL uses the low 16 bits of one of
1577           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1578           * accumulates in the contribution of the upper 16 bits of that
1579           * operand.  If we can determine that one of the args is in the low
1580           * 16 bits, though, we can just emit a single MUL.
1581           */
1582          if (ir->operands[0]->is_uint16_constant()) {
1583             if (devinfo->gen < 7)
1584                emit(MUL(result_dst, op[0], op[1]));
1585             else
1586                emit(MUL(result_dst, op[1], op[0]));
1587          } else if (ir->operands[1]->is_uint16_constant()) {
1588             if (devinfo->gen < 7)
1589                emit(MUL(result_dst, op[1], op[0]));
1590             else
1591                emit(MUL(result_dst, op[0], op[1]));
1592          } else {
1593             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1594
1595             emit(MUL(acc, op[0], op[1]));
1596             emit(MACH(dst_null_d(), op[0], op[1]));
1597             emit(MOV(result_dst, src_reg(acc)));
1598          }
1599       } else {
1600          emit(MUL(result_dst, op[0], op[1]));
1601       }
1602       break;
1603    case ir_binop_imul_high: {
1604       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1605
1606       emit(MUL(acc, op[0], op[1]));
1607       emit(MACH(result_dst, op[0], op[1]));
1608       break;
1609    }
1610    case ir_binop_div:
1611       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1612       assert(ir->type->is_integer());
1613       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1614       break;
1615
1616    case ir_binop_carry:
1617       unreachable("Should have been lowered by carry_to_arith().");
1618
1619    case ir_binop_borrow:
1620       unreachable("Should have been lowered by borrow_to_arith().");
1621
1622    case ir_binop_mod:
1623       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1624       assert(ir->type->is_integer());
1625       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1626       break;
1627
1628    case ir_binop_less:
1629    case ir_binop_greater:
1630    case ir_binop_lequal:
1631    case ir_binop_gequal:
1632    case ir_binop_equal:
1633    case ir_binop_nequal: {
1634       if (devinfo->gen <= 5) {
1635          resolve_bool_comparison(ir->operands[0], &op[0]);
1636          resolve_bool_comparison(ir->operands[1], &op[1]);
1637       }
1638       emit(CMP(result_dst, op[0], op[1],
1639                brw_conditional_for_comparison(ir->operation)));
1640       break;
1641    }
1642
1643    case ir_binop_all_equal:
1644       if (devinfo->gen <= 5) {
1645          resolve_bool_comparison(ir->operands[0], &op[0]);
1646          resolve_bool_comparison(ir->operands[1], &op[1]);
1647       }
1648
1649       /* "==" operator producing a scalar boolean. */
1650       if (ir->operands[0]->type->is_vector() ||
1651           ir->operands[1]->type->is_vector()) {
1652          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1653          emit(MOV(result_dst, src_reg(0)));
1654          inst = emit(MOV(result_dst, src_reg(~0)));
1655          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1656       } else {
1657          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1658       }
1659       break;
1660    case ir_binop_any_nequal:
1661       if (devinfo->gen <= 5) {
1662          resolve_bool_comparison(ir->operands[0], &op[0]);
1663          resolve_bool_comparison(ir->operands[1], &op[1]);
1664       }
1665
1666       /* "!=" operator producing a scalar boolean. */
1667       if (ir->operands[0]->type->is_vector() ||
1668           ir->operands[1]->type->is_vector()) {
1669          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1670
1671          emit(MOV(result_dst, src_reg(0)));
1672          inst = emit(MOV(result_dst, src_reg(~0)));
1673          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1674       } else {
1675          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1676       }
1677       break;
1678
1679    case ir_unop_any:
1680       if (devinfo->gen <= 5) {
1681          resolve_bool_comparison(ir->operands[0], &op[0]);
1682       }
1683       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1684       emit(MOV(result_dst, src_reg(0)));
1685
1686       inst = emit(MOV(result_dst, src_reg(~0)));
1687       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1688       break;
1689
1690    case ir_binop_logic_xor:
1691       emit(XOR(result_dst, op[0], op[1]));
1692       break;
1693
1694    case ir_binop_logic_or:
1695       emit(OR(result_dst, op[0], op[1]));
1696       break;
1697
1698    case ir_binop_logic_and:
1699       emit(AND(result_dst, op[0], op[1]));
1700       break;
1701
1702    case ir_binop_dot:
1703       assert(ir->operands[0]->type->is_vector());
1704       assert(ir->operands[0]->type == ir->operands[1]->type);
1705       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1706       break;
1707
1708    case ir_unop_sqrt:
1709       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1710       break;
1711    case ir_unop_rsq:
1712       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1713       break;
1714
1715    case ir_unop_bitcast_i2f:
1716    case ir_unop_bitcast_u2f:
1717       this->result = op[0];
1718       this->result.type = BRW_REGISTER_TYPE_F;
1719       break;
1720
1721    case ir_unop_bitcast_f2i:
1722       this->result = op[0];
1723       this->result.type = BRW_REGISTER_TYPE_D;
1724       break;
1725
1726    case ir_unop_bitcast_f2u:
1727       this->result = op[0];
1728       this->result.type = BRW_REGISTER_TYPE_UD;
1729       break;
1730
1731    case ir_unop_i2f:
1732    case ir_unop_i2u:
1733    case ir_unop_u2i:
1734    case ir_unop_u2f:
1735    case ir_unop_f2i:
1736    case ir_unop_f2u:
1737       emit(MOV(result_dst, op[0]));
1738       break;
1739    case ir_unop_b2i:
1740    case ir_unop_b2f:
1741       if (devinfo->gen <= 5) {
1742          resolve_bool_comparison(ir->operands[0], &op[0]);
1743       }
1744       emit(MOV(result_dst, negate(op[0])));
1745       break;
1746    case ir_unop_f2b:
1747       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1748       break;
1749    case ir_unop_i2b:
1750       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1751       break;
1752
1753    case ir_unop_trunc:
1754       emit(RNDZ(result_dst, op[0]));
1755       break;
1756    case ir_unop_ceil: {
1757          src_reg tmp = src_reg(this, ir->type);
1758          op[0].negate = !op[0].negate;
1759          emit(RNDD(dst_reg(tmp), op[0]));
1760          tmp.negate = true;
1761          emit(MOV(result_dst, tmp));
1762       }
1763       break;
1764    case ir_unop_floor:
1765       inst = emit(RNDD(result_dst, op[0]));
1766       break;
1767    case ir_unop_fract:
1768       inst = emit(FRC(result_dst, op[0]));
1769       break;
1770    case ir_unop_round_even:
1771       emit(RNDE(result_dst, op[0]));
1772       break;
1773
1774    case ir_binop_min:
1775       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1776       break;
1777    case ir_binop_max:
1778       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1779       break;
1780
1781    case ir_binop_pow:
1782       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1783       break;
1784
1785    case ir_unop_bit_not:
1786       inst = emit(NOT(result_dst, op[0]));
1787       break;
1788    case ir_binop_bit_and:
1789       inst = emit(AND(result_dst, op[0], op[1]));
1790       break;
1791    case ir_binop_bit_xor:
1792       inst = emit(XOR(result_dst, op[0], op[1]));
1793       break;
1794    case ir_binop_bit_or:
1795       inst = emit(OR(result_dst, op[0], op[1]));
1796       break;
1797
1798    case ir_binop_lshift:
1799       inst = emit(SHL(result_dst, op[0], op[1]));
1800       break;
1801
1802    case ir_binop_rshift:
1803       if (ir->type->base_type == GLSL_TYPE_INT)
1804          inst = emit(ASR(result_dst, op[0], op[1]));
1805       else
1806          inst = emit(SHR(result_dst, op[0], op[1]));
1807       break;
1808
1809    case ir_binop_bfm:
1810       emit(BFI1(result_dst, op[0], op[1]));
1811       break;
1812
1813    case ir_binop_ubo_load: {
1814       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1815       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1816       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1817       src_reg offset;
1818
1819       /* Now, load the vector from that offset. */
1820       assert(ir->type->is_vector() || ir->type->is_scalar());
1821
1822       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1823       packed_consts.type = result.type;
1824       src_reg surf_index;
1825
1826       if (const_uniform_block) {
1827          /* The block index is a constant, so just emit the binding table entry
1828           * as an immediate.
1829           */
1830          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1831                               const_uniform_block->value.u[0]);
1832       } else {
1833          /* The block index is not a constant. Evaluate the index expression
1834           * per-channel and add the base UBO index; we have to select a value
1835           * from any live channel.
1836           */
1837          surf_index = src_reg(this, glsl_type::uint_type);
1838          emit(ADD(dst_reg(surf_index), op[0],
1839                   src_reg(prog_data->base.binding_table.ubo_start)));
1840          surf_index = emit_uniformize(surf_index);
1841
1842          /* Assume this may touch any UBO. It would be nice to provide
1843           * a tighter bound, but the array information is already lowered away.
1844           */
1845          brw_mark_surface_used(&prog_data->base,
1846                                prog_data->base.binding_table.ubo_start +
1847                                shader_prog->NumUniformBlocks - 1);
1848       }
1849
1850       if (const_offset_ir) {
1851          if (devinfo->gen >= 8) {
1852             /* Store the offset in a GRF so we can send-from-GRF. */
1853             offset = src_reg(this, glsl_type::int_type);
1854             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1855          } else {
1856             /* Immediates are fine on older generations since they'll be moved
1857              * to a (potentially fake) MRF at the generator level.
1858              */
1859             offset = src_reg(const_offset / 16);
1860          }
1861       } else {
1862          offset = src_reg(this, glsl_type::uint_type);
1863          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1864       }
1865
1866       emit_pull_constant_load_reg(dst_reg(packed_consts),
1867                                   surf_index,
1868                                   offset,
1869                                   NULL, NULL /* before_block/inst */);
1870
1871       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1872       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1873                                             const_offset % 16 / 4,
1874                                             const_offset % 16 / 4,
1875                                             const_offset % 16 / 4);
1876
1877       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1878       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1879          emit(CMP(result_dst, packed_consts, src_reg(0u),
1880                   BRW_CONDITIONAL_NZ));
1881       } else {
1882          emit(MOV(result_dst, packed_consts));
1883       }
1884       break;
1885    }
1886
1887    case ir_binop_vector_extract:
1888       unreachable("should have been lowered by vec_index_to_cond_assign");
1889
1890    case ir_triop_fma:
1891       op[0] = fix_3src_operand(op[0]);
1892       op[1] = fix_3src_operand(op[1]);
1893       op[2] = fix_3src_operand(op[2]);
1894       /* Note that the instruction's argument order is reversed from GLSL
1895        * and the IR.
1896        */
1897       emit(MAD(result_dst, op[2], op[1], op[0]));
1898       break;
1899
1900    case ir_triop_lrp:
1901       emit_lrp(result_dst, op[0], op[1], op[2]);
1902       break;
1903
1904    case ir_triop_csel:
1905       unreachable("already handled above");
1906       break;
1907
1908    case ir_triop_bfi:
1909       op[0] = fix_3src_operand(op[0]);
1910       op[1] = fix_3src_operand(op[1]);
1911       op[2] = fix_3src_operand(op[2]);
1912       emit(BFI2(result_dst, op[0], op[1], op[2]));
1913       break;
1914
1915    case ir_triop_bitfield_extract:
1916       op[0] = fix_3src_operand(op[0]);
1917       op[1] = fix_3src_operand(op[1]);
1918       op[2] = fix_3src_operand(op[2]);
1919       /* Note that the instruction's argument order is reversed from GLSL
1920        * and the IR.
1921        */
1922       emit(BFE(result_dst, op[2], op[1], op[0]));
1923       break;
1924
1925    case ir_triop_vector_insert:
1926       unreachable("should have been lowered by lower_vector_insert");
1927
1928    case ir_quadop_bitfield_insert:
1929       unreachable("not reached: should be handled by "
1930               "bitfield_insert_to_bfm_bfi\n");
1931
1932    case ir_quadop_vector:
1933       unreachable("not reached: should be handled by lower_quadop_vector");
1934
1935    case ir_unop_pack_half_2x16:
1936       emit_pack_half_2x16(result_dst, op[0]);
1937       break;
1938    case ir_unop_unpack_half_2x16:
1939       emit_unpack_half_2x16(result_dst, op[0]);
1940       break;
1941    case ir_unop_unpack_unorm_4x8:
1942       emit_unpack_unorm_4x8(result_dst, op[0]);
1943       break;
1944    case ir_unop_unpack_snorm_4x8:
1945       emit_unpack_snorm_4x8(result_dst, op[0]);
1946       break;
1947    case ir_unop_pack_unorm_4x8:
1948       emit_pack_unorm_4x8(result_dst, op[0]);
1949       break;
1950    case ir_unop_pack_snorm_4x8:
1951       emit_pack_snorm_4x8(result_dst, op[0]);
1952       break;
1953    case ir_unop_pack_snorm_2x16:
1954    case ir_unop_pack_unorm_2x16:
1955    case ir_unop_unpack_snorm_2x16:
1956    case ir_unop_unpack_unorm_2x16:
1957       unreachable("not reached: should be handled by lower_packing_builtins");
1958    case ir_unop_unpack_half_2x16_split_x:
1959    case ir_unop_unpack_half_2x16_split_y:
1960    case ir_binop_pack_half_2x16_split:
1961    case ir_unop_interpolate_at_centroid:
1962    case ir_binop_interpolate_at_sample:
1963    case ir_binop_interpolate_at_offset:
1964       unreachable("not reached: should not occur in vertex shader");
1965    case ir_binop_ldexp:
1966       unreachable("not reached: should be handled by ldexp_to_arith()");
1967    case ir_unop_d2f:
1968    case ir_unop_f2d:
1969    case ir_unop_d2i:
1970    case ir_unop_i2d:
1971    case ir_unop_d2u:
1972    case ir_unop_u2d:
1973    case ir_unop_d2b:
1974    case ir_unop_pack_double_2x32:
1975    case ir_unop_unpack_double_2x32:
1976    case ir_unop_frexp_sig:
1977    case ir_unop_frexp_exp:
1978       unreachable("fp64 todo");
1979    }
1980 }
1981
1982
1983 void
1984 vec4_visitor::visit(ir_swizzle *ir)
1985 {
1986    /* Note that this is only swizzles in expressions, not those on the left
1987     * hand side of an assignment, which do write masking.  See ir_assignment
1988     * for that.
1989     */
1990    const unsigned swz = brw_compose_swizzle(
1991       brw_swizzle_for_size(ir->type->vector_elements),
1992       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1993
1994    ir->val->accept(this);
1995    this->result = swizzle(this->result, swz);
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_variable *ir)
2000 {
2001    const struct glsl_type *type = ir->type;
2002    dst_reg *reg = variable_storage(ir->var);
2003
2004    if (!reg) {
2005       fail("Failed to find variable storage for %s\n", ir->var->name);
2006       this->result = src_reg(brw_null_reg());
2007       return;
2008    }
2009
2010    this->result = src_reg(*reg);
2011
2012    /* System values get their swizzle from the dst_reg writemask */
2013    if (ir->var->data.mode == ir_var_system_value)
2014       return;
2015
2016    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2017       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2018 }
2019
2020
2021 int
2022 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2023 {
2024    /* Under normal circumstances array elements are stored consecutively, so
2025     * the stride is equal to the size of the array element.
2026     */
2027    return type_size(ir->type);
2028 }
2029
2030
2031 void
2032 vec4_visitor::visit(ir_dereference_array *ir)
2033 {
2034    ir_constant *constant_index;
2035    src_reg src;
2036    int array_stride = compute_array_stride(ir);
2037
2038    constant_index = ir->array_index->constant_expression_value();
2039
2040    ir->array->accept(this);
2041    src = this->result;
2042
2043    if (constant_index) {
2044       src.reg_offset += constant_index->value.i[0] * array_stride;
2045    } else {
2046       /* Variable index array dereference.  It eats the "vec4" of the
2047        * base of the array and an index that offsets the Mesa register
2048        * index.
2049        */
2050       ir->array_index->accept(this);
2051
2052       src_reg index_reg;
2053
2054       if (array_stride == 1) {
2055          index_reg = this->result;
2056       } else {
2057          index_reg = src_reg(this, glsl_type::int_type);
2058
2059          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2060       }
2061
2062       if (src.reladdr) {
2063          src_reg temp = src_reg(this, glsl_type::int_type);
2064
2065          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2066
2067          index_reg = temp;
2068       }
2069
2070       src.reladdr = ralloc(mem_ctx, src_reg);
2071       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2072    }
2073
2074    /* If the type is smaller than a vec4, replicate the last channel out. */
2075    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2076       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2077    else
2078       src.swizzle = BRW_SWIZZLE_NOOP;
2079    src.type = brw_type_for_base_type(ir->type);
2080
2081    this->result = src;
2082 }
2083
2084 void
2085 vec4_visitor::visit(ir_dereference_record *ir)
2086 {
2087    unsigned int i;
2088    const glsl_type *struct_type = ir->record->type;
2089    int offset = 0;
2090
2091    ir->record->accept(this);
2092
2093    for (i = 0; i < struct_type->length; i++) {
2094       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2095          break;
2096       offset += type_size(struct_type->fields.structure[i].type);
2097    }
2098
2099    /* If the type is smaller than a vec4, replicate the last channel out. */
2100    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2101       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2102    else
2103       this->result.swizzle = BRW_SWIZZLE_NOOP;
2104    this->result.type = brw_type_for_base_type(ir->type);
2105
2106    this->result.reg_offset += offset;
2107 }
2108
2109 /**
2110  * We want to be careful in assignment setup to hit the actual storage
2111  * instead of potentially using a temporary like we might with the
2112  * ir_dereference handler.
2113  */
2114 static dst_reg
2115 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2116 {
2117    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2118     * access of a vector, it must be separated into a series conditional moves
2119     * before reaching this point (see ir_vec_index_to_cond_assign).
2120     */
2121    assert(ir->as_dereference());
2122    ir_dereference_array *deref_array = ir->as_dereference_array();
2123    if (deref_array) {
2124       assert(!deref_array->array->type->is_vector());
2125    }
2126
2127    /* Use the rvalue deref handler for the most part.  We'll ignore
2128     * swizzles in it and write swizzles using writemask, though.
2129     */
2130    ir->accept(v);
2131    return dst_reg(v->result);
2132 }
2133
2134 void
2135 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2136                               const struct glsl_type *type,
2137                               enum brw_predicate predicate)
2138 {
2139    if (type->base_type == GLSL_TYPE_STRUCT) {
2140       for (unsigned int i = 0; i < type->length; i++) {
2141          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2142       }
2143       return;
2144    }
2145
2146    if (type->is_array()) {
2147       for (unsigned int i = 0; i < type->length; i++) {
2148          emit_block_move(dst, src, type->fields.array, predicate);
2149       }
2150       return;
2151    }
2152
2153    if (type->is_matrix()) {
2154       const struct glsl_type *vec_type;
2155
2156       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2157                                          type->vector_elements, 1);
2158
2159       for (int i = 0; i < type->matrix_columns; i++) {
2160          emit_block_move(dst, src, vec_type, predicate);
2161       }
2162       return;
2163    }
2164
2165    assert(type->is_scalar() || type->is_vector());
2166
2167    dst->type = brw_type_for_base_type(type);
2168    src->type = dst->type;
2169
2170    dst->writemask = (1 << type->vector_elements) - 1;
2171
2172    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2173
2174    vec4_instruction *inst = emit(MOV(*dst, *src));
2175    inst->predicate = predicate;
2176
2177    dst->reg_offset++;
2178    src->reg_offset++;
2179 }
2180
2181
2182 /* If the RHS processing resulted in an instruction generating a
2183  * temporary value, and it would be easy to rewrite the instruction to
2184  * generate its result right into the LHS instead, do so.  This ends
2185  * up reliably removing instructions where it can be tricky to do so
2186  * later without real UD chain information.
2187  */
2188 bool
2189 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2190                                      dst_reg dst,
2191                                      src_reg src,
2192                                      vec4_instruction *pre_rhs_inst,
2193                                      vec4_instruction *last_rhs_inst)
2194 {
2195    /* This could be supported, but it would take more smarts. */
2196    if (ir->condition)
2197       return false;
2198
2199    if (pre_rhs_inst == last_rhs_inst)
2200       return false; /* No instructions generated to work with. */
2201
2202    /* Make sure the last instruction generated our source reg. */
2203    if (src.file != GRF ||
2204        src.file != last_rhs_inst->dst.file ||
2205        src.reg != last_rhs_inst->dst.reg ||
2206        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2207        src.reladdr ||
2208        src.abs ||
2209        src.negate ||
2210        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2211       return false;
2212
2213    /* Check that that last instruction fully initialized the channels
2214     * we want to use, in the order we want to use them.  We could
2215     * potentially reswizzle the operands of many instructions so that
2216     * we could handle out of order channels, but don't yet.
2217     */
2218
2219    for (unsigned i = 0; i < 4; i++) {
2220       if (dst.writemask & (1 << i)) {
2221          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2222             return false;
2223
2224          if (BRW_GET_SWZ(src.swizzle, i) != i)
2225             return false;
2226       }
2227    }
2228
2229    /* Success!  Rewrite the instruction. */
2230    last_rhs_inst->dst.file = dst.file;
2231    last_rhs_inst->dst.reg = dst.reg;
2232    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2233    last_rhs_inst->dst.reladdr = dst.reladdr;
2234    last_rhs_inst->dst.writemask &= dst.writemask;
2235
2236    return true;
2237 }
2238
2239 void
2240 vec4_visitor::visit(ir_assignment *ir)
2241 {
2242    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2243    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2244
2245    if (!ir->lhs->type->is_scalar() &&
2246        !ir->lhs->type->is_vector()) {
2247       ir->rhs->accept(this);
2248       src_reg src = this->result;
2249
2250       if (ir->condition) {
2251          emit_bool_to_cond_code(ir->condition, &predicate);
2252       }
2253
2254       /* emit_block_move doesn't account for swizzles in the source register.
2255        * This should be ok, since the source register is a structure or an
2256        * array, and those can't be swizzled.  But double-check to be sure.
2257        */
2258       assert(src.swizzle ==
2259              (ir->rhs->type->is_matrix()
2260               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2261               : BRW_SWIZZLE_NOOP));
2262
2263       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2264       return;
2265    }
2266
2267    /* Now we're down to just a scalar/vector with writemasks. */
2268    int i;
2269
2270    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2271    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2272
2273    ir->rhs->accept(this);
2274
2275    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2276
2277    int swizzles[4];
2278    int src_chan = 0;
2279
2280    assert(ir->lhs->type->is_vector() ||
2281           ir->lhs->type->is_scalar());
2282    dst.writemask = ir->write_mask;
2283
2284    /* Swizzle a small RHS vector into the channels being written.
2285     *
2286     * glsl ir treats write_mask as dictating how many channels are
2287     * present on the RHS while in our instructions we need to make
2288     * those channels appear in the slots of the vec4 they're written to.
2289     */
2290    for (int i = 0; i < 4; i++)
2291       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2292
2293    src_reg src = swizzle(this->result,
2294                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2295                                       swizzles[2], swizzles[3]));
2296
2297    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2298       return;
2299    }
2300
2301    if (ir->condition) {
2302       emit_bool_to_cond_code(ir->condition, &predicate);
2303    }
2304
2305    for (i = 0; i < type_size(ir->lhs->type); i++) {
2306       vec4_instruction *inst = emit(MOV(dst, src));
2307       inst->predicate = predicate;
2308
2309       dst.reg_offset++;
2310       src.reg_offset++;
2311    }
2312 }
2313
2314 void
2315 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2316 {
2317    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2318       foreach_in_list(ir_constant, field_value, &ir->components) {
2319          emit_constant_values(dst, field_value);
2320       }
2321       return;
2322    }
2323
2324    if (ir->type->is_array()) {
2325       for (unsigned int i = 0; i < ir->type->length; i++) {
2326          emit_constant_values(dst, ir->array_elements[i]);
2327       }
2328       return;
2329    }
2330
2331    if (ir->type->is_matrix()) {
2332       for (int i = 0; i < ir->type->matrix_columns; i++) {
2333          float *vec = &ir->value.f[i * ir->type->vector_elements];
2334
2335          for (int j = 0; j < ir->type->vector_elements; j++) {
2336             dst->writemask = 1 << j;
2337             dst->type = BRW_REGISTER_TYPE_F;
2338
2339             emit(MOV(*dst, src_reg(vec[j])));
2340          }
2341          dst->reg_offset++;
2342       }
2343       return;
2344    }
2345
2346    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2347
2348    for (int i = 0; i < ir->type->vector_elements; i++) {
2349       if (!(remaining_writemask & (1 << i)))
2350          continue;
2351
2352       dst->writemask = 1 << i;
2353       dst->type = brw_type_for_base_type(ir->type);
2354
2355       /* Find other components that match the one we're about to
2356        * write.  Emits fewer instructions for things like vec4(0.5,
2357        * 1.5, 1.5, 1.5).
2358        */
2359       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2360          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2361             if (ir->value.b[i] == ir->value.b[j])
2362                dst->writemask |= (1 << j);
2363          } else {
2364             /* u, i, and f storage all line up, so no need for a
2365              * switch case for comparing each type.
2366              */
2367             if (ir->value.u[i] == ir->value.u[j])
2368                dst->writemask |= (1 << j);
2369          }
2370       }
2371
2372       switch (ir->type->base_type) {
2373       case GLSL_TYPE_FLOAT:
2374          emit(MOV(*dst, src_reg(ir->value.f[i])));
2375          break;
2376       case GLSL_TYPE_INT:
2377          emit(MOV(*dst, src_reg(ir->value.i[i])));
2378          break;
2379       case GLSL_TYPE_UINT:
2380          emit(MOV(*dst, src_reg(ir->value.u[i])));
2381          break;
2382       case GLSL_TYPE_BOOL:
2383          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2384          break;
2385       default:
2386          unreachable("Non-float/uint/int/bool constant");
2387       }
2388
2389       remaining_writemask &= ~dst->writemask;
2390    }
2391    dst->reg_offset++;
2392 }
2393
2394 void
2395 vec4_visitor::visit(ir_constant *ir)
2396 {
2397    dst_reg dst = dst_reg(this, ir->type);
2398    this->result = src_reg(dst);
2399
2400    emit_constant_values(&dst, ir);
2401 }
2402
2403 void
2404 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2405 {
2406    ir_dereference *deref = static_cast<ir_dereference *>(
2407       ir->actual_parameters.get_head());
2408    ir_variable *location = deref->variable_referenced();
2409    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2410                           location->data.binding);
2411
2412    /* Calculate the surface offset */
2413    src_reg offset(this, glsl_type::uint_type);
2414    ir_dereference_array *deref_array = deref->as_dereference_array();
2415    if (deref_array) {
2416       deref_array->array_index->accept(this);
2417
2418       src_reg tmp(this, glsl_type::uint_type);
2419       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2420       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2421    } else {
2422       offset = location->data.atomic.offset;
2423    }
2424
2425    /* Emit the appropriate machine instruction */
2426    const char *callee = ir->callee->function_name();
2427    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2428
2429    if (!strcmp("__intrinsic_atomic_read", callee)) {
2430       emit_untyped_surface_read(surf_index, dst, offset);
2431
2432    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2433       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2434                           src_reg(), src_reg());
2435
2436    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2437       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2438                           src_reg(), src_reg());
2439    }
2440 }
2441
2442 void
2443 vec4_visitor::visit(ir_call *ir)
2444 {
2445    const char *callee = ir->callee->function_name();
2446
2447    if (!strcmp("__intrinsic_atomic_read", callee) ||
2448        !strcmp("__intrinsic_atomic_increment", callee) ||
2449        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2450       visit_atomic_counter_intrinsic(ir);
2451    } else {
2452       unreachable("Unsupported intrinsic.");
2453    }
2454 }
2455
2456 src_reg
2457 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2458 {
2459    vec4_instruction *inst =
2460       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2461                                     dst_reg(this, glsl_type::uvec4_type));
2462    inst->base_mrf = 2;
2463    inst->src[1] = sampler;
2464
2465    int param_base;
2466
2467    if (devinfo->gen >= 9) {
2468       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2469       vec4_instruction *header_inst = new(mem_ctx)
2470          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2471                           dst_reg(MRF, inst->base_mrf));
2472
2473       emit(header_inst);
2474
2475       inst->mlen = 2;
2476       inst->header_size = 1;
2477       param_base = inst->base_mrf + 1;
2478    } else {
2479       inst->mlen = 1;
2480       param_base = inst->base_mrf;
2481    }
2482
2483    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2484    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2485    int zero_mask = 0xf & ~coord_mask;
2486
2487    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2488             coordinate));
2489
2490    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2491             src_reg(0)));
2492
2493    emit(inst);
2494    return src_reg(inst->dst);
2495 }
2496
2497 static bool
2498 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2499 {
2500    if (devinfo->gen < 8 && !devinfo->is_haswell)
2501       return false;
2502
2503    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2504 }
2505
2506 void
2507 vec4_visitor::visit(ir_texture *ir)
2508 {
2509    uint32_t sampler =
2510       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2511
2512    ir_rvalue *nonconst_sampler_index =
2513       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2514
2515    /* Handle non-constant sampler array indexing */
2516    src_reg sampler_reg;
2517    if (nonconst_sampler_index) {
2518       /* The highest sampler which may be used by this operation is
2519        * the last element of the array. Mark it here, because the generator
2520        * doesn't have enough information to determine the bound.
2521        */
2522       uint32_t array_size = ir->sampler->as_dereference_array()
2523          ->array->type->array_size();
2524
2525       uint32_t max_used = sampler + array_size - 1;
2526       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2527          max_used += prog_data->base.binding_table.gather_texture_start;
2528       } else {
2529          max_used += prog_data->base.binding_table.texture_start;
2530       }
2531
2532       brw_mark_surface_used(&prog_data->base, max_used);
2533
2534       /* Emit code to evaluate the actual indexing expression */
2535       nonconst_sampler_index->accept(this);
2536       src_reg temp(this, glsl_type::uint_type);
2537       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2538       sampler_reg = emit_uniformize(temp);
2539    } else {
2540       /* Single sampler, or constant array index; the indexing expression
2541        * is just an immediate.
2542        */
2543       sampler_reg = src_reg(sampler);
2544    }
2545
2546    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2547     * emitting anything other than setting up the constant result.
2548     */
2549    if (ir->op == ir_tg4) {
2550       ir_constant *chan = ir->lod_info.component->as_constant();
2551       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2552       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2553          dst_reg result(this, ir->type);
2554          this->result = src_reg(result);
2555          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2556          return;
2557       }
2558    }
2559
2560    /* Should be lowered by do_lower_texture_projection */
2561    assert(!ir->projector);
2562
2563    /* Should be lowered */
2564    assert(!ir->offset || !ir->offset->type->is_array());
2565
2566    /* Generate code to compute all the subexpression trees.  This has to be
2567     * done before loading any values into MRFs for the sampler message since
2568     * generating these values may involve SEND messages that need the MRFs.
2569     */
2570    src_reg coordinate;
2571    if (ir->coordinate) {
2572       ir->coordinate->accept(this);
2573       coordinate = this->result;
2574    }
2575
2576    src_reg shadow_comparitor;
2577    if (ir->shadow_comparitor) {
2578       ir->shadow_comparitor->accept(this);
2579       shadow_comparitor = this->result;
2580    }
2581
2582    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2583    src_reg offset_value;
2584    if (has_nonconstant_offset) {
2585       ir->offset->accept(this);
2586       offset_value = src_reg(this->result);
2587    }
2588
2589    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2590    src_reg lod, dPdx, dPdy, sample_index, mcs;
2591    switch (ir->op) {
2592    case ir_tex:
2593       lod = src_reg(0.0f);
2594       lod_type = glsl_type::float_type;
2595       break;
2596    case ir_txf:
2597    case ir_txl:
2598    case ir_txs:
2599       ir->lod_info.lod->accept(this);
2600       lod = this->result;
2601       lod_type = ir->lod_info.lod->type;
2602       break;
2603    case ir_query_levels:
2604       lod = src_reg(0);
2605       lod_type = glsl_type::int_type;
2606       break;
2607    case ir_txf_ms:
2608       ir->lod_info.sample_index->accept(this);
2609       sample_index = this->result;
2610       sample_index_type = ir->lod_info.sample_index->type;
2611
2612       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2613          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2614       else
2615          mcs = src_reg(0u);
2616       break;
2617    case ir_txd:
2618       ir->lod_info.grad.dPdx->accept(this);
2619       dPdx = this->result;
2620
2621       ir->lod_info.grad.dPdy->accept(this);
2622       dPdy = this->result;
2623
2624       lod_type = ir->lod_info.grad.dPdx->type;
2625       break;
2626    case ir_txb:
2627    case ir_lod:
2628    case ir_tg4:
2629       break;
2630    }
2631
2632    enum opcode opcode;
2633    switch (ir->op) {
2634    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2635    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2636    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2637    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2638    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2639    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2640    case ir_tg4: opcode = has_nonconstant_offset
2641                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2642    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2643    case ir_txb:
2644       unreachable("TXB is not valid for vertex shaders.");
2645    case ir_lod:
2646       unreachable("LOD is not valid for vertex shaders.");
2647    default:
2648       unreachable("Unrecognized tex op");
2649    }
2650
2651    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2652       opcode, dst_reg(this, ir->type));
2653
2654    if (ir->offset != NULL && !has_nonconstant_offset) {
2655       inst->offset =
2656          brw_texture_offset(ir->offset->as_constant()->value.i,
2657                             ir->offset->type->vector_elements);
2658    }
2659
2660    /* Stuff the channel select bits in the top of the texture offset */
2661    if (ir->op == ir_tg4)
2662       inst->offset |= gather_channel(ir, sampler) << 16;
2663
2664    /* The message header is necessary for:
2665     * - Gen4 (always)
2666     * - Gen9+ for selecting SIMD4x2
2667     * - Texel offsets
2668     * - Gather channel selection
2669     * - Sampler indices too large to fit in a 4-bit value.
2670     */
2671    inst->header_size =
2672       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2673        inst->offset != 0 || ir->op == ir_tg4 ||
2674        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2675    inst->base_mrf = 2;
2676    inst->mlen = inst->header_size + 1; /* always at least one */
2677    inst->dst.writemask = WRITEMASK_XYZW;
2678    inst->shadow_compare = ir->shadow_comparitor != NULL;
2679
2680    inst->src[1] = sampler_reg;
2681
2682    /* MRF for the first parameter */
2683    int param_base = inst->base_mrf + inst->header_size;
2684
2685    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2686       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2687       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2688    } else {
2689       /* Load the coordinate */
2690       /* FINISHME: gl_clamp_mask and saturate */
2691       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2692       int zero_mask = 0xf & ~coord_mask;
2693
2694       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2695                coordinate));
2696
2697       if (zero_mask != 0) {
2698          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2699                   src_reg(0)));
2700       }
2701       /* Load the shadow comparitor */
2702       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2703          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2704                           WRITEMASK_X),
2705                   shadow_comparitor));
2706          inst->mlen++;
2707       }
2708
2709       /* Load the LOD info */
2710       if (ir->op == ir_tex || ir->op == ir_txl) {
2711          int mrf, writemask;
2712          if (devinfo->gen >= 5) {
2713             mrf = param_base + 1;
2714             if (ir->shadow_comparitor) {
2715                writemask = WRITEMASK_Y;
2716                /* mlen already incremented */
2717             } else {
2718                writemask = WRITEMASK_X;
2719                inst->mlen++;
2720             }
2721          } else /* devinfo->gen == 4 */ {
2722             mrf = param_base;
2723             writemask = WRITEMASK_W;
2724          }
2725          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2726       } else if (ir->op == ir_txf) {
2727          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2728       } else if (ir->op == ir_txf_ms) {
2729          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2730                   sample_index));
2731          if (devinfo->gen >= 7) {
2732             /* MCS data is in the first channel of `mcs`, but we need to get it into
2733              * the .y channel of the second vec4 of params, so replicate .x across
2734              * the whole vec4 and then mask off everything except .y
2735              */
2736             mcs.swizzle = BRW_SWIZZLE_XXXX;
2737             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2738                      mcs));
2739          }
2740          inst->mlen++;
2741       } else if (ir->op == ir_txd) {
2742          const glsl_type *type = lod_type;
2743
2744          if (devinfo->gen >= 5) {
2745             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2746             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2747             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2748             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2749             inst->mlen++;
2750
2751             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2752                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2753                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2754                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2755                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2756                inst->mlen++;
2757
2758                if (ir->shadow_comparitor) {
2759                   emit(MOV(dst_reg(MRF, param_base + 2,
2760                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2761                            shadow_comparitor));
2762                }
2763             }
2764          } else /* devinfo->gen == 4 */ {
2765             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2766             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2767             inst->mlen += 2;
2768          }
2769       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2770          if (ir->shadow_comparitor) {
2771             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2772                      shadow_comparitor));
2773          }
2774
2775          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2776                   offset_value));
2777          inst->mlen++;
2778       }
2779    }
2780
2781    emit(inst);
2782
2783    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2784     * spec requires layers.
2785     */
2786    if (ir->op == ir_txs) {
2787       glsl_type const *type = ir->sampler->type;
2788       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2789           type->sampler_array) {
2790          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2791                    writemask(inst->dst, WRITEMASK_Z),
2792                    src_reg(inst->dst), src_reg(6));
2793       }
2794    }
2795
2796    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2797       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2798    }
2799
2800    swizzle_result(ir, src_reg(inst->dst), sampler);
2801 }
2802
2803 /**
2804  * Apply workarounds for Gen6 gather with UINT/SINT
2805  */
2806 void
2807 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2808 {
2809    if (!wa)
2810       return;
2811
2812    int width = (wa & WA_8BIT) ? 8 : 16;
2813    dst_reg dst_f = dst;
2814    dst_f.type = BRW_REGISTER_TYPE_F;
2815
2816    /* Convert from UNORM to UINT */
2817    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2818    emit(MOV(dst, src_reg(dst_f)));
2819
2820    if (wa & WA_SIGN) {
2821       /* Reinterpret the UINT value as a signed INT value by
2822        * shifting the sign bit into place, then shifting back
2823        * preserving sign.
2824        */
2825       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2826       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2827    }
2828 }
2829
2830 /**
2831  * Set up the gather channel based on the swizzle, for gather4.
2832  */
2833 uint32_t
2834 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2835 {
2836    ir_constant *chan = ir->lod_info.component->as_constant();
2837    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2838    switch (swiz) {
2839       case SWIZZLE_X: return 0;
2840       case SWIZZLE_Y:
2841          /* gather4 sampler is broken for green channel on RG32F --
2842           * we must ask for blue instead.
2843           */
2844          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2845             return 2;
2846          return 1;
2847       case SWIZZLE_Z: return 2;
2848       case SWIZZLE_W: return 3;
2849       default:
2850          unreachable("Not reached"); /* zero, one swizzles handled already */
2851    }
2852 }
2853
2854 void
2855 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2856 {
2857    int s = key->tex.swizzles[sampler];
2858
2859    this->result = src_reg(this, ir->type);
2860    dst_reg swizzled_result(this->result);
2861
2862    if (ir->op == ir_query_levels) {
2863       /* # levels is in .w */
2864       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2865       emit(MOV(swizzled_result, orig_val));
2866       return;
2867    }
2868
2869    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2870                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2871       emit(MOV(swizzled_result, orig_val));
2872       return;
2873    }
2874
2875
2876    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2877    int swizzle[4] = {0};
2878
2879    for (int i = 0; i < 4; i++) {
2880       switch (GET_SWZ(s, i)) {
2881       case SWIZZLE_ZERO:
2882          zero_mask |= (1 << i);
2883          break;
2884       case SWIZZLE_ONE:
2885          one_mask |= (1 << i);
2886          break;
2887       default:
2888          copy_mask |= (1 << i);
2889          swizzle[i] = GET_SWZ(s, i);
2890          break;
2891       }
2892    }
2893
2894    if (copy_mask) {
2895       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2896       swizzled_result.writemask = copy_mask;
2897       emit(MOV(swizzled_result, orig_val));
2898    }
2899
2900    if (zero_mask) {
2901       swizzled_result.writemask = zero_mask;
2902       emit(MOV(swizzled_result, src_reg(0.0f)));
2903    }
2904
2905    if (one_mask) {
2906       swizzled_result.writemask = one_mask;
2907       emit(MOV(swizzled_result, src_reg(1.0f)));
2908    }
2909 }
2910
2911 void
2912 vec4_visitor::visit(ir_return *)
2913 {
2914    unreachable("not reached");
2915 }
2916
2917 void
2918 vec4_visitor::visit(ir_discard *)
2919 {
2920    unreachable("not reached");
2921 }
2922
2923 void
2924 vec4_visitor::visit(ir_if *ir)
2925 {
2926    /* Don't point the annotation at the if statement, because then it plus
2927     * the then and else blocks get printed.
2928     */
2929    this->base_ir = ir->condition;
2930
2931    if (devinfo->gen == 6) {
2932       emit_if_gen6(ir);
2933    } else {
2934       enum brw_predicate predicate;
2935       emit_bool_to_cond_code(ir->condition, &predicate);
2936       emit(IF(predicate));
2937    }
2938
2939    visit_instructions(&ir->then_instructions);
2940
2941    if (!ir->else_instructions.is_empty()) {
2942       this->base_ir = ir->condition;
2943       emit(BRW_OPCODE_ELSE);
2944
2945       visit_instructions(&ir->else_instructions);
2946    }
2947
2948    this->base_ir = ir->condition;
2949    emit(BRW_OPCODE_ENDIF);
2950 }
2951
2952 void
2953 vec4_visitor::visit(ir_emit_vertex *)
2954 {
2955    unreachable("not reached");
2956 }
2957
2958 void
2959 vec4_visitor::visit(ir_end_primitive *)
2960 {
2961    unreachable("not reached");
2962 }
2963
2964 void
2965 vec4_visitor::visit(ir_barrier *)
2966 {
2967    unreachable("not reached");
2968 }
2969
2970 void
2971 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2972                                   dst_reg dst, src_reg offset,
2973                                   src_reg src0, src_reg src1)
2974 {
2975    unsigned mlen = 0;
2976
2977    /* Set the atomic operation offset. */
2978    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2979    mlen++;
2980
2981    /* Set the atomic operation arguments. */
2982    if (src0.file != BAD_FILE) {
2983       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2984       mlen++;
2985    }
2986
2987    if (src1.file != BAD_FILE) {
2988       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2989       mlen++;
2990    }
2991
2992    /* Emit the instruction.  Note that this maps to the normal SIMD8
2993     * untyped atomic message on Ivy Bridge, but that's OK because
2994     * unused channels will be masked out.
2995     */
2996    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2997                                  brw_message_reg(0),
2998                                  src_reg(surf_index), src_reg(atomic_op));
2999    inst->mlen = mlen;
3000 }
3001
3002 void
3003 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3004                                         src_reg offset)
3005 {
3006    /* Set the surface read offset. */
3007    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3008
3009    /* Emit the instruction.  Note that this maps to the normal SIMD8
3010     * untyped surface read message, but that's OK because unused
3011     * channels will be masked out.
3012     */
3013    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3014                                  brw_message_reg(0),
3015                                  src_reg(surf_index), src_reg(1));
3016    inst->mlen = 1;
3017 }
3018
3019 void
3020 vec4_visitor::emit_ndc_computation()
3021 {
3022    /* Get the position */
3023    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3024
3025    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3026    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3027    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3028
3029    current_annotation = "NDC";
3030    dst_reg ndc_w = ndc;
3031    ndc_w.writemask = WRITEMASK_W;
3032    src_reg pos_w = pos;
3033    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3034    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3035
3036    dst_reg ndc_xyz = ndc;
3037    ndc_xyz.writemask = WRITEMASK_XYZ;
3038
3039    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3040 }
3041
3042 void
3043 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3044 {
3045    if (devinfo->gen < 6 &&
3046        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3047         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3048       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3049       dst_reg header1_w = header1;
3050       header1_w.writemask = WRITEMASK_W;
3051
3052       emit(MOV(header1, 0u));
3053
3054       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3055          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3056
3057          current_annotation = "Point size";
3058          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3059          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3060       }
3061
3062       if (key->userclip_active) {
3063          current_annotation = "Clipping flags";
3064          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3065          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3066
3067          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3068          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3069          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3070
3071          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3072          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3073          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3074          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3075       }
3076
3077       /* i965 clipping workaround:
3078        * 1) Test for -ve rhw
3079        * 2) If set,
3080        *      set ndc = (0,0,0,0)
3081        *      set ucp[6] = 1
3082        *
3083        * Later, clipping will detect ucp[6] and ensure the primitive is
3084        * clipped against all fixed planes.
3085        */
3086       if (devinfo->has_negative_rhw_bug) {
3087          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3088          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3089          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3090          vec4_instruction *inst;
3091          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3092          inst->predicate = BRW_PREDICATE_NORMAL;
3093          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3094          inst->predicate = BRW_PREDICATE_NORMAL;
3095       }
3096
3097       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3098    } else if (devinfo->gen < 6) {
3099       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3100    } else {
3101       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3102       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3103          dst_reg reg_w = reg;
3104          reg_w.writemask = WRITEMASK_W;
3105          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3106       }
3107       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3108          dst_reg reg_y = reg;
3109          reg_y.writemask = WRITEMASK_Y;
3110          reg_y.type = BRW_REGISTER_TYPE_D;
3111          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3112       }
3113       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3114          dst_reg reg_z = reg;
3115          reg_z.writemask = WRITEMASK_Z;
3116          reg_z.type = BRW_REGISTER_TYPE_D;
3117          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3118       }
3119    }
3120 }
3121
3122 void
3123 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3124 {
3125    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3126     *
3127     *     "If a linked set of shaders forming the vertex stage contains no
3128     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3129     *     application has requested clipping against user clip planes through
3130     *     the API, then the coordinate written to gl_Position is used for
3131     *     comparison against the user clip planes."
3132     *
3133     * This function is only called if the shader didn't write to
3134     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3135     * if the user wrote to it; otherwise we use gl_Position.
3136     */
3137    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3138    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3139       clip_vertex = VARYING_SLOT_POS;
3140    }
3141
3142    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3143         ++i) {
3144       reg.writemask = 1 << i;
3145       emit(DP4(reg,
3146                src_reg(output_reg[clip_vertex]),
3147                src_reg(this->userplane[i + offset])));
3148    }
3149 }
3150
3151 vec4_instruction *
3152 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3153 {
3154    assert (varying < VARYING_SLOT_MAX);
3155    reg.type = output_reg[varying].type;
3156    current_annotation = output_reg_annotation[varying];
3157    /* Copy the register, saturating if necessary */
3158    return emit(MOV(reg, src_reg(output_reg[varying])));
3159 }
3160
3161 void
3162 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3163 {
3164    reg.type = BRW_REGISTER_TYPE_F;
3165
3166    switch (varying) {
3167    case VARYING_SLOT_PSIZ:
3168    {
3169       /* PSIZ is always in slot 0, and is coupled with other flags. */
3170       current_annotation = "indices, point width, clip flags";
3171       emit_psiz_and_flags(reg);
3172       break;
3173    }
3174    case BRW_VARYING_SLOT_NDC:
3175       current_annotation = "NDC";
3176       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3177       break;
3178    case VARYING_SLOT_POS:
3179       current_annotation = "gl_Position";
3180       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3181       break;
3182    case VARYING_SLOT_EDGE:
3183       /* This is present when doing unfilled polygons.  We're supposed to copy
3184        * the edge flag from the user-provided vertex array
3185        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3186        * of that attribute (starts as 1.0f).  This is then used in clipping to
3187        * determine which edges should be drawn as wireframe.
3188        */
3189       current_annotation = "edge flag";
3190       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3191                                     glsl_type::float_type, WRITEMASK_XYZW))));
3192       break;
3193    case BRW_VARYING_SLOT_PAD:
3194       /* No need to write to this slot */
3195       break;
3196    case VARYING_SLOT_COL0:
3197    case VARYING_SLOT_COL1:
3198    case VARYING_SLOT_BFC0:
3199    case VARYING_SLOT_BFC1: {
3200       /* These built-in varyings are only supported in compatibility mode,
3201        * and we only support GS in core profile.  So, this must be a vertex
3202        * shader.
3203        */
3204       assert(stage == MESA_SHADER_VERTEX);
3205       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3206       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3207          inst->saturate = true;
3208       break;
3209    }
3210
3211    default:
3212       emit_generic_urb_slot(reg, varying);
3213       break;
3214    }
3215 }
3216
3217 static int
3218 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3219 {
3220    if (devinfo->gen >= 6) {
3221       /* URB data written (does not include the message header reg) must
3222        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3223        * section 5.4.3.2.2: URB_INTERLEAVED.
3224        *
3225        * URB entries are allocated on a multiple of 1024 bits, so an
3226        * extra 128 bits written here to make the end align to 256 is
3227        * no problem.
3228        */
3229       if ((mlen % 2) != 1)
3230          mlen++;
3231    }
3232
3233    return mlen;
3234 }
3235
3236
3237 /**
3238  * Generates the VUE payload plus the necessary URB write instructions to
3239  * output it.
3240  *
3241  * The VUE layout is documented in Volume 2a.
3242  */
3243 void
3244 vec4_visitor::emit_vertex()
3245 {
3246    /* MRF 0 is reserved for the debugger, so start with message header
3247     * in MRF 1.
3248     */
3249    int base_mrf = 1;
3250    int mrf = base_mrf;
3251    /* In the process of generating our URB write message contents, we
3252     * may need to unspill a register or load from an array.  Those
3253     * reads would use MRFs 14-15.
3254     */
3255    int max_usable_mrf = 13;
3256
3257    /* The following assertion verifies that max_usable_mrf causes an
3258     * even-numbered amount of URB write data, which will meet gen6's
3259     * requirements for length alignment.
3260     */
3261    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3262
3263    /* First mrf is the g0-based message header containing URB handles and
3264     * such.
3265     */
3266    emit_urb_write_header(mrf++);
3267
3268    if (devinfo->gen < 6) {
3269       emit_ndc_computation();
3270    }
3271
3272    /* Lower legacy ff and ClipVertex clipping to clip distances */
3273    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3274       current_annotation = "user clip distances";
3275
3276       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3277       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3278
3279       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3280       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3281    }
3282
3283    /* We may need to split this up into several URB writes, so do them in a
3284     * loop.
3285     */
3286    int slot = 0;
3287    bool complete = false;
3288    do {
3289       /* URB offset is in URB row increments, and each of our MRFs is half of
3290        * one of those, since we're doing interleaved writes.
3291        */
3292       int offset = slot / 2;
3293
3294       mrf = base_mrf + 1;
3295       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3296          emit_urb_slot(dst_reg(MRF, mrf++),
3297                        prog_data->vue_map.slot_to_varying[slot]);
3298
3299          /* If this was max_usable_mrf, we can't fit anything more into this
3300           * URB WRITE.
3301           */
3302          if (mrf > max_usable_mrf) {
3303             slot++;
3304             break;
3305          }
3306       }
3307
3308       complete = slot >= prog_data->vue_map.num_slots;
3309       current_annotation = "URB write";
3310       vec4_instruction *inst = emit_urb_write_opcode(complete);
3311       inst->base_mrf = base_mrf;
3312       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3313       inst->offset += offset;
3314    } while(!complete);
3315 }
3316
3317
3318 src_reg
3319 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3320                                  src_reg *reladdr, int reg_offset)
3321 {
3322    /* Because we store the values to scratch interleaved like our
3323     * vertex data, we need to scale the vec4 index by 2.
3324     */
3325    int message_header_scale = 2;
3326
3327    /* Pre-gen6, the message header uses byte offsets instead of vec4
3328     * (16-byte) offset units.
3329     */
3330    if (devinfo->gen < 6)
3331       message_header_scale *= 16;
3332
3333    if (reladdr) {
3334       src_reg index = src_reg(this, glsl_type::int_type);
3335
3336       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3337                                    src_reg(reg_offset)));
3338       emit_before(block, inst, MUL(dst_reg(index), index,
3339                                    src_reg(message_header_scale)));
3340
3341       return index;
3342    } else {
3343       return src_reg(reg_offset * message_header_scale);
3344    }
3345 }
3346
3347 src_reg
3348 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3349                                        src_reg *reladdr, int reg_offset)
3350 {
3351    if (reladdr) {
3352       src_reg index = src_reg(this, glsl_type::int_type);
3353
3354       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3355                                    src_reg(reg_offset)));
3356
3357       /* Pre-gen6, the message header uses byte offsets instead of vec4
3358        * (16-byte) offset units.
3359        */
3360       if (devinfo->gen < 6) {
3361          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3362       }
3363
3364       return index;
3365    } else if (devinfo->gen >= 8) {
3366       /* Store the offset in a GRF so we can send-from-GRF. */
3367       src_reg offset = src_reg(this, glsl_type::int_type);
3368       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3369       return offset;
3370    } else {
3371       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3372       return src_reg(reg_offset * message_header_scale);
3373    }
3374 }
3375
3376 /**
3377  * Emits an instruction before @inst to load the value named by @orig_src
3378  * from scratch space at @base_offset to @temp.
3379  *
3380  * @base_offset is measured in 32-byte units (the size of a register).
3381  */
3382 void
3383 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3384                                 dst_reg temp, src_reg orig_src,
3385                                 int base_offset)
3386 {
3387    int reg_offset = base_offset + orig_src.reg_offset;
3388    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3389                                       reg_offset);
3390
3391    emit_before(block, inst, SCRATCH_READ(temp, index));
3392 }
3393
3394 /**
3395  * Emits an instruction after @inst to store the value to be written
3396  * to @orig_dst to scratch space at @base_offset, from @temp.
3397  *
3398  * @base_offset is measured in 32-byte units (the size of a register).
3399  */
3400 void
3401 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3402                                  int base_offset)
3403 {
3404    int reg_offset = base_offset + inst->dst.reg_offset;
3405    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3406                                       reg_offset);
3407
3408    /* Create a temporary register to store *inst's result in.
3409     *
3410     * We have to be careful in MOVing from our temporary result register in
3411     * the scratch write.  If we swizzle from channels of the temporary that
3412     * weren't initialized, it will confuse live interval analysis, which will
3413     * make spilling fail to make progress.
3414     */
3415    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3416                                        inst->dst.type),
3417                                 brw_swizzle_for_mask(inst->dst.writemask));
3418    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3419                                        inst->dst.writemask));
3420    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3421    write->predicate = inst->predicate;
3422    write->ir = inst->ir;
3423    write->annotation = inst->annotation;
3424    inst->insert_after(block, write);
3425
3426    inst->dst.file = temp.file;
3427    inst->dst.reg = temp.reg;
3428    inst->dst.reg_offset = temp.reg_offset;
3429    inst->dst.reladdr = NULL;
3430 }
3431
3432 /**
3433  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3434  * adds the scratch read(s) before \p inst. The function also checks for
3435  * recursive reladdr scratch accesses, issuing the corresponding scratch
3436  * loads and rewriting reladdr references accordingly.
3437  *
3438  * \return \p src if it did not require a scratch load, otherwise, the
3439  * register holding the result of the scratch load that the caller should
3440  * use to rewrite src.
3441  */
3442 src_reg
3443 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3444                                    vec4_instruction *inst, src_reg src)
3445 {
3446    /* Resolve recursive reladdr scratch access by calling ourselves
3447     * with src.reladdr
3448     */
3449    if (src.reladdr)
3450       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3451                                           *src.reladdr);
3452
3453    /* Now handle scratch access on src */
3454    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3455       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3456       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3457       src.reg = temp.reg;
3458       src.reg_offset = temp.reg_offset;
3459       src.reladdr = NULL;
3460    }
3461
3462    return src;
3463 }
3464
3465 /**
3466  * We can't generally support array access in GRF space, because a
3467  * single instruction's destination can only span 2 contiguous
3468  * registers.  So, we send all GRF arrays that get variable index
3469  * access to scratch space.
3470  */
3471 void
3472 vec4_visitor::move_grf_array_access_to_scratch()
3473 {
3474    int scratch_loc[this->alloc.count];
3475    memset(scratch_loc, -1, sizeof(scratch_loc));
3476
3477    /* First, calculate the set of virtual GRFs that need to be punted
3478     * to scratch due to having any array access on them, and where in
3479     * scratch.
3480     */
3481    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3482       if (inst->dst.file == GRF && inst->dst.reladdr) {
3483          if (scratch_loc[inst->dst.reg] == -1) {
3484             scratch_loc[inst->dst.reg] = last_scratch;
3485             last_scratch += this->alloc.sizes[inst->dst.reg];
3486          }
3487
3488          for (src_reg *iter = inst->dst.reladdr;
3489               iter->reladdr;
3490               iter = iter->reladdr) {
3491             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3492                scratch_loc[iter->reg] = last_scratch;
3493                last_scratch += this->alloc.sizes[iter->reg];
3494             }
3495          }
3496       }
3497
3498       for (int i = 0 ; i < 3; i++) {
3499          for (src_reg *iter = &inst->src[i];
3500               iter->reladdr;
3501               iter = iter->reladdr) {
3502             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3503                scratch_loc[iter->reg] = last_scratch;
3504                last_scratch += this->alloc.sizes[iter->reg];
3505             }
3506          }
3507       }
3508    }
3509
3510    /* Now, for anything that will be accessed through scratch, rewrite
3511     * it to load/store.  Note that this is a _safe list walk, because
3512     * we may generate a new scratch_write instruction after the one
3513     * we're processing.
3514     */
3515    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3516       /* Set up the annotation tracking for new generated instructions. */
3517       base_ir = inst->ir;
3518       current_annotation = inst->annotation;
3519
3520       /* First handle scratch access on the dst. Notice we have to handle
3521        * the case where the dst's reladdr also points to scratch space.
3522        */
3523       if (inst->dst.reladdr)
3524          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3525                                                    *inst->dst.reladdr);
3526
3527       /* Now that we have handled any (possibly recursive) reladdr scratch
3528        * accesses for dst we can safely do the scratch write for dst itself
3529        */
3530       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3531          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3532
3533       /* Now handle scratch access on any src. In this case, since inst->src[i]
3534        * already is a src_reg, we can just call emit_resolve_reladdr with
3535        * inst->src[i] and it will take care of handling scratch loads for
3536        * both src and src.reladdr (recursively).
3537        */
3538       for (int i = 0 ; i < 3; i++) {
3539          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3540                                              inst->src[i]);
3541       }
3542    }
3543 }
3544
3545 /**
3546  * Emits an instruction before @inst to load the value named by @orig_src
3547  * from the pull constant buffer (surface) at @base_offset to @temp.
3548  */
3549 void
3550 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3551                                       dst_reg temp, src_reg orig_src,
3552                                       int base_offset)
3553 {
3554    int reg_offset = base_offset + orig_src.reg_offset;
3555    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3556    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3557                                              reg_offset);
3558
3559    emit_pull_constant_load_reg(temp,
3560                                index,
3561                                offset,
3562                                block, inst);
3563 }
3564
3565 /**
3566  * Implements array access of uniforms by inserting a
3567  * PULL_CONSTANT_LOAD instruction.
3568  *
3569  * Unlike temporary GRF array access (where we don't support it due to
3570  * the difficulty of doing relative addressing on instruction
3571  * destinations), we could potentially do array access of uniforms
3572  * that were loaded in GRF space as push constants.  In real-world
3573  * usage we've seen, though, the arrays being used are always larger
3574  * than we could load as push constants, so just always move all
3575  * uniform array access out to a pull constant buffer.
3576  */
3577 void
3578 vec4_visitor::move_uniform_array_access_to_pull_constants()
3579 {
3580    int pull_constant_loc[this->uniforms];
3581    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3582    bool nested_reladdr;
3583
3584    /* Walk through and find array access of uniforms.  Put a copy of that
3585     * uniform in the pull constant buffer.
3586     *
3587     * Note that we don't move constant-indexed accesses to arrays.  No
3588     * testing has been done of the performance impact of this choice.
3589     */
3590    do {
3591       nested_reladdr = false;
3592
3593       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3594          for (int i = 0 ; i < 3; i++) {
3595             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3596                continue;
3597
3598             int uniform = inst->src[i].reg;
3599
3600             if (inst->src[i].reladdr->reladdr)
3601                nested_reladdr = true;  /* will need another pass */
3602
3603             /* If this array isn't already present in the pull constant buffer,
3604              * add it.
3605              */
3606             if (pull_constant_loc[uniform] == -1) {
3607                const gl_constant_value **values =
3608                   &stage_prog_data->param[uniform * 4];
3609
3610                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3611
3612                assert(uniform < uniform_array_size);
3613                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3614                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3615                      = values[j];
3616                }
3617             }
3618
3619             /* Set up the annotation tracking for new generated instructions. */
3620             base_ir = inst->ir;
3621             current_annotation = inst->annotation;
3622
3623             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3624
3625             emit_pull_constant_load(block, inst, temp, inst->src[i],
3626                                     pull_constant_loc[uniform]);
3627
3628             inst->src[i].file = temp.file;
3629             inst->src[i].reg = temp.reg;
3630             inst->src[i].reg_offset = temp.reg_offset;
3631             inst->src[i].reladdr = NULL;
3632          }
3633       }
3634    } while (nested_reladdr);
3635
3636    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3637     * no need to track them as larger-than-vec4 objects.  This will be
3638     * relied on in cutting out unused uniform vectors from push
3639     * constants.
3640     */
3641    split_uniform_registers();
3642 }
3643
3644 void
3645 vec4_visitor::resolve_ud_negate(src_reg *reg)
3646 {
3647    if (reg->type != BRW_REGISTER_TYPE_UD ||
3648        !reg->negate)
3649       return;
3650
3651    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3652    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3653    *reg = temp;
3654 }
3655
3656 /**
3657  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3658  *
3659  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3660  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3661  */
3662 void
3663 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3664 {
3665    assert(devinfo->gen <= 5);
3666
3667    if (!rvalue->type->is_boolean())
3668       return;
3669
3670    src_reg and_result = src_reg(this, rvalue->type);
3671    src_reg neg_result = src_reg(this, rvalue->type);
3672    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3673    emit(MOV(dst_reg(neg_result), negate(and_result)));
3674    *reg = neg_result;
3675 }
3676
3677 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3678                            void *log_data,
3679                            struct gl_program *prog,
3680                            const struct brw_vue_prog_key *key,
3681                            struct brw_vue_prog_data *prog_data,
3682                            struct gl_shader_program *shader_prog,
3683                            gl_shader_stage stage,
3684                            void *mem_ctx,
3685                            bool no_spills,
3686                            int shader_time_index)
3687    : backend_shader(compiler, log_data, mem_ctx,
3688                     shader_prog, prog, &prog_data->base, stage),
3689      key(key),
3690      prog_data(prog_data),
3691      sanity_param_count(0),
3692      fail_msg(NULL),
3693      first_non_payload_grf(0),
3694      need_all_constants_in_pull_buffer(false),
3695      no_spills(no_spills),
3696      shader_time_index(shader_time_index),
3697      last_scratch(0)
3698 {
3699    this->failed = false;
3700
3701    this->base_ir = NULL;
3702    this->current_annotation = NULL;
3703    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3704
3705    this->variable_ht = hash_table_ctor(0,
3706                                        hash_table_pointer_hash,
3707                                        hash_table_pointer_compare);
3708
3709    this->virtual_grf_start = NULL;
3710    this->virtual_grf_end = NULL;
3711    this->live_intervals = NULL;
3712
3713    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3714
3715    this->uniforms = 0;
3716
3717    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3718     * at least one. See setup_uniforms() in brw_vec4.cpp.
3719     */
3720    this->uniform_array_size = 1;
3721    if (prog_data) {
3722       this->uniform_array_size =
3723          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3724    }
3725
3726    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3727    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3728 }
3729
3730 vec4_visitor::~vec4_visitor()
3731 {
3732    hash_table_dtor(this->variable_ht);
3733 }
3734
3735
3736 void
3737 vec4_visitor::fail(const char *format, ...)
3738 {
3739    va_list va;
3740    char *msg;
3741
3742    if (failed)
3743       return;
3744
3745    failed = true;
3746
3747    va_start(va, format);
3748    msg = ralloc_vasprintf(mem_ctx, format, va);
3749    va_end(va);
3750    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3751
3752    this->fail_msg = msg;
3753
3754    if (debug_enabled) {
3755       fprintf(stderr, "%s",  msg);
3756    }
3757 }
3758
3759 } /* namespace brw */