src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (storage->builtin)
 690          continue;
 691
 692       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 693           (storage->name[namelen] != 0 &&
 694            storage->name[namelen] != '.' &&
 695            storage->name[namelen] != '[')) {
 696          continue;
 697       }
 698
 699       gl_constant_value *components = storage->storage;
 700       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 701                                storage->type->matrix_columns);
 702
 703       for (unsigned s = 0; s < vector_count; s++) {
 704          assert(uniforms < uniform_array_size);
 705          uniform_vector_size[uniforms] = storage->type->vector_elements;
 706
 707          int i;
 708          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 709             stage_prog_data->param[uniforms * 4 + i] = components;
 710             components++;
 711          }
 712          for (; i < 4; i++) {
 713             static gl_constant_value zero = { 0.0 };
 714             stage_prog_data->param[uniforms * 4 + i] = &zero;
 715          }
 716
 717          uniforms++;
 718       }
 719    }
 720 }
 721
 722 void
 723 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 724 {
 725    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 726       assert(this->uniforms < uniform_array_size);
 727       this->uniform_vector_size[this->uniforms] = 4;
 728       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 729       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 730       for (int j = 0; j < 4; ++j) {
 731          stage_prog_data->param[this->uniforms * 4 + j] =
 732             (gl_constant_value *) &clip_planes[i][j];
 733       }
 734       ++this->uniforms;
 735    }
 736 }
 737
 738 /* Our support for builtin uniforms is even scarier than non-builtin.
 739  * It sits on top of the PROG_STATE_VAR parameters that are
 740  * automatically updated from GL context state.
 741  */
 742 void
 743 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 744 {
 745    const ir_state_slot *const slots = ir->get_state_slots();
 746    assert(slots != NULL);
 747
 748    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 749       /* This state reference has already been setup by ir_to_mesa,
 750        * but we'll get the same index back here.  We can reference
 751        * ParameterValues directly, since unlike brw_fs.cpp, we never
 752        * add new state references during compile.
 753        */
 754       int index = _mesa_add_state_reference(this->prog->Parameters,
 755                                             (gl_state_index *)slots[i].tokens);
 756       gl_constant_value *values =
 757          &this->prog->Parameters->ParameterValues[index][0];
 758
 759       assert(this->uniforms < uniform_array_size);
 760
 761       for (unsigned j = 0; j < 4; j++)
 762          stage_prog_data->param[this->uniforms * 4 + j] =
 763             &values[GET_SWZ(slots[i].swizzle, j)];
 764
 765       this->uniform_vector_size[this->uniforms] =
 766          (ir->type->is_scalar() || ir->type->is_vector() ||
 767           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 768
 769       this->uniforms++;
 770    }
 771 }
 772
 773 dst_reg *
 774 vec4_visitor::variable_storage(ir_variable *var)
 775 {
 776    return (dst_reg *)hash_table_find(this->variable_ht, var);
 777 }
 778
 779 void
 780 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 781                                      enum brw_predicate *predicate)
 782 {
 783    ir_expression *expr = ir->as_expression();
 784
 785    *predicate = BRW_PREDICATE_NORMAL;
 786
 787    if (expr && expr->operation != ir_binop_ubo_load) {
 788       src_reg op[3];
 789       vec4_instruction *inst;
 790
 791       assert(expr->get_num_operands() <= 3);
 792       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 793          expr->operands[i]->accept(this);
 794          op[i] = this->result;
 795
 796          resolve_ud_negate(&op[i]);
 797       }
 798
 799       switch (expr->operation) {
 800       case ir_unop_logic_not:
 801          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 802          inst->conditional_mod = BRW_CONDITIONAL_Z;
 803          break;
 804
 805       case ir_binop_logic_xor:
 806          if (devinfo->gen <= 5) {
 807             src_reg temp = src_reg(this, ir->type);
 808             emit(XOR(dst_reg(temp), op[0], op[1]));
 809             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 810          } else {
 811             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 812          }
 813          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 814          break;
 815
 816       case ir_binop_logic_or:
 817          if (devinfo->gen <= 5) {
 818             src_reg temp = src_reg(this, ir->type);
 819             emit(OR(dst_reg(temp), op[0], op[1]));
 820             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 821          } else {
 822             inst = emit(OR(dst_null_d(), op[0], op[1]));
 823          }
 824          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 825          break;
 826
 827       case ir_binop_logic_and:
 828          if (devinfo->gen <= 5) {
 829             src_reg temp = src_reg(this, ir->type);
 830             emit(AND(dst_reg(temp), op[0], op[1]));
 831             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 832          } else {
 833             inst = emit(AND(dst_null_d(), op[0], op[1]));
 834          }
 835          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836          break;
 837
 838       case ir_unop_f2b:
 839          if (devinfo->gen >= 6) {
 840             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 841          } else {
 842             inst = emit(MOV(dst_null_f(), op[0]));
 843             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 844          }
 845          break;
 846
 847       case ir_unop_i2b:
 848          if (devinfo->gen >= 6) {
 849             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 850          } else {
 851             inst = emit(MOV(dst_null_d(), op[0]));
 852             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 853          }
 854          break;
 855
 856       case ir_binop_all_equal:
 857          if (devinfo->gen <= 5) {
 858             resolve_bool_comparison(expr->operands[0], &op[0]);
 859             resolve_bool_comparison(expr->operands[1], &op[1]);
 860          }
 861          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 862          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 863          break;
 864
 865       case ir_binop_any_nequal:
 866          if (devinfo->gen <= 5) {
 867             resolve_bool_comparison(expr->operands[0], &op[0]);
 868             resolve_bool_comparison(expr->operands[1], &op[1]);
 869          }
 870          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 871          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 872          break;
 873
 874       case ir_unop_any:
 875          if (devinfo->gen <= 5) {
 876             resolve_bool_comparison(expr->operands[0], &op[0]);
 877          }
 878          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 879          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 880          break;
 881
 882       case ir_binop_greater:
 883       case ir_binop_gequal:
 884       case ir_binop_less:
 885       case ir_binop_lequal:
 886       case ir_binop_equal:
 887       case ir_binop_nequal:
 888          if (devinfo->gen <= 5) {
 889             resolve_bool_comparison(expr->operands[0], &op[0]);
 890             resolve_bool_comparison(expr->operands[1], &op[1]);
 891          }
 892          emit(CMP(dst_null_d(), op[0], op[1],
 893                   brw_conditional_for_comparison(expr->operation)));
 894          break;
 895
 896       case ir_triop_csel: {
 897          /* Expand the boolean condition into the flag register. */
 898          inst = emit(MOV(dst_null_d(), op[0]));
 899          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 900
 901          /* Select which boolean to return. */
 902          dst_reg temp(this, expr->operands[1]->type);
 903          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 904          inst->predicate = BRW_PREDICATE_NORMAL;
 905
 906          /* Expand the result to a condition code. */
 907          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 908          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 909          break;
 910       }
 911
 912       default:
 913          unreachable("not reached");
 914       }
 915       return;
 916    }
 917
 918    ir->accept(this);
 919
 920    resolve_ud_negate(&this->result);
 921
 922    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 923    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 924 }
 925
 926 /**
 927  * Emit a gen6 IF statement with the comparison folded into the IF
 928  * instruction.
 929  */
 930 void
 931 vec4_visitor::emit_if_gen6(ir_if *ir)
 932 {
 933    ir_expression *expr = ir->condition->as_expression();
 934
 935    if (expr && expr->operation != ir_binop_ubo_load) {
 936       src_reg op[3];
 937       dst_reg temp;
 938
 939       assert(expr->get_num_operands() <= 3);
 940       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 941          expr->operands[i]->accept(this);
 942          op[i] = this->result;
 943       }
 944
 945       switch (expr->operation) {
 946       case ir_unop_logic_not:
 947          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 948          return;
 949
 950       case ir_binop_logic_xor:
 951          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 952          return;
 953
 954       case ir_binop_logic_or:
 955          temp = dst_reg(this, glsl_type::bool_type);
 956          emit(OR(temp, op[0], op[1]));
 957          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 958          return;
 959
 960       case ir_binop_logic_and:
 961          temp = dst_reg(this, glsl_type::bool_type);
 962          emit(AND(temp, op[0], op[1]));
 963          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 964          return;
 965
 966       case ir_unop_f2b:
 967          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_unop_i2b:
 971          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_binop_greater:
 975       case ir_binop_gequal:
 976       case ir_binop_less:
 977       case ir_binop_lequal:
 978       case ir_binop_equal:
 979       case ir_binop_nequal:
 980          emit(IF(op[0], op[1],
 981                  brw_conditional_for_comparison(expr->operation)));
 982          return;
 983
 984       case ir_binop_all_equal:
 985          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 986          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 987          return;
 988
 989       case ir_binop_any_nequal:
 990          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 991          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 992          return;
 993
 994       case ir_unop_any:
 995          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 996          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 997          return;
 998
 999       case ir_triop_csel: {
1000          /* Expand the boolean condition into the flag register. */
1001          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1002          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1003
1004          /* Select which boolean to return. */
1005          dst_reg temp(this, expr->operands[1]->type);
1006          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1007          inst->predicate = BRW_PREDICATE_NORMAL;
1008
1009          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1010          return;
1011       }
1012
1013       default:
1014          unreachable("not reached");
1015       }
1016       return;
1017    }
1018
1019    ir->condition->accept(this);
1020
1021    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_variable *ir)
1026 {
1027    dst_reg *reg = NULL;
1028
1029    if (variable_storage(ir))
1030       return;
1031
1032    switch (ir->data.mode) {
1033    case ir_var_shader_in:
1034       assert(ir->data.location != -1);
1035       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1036       break;
1037
1038    case ir_var_shader_out:
1039       assert(ir->data.location != -1);
1040       reg = new(mem_ctx) dst_reg(this, ir->type);
1041
1042       for (int i = 0; i < type_size(ir->type); i++) {
1043          output_reg[ir->data.location + i] = *reg;
1044          output_reg[ir->data.location + i].reg_offset = i;
1045          output_reg[ir->data.location + i].type =
1046             brw_type_for_base_type(ir->type->get_scalar_type());
1047          output_reg_annotation[ir->data.location + i] = ir->name;
1048       }
1049       break;
1050
1051    case ir_var_auto:
1052    case ir_var_temporary:
1053       reg = new(mem_ctx) dst_reg(this, ir->type);
1054       break;
1055
1056    case ir_var_uniform:
1057       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1058
1059       /* Thanks to the lower_ubo_reference pass, we will see only
1060        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1061        * variables, so no need for them to be in variable_ht.
1062        *
1063        * Some uniforms, such as samplers and atomic counters, have no actual
1064        * storage, so we should ignore them.
1065        */
1066       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1067          return;
1068
1069       /* Track how big the whole uniform variable is, in case we need to put a
1070        * copy of its data into pull constants for array access.
1071        */
1072       assert(this->uniforms < uniform_array_size);
1073       this->uniform_size[this->uniforms] = type_size(ir->type);
1074
1075       if (!strncmp(ir->name, "gl_", 3)) {
1076          setup_builtin_uniform_values(ir);
1077       } else {
1078          setup_uniform_values(ir);
1079       }
1080       break;
1081
1082    case ir_var_system_value:
1083       reg = make_reg_for_system_value(ir);
1084       break;
1085
1086    default:
1087       unreachable("not reached");
1088    }
1089
1090    reg->type = brw_type_for_base_type(ir->type);
1091    hash_table_insert(this->variable_ht, reg, ir);
1092 }
1093
1094 void
1095 vec4_visitor::visit(ir_loop *ir)
1096 {
1097    /* We don't want debugging output to print the whole body of the
1098     * loop as the annotation.
1099     */
1100    this->base_ir = NULL;
1101
1102    emit(BRW_OPCODE_DO);
1103
1104    visit_instructions(&ir->body_instructions);
1105
1106    emit(BRW_OPCODE_WHILE);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop_jump *ir)
1111 {
1112    switch (ir->mode) {
1113    case ir_loop_jump::jump_break:
1114       emit(BRW_OPCODE_BREAK);
1115       break;
1116    case ir_loop_jump::jump_continue:
1117       emit(BRW_OPCODE_CONTINUE);
1118       break;
1119    }
1120 }
1121
1122
1123 void
1124 vec4_visitor::visit(ir_function_signature *)
1125 {
1126    unreachable("not reached");
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_function *ir)
1131 {
1132    /* Ignore function bodies other than main() -- we shouldn't see calls to
1133     * them since they should all be inlined.
1134     */
1135    if (strcmp(ir->name, "main") == 0) {
1136       const ir_function_signature *sig;
1137       exec_list empty;
1138
1139       sig = ir->matching_signature(NULL, &empty, false);
1140
1141       assert(sig);
1142
1143       visit_instructions(&sig->body);
1144    }
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_mad(ir_expression *ir)
1149 {
1150    /* 3-src instructions were introduced in gen6. */
1151    if (devinfo->gen < 6)
1152       return false;
1153
1154    /* MAD can only handle floating-point data. */
1155    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1156       return false;
1157
1158    ir_rvalue *nonmul;
1159    ir_expression *mul;
1160    bool mul_negate, mul_abs;
1161
1162    for (int i = 0; i < 2; i++) {
1163       mul_negate = false;
1164       mul_abs = false;
1165
1166       mul = ir->operands[i]->as_expression();
1167       nonmul = ir->operands[1 - i];
1168
1169       if (mul && mul->operation == ir_unop_abs) {
1170          mul = mul->operands[0]->as_expression();
1171          mul_abs = true;
1172       } else if (mul && mul->operation == ir_unop_neg) {
1173          mul = mul->operands[0]->as_expression();
1174          mul_negate = true;
1175       }
1176
1177       if (mul && mul->operation == ir_binop_mul)
1178          break;
1179    }
1180
1181    if (!mul || mul->operation != ir_binop_mul)
1182       return false;
1183
1184    nonmul->accept(this);
1185    src_reg src0 = fix_3src_operand(this->result);
1186
1187    mul->operands[0]->accept(this);
1188    src_reg src1 = fix_3src_operand(this->result);
1189    src1.negate ^= mul_negate;
1190    src1.abs = mul_abs;
1191    if (mul_abs)
1192       src1.negate = false;
1193
1194    mul->operands[1]->accept(this);
1195    src_reg src2 = fix_3src_operand(this->result);
1196    src2.abs = mul_abs;
1197    if (mul_abs)
1198       src2.negate = false;
1199
1200    this->result = src_reg(this, ir->type);
1201    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1202
1203    return true;
1204 }
1205
1206 bool
1207 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1208 {
1209    /* This optimization relies on CMP setting the destination to 0 when
1210     * false.  Early hardware only sets the least significant bit, and
1211     * leaves the other bits undefined.  So we can't use it.
1212     */
1213    if (devinfo->gen < 6)
1214       return false;
1215
1216    ir_expression *const cmp = ir->operands[0]->as_expression();
1217
1218    if (cmp == NULL)
1219       return false;
1220
1221    switch (cmp->operation) {
1222    case ir_binop_less:
1223    case ir_binop_greater:
1224    case ir_binop_lequal:
1225    case ir_binop_gequal:
1226    case ir_binop_equal:
1227    case ir_binop_nequal:
1228       break;
1229
1230    default:
1231       return false;
1232    }
1233
1234    cmp->operands[0]->accept(this);
1235    const src_reg cmp_src0 = this->result;
1236
1237    cmp->operands[1]->accept(this);
1238    const src_reg cmp_src1 = this->result;
1239
1240    this->result = src_reg(this, ir->type);
1241
1242    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1243             brw_conditional_for_comparison(cmp->operation)));
1244
1245    /* If the comparison is false, this->result will just happen to be zero.
1246     */
1247    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1248                                        this->result, src_reg(1.0f));
1249    inst->predicate = BRW_PREDICATE_NORMAL;
1250    inst->predicate_inverse = true;
1251
1252    return true;
1253 }
1254
1255 void
1256 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1257                           src_reg src0, src_reg src1)
1258 {
1259    vec4_instruction *inst;
1260
1261    if (devinfo->gen >= 6) {
1262       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1263       inst->conditional_mod = conditionalmod;
1264    } else {
1265       emit(CMP(dst, src0, src1, conditionalmod));
1266
1267       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268       inst->predicate = BRW_PREDICATE_NORMAL;
1269    }
1270 }
1271
1272 void
1273 vec4_visitor::emit_lrp(const dst_reg &dst,
1274                        const src_reg &x, const src_reg &y, const src_reg &a)
1275 {
1276    if (devinfo->gen >= 6) {
1277       /* Note that the instruction's argument order is reversed from GLSL
1278        * and the IR.
1279        */
1280       emit(LRP(dst,
1281                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1282    } else {
1283       /* Earlier generations don't support three source operations, so we
1284        * need to emit x*(1-a) + y*a.
1285        */
1286       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1287       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1288       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1289       y_times_a.writemask           = dst.writemask;
1290       one_minus_a.writemask         = dst.writemask;
1291       x_times_one_minus_a.writemask = dst.writemask;
1292
1293       emit(MUL(y_times_a, y, a));
1294       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1295       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1296       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1297    }
1298 }
1299
1300 /**
1301  * Emits the instructions needed to perform a pull constant load. before_block
1302  * and before_inst can be NULL in which case the instruction will be appended
1303  * to the end of the instruction list.
1304  */
1305 void
1306 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1307                                           src_reg surf_index,
1308                                           src_reg offset_reg,
1309                                           bblock_t *before_block,
1310                                           vec4_instruction *before_inst)
1311 {
1312    assert((before_inst == NULL && before_block == NULL) ||
1313           (before_inst && before_block));
1314
1315    vec4_instruction *pull;
1316
1317    if (devinfo->gen >= 9) {
1318       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1319       src_reg header(this, glsl_type::uvec4_type, 2);
1320
1321       pull = new(mem_ctx)
1322          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1323                           dst_reg(header));
1324
1325       if (before_inst)
1326          emit_before(before_block, before_inst, pull);
1327       else
1328          emit(pull);
1329
1330       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1331                                  offset_reg.type);
1332       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1333
1334       if (before_inst)
1335          emit_before(before_block, before_inst, pull);
1336       else
1337          emit(pull);
1338
1339       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1340                                            dst,
1341                                            surf_index,
1342                                            header);
1343       pull->mlen = 2;
1344       pull->header_size = 1;
1345    } else if (devinfo->gen >= 7) {
1346       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1347
1348       grf_offset.type = offset_reg.type;
1349
1350       pull = MOV(grf_offset, offset_reg);
1351
1352       if (before_inst)
1353          emit_before(before_block, before_inst, pull);
1354       else
1355          emit(pull);
1356
1357       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1358                                            dst,
1359                                            surf_index,
1360                                            src_reg(grf_offset));
1361       pull->mlen = 1;
1362    } else {
1363       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1364                                            dst,
1365                                            surf_index,
1366                                            offset_reg);
1367       pull->base_mrf = 14;
1368       pull->mlen = 1;
1369    }
1370
1371    if (before_inst)
1372       emit_before(before_block, before_inst, pull);
1373    else
1374       emit(pull);
1375 }
1376
1377 void
1378 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1379 {
1380    const src_reg chan_index(this, glsl_type::uint_type);
1381
1382    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1383       ->force_writemask_all = true;
1384    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1385       ->force_writemask_all = true;
1386 }
1387
1388 void
1389 vec4_visitor::visit(ir_expression *ir)
1390 {
1391    unsigned int operand;
1392    src_reg op[ARRAY_SIZE(ir->operands)];
1393    vec4_instruction *inst;
1394
1395    if (ir->operation == ir_binop_add) {
1396       if (try_emit_mad(ir))
1397          return;
1398    }
1399
1400    if (ir->operation == ir_unop_b2f) {
1401       if (try_emit_b2f_of_compare(ir))
1402          return;
1403    }
1404
1405    /* Storage for our result.  Ideally for an assignment we'd be using
1406     * the actual storage for the result here, instead.
1407     */
1408    dst_reg result_dst(this, ir->type);
1409    src_reg result_src(result_dst);
1410
1411    if (ir->operation == ir_triop_csel) {
1412       ir->operands[1]->accept(this);
1413       op[1] = this->result;
1414       ir->operands[2]->accept(this);
1415       op[2] = this->result;
1416
1417       enum brw_predicate predicate;
1418       emit_bool_to_cond_code(ir->operands[0], &predicate);
1419       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1420       inst->predicate = predicate;
1421       this->result = result_src;
1422       return;
1423    }
1424
1425    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1426       this->result.file = BAD_FILE;
1427       ir->operands[operand]->accept(this);
1428       if (this->result.file == BAD_FILE) {
1429          fprintf(stderr, "Failed to get tree for expression operand:\n");
1430          ir->operands[operand]->fprint(stderr);
1431          exit(1);
1432       }
1433       op[operand] = this->result;
1434
1435       /* Matrix expression operands should have been broken down to vector
1436        * operations already.
1437        */
1438       assert(!ir->operands[operand]->type->is_matrix());
1439    }
1440
1441    /* If nothing special happens, this is the result. */
1442    this->result = result_src;
1443
1444    switch (ir->operation) {
1445    case ir_unop_logic_not:
1446       emit(NOT(result_dst, op[0]));
1447       break;
1448    case ir_unop_neg:
1449       op[0].negate = !op[0].negate;
1450       emit(MOV(result_dst, op[0]));
1451       break;
1452    case ir_unop_abs:
1453       op[0].abs = true;
1454       op[0].negate = false;
1455       emit(MOV(result_dst, op[0]));
1456       break;
1457
1458    case ir_unop_sign:
1459       if (ir->type->is_float()) {
1460          /* AND(val, 0x80000000) gives the sign bit.
1461           *
1462           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1463           * zero.
1464           */
1465          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1466
1467          op[0].type = BRW_REGISTER_TYPE_UD;
1468          result_dst.type = BRW_REGISTER_TYPE_UD;
1469          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1470
1471          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1472          inst->predicate = BRW_PREDICATE_NORMAL;
1473
1474          this->result.type = BRW_REGISTER_TYPE_F;
1475       } else {
1476          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1477           *               -> non-negative val generates 0x00000000.
1478           *  Predicated OR sets 1 if val is positive.
1479           */
1480          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1481
1482          emit(ASR(result_dst, op[0], src_reg(31)));
1483
1484          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1485          inst->predicate = BRW_PREDICATE_NORMAL;
1486       }
1487       break;
1488
1489    case ir_unop_rcp:
1490       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1491       break;
1492
1493    case ir_unop_exp2:
1494       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1495       break;
1496    case ir_unop_log2:
1497       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1498       break;
1499    case ir_unop_exp:
1500    case ir_unop_log:
1501       unreachable("not reached: should be handled by ir_explog_to_explog2");
1502    case ir_unop_sin:
1503       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1504       break;
1505    case ir_unop_cos:
1506       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1507       break;
1508
1509    case ir_unop_dFdx:
1510    case ir_unop_dFdx_coarse:
1511    case ir_unop_dFdx_fine:
1512    case ir_unop_dFdy:
1513    case ir_unop_dFdy_coarse:
1514    case ir_unop_dFdy_fine:
1515       unreachable("derivatives not valid in vertex shader");
1516
1517    case ir_unop_bitfield_reverse:
1518       emit(BFREV(result_dst, op[0]));
1519       break;
1520    case ir_unop_bit_count:
1521       emit(CBIT(result_dst, op[0]));
1522       break;
1523    case ir_unop_find_msb: {
1524       src_reg temp = src_reg(this, glsl_type::uint_type);
1525
1526       inst = emit(FBH(dst_reg(temp), op[0]));
1527       inst->dst.writemask = WRITEMASK_XYZW;
1528
1529       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1530        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1531        * subtract the result from 31 to convert the MSB count into an LSB count.
1532        */
1533
1534       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1535       temp.swizzle = BRW_SWIZZLE_NOOP;
1536       emit(MOV(result_dst, temp));
1537
1538       src_reg src_tmp = src_reg(result_dst);
1539       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1540
1541       src_tmp.negate = true;
1542       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1543       inst->predicate = BRW_PREDICATE_NORMAL;
1544       break;
1545    }
1546    case ir_unop_find_lsb:
1547       emit(FBL(result_dst, op[0]));
1548       break;
1549    case ir_unop_saturate:
1550       inst = emit(MOV(result_dst, op[0]));
1551       inst->saturate = true;
1552       break;
1553
1554    case ir_unop_noise:
1555       unreachable("not reached: should be handled by lower_noise");
1556
1557    case ir_binop_add:
1558       emit(ADD(result_dst, op[0], op[1]));
1559       break;
1560    case ir_binop_sub:
1561       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1562
1563    case ir_binop_mul:
1564       if (devinfo->gen < 8 && ir->type->is_integer()) {
1565          /* For integer multiplication, the MUL uses the low 16 bits of one of
1566           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1567           * accumulates in the contribution of the upper 16 bits of that
1568           * operand.  If we can determine that one of the args is in the low
1569           * 16 bits, though, we can just emit a single MUL.
1570           */
1571          if (ir->operands[0]->is_uint16_constant()) {
1572             if (devinfo->gen < 7)
1573                emit(MUL(result_dst, op[0], op[1]));
1574             else
1575                emit(MUL(result_dst, op[1], op[0]));
1576          } else if (ir->operands[1]->is_uint16_constant()) {
1577             if (devinfo->gen < 7)
1578                emit(MUL(result_dst, op[1], op[0]));
1579             else
1580                emit(MUL(result_dst, op[0], op[1]));
1581          } else {
1582             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1583
1584             emit(MUL(acc, op[0], op[1]));
1585             emit(MACH(dst_null_d(), op[0], op[1]));
1586             emit(MOV(result_dst, src_reg(acc)));
1587          }
1588       } else {
1589          emit(MUL(result_dst, op[0], op[1]));
1590       }
1591       break;
1592    case ir_binop_imul_high: {
1593       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1594
1595       emit(MUL(acc, op[0], op[1]));
1596       emit(MACH(result_dst, op[0], op[1]));
1597       break;
1598    }
1599    case ir_binop_div:
1600       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1601       assert(ir->type->is_integer());
1602       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1603       break;
1604
1605    case ir_binop_carry:
1606       unreachable("Should have been lowered by carry_to_arith().");
1607
1608    case ir_binop_borrow:
1609       unreachable("Should have been lowered by borrow_to_arith().");
1610
1611    case ir_binop_mod:
1612       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1613       assert(ir->type->is_integer());
1614       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1615       break;
1616
1617    case ir_binop_less:
1618    case ir_binop_greater:
1619    case ir_binop_lequal:
1620    case ir_binop_gequal:
1621    case ir_binop_equal:
1622    case ir_binop_nequal: {
1623       if (devinfo->gen <= 5) {
1624          resolve_bool_comparison(ir->operands[0], &op[0]);
1625          resolve_bool_comparison(ir->operands[1], &op[1]);
1626       }
1627       emit(CMP(result_dst, op[0], op[1],
1628                brw_conditional_for_comparison(ir->operation)));
1629       break;
1630    }
1631
1632    case ir_binop_all_equal:
1633       if (devinfo->gen <= 5) {
1634          resolve_bool_comparison(ir->operands[0], &op[0]);
1635          resolve_bool_comparison(ir->operands[1], &op[1]);
1636       }
1637
1638       /* "==" operator producing a scalar boolean. */
1639       if (ir->operands[0]->type->is_vector() ||
1640           ir->operands[1]->type->is_vector()) {
1641          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1642          emit(MOV(result_dst, src_reg(0)));
1643          inst = emit(MOV(result_dst, src_reg(~0)));
1644          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1645       } else {
1646          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1647       }
1648       break;
1649    case ir_binop_any_nequal:
1650       if (devinfo->gen <= 5) {
1651          resolve_bool_comparison(ir->operands[0], &op[0]);
1652          resolve_bool_comparison(ir->operands[1], &op[1]);
1653       }
1654
1655       /* "!=" operator producing a scalar boolean. */
1656       if (ir->operands[0]->type->is_vector() ||
1657           ir->operands[1]->type->is_vector()) {
1658          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1659
1660          emit(MOV(result_dst, src_reg(0)));
1661          inst = emit(MOV(result_dst, src_reg(~0)));
1662          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1663       } else {
1664          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1665       }
1666       break;
1667
1668    case ir_unop_any:
1669       if (devinfo->gen <= 5) {
1670          resolve_bool_comparison(ir->operands[0], &op[0]);
1671       }
1672       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1673       emit(MOV(result_dst, src_reg(0)));
1674
1675       inst = emit(MOV(result_dst, src_reg(~0)));
1676       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1677       break;
1678
1679    case ir_binop_logic_xor:
1680       emit(XOR(result_dst, op[0], op[1]));
1681       break;
1682
1683    case ir_binop_logic_or:
1684       emit(OR(result_dst, op[0], op[1]));
1685       break;
1686
1687    case ir_binop_logic_and:
1688       emit(AND(result_dst, op[0], op[1]));
1689       break;
1690
1691    case ir_binop_dot:
1692       assert(ir->operands[0]->type->is_vector());
1693       assert(ir->operands[0]->type == ir->operands[1]->type);
1694       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1695       break;
1696
1697    case ir_unop_sqrt:
1698       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1699       break;
1700    case ir_unop_rsq:
1701       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1702       break;
1703
1704    case ir_unop_bitcast_i2f:
1705    case ir_unop_bitcast_u2f:
1706       this->result = op[0];
1707       this->result.type = BRW_REGISTER_TYPE_F;
1708       break;
1709
1710    case ir_unop_bitcast_f2i:
1711       this->result = op[0];
1712       this->result.type = BRW_REGISTER_TYPE_D;
1713       break;
1714
1715    case ir_unop_bitcast_f2u:
1716       this->result = op[0];
1717       this->result.type = BRW_REGISTER_TYPE_UD;
1718       break;
1719
1720    case ir_unop_i2f:
1721    case ir_unop_i2u:
1722    case ir_unop_u2i:
1723    case ir_unop_u2f:
1724    case ir_unop_f2i:
1725    case ir_unop_f2u:
1726       emit(MOV(result_dst, op[0]));
1727       break;
1728    case ir_unop_b2i:
1729    case ir_unop_b2f:
1730       if (devinfo->gen <= 5) {
1731          resolve_bool_comparison(ir->operands[0], &op[0]);
1732       }
1733       emit(MOV(result_dst, negate(op[0])));
1734       break;
1735    case ir_unop_f2b:
1736       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1737       break;
1738    case ir_unop_i2b:
1739       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1740       break;
1741
1742    case ir_unop_trunc:
1743       emit(RNDZ(result_dst, op[0]));
1744       break;
1745    case ir_unop_ceil: {
1746          src_reg tmp = src_reg(this, ir->type);
1747          op[0].negate = !op[0].negate;
1748          emit(RNDD(dst_reg(tmp), op[0]));
1749          tmp.negate = true;
1750          emit(MOV(result_dst, tmp));
1751       }
1752       break;
1753    case ir_unop_floor:
1754       inst = emit(RNDD(result_dst, op[0]));
1755       break;
1756    case ir_unop_fract:
1757       inst = emit(FRC(result_dst, op[0]));
1758       break;
1759    case ir_unop_round_even:
1760       emit(RNDE(result_dst, op[0]));
1761       break;
1762
1763    case ir_binop_min:
1764       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1765       break;
1766    case ir_binop_max:
1767       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1768       break;
1769
1770    case ir_binop_pow:
1771       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1772       break;
1773
1774    case ir_unop_bit_not:
1775       inst = emit(NOT(result_dst, op[0]));
1776       break;
1777    case ir_binop_bit_and:
1778       inst = emit(AND(result_dst, op[0], op[1]));
1779       break;
1780    case ir_binop_bit_xor:
1781       inst = emit(XOR(result_dst, op[0], op[1]));
1782       break;
1783    case ir_binop_bit_or:
1784       inst = emit(OR(result_dst, op[0], op[1]));
1785       break;
1786
1787    case ir_binop_lshift:
1788       inst = emit(SHL(result_dst, op[0], op[1]));
1789       break;
1790
1791    case ir_binop_rshift:
1792       if (ir->type->base_type == GLSL_TYPE_INT)
1793          inst = emit(ASR(result_dst, op[0], op[1]));
1794       else
1795          inst = emit(SHR(result_dst, op[0], op[1]));
1796       break;
1797
1798    case ir_binop_bfm:
1799       emit(BFI1(result_dst, op[0], op[1]));
1800       break;
1801
1802    case ir_binop_ubo_load: {
1803       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1804       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1805       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1806       src_reg offset;
1807
1808       /* Now, load the vector from that offset. */
1809       assert(ir->type->is_vector() || ir->type->is_scalar());
1810
1811       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1812       packed_consts.type = result.type;
1813       src_reg surf_index;
1814
1815       if (const_uniform_block) {
1816          /* The block index is a constant, so just emit the binding table entry
1817           * as an immediate.
1818           */
1819          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1820                               const_uniform_block->value.u[0]);
1821       } else {
1822          /* The block index is not a constant. Evaluate the index expression
1823           * per-channel and add the base UBO index; we have to select a value
1824           * from any live channel.
1825           */
1826          surf_index = src_reg(this, glsl_type::uint_type);
1827          emit(ADD(dst_reg(surf_index), op[0],
1828                   src_reg(prog_data->base.binding_table.ubo_start)));
1829          emit_uniformize(dst_reg(surf_index), surf_index);
1830
1831          /* Assume this may touch any UBO. It would be nice to provide
1832           * a tighter bound, but the array information is already lowered away.
1833           */
1834          brw_mark_surface_used(&prog_data->base,
1835                                prog_data->base.binding_table.ubo_start +
1836                                shader_prog->NumUniformBlocks - 1);
1837       }
1838
1839       if (const_offset_ir) {
1840          if (devinfo->gen >= 8) {
1841             /* Store the offset in a GRF so we can send-from-GRF. */
1842             offset = src_reg(this, glsl_type::int_type);
1843             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1844          } else {
1845             /* Immediates are fine on older generations since they'll be moved
1846              * to a (potentially fake) MRF at the generator level.
1847              */
1848             offset = src_reg(const_offset / 16);
1849          }
1850       } else {
1851          offset = src_reg(this, glsl_type::uint_type);
1852          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1853       }
1854
1855       emit_pull_constant_load_reg(dst_reg(packed_consts),
1856                                   surf_index,
1857                                   offset,
1858                                   NULL, NULL /* before_block/inst */);
1859
1860       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1861       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1862                                             const_offset % 16 / 4,
1863                                             const_offset % 16 / 4,
1864                                             const_offset % 16 / 4);
1865
1866       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1867       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1868          emit(CMP(result_dst, packed_consts, src_reg(0u),
1869                   BRW_CONDITIONAL_NZ));
1870       } else {
1871          emit(MOV(result_dst, packed_consts));
1872       }
1873       break;
1874    }
1875
1876    case ir_binop_vector_extract:
1877       unreachable("should have been lowered by vec_index_to_cond_assign");
1878
1879    case ir_triop_fma:
1880       op[0] = fix_3src_operand(op[0]);
1881       op[1] = fix_3src_operand(op[1]);
1882       op[2] = fix_3src_operand(op[2]);
1883       /* Note that the instruction's argument order is reversed from GLSL
1884        * and the IR.
1885        */
1886       emit(MAD(result_dst, op[2], op[1], op[0]));
1887       break;
1888
1889    case ir_triop_lrp:
1890       emit_lrp(result_dst, op[0], op[1], op[2]);
1891       break;
1892
1893    case ir_triop_csel:
1894       unreachable("already handled above");
1895       break;
1896
1897    case ir_triop_bfi:
1898       op[0] = fix_3src_operand(op[0]);
1899       op[1] = fix_3src_operand(op[1]);
1900       op[2] = fix_3src_operand(op[2]);
1901       emit(BFI2(result_dst, op[0], op[1], op[2]));
1902       break;
1903
1904    case ir_triop_bitfield_extract:
1905       op[0] = fix_3src_operand(op[0]);
1906       op[1] = fix_3src_operand(op[1]);
1907       op[2] = fix_3src_operand(op[2]);
1908       /* Note that the instruction's argument order is reversed from GLSL
1909        * and the IR.
1910        */
1911       emit(BFE(result_dst, op[2], op[1], op[0]));
1912       break;
1913
1914    case ir_triop_vector_insert:
1915       unreachable("should have been lowered by lower_vector_insert");
1916
1917    case ir_quadop_bitfield_insert:
1918       unreachable("not reached: should be handled by "
1919               "bitfield_insert_to_bfm_bfi\n");
1920
1921    case ir_quadop_vector:
1922       unreachable("not reached: should be handled by lower_quadop_vector");
1923
1924    case ir_unop_pack_half_2x16:
1925       emit_pack_half_2x16(result_dst, op[0]);
1926       break;
1927    case ir_unop_unpack_half_2x16:
1928       emit_unpack_half_2x16(result_dst, op[0]);
1929       break;
1930    case ir_unop_unpack_unorm_4x8:
1931       emit_unpack_unorm_4x8(result_dst, op[0]);
1932       break;
1933    case ir_unop_unpack_snorm_4x8:
1934       emit_unpack_snorm_4x8(result_dst, op[0]);
1935       break;
1936    case ir_unop_pack_unorm_4x8:
1937       emit_pack_unorm_4x8(result_dst, op[0]);
1938       break;
1939    case ir_unop_pack_snorm_4x8:
1940       emit_pack_snorm_4x8(result_dst, op[0]);
1941       break;
1942    case ir_unop_pack_snorm_2x16:
1943    case ir_unop_pack_unorm_2x16:
1944    case ir_unop_unpack_snorm_2x16:
1945    case ir_unop_unpack_unorm_2x16:
1946       unreachable("not reached: should be handled by lower_packing_builtins");
1947    case ir_unop_unpack_half_2x16_split_x:
1948    case ir_unop_unpack_half_2x16_split_y:
1949    case ir_binop_pack_half_2x16_split:
1950    case ir_unop_interpolate_at_centroid:
1951    case ir_binop_interpolate_at_sample:
1952    case ir_binop_interpolate_at_offset:
1953       unreachable("not reached: should not occur in vertex shader");
1954    case ir_binop_ldexp:
1955       unreachable("not reached: should be handled by ldexp_to_arith()");
1956    case ir_unop_d2f:
1957    case ir_unop_f2d:
1958    case ir_unop_d2i:
1959    case ir_unop_i2d:
1960    case ir_unop_d2u:
1961    case ir_unop_u2d:
1962    case ir_unop_d2b:
1963    case ir_unop_pack_double_2x32:
1964    case ir_unop_unpack_double_2x32:
1965    case ir_unop_frexp_sig:
1966    case ir_unop_frexp_exp:
1967       unreachable("fp64 todo");
1968    }
1969 }
1970
1971
1972 void
1973 vec4_visitor::visit(ir_swizzle *ir)
1974 {
1975    /* Note that this is only swizzles in expressions, not those on the left
1976     * hand side of an assignment, which do write masking.  See ir_assignment
1977     * for that.
1978     */
1979    const unsigned swz = brw_compose_swizzle(
1980       brw_swizzle_for_size(ir->type->vector_elements),
1981       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1982
1983    ir->val->accept(this);
1984    this->result = swizzle(this->result, swz);
1985 }
1986
1987 void
1988 vec4_visitor::visit(ir_dereference_variable *ir)
1989 {
1990    const struct glsl_type *type = ir->type;
1991    dst_reg *reg = variable_storage(ir->var);
1992
1993    if (!reg) {
1994       fail("Failed to find variable storage for %s\n", ir->var->name);
1995       this->result = src_reg(brw_null_reg());
1996       return;
1997    }
1998
1999    this->result = src_reg(*reg);
2000
2001    /* System values get their swizzle from the dst_reg writemask */
2002    if (ir->var->data.mode == ir_var_system_value)
2003       return;
2004
2005    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2006       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2007 }
2008
2009
2010 int
2011 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2012 {
2013    /* Under normal circumstances array elements are stored consecutively, so
2014     * the stride is equal to the size of the array element.
2015     */
2016    return type_size(ir->type);
2017 }
2018
2019
2020 void
2021 vec4_visitor::visit(ir_dereference_array *ir)
2022 {
2023    ir_constant *constant_index;
2024    src_reg src;
2025    int array_stride = compute_array_stride(ir);
2026
2027    constant_index = ir->array_index->constant_expression_value();
2028
2029    ir->array->accept(this);
2030    src = this->result;
2031
2032    if (constant_index) {
2033       src.reg_offset += constant_index->value.i[0] * array_stride;
2034    } else {
2035       /* Variable index array dereference.  It eats the "vec4" of the
2036        * base of the array and an index that offsets the Mesa register
2037        * index.
2038        */
2039       ir->array_index->accept(this);
2040
2041       src_reg index_reg;
2042
2043       if (array_stride == 1) {
2044          index_reg = this->result;
2045       } else {
2046          index_reg = src_reg(this, glsl_type::int_type);
2047
2048          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2049       }
2050
2051       if (src.reladdr) {
2052          src_reg temp = src_reg(this, glsl_type::int_type);
2053
2054          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2055
2056          index_reg = temp;
2057       }
2058
2059       src.reladdr = ralloc(mem_ctx, src_reg);
2060       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2061    }
2062
2063    /* If the type is smaller than a vec4, replicate the last channel out. */
2064    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2065       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2066    else
2067       src.swizzle = BRW_SWIZZLE_NOOP;
2068    src.type = brw_type_for_base_type(ir->type);
2069
2070    this->result = src;
2071 }
2072
2073 void
2074 vec4_visitor::visit(ir_dereference_record *ir)
2075 {
2076    unsigned int i;
2077    const glsl_type *struct_type = ir->record->type;
2078    int offset = 0;
2079
2080    ir->record->accept(this);
2081
2082    for (i = 0; i < struct_type->length; i++) {
2083       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2084          break;
2085       offset += type_size(struct_type->fields.structure[i].type);
2086    }
2087
2088    /* If the type is smaller than a vec4, replicate the last channel out. */
2089    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2090       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2091    else
2092       this->result.swizzle = BRW_SWIZZLE_NOOP;
2093    this->result.type = brw_type_for_base_type(ir->type);
2094
2095    this->result.reg_offset += offset;
2096 }
2097
2098 /**
2099  * We want to be careful in assignment setup to hit the actual storage
2100  * instead of potentially using a temporary like we might with the
2101  * ir_dereference handler.
2102  */
2103 static dst_reg
2104 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2105 {
2106    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2107     * access of a vector, it must be separated into a series conditional moves
2108     * before reaching this point (see ir_vec_index_to_cond_assign).
2109     */
2110    assert(ir->as_dereference());
2111    ir_dereference_array *deref_array = ir->as_dereference_array();
2112    if (deref_array) {
2113       assert(!deref_array->array->type->is_vector());
2114    }
2115
2116    /* Use the rvalue deref handler for the most part.  We'll ignore
2117     * swizzles in it and write swizzles using writemask, though.
2118     */
2119    ir->accept(v);
2120    return dst_reg(v->result);
2121 }
2122
2123 void
2124 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2125                               const struct glsl_type *type,
2126                               enum brw_predicate predicate)
2127 {
2128    if (type->base_type == GLSL_TYPE_STRUCT) {
2129       for (unsigned int i = 0; i < type->length; i++) {
2130          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2131       }
2132       return;
2133    }
2134
2135    if (type->is_array()) {
2136       for (unsigned int i = 0; i < type->length; i++) {
2137          emit_block_move(dst, src, type->fields.array, predicate);
2138       }
2139       return;
2140    }
2141
2142    if (type->is_matrix()) {
2143       const struct glsl_type *vec_type;
2144
2145       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2146                                          type->vector_elements, 1);
2147
2148       for (int i = 0; i < type->matrix_columns; i++) {
2149          emit_block_move(dst, src, vec_type, predicate);
2150       }
2151       return;
2152    }
2153
2154    assert(type->is_scalar() || type->is_vector());
2155
2156    dst->type = brw_type_for_base_type(type);
2157    src->type = dst->type;
2158
2159    dst->writemask = (1 << type->vector_elements) - 1;
2160
2161    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2162
2163    vec4_instruction *inst = emit(MOV(*dst, *src));
2164    inst->predicate = predicate;
2165
2166    dst->reg_offset++;
2167    src->reg_offset++;
2168 }
2169
2170
2171 /* If the RHS processing resulted in an instruction generating a
2172  * temporary value, and it would be easy to rewrite the instruction to
2173  * generate its result right into the LHS instead, do so.  This ends
2174  * up reliably removing instructions where it can be tricky to do so
2175  * later without real UD chain information.
2176  */
2177 bool
2178 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2179                                      dst_reg dst,
2180                                      src_reg src,
2181                                      vec4_instruction *pre_rhs_inst,
2182                                      vec4_instruction *last_rhs_inst)
2183 {
2184    /* This could be supported, but it would take more smarts. */
2185    if (ir->condition)
2186       return false;
2187
2188    if (pre_rhs_inst == last_rhs_inst)
2189       return false; /* No instructions generated to work with. */
2190
2191    /* Make sure the last instruction generated our source reg. */
2192    if (src.file != GRF ||
2193        src.file != last_rhs_inst->dst.file ||
2194        src.reg != last_rhs_inst->dst.reg ||
2195        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2196        src.reladdr ||
2197        src.abs ||
2198        src.negate ||
2199        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2200       return false;
2201
2202    /* Check that that last instruction fully initialized the channels
2203     * we want to use, in the order we want to use them.  We could
2204     * potentially reswizzle the operands of many instructions so that
2205     * we could handle out of order channels, but don't yet.
2206     */
2207
2208    for (unsigned i = 0; i < 4; i++) {
2209       if (dst.writemask & (1 << i)) {
2210          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2211             return false;
2212
2213          if (BRW_GET_SWZ(src.swizzle, i) != i)
2214             return false;
2215       }
2216    }
2217
2218    /* Success!  Rewrite the instruction. */
2219    last_rhs_inst->dst.file = dst.file;
2220    last_rhs_inst->dst.reg = dst.reg;
2221    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2222    last_rhs_inst->dst.reladdr = dst.reladdr;
2223    last_rhs_inst->dst.writemask &= dst.writemask;
2224
2225    return true;
2226 }
2227
2228 void
2229 vec4_visitor::visit(ir_assignment *ir)
2230 {
2231    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2232    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2233
2234    if (!ir->lhs->type->is_scalar() &&
2235        !ir->lhs->type->is_vector()) {
2236       ir->rhs->accept(this);
2237       src_reg src = this->result;
2238
2239       if (ir->condition) {
2240          emit_bool_to_cond_code(ir->condition, &predicate);
2241       }
2242
2243       /* emit_block_move doesn't account for swizzles in the source register.
2244        * This should be ok, since the source register is a structure or an
2245        * array, and those can't be swizzled.  But double-check to be sure.
2246        */
2247       assert(src.swizzle ==
2248              (ir->rhs->type->is_matrix()
2249               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2250               : BRW_SWIZZLE_NOOP));
2251
2252       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2253       return;
2254    }
2255
2256    /* Now we're down to just a scalar/vector with writemasks. */
2257    int i;
2258
2259    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2260    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2261
2262    ir->rhs->accept(this);
2263
2264    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2265
2266    int swizzles[4];
2267    int src_chan = 0;
2268
2269    assert(ir->lhs->type->is_vector() ||
2270           ir->lhs->type->is_scalar());
2271    dst.writemask = ir->write_mask;
2272
2273    /* Swizzle a small RHS vector into the channels being written.
2274     *
2275     * glsl ir treats write_mask as dictating how many channels are
2276     * present on the RHS while in our instructions we need to make
2277     * those channels appear in the slots of the vec4 they're written to.
2278     */
2279    for (int i = 0; i < 4; i++)
2280       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2281
2282    src_reg src = swizzle(this->result,
2283                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2284                                       swizzles[2], swizzles[3]));
2285
2286    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2287       return;
2288    }
2289
2290    if (ir->condition) {
2291       emit_bool_to_cond_code(ir->condition, &predicate);
2292    }
2293
2294    for (i = 0; i < type_size(ir->lhs->type); i++) {
2295       vec4_instruction *inst = emit(MOV(dst, src));
2296       inst->predicate = predicate;
2297
2298       dst.reg_offset++;
2299       src.reg_offset++;
2300    }
2301 }
2302
2303 void
2304 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2305 {
2306    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2307       foreach_in_list(ir_constant, field_value, &ir->components) {
2308          emit_constant_values(dst, field_value);
2309       }
2310       return;
2311    }
2312
2313    if (ir->type->is_array()) {
2314       for (unsigned int i = 0; i < ir->type->length; i++) {
2315          emit_constant_values(dst, ir->array_elements[i]);
2316       }
2317       return;
2318    }
2319
2320    if (ir->type->is_matrix()) {
2321       for (int i = 0; i < ir->type->matrix_columns; i++) {
2322          float *vec = &ir->value.f[i * ir->type->vector_elements];
2323
2324          for (int j = 0; j < ir->type->vector_elements; j++) {
2325             dst->writemask = 1 << j;
2326             dst->type = BRW_REGISTER_TYPE_F;
2327
2328             emit(MOV(*dst, src_reg(vec[j])));
2329          }
2330          dst->reg_offset++;
2331       }
2332       return;
2333    }
2334
2335    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2336
2337    for (int i = 0; i < ir->type->vector_elements; i++) {
2338       if (!(remaining_writemask & (1 << i)))
2339          continue;
2340
2341       dst->writemask = 1 << i;
2342       dst->type = brw_type_for_base_type(ir->type);
2343
2344       /* Find other components that match the one we're about to
2345        * write.  Emits fewer instructions for things like vec4(0.5,
2346        * 1.5, 1.5, 1.5).
2347        */
2348       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2349          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2350             if (ir->value.b[i] == ir->value.b[j])
2351                dst->writemask |= (1 << j);
2352          } else {
2353             /* u, i, and f storage all line up, so no need for a
2354              * switch case for comparing each type.
2355              */
2356             if (ir->value.u[i] == ir->value.u[j])
2357                dst->writemask |= (1 << j);
2358          }
2359       }
2360
2361       switch (ir->type->base_type) {
2362       case GLSL_TYPE_FLOAT:
2363          emit(MOV(*dst, src_reg(ir->value.f[i])));
2364          break;
2365       case GLSL_TYPE_INT:
2366          emit(MOV(*dst, src_reg(ir->value.i[i])));
2367          break;
2368       case GLSL_TYPE_UINT:
2369          emit(MOV(*dst, src_reg(ir->value.u[i])));
2370          break;
2371       case GLSL_TYPE_BOOL:
2372          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2373          break;
2374       default:
2375          unreachable("Non-float/uint/int/bool constant");
2376       }
2377
2378       remaining_writemask &= ~dst->writemask;
2379    }
2380    dst->reg_offset++;
2381 }
2382
2383 void
2384 vec4_visitor::visit(ir_constant *ir)
2385 {
2386    dst_reg dst = dst_reg(this, ir->type);
2387    this->result = src_reg(dst);
2388
2389    emit_constant_values(&dst, ir);
2390 }
2391
2392 void
2393 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2394 {
2395    ir_dereference *deref = static_cast<ir_dereference *>(
2396       ir->actual_parameters.get_head());
2397    ir_variable *location = deref->variable_referenced();
2398    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2399                           location->data.binding);
2400
2401    /* Calculate the surface offset */
2402    src_reg offset(this, glsl_type::uint_type);
2403    ir_dereference_array *deref_array = deref->as_dereference_array();
2404    if (deref_array) {
2405       deref_array->array_index->accept(this);
2406
2407       src_reg tmp(this, glsl_type::uint_type);
2408       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2409       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2410    } else {
2411       offset = location->data.atomic.offset;
2412    }
2413
2414    /* Emit the appropriate machine instruction */
2415    const char *callee = ir->callee->function_name();
2416    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2417
2418    if (!strcmp("__intrinsic_atomic_read", callee)) {
2419       emit_untyped_surface_read(surf_index, dst, offset);
2420
2421    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2422       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2423                           src_reg(), src_reg());
2424
2425    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2426       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2427                           src_reg(), src_reg());
2428    }
2429 }
2430
2431 void
2432 vec4_visitor::visit(ir_call *ir)
2433 {
2434    const char *callee = ir->callee->function_name();
2435
2436    if (!strcmp("__intrinsic_atomic_read", callee) ||
2437        !strcmp("__intrinsic_atomic_increment", callee) ||
2438        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2439       visit_atomic_counter_intrinsic(ir);
2440    } else {
2441       unreachable("Unsupported intrinsic.");
2442    }
2443 }
2444
2445 src_reg
2446 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2447 {
2448    vec4_instruction *inst =
2449       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2450                                     dst_reg(this, glsl_type::uvec4_type));
2451    inst->base_mrf = 2;
2452    inst->src[1] = sampler;
2453
2454    int param_base;
2455
2456    if (devinfo->gen >= 9) {
2457       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2458       vec4_instruction *header_inst = new(mem_ctx)
2459          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2460                           dst_reg(MRF, inst->base_mrf));
2461
2462       emit(header_inst);
2463
2464       inst->mlen = 2;
2465       inst->header_size = 1;
2466       param_base = inst->base_mrf + 1;
2467    } else {
2468       inst->mlen = 1;
2469       param_base = inst->base_mrf;
2470    }
2471
2472    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2473    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2474    int zero_mask = 0xf & ~coord_mask;
2475
2476    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2477             coordinate));
2478
2479    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2480             src_reg(0)));
2481
2482    emit(inst);
2483    return src_reg(inst->dst);
2484 }
2485
2486 static bool
2487 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2488 {
2489    if (devinfo->gen < 8 && !devinfo->is_haswell)
2490       return false;
2491
2492    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2493 }
2494
2495 void
2496 vec4_visitor::visit(ir_texture *ir)
2497 {
2498    uint32_t sampler =
2499       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2500
2501    ir_rvalue *nonconst_sampler_index =
2502       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2503
2504    /* Handle non-constant sampler array indexing */
2505    src_reg sampler_reg;
2506    if (nonconst_sampler_index) {
2507       /* The highest sampler which may be used by this operation is
2508        * the last element of the array. Mark it here, because the generator
2509        * doesn't have enough information to determine the bound.
2510        */
2511       uint32_t array_size = ir->sampler->as_dereference_array()
2512          ->array->type->array_size();
2513
2514       uint32_t max_used = sampler + array_size - 1;
2515       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2516          max_used += prog_data->base.binding_table.gather_texture_start;
2517       } else {
2518          max_used += prog_data->base.binding_table.texture_start;
2519       }
2520
2521       brw_mark_surface_used(&prog_data->base, max_used);
2522
2523       /* Emit code to evaluate the actual indexing expression */
2524       nonconst_sampler_index->accept(this);
2525       dst_reg temp(this, glsl_type::uint_type);
2526       emit(ADD(temp, this->result, src_reg(sampler)));
2527       emit_uniformize(temp, src_reg(temp));
2528
2529       sampler_reg = src_reg(temp);
2530    } else {
2531       /* Single sampler, or constant array index; the indexing expression
2532        * is just an immediate.
2533        */
2534       sampler_reg = src_reg(sampler);
2535    }
2536
2537    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2538     * emitting anything other than setting up the constant result.
2539     */
2540    if (ir->op == ir_tg4) {
2541       ir_constant *chan = ir->lod_info.component->as_constant();
2542       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2543       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2544          dst_reg result(this, ir->type);
2545          this->result = src_reg(result);
2546          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2547          return;
2548       }
2549    }
2550
2551    /* Should be lowered by do_lower_texture_projection */
2552    assert(!ir->projector);
2553
2554    /* Should be lowered */
2555    assert(!ir->offset || !ir->offset->type->is_array());
2556
2557    /* Generate code to compute all the subexpression trees.  This has to be
2558     * done before loading any values into MRFs for the sampler message since
2559     * generating these values may involve SEND messages that need the MRFs.
2560     */
2561    src_reg coordinate;
2562    if (ir->coordinate) {
2563       ir->coordinate->accept(this);
2564       coordinate = this->result;
2565    }
2566
2567    src_reg shadow_comparitor;
2568    if (ir->shadow_comparitor) {
2569       ir->shadow_comparitor->accept(this);
2570       shadow_comparitor = this->result;
2571    }
2572
2573    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2574    src_reg offset_value;
2575    if (has_nonconstant_offset) {
2576       ir->offset->accept(this);
2577       offset_value = src_reg(this->result);
2578    }
2579
2580    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2581    src_reg lod, dPdx, dPdy, sample_index, mcs;
2582    switch (ir->op) {
2583    case ir_tex:
2584       lod = src_reg(0.0f);
2585       lod_type = glsl_type::float_type;
2586       break;
2587    case ir_txf:
2588    case ir_txl:
2589    case ir_txs:
2590       ir->lod_info.lod->accept(this);
2591       lod = this->result;
2592       lod_type = ir->lod_info.lod->type;
2593       break;
2594    case ir_query_levels:
2595       lod = src_reg(0);
2596       lod_type = glsl_type::int_type;
2597       break;
2598    case ir_txf_ms:
2599       ir->lod_info.sample_index->accept(this);
2600       sample_index = this->result;
2601       sample_index_type = ir->lod_info.sample_index->type;
2602
2603       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2604          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2605       else
2606          mcs = src_reg(0u);
2607       break;
2608    case ir_txd:
2609       ir->lod_info.grad.dPdx->accept(this);
2610       dPdx = this->result;
2611
2612       ir->lod_info.grad.dPdy->accept(this);
2613       dPdy = this->result;
2614
2615       lod_type = ir->lod_info.grad.dPdx->type;
2616       break;
2617    case ir_txb:
2618    case ir_lod:
2619    case ir_tg4:
2620       break;
2621    }
2622
2623    enum opcode opcode;
2624    switch (ir->op) {
2625    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2626    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2627    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2628    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2629    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2630    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2631    case ir_tg4: opcode = has_nonconstant_offset
2632                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2633    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2634    case ir_txb:
2635       unreachable("TXB is not valid for vertex shaders.");
2636    case ir_lod:
2637       unreachable("LOD is not valid for vertex shaders.");
2638    default:
2639       unreachable("Unrecognized tex op");
2640    }
2641
2642    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2643       opcode, dst_reg(this, ir->type));
2644
2645    if (ir->offset != NULL && !has_nonconstant_offset) {
2646       inst->offset =
2647          brw_texture_offset(ir->offset->as_constant()->value.i,
2648                             ir->offset->type->vector_elements);
2649    }
2650
2651    /* Stuff the channel select bits in the top of the texture offset */
2652    if (ir->op == ir_tg4)
2653       inst->offset |= gather_channel(ir, sampler) << 16;
2654
2655    /* The message header is necessary for:
2656     * - Gen4 (always)
2657     * - Gen9+ for selecting SIMD4x2
2658     * - Texel offsets
2659     * - Gather channel selection
2660     * - Sampler indices too large to fit in a 4-bit value.
2661     */
2662    inst->header_size =
2663       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2664        inst->offset != 0 || ir->op == ir_tg4 ||
2665        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2666    inst->base_mrf = 2;
2667    inst->mlen = inst->header_size + 1; /* always at least one */
2668    inst->dst.writemask = WRITEMASK_XYZW;
2669    inst->shadow_compare = ir->shadow_comparitor != NULL;
2670
2671    inst->src[1] = sampler_reg;
2672
2673    /* MRF for the first parameter */
2674    int param_base = inst->base_mrf + inst->header_size;
2675
2676    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2677       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2678       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2679    } else {
2680       /* Load the coordinate */
2681       /* FINISHME: gl_clamp_mask and saturate */
2682       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2683       int zero_mask = 0xf & ~coord_mask;
2684
2685       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2686                coordinate));
2687
2688       if (zero_mask != 0) {
2689          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2690                   src_reg(0)));
2691       }
2692       /* Load the shadow comparitor */
2693       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2694          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2695                           WRITEMASK_X),
2696                   shadow_comparitor));
2697          inst->mlen++;
2698       }
2699
2700       /* Load the LOD info */
2701       if (ir->op == ir_tex || ir->op == ir_txl) {
2702          int mrf, writemask;
2703          if (devinfo->gen >= 5) {
2704             mrf = param_base + 1;
2705             if (ir->shadow_comparitor) {
2706                writemask = WRITEMASK_Y;
2707                /* mlen already incremented */
2708             } else {
2709                writemask = WRITEMASK_X;
2710                inst->mlen++;
2711             }
2712          } else /* devinfo->gen == 4 */ {
2713             mrf = param_base;
2714             writemask = WRITEMASK_W;
2715          }
2716          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2717       } else if (ir->op == ir_txf) {
2718          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2719       } else if (ir->op == ir_txf_ms) {
2720          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2721                   sample_index));
2722          if (devinfo->gen >= 7) {
2723             /* MCS data is in the first channel of `mcs`, but we need to get it into
2724              * the .y channel of the second vec4 of params, so replicate .x across
2725              * the whole vec4 and then mask off everything except .y
2726              */
2727             mcs.swizzle = BRW_SWIZZLE_XXXX;
2728             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2729                      mcs));
2730          }
2731          inst->mlen++;
2732       } else if (ir->op == ir_txd) {
2733          const glsl_type *type = lod_type;
2734
2735          if (devinfo->gen >= 5) {
2736             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2737             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2738             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2739             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2740             inst->mlen++;
2741
2742             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2743                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2744                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2745                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2746                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2747                inst->mlen++;
2748
2749                if (ir->shadow_comparitor) {
2750                   emit(MOV(dst_reg(MRF, param_base + 2,
2751                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2752                            shadow_comparitor));
2753                }
2754             }
2755          } else /* devinfo->gen == 4 */ {
2756             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2757             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2758             inst->mlen += 2;
2759          }
2760       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2761          if (ir->shadow_comparitor) {
2762             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2763                      shadow_comparitor));
2764          }
2765
2766          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2767                   offset_value));
2768          inst->mlen++;
2769       }
2770    }
2771
2772    emit(inst);
2773
2774    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2775     * spec requires layers.
2776     */
2777    if (ir->op == ir_txs) {
2778       glsl_type const *type = ir->sampler->type;
2779       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2780           type->sampler_array) {
2781          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2782                    writemask(inst->dst, WRITEMASK_Z),
2783                    src_reg(inst->dst), src_reg(6));
2784       }
2785    }
2786
2787    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2788       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2789    }
2790
2791    swizzle_result(ir, src_reg(inst->dst), sampler);
2792 }
2793
2794 /**
2795  * Apply workarounds for Gen6 gather with UINT/SINT
2796  */
2797 void
2798 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2799 {
2800    if (!wa)
2801       return;
2802
2803    int width = (wa & WA_8BIT) ? 8 : 16;
2804    dst_reg dst_f = dst;
2805    dst_f.type = BRW_REGISTER_TYPE_F;
2806
2807    /* Convert from UNORM to UINT */
2808    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2809    emit(MOV(dst, src_reg(dst_f)));
2810
2811    if (wa & WA_SIGN) {
2812       /* Reinterpret the UINT value as a signed INT value by
2813        * shifting the sign bit into place, then shifting back
2814        * preserving sign.
2815        */
2816       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2817       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2818    }
2819 }
2820
2821 /**
2822  * Set up the gather channel based on the swizzle, for gather4.
2823  */
2824 uint32_t
2825 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2826 {
2827    ir_constant *chan = ir->lod_info.component->as_constant();
2828    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2829    switch (swiz) {
2830       case SWIZZLE_X: return 0;
2831       case SWIZZLE_Y:
2832          /* gather4 sampler is broken for green channel on RG32F --
2833           * we must ask for blue instead.
2834           */
2835          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2836             return 2;
2837          return 1;
2838       case SWIZZLE_Z: return 2;
2839       case SWIZZLE_W: return 3;
2840       default:
2841          unreachable("Not reached"); /* zero, one swizzles handled already */
2842    }
2843 }
2844
2845 void
2846 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2847 {
2848    int s = key->tex.swizzles[sampler];
2849
2850    this->result = src_reg(this, ir->type);
2851    dst_reg swizzled_result(this->result);
2852
2853    if (ir->op == ir_query_levels) {
2854       /* # levels is in .w */
2855       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2856       emit(MOV(swizzled_result, orig_val));
2857       return;
2858    }
2859
2860    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2861                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2862       emit(MOV(swizzled_result, orig_val));
2863       return;
2864    }
2865
2866
2867    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2868    int swizzle[4] = {0};
2869
2870    for (int i = 0; i < 4; i++) {
2871       switch (GET_SWZ(s, i)) {
2872       case SWIZZLE_ZERO:
2873          zero_mask |= (1 << i);
2874          break;
2875       case SWIZZLE_ONE:
2876          one_mask |= (1 << i);
2877          break;
2878       default:
2879          copy_mask |= (1 << i);
2880          swizzle[i] = GET_SWZ(s, i);
2881          break;
2882       }
2883    }
2884
2885    if (copy_mask) {
2886       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2887       swizzled_result.writemask = copy_mask;
2888       emit(MOV(swizzled_result, orig_val));
2889    }
2890
2891    if (zero_mask) {
2892       swizzled_result.writemask = zero_mask;
2893       emit(MOV(swizzled_result, src_reg(0.0f)));
2894    }
2895
2896    if (one_mask) {
2897       swizzled_result.writemask = one_mask;
2898       emit(MOV(swizzled_result, src_reg(1.0f)));
2899    }
2900 }
2901
2902 void
2903 vec4_visitor::visit(ir_return *)
2904 {
2905    unreachable("not reached");
2906 }
2907
2908 void
2909 vec4_visitor::visit(ir_discard *)
2910 {
2911    unreachable("not reached");
2912 }
2913
2914 void
2915 vec4_visitor::visit(ir_if *ir)
2916 {
2917    /* Don't point the annotation at the if statement, because then it plus
2918     * the then and else blocks get printed.
2919     */
2920    this->base_ir = ir->condition;
2921
2922    if (devinfo->gen == 6) {
2923       emit_if_gen6(ir);
2924    } else {
2925       enum brw_predicate predicate;
2926       emit_bool_to_cond_code(ir->condition, &predicate);
2927       emit(IF(predicate));
2928    }
2929
2930    visit_instructions(&ir->then_instructions);
2931
2932    if (!ir->else_instructions.is_empty()) {
2933       this->base_ir = ir->condition;
2934       emit(BRW_OPCODE_ELSE);
2935
2936       visit_instructions(&ir->else_instructions);
2937    }
2938
2939    this->base_ir = ir->condition;
2940    emit(BRW_OPCODE_ENDIF);
2941 }
2942
2943 void
2944 vec4_visitor::visit(ir_emit_vertex *)
2945 {
2946    unreachable("not reached");
2947 }
2948
2949 void
2950 vec4_visitor::visit(ir_end_primitive *)
2951 {
2952    unreachable("not reached");
2953 }
2954
2955 void
2956 vec4_visitor::visit(ir_barrier *)
2957 {
2958    unreachable("not reached");
2959 }
2960
2961 void
2962 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2963                                   dst_reg dst, src_reg offset,
2964                                   src_reg src0, src_reg src1)
2965 {
2966    unsigned mlen = 0;
2967
2968    /* Set the atomic operation offset. */
2969    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2970    mlen++;
2971
2972    /* Set the atomic operation arguments. */
2973    if (src0.file != BAD_FILE) {
2974       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2975       mlen++;
2976    }
2977
2978    if (src1.file != BAD_FILE) {
2979       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2980       mlen++;
2981    }
2982
2983    /* Emit the instruction.  Note that this maps to the normal SIMD8
2984     * untyped atomic message on Ivy Bridge, but that's OK because
2985     * unused channels will be masked out.
2986     */
2987    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2988                                  brw_message_reg(0),
2989                                  src_reg(surf_index), src_reg(atomic_op));
2990    inst->mlen = mlen;
2991 }
2992
2993 void
2994 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2995                                         src_reg offset)
2996 {
2997    /* Set the surface read offset. */
2998    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2999
3000    /* Emit the instruction.  Note that this maps to the normal SIMD8
3001     * untyped surface read message, but that's OK because unused
3002     * channels will be masked out.
3003     */
3004    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3005                                  brw_message_reg(0),
3006                                  src_reg(surf_index), src_reg(1));
3007    inst->mlen = 1;
3008 }
3009
3010 void
3011 vec4_visitor::emit_ndc_computation()
3012 {
3013    /* Get the position */
3014    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3015
3016    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3017    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3018    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3019
3020    current_annotation = "NDC";
3021    dst_reg ndc_w = ndc;
3022    ndc_w.writemask = WRITEMASK_W;
3023    src_reg pos_w = pos;
3024    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3025    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3026
3027    dst_reg ndc_xyz = ndc;
3028    ndc_xyz.writemask = WRITEMASK_XYZ;
3029
3030    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3031 }
3032
3033 void
3034 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3035 {
3036    if (devinfo->gen < 6 &&
3037        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3038         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3039       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3040       dst_reg header1_w = header1;
3041       header1_w.writemask = WRITEMASK_W;
3042
3043       emit(MOV(header1, 0u));
3044
3045       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3046          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3047
3048          current_annotation = "Point size";
3049          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3050          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3051       }
3052
3053       if (key->userclip_active) {
3054          current_annotation = "Clipping flags";
3055          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3056          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3057
3058          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3059          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3060          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3061
3062          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3063          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3064          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3065          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3066       }
3067
3068       /* i965 clipping workaround:
3069        * 1) Test for -ve rhw
3070        * 2) If set,
3071        *      set ndc = (0,0,0,0)
3072        *      set ucp[6] = 1
3073        *
3074        * Later, clipping will detect ucp[6] and ensure the primitive is
3075        * clipped against all fixed planes.
3076        */
3077       if (devinfo->has_negative_rhw_bug) {
3078          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3079          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3080          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3081          vec4_instruction *inst;
3082          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3083          inst->predicate = BRW_PREDICATE_NORMAL;
3084          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3085          inst->predicate = BRW_PREDICATE_NORMAL;
3086       }
3087
3088       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3089    } else if (devinfo->gen < 6) {
3090       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3091    } else {
3092       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3093       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3094          dst_reg reg_w = reg;
3095          reg_w.writemask = WRITEMASK_W;
3096          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3097       }
3098       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3099          dst_reg reg_y = reg;
3100          reg_y.writemask = WRITEMASK_Y;
3101          reg_y.type = BRW_REGISTER_TYPE_D;
3102          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3103       }
3104       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3105          dst_reg reg_z = reg;
3106          reg_z.writemask = WRITEMASK_Z;
3107          reg_z.type = BRW_REGISTER_TYPE_D;
3108          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3109       }
3110    }
3111 }
3112
3113 void
3114 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3115 {
3116    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3117     *
3118     *     "If a linked set of shaders forming the vertex stage contains no
3119     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3120     *     application has requested clipping against user clip planes through
3121     *     the API, then the coordinate written to gl_Position is used for
3122     *     comparison against the user clip planes."
3123     *
3124     * This function is only called if the shader didn't write to
3125     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3126     * if the user wrote to it; otherwise we use gl_Position.
3127     */
3128    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3129    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3130       clip_vertex = VARYING_SLOT_POS;
3131    }
3132
3133    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3134         ++i) {
3135       reg.writemask = 1 << i;
3136       emit(DP4(reg,
3137                src_reg(output_reg[clip_vertex]),
3138                src_reg(this->userplane[i + offset])));
3139    }
3140 }
3141
3142 vec4_instruction *
3143 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3144 {
3145    assert (varying < VARYING_SLOT_MAX);
3146    reg.type = output_reg[varying].type;
3147    current_annotation = output_reg_annotation[varying];
3148    /* Copy the register, saturating if necessary */
3149    return emit(MOV(reg, src_reg(output_reg[varying])));
3150 }
3151
3152 void
3153 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3154 {
3155    reg.type = BRW_REGISTER_TYPE_F;
3156
3157    switch (varying) {
3158    case VARYING_SLOT_PSIZ:
3159    {
3160       /* PSIZ is always in slot 0, and is coupled with other flags. */
3161       current_annotation = "indices, point width, clip flags";
3162       emit_psiz_and_flags(reg);
3163       break;
3164    }
3165    case BRW_VARYING_SLOT_NDC:
3166       current_annotation = "NDC";
3167       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3168       break;
3169    case VARYING_SLOT_POS:
3170       current_annotation = "gl_Position";
3171       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3172       break;
3173    case VARYING_SLOT_EDGE:
3174       /* This is present when doing unfilled polygons.  We're supposed to copy
3175        * the edge flag from the user-provided vertex array
3176        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3177        * of that attribute (starts as 1.0f).  This is then used in clipping to
3178        * determine which edges should be drawn as wireframe.
3179        */
3180       current_annotation = "edge flag";
3181       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3182                                     glsl_type::float_type, WRITEMASK_XYZW))));
3183       break;
3184    case BRW_VARYING_SLOT_PAD:
3185       /* No need to write to this slot */
3186       break;
3187    case VARYING_SLOT_COL0:
3188    case VARYING_SLOT_COL1:
3189    case VARYING_SLOT_BFC0:
3190    case VARYING_SLOT_BFC1: {
3191       /* These built-in varyings are only supported in compatibility mode,
3192        * and we only support GS in core profile.  So, this must be a vertex
3193        * shader.
3194        */
3195       assert(stage == MESA_SHADER_VERTEX);
3196       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3197       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3198          inst->saturate = true;
3199       break;
3200    }
3201
3202    default:
3203       emit_generic_urb_slot(reg, varying);
3204       break;
3205    }
3206 }
3207
3208 static int
3209 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3210 {
3211    if (devinfo->gen >= 6) {
3212       /* URB data written (does not include the message header reg) must
3213        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3214        * section 5.4.3.2.2: URB_INTERLEAVED.
3215        *
3216        * URB entries are allocated on a multiple of 1024 bits, so an
3217        * extra 128 bits written here to make the end align to 256 is
3218        * no problem.
3219        */
3220       if ((mlen % 2) != 1)
3221          mlen++;
3222    }
3223
3224    return mlen;
3225 }
3226
3227
3228 /**
3229  * Generates the VUE payload plus the necessary URB write instructions to
3230  * output it.
3231  *
3232  * The VUE layout is documented in Volume 2a.
3233  */
3234 void
3235 vec4_visitor::emit_vertex()
3236 {
3237    /* MRF 0 is reserved for the debugger, so start with message header
3238     * in MRF 1.
3239     */
3240    int base_mrf = 1;
3241    int mrf = base_mrf;
3242    /* In the process of generating our URB write message contents, we
3243     * may need to unspill a register or load from an array.  Those
3244     * reads would use MRFs 14-15.
3245     */
3246    int max_usable_mrf = 13;
3247
3248    /* The following assertion verifies that max_usable_mrf causes an
3249     * even-numbered amount of URB write data, which will meet gen6's
3250     * requirements for length alignment.
3251     */
3252    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3253
3254    /* First mrf is the g0-based message header containing URB handles and
3255     * such.
3256     */
3257    emit_urb_write_header(mrf++);
3258
3259    if (devinfo->gen < 6) {
3260       emit_ndc_computation();
3261    }
3262
3263    /* Lower legacy ff and ClipVertex clipping to clip distances */
3264    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3265       current_annotation = "user clip distances";
3266
3267       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3268       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3269
3270       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3271       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3272    }
3273
3274    /* We may need to split this up into several URB writes, so do them in a
3275     * loop.
3276     */
3277    int slot = 0;
3278    bool complete = false;
3279    do {
3280       /* URB offset is in URB row increments, and each of our MRFs is half of
3281        * one of those, since we're doing interleaved writes.
3282        */
3283       int offset = slot / 2;
3284
3285       mrf = base_mrf + 1;
3286       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3287          emit_urb_slot(dst_reg(MRF, mrf++),
3288                        prog_data->vue_map.slot_to_varying[slot]);
3289
3290          /* If this was max_usable_mrf, we can't fit anything more into this
3291           * URB WRITE.
3292           */
3293          if (mrf > max_usable_mrf) {
3294             slot++;
3295             break;
3296          }
3297       }
3298
3299       complete = slot >= prog_data->vue_map.num_slots;
3300       current_annotation = "URB write";
3301       vec4_instruction *inst = emit_urb_write_opcode(complete);
3302       inst->base_mrf = base_mrf;
3303       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3304       inst->offset += offset;
3305    } while(!complete);
3306 }
3307
3308
3309 src_reg
3310 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3311                                  src_reg *reladdr, int reg_offset)
3312 {
3313    /* Because we store the values to scratch interleaved like our
3314     * vertex data, we need to scale the vec4 index by 2.
3315     */
3316    int message_header_scale = 2;
3317
3318    /* Pre-gen6, the message header uses byte offsets instead of vec4
3319     * (16-byte) offset units.
3320     */
3321    if (devinfo->gen < 6)
3322       message_header_scale *= 16;
3323
3324    if (reladdr) {
3325       src_reg index = src_reg(this, glsl_type::int_type);
3326
3327       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3328                                    src_reg(reg_offset)));
3329       emit_before(block, inst, MUL(dst_reg(index), index,
3330                                    src_reg(message_header_scale)));
3331
3332       return index;
3333    } else {
3334       return src_reg(reg_offset * message_header_scale);
3335    }
3336 }
3337
3338 src_reg
3339 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3340                                        src_reg *reladdr, int reg_offset)
3341 {
3342    if (reladdr) {
3343       src_reg index = src_reg(this, glsl_type::int_type);
3344
3345       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3346                                    src_reg(reg_offset)));
3347
3348       /* Pre-gen6, the message header uses byte offsets instead of vec4
3349        * (16-byte) offset units.
3350        */
3351       if (devinfo->gen < 6) {
3352          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3353       }
3354
3355       return index;
3356    } else if (devinfo->gen >= 8) {
3357       /* Store the offset in a GRF so we can send-from-GRF. */
3358       src_reg offset = src_reg(this, glsl_type::int_type);
3359       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3360       return offset;
3361    } else {
3362       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3363       return src_reg(reg_offset * message_header_scale);
3364    }
3365 }
3366
3367 /**
3368  * Emits an instruction before @inst to load the value named by @orig_src
3369  * from scratch space at @base_offset to @temp.
3370  *
3371  * @base_offset is measured in 32-byte units (the size of a register).
3372  */
3373 void
3374 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3375                                 dst_reg temp, src_reg orig_src,
3376                                 int base_offset)
3377 {
3378    int reg_offset = base_offset + orig_src.reg_offset;
3379    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3380                                       reg_offset);
3381
3382    emit_before(block, inst, SCRATCH_READ(temp, index));
3383 }
3384
3385 /**
3386  * Emits an instruction after @inst to store the value to be written
3387  * to @orig_dst to scratch space at @base_offset, from @temp.
3388  *
3389  * @base_offset is measured in 32-byte units (the size of a register).
3390  */
3391 void
3392 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3393                                  int base_offset)
3394 {
3395    int reg_offset = base_offset + inst->dst.reg_offset;
3396    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3397                                       reg_offset);
3398
3399    /* Create a temporary register to store *inst's result in.
3400     *
3401     * We have to be careful in MOVing from our temporary result register in
3402     * the scratch write.  If we swizzle from channels of the temporary that
3403     * weren't initialized, it will confuse live interval analysis, which will
3404     * make spilling fail to make progress.
3405     */
3406    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3407                                        inst->dst.type),
3408                                 brw_swizzle_for_mask(inst->dst.writemask));
3409    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3410                                        inst->dst.writemask));
3411    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3412    write->predicate = inst->predicate;
3413    write->ir = inst->ir;
3414    write->annotation = inst->annotation;
3415    inst->insert_after(block, write);
3416
3417    inst->dst.file = temp.file;
3418    inst->dst.reg = temp.reg;
3419    inst->dst.reg_offset = temp.reg_offset;
3420    inst->dst.reladdr = NULL;
3421 }
3422
3423 /**
3424  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3425  * adds the scratch read(s) before \p inst. The function also checks for
3426  * recursive reladdr scratch accesses, issuing the corresponding scratch
3427  * loads and rewriting reladdr references accordingly.
3428  *
3429  * \return \p src if it did not require a scratch load, otherwise, the
3430  * register holding the result of the scratch load that the caller should
3431  * use to rewrite src.
3432  */
3433 src_reg
3434 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3435                                    vec4_instruction *inst, src_reg src)
3436 {
3437    /* Resolve recursive reladdr scratch access by calling ourselves
3438     * with src.reladdr
3439     */
3440    if (src.reladdr)
3441       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3442                                           *src.reladdr);
3443
3444    /* Now handle scratch access on src */
3445    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3446       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3447       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3448       src.reg = temp.reg;
3449       src.reg_offset = temp.reg_offset;
3450       src.reladdr = NULL;
3451    }
3452
3453    return src;
3454 }
3455
3456 /**
3457  * We can't generally support array access in GRF space, because a
3458  * single instruction's destination can only span 2 contiguous
3459  * registers.  So, we send all GRF arrays that get variable index
3460  * access to scratch space.
3461  */
3462 void
3463 vec4_visitor::move_grf_array_access_to_scratch()
3464 {
3465    int scratch_loc[this->alloc.count];
3466    memset(scratch_loc, -1, sizeof(scratch_loc));
3467
3468    /* First, calculate the set of virtual GRFs that need to be punted
3469     * to scratch due to having any array access on them, and where in
3470     * scratch.
3471     */
3472    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3473       if (inst->dst.file == GRF && inst->dst.reladdr) {
3474          if (scratch_loc[inst->dst.reg] == -1) {
3475             scratch_loc[inst->dst.reg] = last_scratch;
3476             last_scratch += this->alloc.sizes[inst->dst.reg];
3477          }
3478
3479          for (src_reg *iter = inst->dst.reladdr;
3480               iter->reladdr;
3481               iter = iter->reladdr) {
3482             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3483                scratch_loc[iter->reg] = last_scratch;
3484                last_scratch += this->alloc.sizes[iter->reg];
3485             }
3486          }
3487       }
3488
3489       for (int i = 0 ; i < 3; i++) {
3490          for (src_reg *iter = &inst->src[i];
3491               iter->reladdr;
3492               iter = iter->reladdr) {
3493             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3494                scratch_loc[iter->reg] = last_scratch;
3495                last_scratch += this->alloc.sizes[iter->reg];
3496             }
3497          }
3498       }
3499    }
3500
3501    /* Now, for anything that will be accessed through scratch, rewrite
3502     * it to load/store.  Note that this is a _safe list walk, because
3503     * we may generate a new scratch_write instruction after the one
3504     * we're processing.
3505     */
3506    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3507       /* Set up the annotation tracking for new generated instructions. */
3508       base_ir = inst->ir;
3509       current_annotation = inst->annotation;
3510
3511       /* First handle scratch access on the dst. Notice we have to handle
3512        * the case where the dst's reladdr also points to scratch space.
3513        */
3514       if (inst->dst.reladdr)
3515          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3516                                                    *inst->dst.reladdr);
3517
3518       /* Now that we have handled any (possibly recursive) reladdr scratch
3519        * accesses for dst we can safely do the scratch write for dst itself
3520        */
3521       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3522          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3523
3524       /* Now handle scratch access on any src. In this case, since inst->src[i]
3525        * already is a src_reg, we can just call emit_resolve_reladdr with
3526        * inst->src[i] and it will take care of handling scratch loads for
3527        * both src and src.reladdr (recursively).
3528        */
3529       for (int i = 0 ; i < 3; i++) {
3530          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3531                                              inst->src[i]);
3532       }
3533    }
3534 }
3535
3536 /**
3537  * Emits an instruction before @inst to load the value named by @orig_src
3538  * from the pull constant buffer (surface) at @base_offset to @temp.
3539  */
3540 void
3541 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3542                                       dst_reg temp, src_reg orig_src,
3543                                       int base_offset)
3544 {
3545    int reg_offset = base_offset + orig_src.reg_offset;
3546    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3547    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3548                                              reg_offset);
3549
3550    emit_pull_constant_load_reg(temp,
3551                                index,
3552                                offset,
3553                                block, inst);
3554 }
3555
3556 /**
3557  * Implements array access of uniforms by inserting a
3558  * PULL_CONSTANT_LOAD instruction.
3559  *
3560  * Unlike temporary GRF array access (where we don't support it due to
3561  * the difficulty of doing relative addressing on instruction
3562  * destinations), we could potentially do array access of uniforms
3563  * that were loaded in GRF space as push constants.  In real-world
3564  * usage we've seen, though, the arrays being used are always larger
3565  * than we could load as push constants, so just always move all
3566  * uniform array access out to a pull constant buffer.
3567  */
3568 void
3569 vec4_visitor::move_uniform_array_access_to_pull_constants()
3570 {
3571    int pull_constant_loc[this->uniforms];
3572    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3573    bool nested_reladdr;
3574
3575    /* Walk through and find array access of uniforms.  Put a copy of that
3576     * uniform in the pull constant buffer.
3577     *
3578     * Note that we don't move constant-indexed accesses to arrays.  No
3579     * testing has been done of the performance impact of this choice.
3580     */
3581    do {
3582       nested_reladdr = false;
3583
3584       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3585          for (int i = 0 ; i < 3; i++) {
3586             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3587                continue;
3588
3589             int uniform = inst->src[i].reg;
3590
3591             if (inst->src[i].reladdr->reladdr)
3592                nested_reladdr = true;  /* will need another pass */
3593
3594             /* If this array isn't already present in the pull constant buffer,
3595              * add it.
3596              */
3597             if (pull_constant_loc[uniform] == -1) {
3598                const gl_constant_value **values =
3599                   &stage_prog_data->param[uniform * 4];
3600
3601                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3602
3603                assert(uniform < uniform_array_size);
3604                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3605                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3606                      = values[j];
3607                }
3608             }
3609
3610             /* Set up the annotation tracking for new generated instructions. */
3611             base_ir = inst->ir;
3612             current_annotation = inst->annotation;
3613
3614             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3615
3616             emit_pull_constant_load(block, inst, temp, inst->src[i],
3617                                     pull_constant_loc[uniform]);
3618
3619             inst->src[i].file = temp.file;
3620             inst->src[i].reg = temp.reg;
3621             inst->src[i].reg_offset = temp.reg_offset;
3622             inst->src[i].reladdr = NULL;
3623          }
3624       }
3625    } while (nested_reladdr);
3626
3627    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3628     * no need to track them as larger-than-vec4 objects.  This will be
3629     * relied on in cutting out unused uniform vectors from push
3630     * constants.
3631     */
3632    split_uniform_registers();
3633 }
3634
3635 void
3636 vec4_visitor::resolve_ud_negate(src_reg *reg)
3637 {
3638    if (reg->type != BRW_REGISTER_TYPE_UD ||
3639        !reg->negate)
3640       return;
3641
3642    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3643    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3644    *reg = temp;
3645 }
3646
3647 /**
3648  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3649  *
3650  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3651  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3652  */
3653 void
3654 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3655 {
3656    assert(devinfo->gen <= 5);
3657
3658    if (!rvalue->type->is_boolean())
3659       return;
3660
3661    src_reg and_result = src_reg(this, rvalue->type);
3662    src_reg neg_result = src_reg(this, rvalue->type);
3663    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3664    emit(MOV(dst_reg(neg_result), negate(and_result)));
3665    *reg = neg_result;
3666 }
3667
3668 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3669                            void *log_data,
3670                            struct gl_program *prog,
3671                            const struct brw_vue_prog_key *key,
3672                            struct brw_vue_prog_data *prog_data,
3673                            struct gl_shader_program *shader_prog,
3674                            gl_shader_stage stage,
3675                            void *mem_ctx,
3676                            bool no_spills,
3677                            int shader_time_index)
3678    : backend_shader(compiler, log_data, mem_ctx,
3679                     shader_prog, prog, &prog_data->base, stage),
3680      key(key),
3681      prog_data(prog_data),
3682      sanity_param_count(0),
3683      fail_msg(NULL),
3684      first_non_payload_grf(0),
3685      need_all_constants_in_pull_buffer(false),
3686      no_spills(no_spills),
3687      shader_time_index(shader_time_index),
3688      last_scratch(0)
3689 {
3690    this->failed = false;
3691
3692    this->base_ir = NULL;
3693    this->current_annotation = NULL;
3694    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3695
3696    this->variable_ht = hash_table_ctor(0,
3697                                        hash_table_pointer_hash,
3698                                        hash_table_pointer_compare);
3699
3700    this->virtual_grf_start = NULL;
3701    this->virtual_grf_end = NULL;
3702    this->live_intervals = NULL;
3703
3704    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3705
3706    this->uniforms = 0;
3707
3708    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3709     * at least one. See setup_uniforms() in brw_vec4.cpp.
3710     */
3711    this->uniform_array_size = 1;
3712    if (prog_data) {
3713       this->uniform_array_size =
3714          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3715    }
3716
3717    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3718    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3719 }
3720
3721 vec4_visitor::~vec4_visitor()
3722 {
3723    hash_table_dtor(this->variable_ht);
3724 }
3725
3726
3727 void
3728 vec4_visitor::fail(const char *format, ...)
3729 {
3730    va_list va;
3731    char *msg;
3732
3733    if (failed)
3734       return;
3735
3736    failed = true;
3737
3738    va_start(va, format);
3739    msg = ralloc_vasprintf(mem_ctx, format, va);
3740    va_end(va);
3741    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3742
3743    this->fail_msg = msg;
3744
3745    if (debug_enabled) {
3746       fprintf(stderr, "%s",  msg);
3747    }
3748 }
3749
3750 } /* namespace brw */