src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(vec4_visitor *v,
  32                                    enum opcode opcode, const dst_reg &dst,
  33                                    const src_reg &src0, const src_reg &src1,
  34                                    const src_reg &src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->writes_accumulator = false;
  46    this->conditional_mod = BRW_CONDITIONAL_NONE;
  47    this->target = 0;
  48    this->shadow_compare = false;
  49    this->ir = v->base_ir;
  50    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  51    this->header_present = false;
  52    this->mlen = 0;
  53    this->base_mrf = 0;
  54    this->offset = 0;
  55    this->annotation = v->current_annotation;
  56 }
  57
  58 vec4_instruction *
  59 vec4_visitor::emit(vec4_instruction *inst)
  60 {
  61    this->instructions.push_tail(inst);
  62
  63    return inst;
  64 }
  65
  66 vec4_instruction *
  67 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  68                           vec4_instruction *new_inst)
  69 {
  70    new_inst->ir = inst->ir;
  71    new_inst->annotation = inst->annotation;
  72
  73    inst->insert_before(block, new_inst);
  74
  75    return inst;
  76 }
  77
  78 vec4_instruction *
  79 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  80                    const src_reg &src1, const src_reg &src2)
  81 {
  82    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  83                                              src0, src1, src2));
  84 }
  85
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  89                    const src_reg &src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* Take the instruction:
 226     *
 227     * CMP null<d> src0<f> src1<f>
 228     *
 229     * Original gen4 does type conversion to the destination type before
 230     * comparison, producing garbage results for floating point comparisons.
 231     *
 232     * The destination type doesn't matter on newer generations, so we set the
 233     * type to match src0 so we can compact the instruction.
 234     */
 235    dst.type = src0.type;
 236    if (dst.file == HW_REG)
 237       dst.fixed_hw_reg.type = dst.type;
 238
 239    resolve_ud_negate(&src0);
 240    resolve_ud_negate(&src1);
 241
 242    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 243    inst->conditional_mod = condition;
 244
 245    return inst;
 246 }
 247
 248 vec4_instruction *
 249 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 250 {
 251    vec4_instruction *inst;
 252
 253    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 254                                         dst, index);
 255    inst->base_mrf = 14;
 256    inst->mlen = 2;
 257
 258    return inst;
 259 }
 260
 261 vec4_instruction *
 262 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 263                             const src_reg &index)
 264 {
 265    vec4_instruction *inst;
 266
 267    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 268                                         dst, src, index);
 269    inst->base_mrf = 13;
 270    inst->mlen = 3;
 271
 272    return inst;
 273 }
 274
 275 void
 276 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 277 {
 278    static enum opcode dot_opcodes[] = {
 279       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 280    };
 281
 282    emit(dot_opcodes[elements - 2], dst, src0, src1);
 283 }
 284
 285 src_reg
 286 vec4_visitor::fix_3src_operand(src_reg src)
 287 {
 288    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 289     * able to use vertical stride of zero to replicate the vec4 uniform, like
 290     *
 291     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 292     *
 293     * But you can't, since vertical stride is always four in three-source
 294     * instructions. Instead, insert a MOV instruction to do the replication so
 295     * that the three-source instruction can consume it.
 296     */
 297
 298    /* The MOV is only needed if the source is a uniform or immediate. */
 299    if (src.file != UNIFORM && src.file != IMM)
 300       return src;
 301
 302    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 303       return src;
 304
 305    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 306    expanded.type = src.type;
 307    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 308    return src_reg(expanded);
 309 }
 310
 311 src_reg
 312 vec4_visitor::fix_math_operand(src_reg src)
 313 {
 314    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 315       return src;
 316
 317    /* The gen6 math instruction ignores the source modifiers --
 318     * swizzle, abs, negate, and at least some parts of the register
 319     * region description.
 320     *
 321     * Rather than trying to enumerate all these cases, *always* expand the
 322     * operand to a temp GRF for gen6.
 323     *
 324     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 325     * can't use.
 326     */
 327
 328    if (brw->gen == 7 && src.file != IMM)
 329       return src;
 330
 331    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 332    expanded.type = src.type;
 333    emit(MOV(expanded, src));
 334    return src_reg(expanded);
 335 }
 336
 337 void
 338 vec4_visitor::emit_math(enum opcode opcode,
 339                         const dst_reg &dst,
 340                         const src_reg &src0, const src_reg &src1)
 341 {
 342    vec4_instruction *math =
 343       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 344
 345    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 346       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 347       math->dst = dst_reg(this, glsl_type::vec4_type);
 348       math->dst.type = dst.type;
 349       emit(MOV(dst, src_reg(math->dst)));
 350    } else if (brw->gen < 6) {
 351       math->base_mrf = 1;
 352       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 353    }
 354 }
 355
 356 void
 357 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 358 {
 359    if (brw->gen < 7) {
 360       unreachable("ir_unop_pack_half_2x16 should be lowered");
 361    }
 362
 363    assert(dst.type == BRW_REGISTER_TYPE_UD);
 364    assert(src0.type == BRW_REGISTER_TYPE_F);
 365
 366    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 367     *
 368     *   Because this instruction does not have a 16-bit floating-point type,
 369     *   the destination data type must be Word (W).
 370     *
 371     *   The destination must be DWord-aligned and specify a horizontal stride
 372     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 373     *   each destination channel and the upper word is not modified.
 374     *
 375     * The above restriction implies that the f32to16 instruction must use
 376     * align1 mode, because only in align1 mode is it possible to specify
 377     * horizontal stride.  We choose here to defy the hardware docs and emit
 378     * align16 instructions.
 379     *
 380     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 381     * instructions. I was partially successful in that the code passed all
 382     * tests.  However, the code was dubiously correct and fragile, and the
 383     * tests were not harsh enough to probe that frailty. Not trusting the
 384     * code, I chose instead to remain in align16 mode in defiance of the hw
 385     * docs).
 386     *
 387     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 388     * simulator, emitting a f32to16 in align16 mode with UD as destination
 389     * data type is safe. The behavior differs from that specified in the PRM
 390     * in that the upper word of each destination channel is cleared to 0.
 391     */
 392
 393    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 394    src_reg tmp_src(tmp_dst);
 395
 396 #if 0
 397    /* Verify the undocumented behavior on which the following instructions
 398     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 399     * then the result of the bit-or instruction below will be incorrect.
 400     *
 401     * You should inspect the disasm output in order to verify that the MOV is
 402     * not optimized away.
 403     */
 404    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 405 #endif
 406
 407    /* Give tmp the form below, where "." means untouched.
 408     *
 409     *     w z          y          x w z          y          x
 410     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 411     *
 412     * That the upper word of each write-channel be 0 is required for the
 413     * following bit-shift and bit-or instructions to work. Note that this
 414     * relies on the undocumented hardware behavior mentioned above.
 415     */
 416    tmp_dst.writemask = WRITEMASK_XY;
 417    emit(F32TO16(tmp_dst, src0));
 418
 419    /* Give the write-channels of dst the form:
 420     *   0xhhhh0000
 421     */
 422    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 423    emit(SHL(dst, tmp_src, src_reg(16u)));
 424
 425    /* Finally, give the write-channels of dst the form of packHalf2x16's
 426     * output:
 427     *   0xhhhhllll
 428     */
 429    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 430    emit(OR(dst, src_reg(dst), tmp_src));
 431 }
 432
 433 void
 434 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 435 {
 436    if (brw->gen < 7) {
 437       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 438    }
 439
 440    assert(dst.type == BRW_REGISTER_TYPE_F);
 441    assert(src0.type == BRW_REGISTER_TYPE_UD);
 442
 443    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 444     *
 445     *   Because this instruction does not have a 16-bit floating-point type,
 446     *   the source data type must be Word (W). The destination type must be
 447     *   F (Float).
 448     *
 449     * To use W as the source data type, we must adjust horizontal strides,
 450     * which is only possible in align1 mode. All my [chadv] attempts at
 451     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 452     * Piglit tests, so I gave up.
 453     *
 454     * I've verified that, on gen7 hardware and the simulator, it is safe to
 455     * emit f16to32 in align16 mode with UD as source data type.
 456     */
 457
 458    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 459    src_reg tmp_src(tmp_dst);
 460
 461    tmp_dst.writemask = WRITEMASK_X;
 462    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 463
 464    tmp_dst.writemask = WRITEMASK_Y;
 465    emit(SHR(tmp_dst, src0, src_reg(16u)));
 466
 467    dst.writemask = WRITEMASK_XY;
 468    emit(F16TO32(dst, tmp_src));
 469 }
 470
 471 void
 472 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 473 {
 474    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 475     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 476     * is not suitable to generate the shift values, but we can use the packed
 477     * vector float and a type-converting MOV.
 478     */
 479    dst_reg shift(this, glsl_type::uvec4_type);
 480    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 481
 482    dst_reg shifted(this, glsl_type::uvec4_type);
 483    src0.swizzle = BRW_SWIZZLE_XXXX;
 484    emit(SHR(shifted, src0, src_reg(shift)));
 485
 486    shifted.type = BRW_REGISTER_TYPE_UB;
 487    dst_reg f(this, glsl_type::vec4_type);
 488    emit(MOV(f, src_reg(shifted)));
 489
 490    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 491 }
 492
 493 void
 494 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 495 {
 496    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 497     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 498     * is not suitable to generate the shift values, but we can use the packed
 499     * vector float and a type-converting MOV.
 500     */
 501    dst_reg shift(this, glsl_type::uvec4_type);
 502    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 503
 504    dst_reg shifted(this, glsl_type::uvec4_type);
 505    src0.swizzle = BRW_SWIZZLE_XXXX;
 506    emit(SHR(shifted, src0, src_reg(shift)));
 507
 508    shifted.type = BRW_REGISTER_TYPE_B;
 509    dst_reg f(this, glsl_type::vec4_type);
 510    emit(MOV(f, src_reg(shifted)));
 511
 512    dst_reg scaled(this, glsl_type::vec4_type);
 513    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 514
 515    dst_reg max(this, glsl_type::vec4_type);
 516    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 517    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 518 }
 519
 520 void
 521 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 522 {
 523    dst_reg saturated(this, glsl_type::vec4_type);
 524    vec4_instruction *inst = emit(MOV(saturated, src0));
 525    inst->saturate = true;
 526
 527    dst_reg scaled(this, glsl_type::vec4_type);
 528    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 529
 530    dst_reg rounded(this, glsl_type::vec4_type);
 531    emit(RNDE(rounded, src_reg(scaled)));
 532
 533    dst_reg u(this, glsl_type::uvec4_type);
 534    emit(MOV(u, src_reg(rounded)));
 535
 536    src_reg bytes(u);
 537    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 538 }
 539
 540 void
 541 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 542 {
 543    dst_reg max(this, glsl_type::vec4_type);
 544    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 545
 546    dst_reg min(this, glsl_type::vec4_type);
 547    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 548
 549    dst_reg scaled(this, glsl_type::vec4_type);
 550    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 551
 552    dst_reg rounded(this, glsl_type::vec4_type);
 553    emit(RNDE(rounded, src_reg(scaled)));
 554
 555    dst_reg i(this, glsl_type::ivec4_type);
 556    emit(MOV(i, src_reg(rounded)));
 557
 558    src_reg bytes(i);
 559    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 560 }
 561
 562 void
 563 vec4_visitor::visit_instructions(const exec_list *list)
 564 {
 565    foreach_in_list(ir_instruction, ir, list) {
 566       base_ir = ir;
 567       ir->accept(this);
 568    }
 569 }
 570
 571
 572 static int
 573 type_size(const struct glsl_type *type)
 574 {
 575    unsigned int i;
 576    int size;
 577
 578    switch (type->base_type) {
 579    case GLSL_TYPE_UINT:
 580    case GLSL_TYPE_INT:
 581    case GLSL_TYPE_FLOAT:
 582    case GLSL_TYPE_BOOL:
 583       if (type->is_matrix()) {
 584          return type->matrix_columns;
 585       } else {
 586          /* Regardless of size of vector, it gets a vec4. This is bad
 587           * packing for things like floats, but otherwise arrays become a
 588           * mess.  Hopefully a later pass over the code can pack scalars
 589           * down if appropriate.
 590           */
 591          return 1;
 592       }
 593    case GLSL_TYPE_ARRAY:
 594       assert(type->length > 0);
 595       return type_size(type->fields.array) * type->length;
 596    case GLSL_TYPE_STRUCT:
 597       size = 0;
 598       for (i = 0; i < type->length; i++) {
 599          size += type_size(type->fields.structure[i].type);
 600       }
 601       return size;
 602    case GLSL_TYPE_SAMPLER:
 603       /* Samplers take up no register space, since they're baked in at
 604        * link time.
 605        */
 606       return 0;
 607    case GLSL_TYPE_ATOMIC_UINT:
 608       return 0;
 609    case GLSL_TYPE_IMAGE:
 610    case GLSL_TYPE_VOID:
 611    case GLSL_TYPE_ERROR:
 612    case GLSL_TYPE_INTERFACE:
 613       unreachable("not reached");
 614    }
 615
 616    return 0;
 617 }
 618
 619 int
 620 vec4_visitor::virtual_grf_alloc(int size)
 621 {
 622    if (virtual_grf_array_size <= virtual_grf_count) {
 623       if (virtual_grf_array_size == 0)
 624          virtual_grf_array_size = 16;
 625       else
 626          virtual_grf_array_size *= 2;
 627       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 628                                    virtual_grf_array_size);
 629       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 630                                      virtual_grf_array_size);
 631    }
 632    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 633    virtual_grf_reg_count += size;
 634    virtual_grf_sizes[virtual_grf_count] = size;
 635    return virtual_grf_count++;
 636 }
 637
 638 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 639 {
 640    init();
 641
 642    this->file = GRF;
 643    this->reg = v->virtual_grf_alloc(type_size(type));
 644
 645    if (type->is_array() || type->is_record()) {
 646       this->swizzle = BRW_SWIZZLE_NOOP;
 647    } else {
 648       this->swizzle = swizzle_for_size(type->vector_elements);
 649    }
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 655 {
 656    assert(size > 0);
 657
 658    init();
 659
 660    this->file = GRF;
 661    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 662
 663    this->swizzle = BRW_SWIZZLE_NOOP;
 664
 665    this->type = brw_type_for_base_type(type);
 666 }
 667
 668 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 669 {
 670    init();
 671
 672    this->file = GRF;
 673    this->reg = v->virtual_grf_alloc(type_size(type));
 674
 675    if (type->is_array() || type->is_record()) {
 676       this->writemask = WRITEMASK_XYZW;
 677    } else {
 678       this->writemask = (1 << type->vector_elements) - 1;
 679    }
 680
 681    this->type = brw_type_for_base_type(type);
 682 }
 683
 684 /* Our support for uniforms is piggy-backed on the struct
 685  * gl_fragment_program, because that's where the values actually
 686  * get stored, rather than in some global gl_shader_program uniform
 687  * store.
 688  */
 689 void
 690 vec4_visitor::setup_uniform_values(ir_variable *ir)
 691 {
 692    int namelen = strlen(ir->name);
 693
 694    /* The data for our (non-builtin) uniforms is stored in a series of
 695     * gl_uniform_driver_storage structs for each subcomponent that
 696     * glGetUniformLocation() could name.  We know it's been set up in the same
 697     * order we'd walk the type, so walk the list of storage and find anything
 698     * with our name, or the prefix of a component that starts with our name.
 699     */
 700    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 701       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 702
 703       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 704           (storage->name[namelen] != 0 &&
 705            storage->name[namelen] != '.' &&
 706            storage->name[namelen] != '[')) {
 707          continue;
 708       }
 709
 710       gl_constant_value *components = storage->storage;
 711       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 712                                storage->type->matrix_columns);
 713
 714       for (unsigned s = 0; s < vector_count; s++) {
 715          assert(uniforms < uniform_array_size);
 716          uniform_vector_size[uniforms] = storage->type->vector_elements;
 717
 718          int i;
 719          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 720             stage_prog_data->param[uniforms * 4 + i] = components;
 721             components++;
 722          }
 723          for (; i < 4; i++) {
 724             static gl_constant_value zero = { 0.0 };
 725             stage_prog_data->param[uniforms * 4 + i] = &zero;
 726          }
 727
 728          uniforms++;
 729       }
 730    }
 731 }
 732
 733 void
 734 vec4_visitor::setup_uniform_clipplane_values()
 735 {
 736    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 737
 738    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 739       assert(this->uniforms < uniform_array_size);
 740       this->uniform_vector_size[this->uniforms] = 4;
 741       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 742       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 743       for (int j = 0; j < 4; ++j) {
 744          stage_prog_data->param[this->uniforms * 4 + j] =
 745             (gl_constant_value *) &clip_planes[i][j];
 746       }
 747       ++this->uniforms;
 748    }
 749 }
 750
 751 /* Our support for builtin uniforms is even scarier than non-builtin.
 752  * It sits on top of the PROG_STATE_VAR parameters that are
 753  * automatically updated from GL context state.
 754  */
 755 void
 756 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 757 {
 758    const ir_state_slot *const slots = ir->get_state_slots();
 759    assert(slots != NULL);
 760
 761    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 762       /* This state reference has already been setup by ir_to_mesa,
 763        * but we'll get the same index back here.  We can reference
 764        * ParameterValues directly, since unlike brw_fs.cpp, we never
 765        * add new state references during compile.
 766        */
 767       int index = _mesa_add_state_reference(this->prog->Parameters,
 768                                             (gl_state_index *)slots[i].tokens);
 769       gl_constant_value *values =
 770          &this->prog->Parameters->ParameterValues[index][0];
 771
 772       assert(this->uniforms < uniform_array_size);
 773       this->uniform_vector_size[this->uniforms] = 0;
 774       /* Add each of the unique swizzled channels of the element.
 775        * This will end up matching the size of the glsl_type of this field.
 776        */
 777       int last_swiz = -1;
 778       for (unsigned int j = 0; j < 4; j++) {
 779          int swiz = GET_SWZ(slots[i].swizzle, j);
 780          last_swiz = swiz;
 781
 782          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 783          assert(this->uniforms < uniform_array_size);
 784          if (swiz <= last_swiz)
 785             this->uniform_vector_size[this->uniforms]++;
 786       }
 787       this->uniforms++;
 788    }
 789 }
 790
 791 dst_reg *
 792 vec4_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (dst_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 799                                      enum brw_predicate *predicate)
 800 {
 801    ir_expression *expr = ir->as_expression();
 802
 803    *predicate = BRW_PREDICATE_NORMAL;
 804
 805    if (expr && expr->operation != ir_binop_ubo_load) {
 806       src_reg op[3];
 807       vec4_instruction *inst;
 808
 809       assert(expr->get_num_operands() <= 3);
 810       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 811          expr->operands[i]->accept(this);
 812          op[i] = this->result;
 813
 814          resolve_ud_negate(&op[i]);
 815       }
 816
 817       switch (expr->operation) {
 818       case ir_unop_logic_not:
 819          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 820          inst->conditional_mod = BRW_CONDITIONAL_Z;
 821          break;
 822
 823       case ir_binop_logic_xor:
 824          if (brw->gen <= 5) {
 825             src_reg temp = src_reg(this, ir->type);
 826             emit(XOR(dst_reg(temp), op[0], op[1]));
 827             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 828          } else {
 829             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 830          }
 831          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 832          break;
 833
 834       case ir_binop_logic_or:
 835          if (brw->gen <= 5) {
 836             src_reg temp = src_reg(this, ir->type);
 837             emit(OR(dst_reg(temp), op[0], op[1]));
 838             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 839          } else {
 840             inst = emit(OR(dst_null_d(), op[0], op[1]));
 841          }
 842          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          break;
 844
 845       case ir_binop_logic_and:
 846          if (brw->gen <= 5) {
 847             src_reg temp = src_reg(this, ir->type);
 848             emit(AND(dst_reg(temp), op[0], op[1]));
 849             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 850          } else {
 851             inst = emit(AND(dst_null_d(), op[0], op[1]));
 852          }
 853          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 854          break;
 855
 856       case ir_unop_f2b:
 857          if (brw->gen >= 6) {
 858             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 859          } else {
 860             inst = emit(MOV(dst_null_f(), op[0]));
 861             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 862          }
 863          break;
 864
 865       case ir_unop_i2b:
 866          if (brw->gen >= 6) {
 867             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 868          } else {
 869             inst = emit(MOV(dst_null_d(), op[0]));
 870             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 871          }
 872          break;
 873
 874       case ir_binop_all_equal:
 875          if (brw->gen <= 5) {
 876             resolve_bool_comparison(expr->operands[0], &op[0]);
 877             resolve_bool_comparison(expr->operands[1], &op[1]);
 878          }
 879          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 880          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 881          break;
 882
 883       case ir_binop_any_nequal:
 884          if (brw->gen <= 5) {
 885             resolve_bool_comparison(expr->operands[0], &op[0]);
 886             resolve_bool_comparison(expr->operands[1], &op[1]);
 887          }
 888          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 889          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 890          break;
 891
 892       case ir_unop_any:
 893          if (brw->gen <= 5) {
 894             resolve_bool_comparison(expr->operands[0], &op[0]);
 895          }
 896          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 897          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 898          break;
 899
 900       case ir_binop_greater:
 901       case ir_binop_gequal:
 902       case ir_binop_less:
 903       case ir_binop_lequal:
 904       case ir_binop_equal:
 905       case ir_binop_nequal:
 906          if (brw->gen <= 5) {
 907             resolve_bool_comparison(expr->operands[0], &op[0]);
 908             resolve_bool_comparison(expr->operands[1], &op[1]);
 909          }
 910          emit(CMP(dst_null_d(), op[0], op[1],
 911                   brw_conditional_for_comparison(expr->operation)));
 912          break;
 913
 914       case ir_triop_csel: {
 915          /* Expand the boolean condition into the flag register. */
 916          inst = emit(MOV(dst_null_d(), op[0]));
 917          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 918
 919          /* Select which boolean to return. */
 920          dst_reg temp(this, expr->operands[1]->type);
 921          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 922          inst->predicate = BRW_PREDICATE_NORMAL;
 923
 924          /* Expand the result to a condition code. */
 925          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 926          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 927          break;
 928       }
 929
 930       default:
 931          unreachable("not reached");
 932       }
 933       return;
 934    }
 935
 936    ir->accept(this);
 937
 938    resolve_ud_negate(&this->result);
 939
 940    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 941    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 942 }
 943
 944 /**
 945  * Emit a gen6 IF statement with the comparison folded into the IF
 946  * instruction.
 947  */
 948 void
 949 vec4_visitor::emit_if_gen6(ir_if *ir)
 950 {
 951    ir_expression *expr = ir->condition->as_expression();
 952
 953    if (expr && expr->operation != ir_binop_ubo_load) {
 954       src_reg op[3];
 955       dst_reg temp;
 956
 957       assert(expr->get_num_operands() <= 3);
 958       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 959          expr->operands[i]->accept(this);
 960          op[i] = this->result;
 961       }
 962
 963       switch (expr->operation) {
 964       case ir_unop_logic_not:
 965          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 966          return;
 967
 968       case ir_binop_logic_xor:
 969          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 970          return;
 971
 972       case ir_binop_logic_or:
 973          temp = dst_reg(this, glsl_type::bool_type);
 974          emit(OR(temp, op[0], op[1]));
 975          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 976          return;
 977
 978       case ir_binop_logic_and:
 979          temp = dst_reg(this, glsl_type::bool_type);
 980          emit(AND(temp, op[0], op[1]));
 981          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 982          return;
 983
 984       case ir_unop_f2b:
 985          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 986          return;
 987
 988       case ir_unop_i2b:
 989          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 990          return;
 991
 992       case ir_binop_greater:
 993       case ir_binop_gequal:
 994       case ir_binop_less:
 995       case ir_binop_lequal:
 996       case ir_binop_equal:
 997       case ir_binop_nequal:
 998          emit(IF(op[0], op[1],
 999                  brw_conditional_for_comparison(expr->operation)));
1000          return;
1001
1002       case ir_binop_all_equal:
1003          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1004          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1005          return;
1006
1007       case ir_binop_any_nequal:
1008          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1009          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1010          return;
1011
1012       case ir_unop_any:
1013          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1014          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1015          return;
1016
1017       case ir_triop_csel: {
1018          /* Expand the boolean condition into the flag register. */
1019          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1020          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1021
1022          /* Select which boolean to return. */
1023          dst_reg temp(this, expr->operands[1]->type);
1024          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1025          inst->predicate = BRW_PREDICATE_NORMAL;
1026
1027          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1028          return;
1029       }
1030
1031       default:
1032          unreachable("not reached");
1033       }
1034       return;
1035    }
1036
1037    ir->condition->accept(this);
1038
1039    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_variable *ir)
1044 {
1045    dst_reg *reg = NULL;
1046
1047    if (variable_storage(ir))
1048       return;
1049
1050    switch (ir->data.mode) {
1051    case ir_var_shader_in:
1052       assert(ir->data.location != -1);
1053       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1054       break;
1055
1056    case ir_var_shader_out:
1057       assert(ir->data.location != -1);
1058       reg = new(mem_ctx) dst_reg(this, ir->type);
1059
1060       for (int i = 0; i < type_size(ir->type); i++) {
1061          output_reg[ir->data.location + i] = *reg;
1062          output_reg[ir->data.location + i].reg_offset = i;
1063          output_reg[ir->data.location + i].type =
1064             brw_type_for_base_type(ir->type->get_scalar_type());
1065          output_reg_annotation[ir->data.location + i] = ir->name;
1066       }
1067       break;
1068
1069    case ir_var_auto:
1070    case ir_var_temporary:
1071       reg = new(mem_ctx) dst_reg(this, ir->type);
1072       break;
1073
1074    case ir_var_uniform:
1075       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1076
1077       /* Thanks to the lower_ubo_reference pass, we will see only
1078        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1079        * variables, so no need for them to be in variable_ht.
1080        *
1081        * Some uniforms, such as samplers and atomic counters, have no actual
1082        * storage, so we should ignore them.
1083        */
1084       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1085          return;
1086
1087       /* Track how big the whole uniform variable is, in case we need to put a
1088        * copy of its data into pull constants for array access.
1089        */
1090       assert(this->uniforms < uniform_array_size);
1091       this->uniform_size[this->uniforms] = type_size(ir->type);
1092
1093       if (!strncmp(ir->name, "gl_", 3)) {
1094          setup_builtin_uniform_values(ir);
1095       } else {
1096          setup_uniform_values(ir);
1097       }
1098       break;
1099
1100    case ir_var_system_value:
1101       reg = make_reg_for_system_value(ir);
1102       break;
1103
1104    default:
1105       unreachable("not reached");
1106    }
1107
1108    reg->type = brw_type_for_base_type(ir->type);
1109    hash_table_insert(this->variable_ht, reg, ir);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop *ir)
1114 {
1115    /* We don't want debugging output to print the whole body of the
1116     * loop as the annotation.
1117     */
1118    this->base_ir = NULL;
1119
1120    emit(BRW_OPCODE_DO);
1121
1122    visit_instructions(&ir->body_instructions);
1123
1124    emit(BRW_OPCODE_WHILE);
1125 }
1126
1127 void
1128 vec4_visitor::visit(ir_loop_jump *ir)
1129 {
1130    switch (ir->mode) {
1131    case ir_loop_jump::jump_break:
1132       emit(BRW_OPCODE_BREAK);
1133       break;
1134    case ir_loop_jump::jump_continue:
1135       emit(BRW_OPCODE_CONTINUE);
1136       break;
1137    }
1138 }
1139
1140
1141 void
1142 vec4_visitor::visit(ir_function_signature *)
1143 {
1144    unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::visit(ir_function *ir)
1149 {
1150    /* Ignore function bodies other than main() -- we shouldn't see calls to
1151     * them since they should all be inlined.
1152     */
1153    if (strcmp(ir->name, "main") == 0) {
1154       const ir_function_signature *sig;
1155       exec_list empty;
1156
1157       sig = ir->matching_signature(NULL, &empty, false);
1158
1159       assert(sig);
1160
1161       visit_instructions(&sig->body);
1162    }
1163 }
1164
1165 bool
1166 vec4_visitor::try_emit_mad(ir_expression *ir)
1167 {
1168    /* 3-src instructions were introduced in gen6. */
1169    if (brw->gen < 6)
1170       return false;
1171
1172    /* MAD can only handle floating-point data. */
1173    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1174       return false;
1175
1176    ir_rvalue *nonmul = ir->operands[1];
1177    ir_expression *mul = ir->operands[0]->as_expression();
1178
1179    if (!mul || mul->operation != ir_binop_mul) {
1180       nonmul = ir->operands[0];
1181       mul = ir->operands[1]->as_expression();
1182
1183       if (!mul || mul->operation != ir_binop_mul)
1184          return false;
1185    }
1186
1187    nonmul->accept(this);
1188    src_reg src0 = fix_3src_operand(this->result);
1189
1190    mul->operands[0]->accept(this);
1191    src_reg src1 = fix_3src_operand(this->result);
1192
1193    mul->operands[1]->accept(this);
1194    src_reg src2 = fix_3src_operand(this->result);
1195
1196    this->result = src_reg(this, ir->type);
1197    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1198
1199    return true;
1200 }
1201
1202 bool
1203 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1204 {
1205    /* This optimization relies on CMP setting the destination to 0 when
1206     * false.  Early hardware only sets the least significant bit, and
1207     * leaves the other bits undefined.  So we can't use it.
1208     */
1209    if (brw->gen < 6)
1210       return false;
1211
1212    ir_expression *const cmp = ir->operands[0]->as_expression();
1213
1214    if (cmp == NULL)
1215       return false;
1216
1217    switch (cmp->operation) {
1218    case ir_binop_less:
1219    case ir_binop_greater:
1220    case ir_binop_lequal:
1221    case ir_binop_gequal:
1222    case ir_binop_equal:
1223    case ir_binop_nequal:
1224       break;
1225
1226    default:
1227       return false;
1228    }
1229
1230    cmp->operands[0]->accept(this);
1231    const src_reg cmp_src0 = this->result;
1232
1233    cmp->operands[1]->accept(this);
1234    const src_reg cmp_src1 = this->result;
1235
1236    this->result = src_reg(this, ir->type);
1237
1238    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1239             brw_conditional_for_comparison(cmp->operation)));
1240
1241    /* If the comparison is false, this->result will just happen to be zero.
1242     */
1243    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1244                                        this->result, src_reg(1.0f));
1245    inst->predicate = BRW_PREDICATE_NORMAL;
1246    inst->predicate_inverse = true;
1247
1248    return true;
1249 }
1250
1251 void
1252 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1253                           src_reg src0, src_reg src1)
1254 {
1255    vec4_instruction *inst;
1256
1257    if (brw->gen >= 6) {
1258       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1259       inst->conditional_mod = conditionalmod;
1260    } else {
1261       emit(CMP(dst, src0, src1, conditionalmod));
1262
1263       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1264       inst->predicate = BRW_PREDICATE_NORMAL;
1265    }
1266 }
1267
1268 void
1269 vec4_visitor::emit_lrp(const dst_reg &dst,
1270                        const src_reg &x, const src_reg &y, const src_reg &a)
1271 {
1272    if (brw->gen >= 6) {
1273       /* Note that the instruction's argument order is reversed from GLSL
1274        * and the IR.
1275        */
1276       emit(LRP(dst,
1277                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1278    } else {
1279       /* Earlier generations don't support three source operations, so we
1280        * need to emit x*(1-a) + y*a.
1281        */
1282       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1283       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1284       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1285       y_times_a.writemask           = dst.writemask;
1286       one_minus_a.writemask         = dst.writemask;
1287       x_times_one_minus_a.writemask = dst.writemask;
1288
1289       emit(MUL(y_times_a, y, a));
1290       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1291       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1292       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1293    }
1294 }
1295
1296 void
1297 vec4_visitor::visit(ir_expression *ir)
1298 {
1299    unsigned int operand;
1300    src_reg op[Elements(ir->operands)];
1301    vec4_instruction *inst;
1302
1303    if (ir->operation == ir_binop_add) {
1304       if (try_emit_mad(ir))
1305          return;
1306    }
1307
1308    if (ir->operation == ir_unop_b2f) {
1309       if (try_emit_b2f_of_compare(ir))
1310          return;
1311    }
1312
1313    /* Storage for our result.  Ideally for an assignment we'd be using
1314     * the actual storage for the result here, instead.
1315     */
1316    dst_reg result_dst(this, ir->type);
1317    src_reg result_src(result_dst);
1318
1319    if (ir->operation == ir_triop_csel) {
1320       ir->operands[1]->accept(this);
1321       op[1] = this->result;
1322       ir->operands[2]->accept(this);
1323       op[2] = this->result;
1324
1325       enum brw_predicate predicate;
1326       emit_bool_to_cond_code(ir->operands[0], &predicate);
1327       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1328       inst->predicate = predicate;
1329       this->result = result_src;
1330       return;
1331    }
1332
1333    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1334       this->result.file = BAD_FILE;
1335       ir->operands[operand]->accept(this);
1336       if (this->result.file == BAD_FILE) {
1337          fprintf(stderr, "Failed to get tree for expression operand:\n");
1338          ir->operands[operand]->fprint(stderr);
1339          exit(1);
1340       }
1341       op[operand] = this->result;
1342
1343       /* Matrix expression operands should have been broken down to vector
1344        * operations already.
1345        */
1346       assert(!ir->operands[operand]->type->is_matrix());
1347    }
1348
1349    /* If nothing special happens, this is the result. */
1350    this->result = result_src;
1351
1352    switch (ir->operation) {
1353    case ir_unop_logic_not:
1354       emit(NOT(result_dst, op[0]));
1355       break;
1356    case ir_unop_neg:
1357       op[0].negate = !op[0].negate;
1358       emit(MOV(result_dst, op[0]));
1359       break;
1360    case ir_unop_abs:
1361       op[0].abs = true;
1362       op[0].negate = false;
1363       emit(MOV(result_dst, op[0]));
1364       break;
1365
1366    case ir_unop_sign:
1367       if (ir->type->is_float()) {
1368          /* AND(val, 0x80000000) gives the sign bit.
1369           *
1370           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1371           * zero.
1372           */
1373          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1374
1375          op[0].type = BRW_REGISTER_TYPE_UD;
1376          result_dst.type = BRW_REGISTER_TYPE_UD;
1377          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1378
1379          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1380          inst->predicate = BRW_PREDICATE_NORMAL;
1381
1382          this->result.type = BRW_REGISTER_TYPE_F;
1383       } else {
1384          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1385           *               -> non-negative val generates 0x00000000.
1386           *  Predicated OR sets 1 if val is positive.
1387           */
1388          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1389
1390          emit(ASR(result_dst, op[0], src_reg(31)));
1391
1392          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1393          inst->predicate = BRW_PREDICATE_NORMAL;
1394       }
1395       break;
1396
1397    case ir_unop_rcp:
1398       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1399       break;
1400
1401    case ir_unop_exp2:
1402       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1403       break;
1404    case ir_unop_log2:
1405       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1406       break;
1407    case ir_unop_exp:
1408    case ir_unop_log:
1409       unreachable("not reached: should be handled by ir_explog_to_explog2");
1410    case ir_unop_sin:
1411    case ir_unop_sin_reduced:
1412       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1413       break;
1414    case ir_unop_cos:
1415    case ir_unop_cos_reduced:
1416       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1417       break;
1418
1419    case ir_unop_dFdx:
1420    case ir_unop_dFdx_coarse:
1421    case ir_unop_dFdx_fine:
1422    case ir_unop_dFdy:
1423    case ir_unop_dFdy_coarse:
1424    case ir_unop_dFdy_fine:
1425       unreachable("derivatives not valid in vertex shader");
1426
1427    case ir_unop_bitfield_reverse:
1428       emit(BFREV(result_dst, op[0]));
1429       break;
1430    case ir_unop_bit_count:
1431       emit(CBIT(result_dst, op[0]));
1432       break;
1433    case ir_unop_find_msb: {
1434       src_reg temp = src_reg(this, glsl_type::uint_type);
1435
1436       inst = emit(FBH(dst_reg(temp), op[0]));
1437       inst->dst.writemask = WRITEMASK_XYZW;
1438
1439       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1440        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1441        * subtract the result from 31 to convert the MSB count into an LSB count.
1442        */
1443
1444       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1445       temp.swizzle = BRW_SWIZZLE_NOOP;
1446       emit(MOV(result_dst, temp));
1447
1448       src_reg src_tmp = src_reg(result_dst);
1449       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1450
1451       src_tmp.negate = true;
1452       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1453       inst->predicate = BRW_PREDICATE_NORMAL;
1454       break;
1455    }
1456    case ir_unop_find_lsb:
1457       emit(FBL(result_dst, op[0]));
1458       break;
1459    case ir_unop_saturate:
1460       inst = emit(MOV(result_dst, op[0]));
1461       inst->saturate = true;
1462       break;
1463
1464    case ir_unop_noise:
1465       unreachable("not reached: should be handled by lower_noise");
1466
1467    case ir_binop_add:
1468       emit(ADD(result_dst, op[0], op[1]));
1469       break;
1470    case ir_binop_sub:
1471       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1472
1473    case ir_binop_mul:
1474       if (brw->gen < 8 && ir->type->is_integer()) {
1475          /* For integer multiplication, the MUL uses the low 16 bits of one of
1476           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1477           * accumulates in the contribution of the upper 16 bits of that
1478           * operand.  If we can determine that one of the args is in the low
1479           * 16 bits, though, we can just emit a single MUL.
1480           */
1481          if (ir->operands[0]->is_uint16_constant()) {
1482             if (brw->gen < 7)
1483                emit(MUL(result_dst, op[0], op[1]));
1484             else
1485                emit(MUL(result_dst, op[1], op[0]));
1486          } else if (ir->operands[1]->is_uint16_constant()) {
1487             if (brw->gen < 7)
1488                emit(MUL(result_dst, op[1], op[0]));
1489             else
1490                emit(MUL(result_dst, op[0], op[1]));
1491          } else {
1492             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1493
1494             emit(MUL(acc, op[0], op[1]));
1495             emit(MACH(dst_null_d(), op[0], op[1]));
1496             emit(MOV(result_dst, src_reg(acc)));
1497          }
1498       } else {
1499          emit(MUL(result_dst, op[0], op[1]));
1500       }
1501       break;
1502    case ir_binop_imul_high: {
1503       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1504
1505       emit(MUL(acc, op[0], op[1]));
1506       emit(MACH(result_dst, op[0], op[1]));
1507       break;
1508    }
1509    case ir_binop_div:
1510       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1511       assert(ir->type->is_integer());
1512       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1513       break;
1514    case ir_binop_carry: {
1515       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1516
1517       emit(ADDC(dst_null_ud(), op[0], op[1]));
1518       emit(MOV(result_dst, src_reg(acc)));
1519       break;
1520    }
1521    case ir_binop_borrow: {
1522       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1523
1524       emit(SUBB(dst_null_ud(), op[0], op[1]));
1525       emit(MOV(result_dst, src_reg(acc)));
1526       break;
1527    }
1528    case ir_binop_mod:
1529       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1530       assert(ir->type->is_integer());
1531       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1532       break;
1533
1534    case ir_binop_less:
1535    case ir_binop_greater:
1536    case ir_binop_lequal:
1537    case ir_binop_gequal:
1538    case ir_binop_equal:
1539    case ir_binop_nequal: {
1540       if (brw->gen <= 5) {
1541          resolve_bool_comparison(ir->operands[0], &op[0]);
1542          resolve_bool_comparison(ir->operands[1], &op[1]);
1543       }
1544       emit(CMP(result_dst, op[0], op[1],
1545                brw_conditional_for_comparison(ir->operation)));
1546       break;
1547    }
1548
1549    case ir_binop_all_equal:
1550       /* "==" operator producing a scalar boolean. */
1551       if (ir->operands[0]->type->is_vector() ||
1552           ir->operands[1]->type->is_vector()) {
1553          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1554          emit(MOV(result_dst, src_reg(0)));
1555          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1556          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1557       } else {
1558          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1559       }
1560       break;
1561    case ir_binop_any_nequal:
1562       /* "!=" operator producing a scalar boolean. */
1563       if (ir->operands[0]->type->is_vector() ||
1564           ir->operands[1]->type->is_vector()) {
1565          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1566
1567          emit(MOV(result_dst, src_reg(0)));
1568          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1569          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1570       } else {
1571          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1572       }
1573       break;
1574
1575    case ir_unop_any:
1576       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1577       emit(MOV(result_dst, src_reg(0)));
1578
1579       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1580       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1581       break;
1582
1583    case ir_binop_logic_xor:
1584       emit(XOR(result_dst, op[0], op[1]));
1585       break;
1586
1587    case ir_binop_logic_or:
1588       emit(OR(result_dst, op[0], op[1]));
1589       break;
1590
1591    case ir_binop_logic_and:
1592       emit(AND(result_dst, op[0], op[1]));
1593       break;
1594
1595    case ir_binop_dot:
1596       assert(ir->operands[0]->type->is_vector());
1597       assert(ir->operands[0]->type == ir->operands[1]->type);
1598       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1599       break;
1600
1601    case ir_unop_sqrt:
1602       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1603       break;
1604    case ir_unop_rsq:
1605       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1606       break;
1607
1608    case ir_unop_bitcast_i2f:
1609    case ir_unop_bitcast_u2f:
1610       this->result = op[0];
1611       this->result.type = BRW_REGISTER_TYPE_F;
1612       break;
1613
1614    case ir_unop_bitcast_f2i:
1615       this->result = op[0];
1616       this->result.type = BRW_REGISTER_TYPE_D;
1617       break;
1618
1619    case ir_unop_bitcast_f2u:
1620       this->result = op[0];
1621       this->result.type = BRW_REGISTER_TYPE_UD;
1622       break;
1623
1624    case ir_unop_i2f:
1625    case ir_unop_i2u:
1626    case ir_unop_u2i:
1627    case ir_unop_u2f:
1628    case ir_unop_f2i:
1629    case ir_unop_f2u:
1630       emit(MOV(result_dst, op[0]));
1631       break;
1632    case ir_unop_b2i:
1633       emit(AND(result_dst, op[0], src_reg(1)));
1634       break;
1635    case ir_unop_b2f:
1636       if (brw->gen <= 5) {
1637          resolve_bool_comparison(ir->operands[0], &op[0]);
1638       }
1639       op[0].type = BRW_REGISTER_TYPE_D;
1640       result_dst.type = BRW_REGISTER_TYPE_D;
1641       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1642       result_dst.type = BRW_REGISTER_TYPE_F;
1643       break;
1644    case ir_unop_f2b:
1645       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1646       break;
1647    case ir_unop_i2b:
1648       emit(AND(result_dst, op[0], src_reg(1)));
1649       break;
1650
1651    case ir_unop_trunc:
1652       emit(RNDZ(result_dst, op[0]));
1653       break;
1654    case ir_unop_ceil: {
1655          src_reg tmp = src_reg(this, ir->type);
1656          op[0].negate = !op[0].negate;
1657          emit(RNDD(dst_reg(tmp), op[0]));
1658          tmp.negate = true;
1659          emit(MOV(result_dst, tmp));
1660       }
1661       break;
1662    case ir_unop_floor:
1663       inst = emit(RNDD(result_dst, op[0]));
1664       break;
1665    case ir_unop_fract:
1666       inst = emit(FRC(result_dst, op[0]));
1667       break;
1668    case ir_unop_round_even:
1669       emit(RNDE(result_dst, op[0]));
1670       break;
1671
1672    case ir_binop_min:
1673       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1674       break;
1675    case ir_binop_max:
1676       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1677       break;
1678
1679    case ir_binop_pow:
1680       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1681       break;
1682
1683    case ir_unop_bit_not:
1684       inst = emit(NOT(result_dst, op[0]));
1685       break;
1686    case ir_binop_bit_and:
1687       inst = emit(AND(result_dst, op[0], op[1]));
1688       break;
1689    case ir_binop_bit_xor:
1690       inst = emit(XOR(result_dst, op[0], op[1]));
1691       break;
1692    case ir_binop_bit_or:
1693       inst = emit(OR(result_dst, op[0], op[1]));
1694       break;
1695
1696    case ir_binop_lshift:
1697       inst = emit(SHL(result_dst, op[0], op[1]));
1698       break;
1699
1700    case ir_binop_rshift:
1701       if (ir->type->base_type == GLSL_TYPE_INT)
1702          inst = emit(ASR(result_dst, op[0], op[1]));
1703       else
1704          inst = emit(SHR(result_dst, op[0], op[1]));
1705       break;
1706
1707    case ir_binop_bfm:
1708       emit(BFI1(result_dst, op[0], op[1]));
1709       break;
1710
1711    case ir_binop_ubo_load: {
1712       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1713       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1714       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1715       src_reg offset;
1716
1717       /* Now, load the vector from that offset. */
1718       assert(ir->type->is_vector() || ir->type->is_scalar());
1719
1720       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1721       packed_consts.type = result.type;
1722       src_reg surf_index;
1723
1724       if (const_uniform_block) {
1725          /* The block index is a constant, so just emit the binding table entry
1726           * as an immediate.
1727           */
1728          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1729                               const_uniform_block->value.u[0]);
1730       } else {
1731          /* The block index is not a constant. Evaluate the index expression
1732           * per-channel and add the base UBO index; the generator will select
1733           * a value from any live channel.
1734           */
1735          surf_index = src_reg(this, glsl_type::uint_type);
1736          emit(ADD(dst_reg(surf_index), op[0],
1737                   src_reg(prog_data->base.binding_table.ubo_start)));
1738
1739          /* Assume this may touch any UBO. It would be nice to provide
1740           * a tighter bound, but the array information is already lowered away.
1741           */
1742          brw_mark_surface_used(&prog_data->base,
1743                                prog_data->base.binding_table.ubo_start +
1744                                shader_prog->NumUniformBlocks - 1);
1745       }
1746
1747       if (const_offset_ir) {
1748          if (brw->gen >= 8) {
1749             /* Store the offset in a GRF so we can send-from-GRF. */
1750             offset = src_reg(this, glsl_type::int_type);
1751             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1752          } else {
1753             /* Immediates are fine on older generations since they'll be moved
1754              * to a (potentially fake) MRF at the generator level.
1755              */
1756             offset = src_reg(const_offset / 16);
1757          }
1758       } else {
1759          offset = src_reg(this, glsl_type::uint_type);
1760          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1761       }
1762
1763       if (brw->gen >= 7) {
1764          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1765          grf_offset.type = offset.type;
1766
1767          emit(MOV(grf_offset, offset));
1768
1769          emit(new(mem_ctx) vec4_instruction(this,
1770                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1771                                             dst_reg(packed_consts),
1772                                             surf_index,
1773                                             src_reg(grf_offset)));
1774       } else {
1775          vec4_instruction *pull =
1776             emit(new(mem_ctx) vec4_instruction(this,
1777                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1778                                                dst_reg(packed_consts),
1779                                                surf_index,
1780                                                offset));
1781          pull->base_mrf = 14;
1782          pull->mlen = 1;
1783       }
1784
1785       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1786       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1787                                             const_offset % 16 / 4,
1788                                             const_offset % 16 / 4,
1789                                             const_offset % 16 / 4);
1790
1791       /* UBO bools are any nonzero int.  We need to convert them to use the
1792        * value of true stored in ctx->Const.UniformBooleanTrue.
1793        */
1794       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1795          emit(CMP(result_dst, packed_consts, src_reg(0u),
1796                   BRW_CONDITIONAL_NZ));
1797       } else {
1798          emit(MOV(result_dst, packed_consts));
1799       }
1800       break;
1801    }
1802
1803    case ir_binop_vector_extract:
1804       unreachable("should have been lowered by vec_index_to_cond_assign");
1805
1806    case ir_triop_fma:
1807       op[0] = fix_3src_operand(op[0]);
1808       op[1] = fix_3src_operand(op[1]);
1809       op[2] = fix_3src_operand(op[2]);
1810       /* Note that the instruction's argument order is reversed from GLSL
1811        * and the IR.
1812        */
1813       emit(MAD(result_dst, op[2], op[1], op[0]));
1814       break;
1815
1816    case ir_triop_lrp:
1817       emit_lrp(result_dst, op[0], op[1], op[2]);
1818       break;
1819
1820    case ir_triop_csel:
1821       unreachable("already handled above");
1822       break;
1823
1824    case ir_triop_bfi:
1825       op[0] = fix_3src_operand(op[0]);
1826       op[1] = fix_3src_operand(op[1]);
1827       op[2] = fix_3src_operand(op[2]);
1828       emit(BFI2(result_dst, op[0], op[1], op[2]));
1829       break;
1830
1831    case ir_triop_bitfield_extract:
1832       op[0] = fix_3src_operand(op[0]);
1833       op[1] = fix_3src_operand(op[1]);
1834       op[2] = fix_3src_operand(op[2]);
1835       /* Note that the instruction's argument order is reversed from GLSL
1836        * and the IR.
1837        */
1838       emit(BFE(result_dst, op[2], op[1], op[0]));
1839       break;
1840
1841    case ir_triop_vector_insert:
1842       unreachable("should have been lowered by lower_vector_insert");
1843
1844    case ir_quadop_bitfield_insert:
1845       unreachable("not reached: should be handled by "
1846               "bitfield_insert_to_bfm_bfi\n");
1847
1848    case ir_quadop_vector:
1849       unreachable("not reached: should be handled by lower_quadop_vector");
1850
1851    case ir_unop_pack_half_2x16:
1852       emit_pack_half_2x16(result_dst, op[0]);
1853       break;
1854    case ir_unop_unpack_half_2x16:
1855       emit_unpack_half_2x16(result_dst, op[0]);
1856       break;
1857    case ir_unop_unpack_unorm_4x8:
1858       emit_unpack_unorm_4x8(result_dst, op[0]);
1859       break;
1860    case ir_unop_unpack_snorm_4x8:
1861       emit_unpack_snorm_4x8(result_dst, op[0]);
1862       break;
1863    case ir_unop_pack_unorm_4x8:
1864       emit_pack_unorm_4x8(result_dst, op[0]);
1865       break;
1866    case ir_unop_pack_snorm_4x8:
1867       emit_pack_snorm_4x8(result_dst, op[0]);
1868       break;
1869    case ir_unop_pack_snorm_2x16:
1870    case ir_unop_pack_unorm_2x16:
1871    case ir_unop_unpack_snorm_2x16:
1872    case ir_unop_unpack_unorm_2x16:
1873       unreachable("not reached: should be handled by lower_packing_builtins");
1874    case ir_unop_unpack_half_2x16_split_x:
1875    case ir_unop_unpack_half_2x16_split_y:
1876    case ir_binop_pack_half_2x16_split:
1877    case ir_unop_interpolate_at_centroid:
1878    case ir_binop_interpolate_at_sample:
1879    case ir_binop_interpolate_at_offset:
1880       unreachable("not reached: should not occur in vertex shader");
1881    case ir_binop_ldexp:
1882       unreachable("not reached: should be handled by ldexp_to_arith()");
1883    }
1884 }
1885
1886
1887 void
1888 vec4_visitor::visit(ir_swizzle *ir)
1889 {
1890    src_reg src;
1891    int i = 0;
1892    int swizzle[4];
1893
1894    /* Note that this is only swizzles in expressions, not those on the left
1895     * hand side of an assignment, which do write masking.  See ir_assignment
1896     * for that.
1897     */
1898
1899    ir->val->accept(this);
1900    src = this->result;
1901    assert(src.file != BAD_FILE);
1902
1903    for (i = 0; i < ir->type->vector_elements; i++) {
1904       switch (i) {
1905       case 0:
1906          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1907          break;
1908       case 1:
1909          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1910          break;
1911       case 2:
1912          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1913          break;
1914       case 3:
1915          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1916             break;
1917       }
1918    }
1919    for (; i < 4; i++) {
1920       /* Replicate the last channel out. */
1921       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1922    }
1923
1924    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1925
1926    this->result = src;
1927 }
1928
1929 void
1930 vec4_visitor::visit(ir_dereference_variable *ir)
1931 {
1932    const struct glsl_type *type = ir->type;
1933    dst_reg *reg = variable_storage(ir->var);
1934
1935    if (!reg) {
1936       fail("Failed to find variable storage for %s\n", ir->var->name);
1937       this->result = src_reg(brw_null_reg());
1938       return;
1939    }
1940
1941    this->result = src_reg(*reg);
1942
1943    /* System values get their swizzle from the dst_reg writemask */
1944    if (ir->var->data.mode == ir_var_system_value)
1945       return;
1946
1947    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1948       this->result.swizzle = swizzle_for_size(type->vector_elements);
1949 }
1950
1951
1952 int
1953 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1954 {
1955    /* Under normal circumstances array elements are stored consecutively, so
1956     * the stride is equal to the size of the array element.
1957     */
1958    return type_size(ir->type);
1959 }
1960
1961
1962 void
1963 vec4_visitor::visit(ir_dereference_array *ir)
1964 {
1965    ir_constant *constant_index;
1966    src_reg src;
1967    int array_stride = compute_array_stride(ir);
1968
1969    constant_index = ir->array_index->constant_expression_value();
1970
1971    ir->array->accept(this);
1972    src = this->result;
1973
1974    if (constant_index) {
1975       src.reg_offset += constant_index->value.i[0] * array_stride;
1976    } else {
1977       /* Variable index array dereference.  It eats the "vec4" of the
1978        * base of the array and an index that offsets the Mesa register
1979        * index.
1980        */
1981       ir->array_index->accept(this);
1982
1983       src_reg index_reg;
1984
1985       if (array_stride == 1) {
1986          index_reg = this->result;
1987       } else {
1988          index_reg = src_reg(this, glsl_type::int_type);
1989
1990          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1991       }
1992
1993       if (src.reladdr) {
1994          src_reg temp = src_reg(this, glsl_type::int_type);
1995
1996          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1997
1998          index_reg = temp;
1999       }
2000
2001       src.reladdr = ralloc(mem_ctx, src_reg);
2002       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2003    }
2004
2005    /* If the type is smaller than a vec4, replicate the last channel out. */
2006    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2007       src.swizzle = swizzle_for_size(ir->type->vector_elements);
2008    else
2009       src.swizzle = BRW_SWIZZLE_NOOP;
2010    src.type = brw_type_for_base_type(ir->type);
2011
2012    this->result = src;
2013 }
2014
2015 void
2016 vec4_visitor::visit(ir_dereference_record *ir)
2017 {
2018    unsigned int i;
2019    const glsl_type *struct_type = ir->record->type;
2020    int offset = 0;
2021
2022    ir->record->accept(this);
2023
2024    for (i = 0; i < struct_type->length; i++) {
2025       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2026          break;
2027       offset += type_size(struct_type->fields.structure[i].type);
2028    }
2029
2030    /* If the type is smaller than a vec4, replicate the last channel out. */
2031    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2032       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2033    else
2034       this->result.swizzle = BRW_SWIZZLE_NOOP;
2035    this->result.type = brw_type_for_base_type(ir->type);
2036
2037    this->result.reg_offset += offset;
2038 }
2039
2040 /**
2041  * We want to be careful in assignment setup to hit the actual storage
2042  * instead of potentially using a temporary like we might with the
2043  * ir_dereference handler.
2044  */
2045 static dst_reg
2046 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2047 {
2048    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2049     * access of a vector, it must be separated into a series conditional moves
2050     * before reaching this point (see ir_vec_index_to_cond_assign).
2051     */
2052    assert(ir->as_dereference());
2053    ir_dereference_array *deref_array = ir->as_dereference_array();
2054    if (deref_array) {
2055       assert(!deref_array->array->type->is_vector());
2056    }
2057
2058    /* Use the rvalue deref handler for the most part.  We'll ignore
2059     * swizzles in it and write swizzles using writemask, though.
2060     */
2061    ir->accept(v);
2062    return dst_reg(v->result);
2063 }
2064
2065 void
2066 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2067                               const struct glsl_type *type,
2068                               enum brw_predicate predicate)
2069 {
2070    if (type->base_type == GLSL_TYPE_STRUCT) {
2071       for (unsigned int i = 0; i < type->length; i++) {
2072          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2073       }
2074       return;
2075    }
2076
2077    if (type->is_array()) {
2078       for (unsigned int i = 0; i < type->length; i++) {
2079          emit_block_move(dst, src, type->fields.array, predicate);
2080       }
2081       return;
2082    }
2083
2084    if (type->is_matrix()) {
2085       const struct glsl_type *vec_type;
2086
2087       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2088                                          type->vector_elements, 1);
2089
2090       for (int i = 0; i < type->matrix_columns; i++) {
2091          emit_block_move(dst, src, vec_type, predicate);
2092       }
2093       return;
2094    }
2095
2096    assert(type->is_scalar() || type->is_vector());
2097
2098    dst->type = brw_type_for_base_type(type);
2099    src->type = dst->type;
2100
2101    dst->writemask = (1 << type->vector_elements) - 1;
2102
2103    src->swizzle = swizzle_for_size(type->vector_elements);
2104
2105    vec4_instruction *inst = emit(MOV(*dst, *src));
2106    inst->predicate = predicate;
2107
2108    dst->reg_offset++;
2109    src->reg_offset++;
2110 }
2111
2112
2113 /* If the RHS processing resulted in an instruction generating a
2114  * temporary value, and it would be easy to rewrite the instruction to
2115  * generate its result right into the LHS instead, do so.  This ends
2116  * up reliably removing instructions where it can be tricky to do so
2117  * later without real UD chain information.
2118  */
2119 bool
2120 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2121                                      dst_reg dst,
2122                                      src_reg src,
2123                                      vec4_instruction *pre_rhs_inst,
2124                                      vec4_instruction *last_rhs_inst)
2125 {
2126    /* This could be supported, but it would take more smarts. */
2127    if (ir->condition)
2128       return false;
2129
2130    if (pre_rhs_inst == last_rhs_inst)
2131       return false; /* No instructions generated to work with. */
2132
2133    /* Make sure the last instruction generated our source reg. */
2134    if (src.file != GRF ||
2135        src.file != last_rhs_inst->dst.file ||
2136        src.reg != last_rhs_inst->dst.reg ||
2137        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2138        src.reladdr ||
2139        src.abs ||
2140        src.negate ||
2141        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2142       return false;
2143
2144    /* Check that that last instruction fully initialized the channels
2145     * we want to use, in the order we want to use them.  We could
2146     * potentially reswizzle the operands of many instructions so that
2147     * we could handle out of order channels, but don't yet.
2148     */
2149
2150    for (unsigned i = 0; i < 4; i++) {
2151       if (dst.writemask & (1 << i)) {
2152          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2153             return false;
2154
2155          if (BRW_GET_SWZ(src.swizzle, i) != i)
2156             return false;
2157       }
2158    }
2159
2160    /* Success!  Rewrite the instruction. */
2161    last_rhs_inst->dst.file = dst.file;
2162    last_rhs_inst->dst.reg = dst.reg;
2163    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2164    last_rhs_inst->dst.reladdr = dst.reladdr;
2165    last_rhs_inst->dst.writemask &= dst.writemask;
2166
2167    return true;
2168 }
2169
2170 void
2171 vec4_visitor::visit(ir_assignment *ir)
2172 {
2173    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2174    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2175
2176    if (!ir->lhs->type->is_scalar() &&
2177        !ir->lhs->type->is_vector()) {
2178       ir->rhs->accept(this);
2179       src_reg src = this->result;
2180
2181       if (ir->condition) {
2182          emit_bool_to_cond_code(ir->condition, &predicate);
2183       }
2184
2185       /* emit_block_move doesn't account for swizzles in the source register.
2186        * This should be ok, since the source register is a structure or an
2187        * array, and those can't be swizzled.  But double-check to be sure.
2188        */
2189       assert(src.swizzle ==
2190              (ir->rhs->type->is_matrix()
2191               ? swizzle_for_size(ir->rhs->type->vector_elements)
2192               : BRW_SWIZZLE_NOOP));
2193
2194       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2195       return;
2196    }
2197
2198    /* Now we're down to just a scalar/vector with writemasks. */
2199    int i;
2200
2201    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2202    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2203
2204    ir->rhs->accept(this);
2205
2206    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2207
2208    src_reg src = this->result;
2209
2210    int swizzles[4];
2211    int first_enabled_chan = 0;
2212    int src_chan = 0;
2213
2214    assert(ir->lhs->type->is_vector() ||
2215           ir->lhs->type->is_scalar());
2216    dst.writemask = ir->write_mask;
2217
2218    for (int i = 0; i < 4; i++) {
2219       if (dst.writemask & (1 << i)) {
2220          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2221          break;
2222       }
2223    }
2224
2225    /* Swizzle a small RHS vector into the channels being written.
2226     *
2227     * glsl ir treats write_mask as dictating how many channels are
2228     * present on the RHS while in our instructions we need to make
2229     * those channels appear in the slots of the vec4 they're written to.
2230     */
2231    for (int i = 0; i < 4; i++) {
2232       if (dst.writemask & (1 << i))
2233          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2234       else
2235          swizzles[i] = first_enabled_chan;
2236    }
2237    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2238                               swizzles[2], swizzles[3]);
2239
2240    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2241       return;
2242    }
2243
2244    if (ir->condition) {
2245       emit_bool_to_cond_code(ir->condition, &predicate);
2246    }
2247
2248    for (i = 0; i < type_size(ir->lhs->type); i++) {
2249       vec4_instruction *inst = emit(MOV(dst, src));
2250       inst->predicate = predicate;
2251
2252       dst.reg_offset++;
2253       src.reg_offset++;
2254    }
2255 }
2256
2257 void
2258 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2259 {
2260    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2261       foreach_in_list(ir_constant, field_value, &ir->components) {
2262          emit_constant_values(dst, field_value);
2263       }
2264       return;
2265    }
2266
2267    if (ir->type->is_array()) {
2268       for (unsigned int i = 0; i < ir->type->length; i++) {
2269          emit_constant_values(dst, ir->array_elements[i]);
2270       }
2271       return;
2272    }
2273
2274    if (ir->type->is_matrix()) {
2275       for (int i = 0; i < ir->type->matrix_columns; i++) {
2276          float *vec = &ir->value.f[i * ir->type->vector_elements];
2277
2278          for (int j = 0; j < ir->type->vector_elements; j++) {
2279             dst->writemask = 1 << j;
2280             dst->type = BRW_REGISTER_TYPE_F;
2281
2282             emit(MOV(*dst, src_reg(vec[j])));
2283          }
2284          dst->reg_offset++;
2285       }
2286       return;
2287    }
2288
2289    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2290
2291    for (int i = 0; i < ir->type->vector_elements; i++) {
2292       if (!(remaining_writemask & (1 << i)))
2293          continue;
2294
2295       dst->writemask = 1 << i;
2296       dst->type = brw_type_for_base_type(ir->type);
2297
2298       /* Find other components that match the one we're about to
2299        * write.  Emits fewer instructions for things like vec4(0.5,
2300        * 1.5, 1.5, 1.5).
2301        */
2302       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2303          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2304             if (ir->value.b[i] == ir->value.b[j])
2305                dst->writemask |= (1 << j);
2306          } else {
2307             /* u, i, and f storage all line up, so no need for a
2308              * switch case for comparing each type.
2309              */
2310             if (ir->value.u[i] == ir->value.u[j])
2311                dst->writemask |= (1 << j);
2312          }
2313       }
2314
2315       switch (ir->type->base_type) {
2316       case GLSL_TYPE_FLOAT:
2317          emit(MOV(*dst, src_reg(ir->value.f[i])));
2318          break;
2319       case GLSL_TYPE_INT:
2320          emit(MOV(*dst, src_reg(ir->value.i[i])));
2321          break;
2322       case GLSL_TYPE_UINT:
2323          emit(MOV(*dst, src_reg(ir->value.u[i])));
2324          break;
2325       case GLSL_TYPE_BOOL:
2326          emit(MOV(*dst,
2327                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2328                                               : 0)));
2329          break;
2330       default:
2331          unreachable("Non-float/uint/int/bool constant");
2332       }
2333
2334       remaining_writemask &= ~dst->writemask;
2335    }
2336    dst->reg_offset++;
2337 }
2338
2339 void
2340 vec4_visitor::visit(ir_constant *ir)
2341 {
2342    dst_reg dst = dst_reg(this, ir->type);
2343    this->result = src_reg(dst);
2344
2345    emit_constant_values(&dst, ir);
2346 }
2347
2348 void
2349 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2350 {
2351    ir_dereference *deref = static_cast<ir_dereference *>(
2352       ir->actual_parameters.get_head());
2353    ir_variable *location = deref->variable_referenced();
2354    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2355                           location->data.binding);
2356
2357    /* Calculate the surface offset */
2358    src_reg offset(this, glsl_type::uint_type);
2359    ir_dereference_array *deref_array = deref->as_dereference_array();
2360    if (deref_array) {
2361       deref_array->array_index->accept(this);
2362
2363       src_reg tmp(this, glsl_type::uint_type);
2364       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2365       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2366    } else {
2367       offset = location->data.atomic.offset;
2368    }
2369
2370    /* Emit the appropriate machine instruction */
2371    const char *callee = ir->callee->function_name();
2372    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2373
2374    if (!strcmp("__intrinsic_atomic_read", callee)) {
2375       emit_untyped_surface_read(surf_index, dst, offset);
2376
2377    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2378       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2379                           src_reg(), src_reg());
2380
2381    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2382       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2383                           src_reg(), src_reg());
2384    }
2385 }
2386
2387 void
2388 vec4_visitor::visit(ir_call *ir)
2389 {
2390    const char *callee = ir->callee->function_name();
2391
2392    if (!strcmp("__intrinsic_atomic_read", callee) ||
2393        !strcmp("__intrinsic_atomic_increment", callee) ||
2394        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2395       visit_atomic_counter_intrinsic(ir);
2396    } else {
2397       unreachable("Unsupported intrinsic.");
2398    }
2399 }
2400
2401 src_reg
2402 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2403 {
2404    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2405    inst->base_mrf = 2;
2406    inst->mlen = 1;
2407    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2408    inst->dst.writemask = WRITEMASK_XYZW;
2409
2410    inst->src[1] = sampler;
2411
2412    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2413    int param_base = inst->base_mrf;
2414    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2415    int zero_mask = 0xf & ~coord_mask;
2416
2417    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2418             coordinate));
2419
2420    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2421             src_reg(0)));
2422
2423    emit(inst);
2424    return src_reg(inst->dst);
2425 }
2426
2427 static bool
2428 is_high_sampler(struct brw_context *brw, src_reg sampler)
2429 {
2430    if (brw->gen < 8 && !brw->is_haswell)
2431       return false;
2432
2433    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2434 }
2435
2436 void
2437 vec4_visitor::visit(ir_texture *ir)
2438 {
2439    uint32_t sampler =
2440       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2441
2442    ir_rvalue *nonconst_sampler_index =
2443       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2444
2445    /* Handle non-constant sampler array indexing */
2446    src_reg sampler_reg;
2447    if (nonconst_sampler_index) {
2448       /* The highest sampler which may be used by this operation is
2449        * the last element of the array. Mark it here, because the generator
2450        * doesn't have enough information to determine the bound.
2451        */
2452       uint32_t array_size = ir->sampler->as_dereference_array()
2453          ->array->type->array_size();
2454
2455       uint32_t max_used = sampler + array_size - 1;
2456       if (ir->op == ir_tg4 && brw->gen < 8) {
2457          max_used += prog_data->base.binding_table.gather_texture_start;
2458       } else {
2459          max_used += prog_data->base.binding_table.texture_start;
2460       }
2461
2462       brw_mark_surface_used(&prog_data->base, max_used);
2463
2464       /* Emit code to evaluate the actual indexing expression */
2465       nonconst_sampler_index->accept(this);
2466       dst_reg temp(this, glsl_type::uint_type);
2467       emit(ADD(temp, this->result, src_reg(sampler)))
2468          ->force_writemask_all = true;
2469       sampler_reg = src_reg(temp);
2470    } else {
2471       /* Single sampler, or constant array index; the indexing expression
2472        * is just an immediate.
2473        */
2474       sampler_reg = src_reg(sampler);
2475    }
2476
2477    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2478     * emitting anything other than setting up the constant result.
2479     */
2480    if (ir->op == ir_tg4) {
2481       ir_constant *chan = ir->lod_info.component->as_constant();
2482       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2483       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2484          dst_reg result(this, ir->type);
2485          this->result = src_reg(result);
2486          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2487          return;
2488       }
2489    }
2490
2491    /* Should be lowered by do_lower_texture_projection */
2492    assert(!ir->projector);
2493
2494    /* Should be lowered */
2495    assert(!ir->offset || !ir->offset->type->is_array());
2496
2497    /* Generate code to compute all the subexpression trees.  This has to be
2498     * done before loading any values into MRFs for the sampler message since
2499     * generating these values may involve SEND messages that need the MRFs.
2500     */
2501    src_reg coordinate;
2502    if (ir->coordinate) {
2503       ir->coordinate->accept(this);
2504       coordinate = this->result;
2505    }
2506
2507    src_reg shadow_comparitor;
2508    if (ir->shadow_comparitor) {
2509       ir->shadow_comparitor->accept(this);
2510       shadow_comparitor = this->result;
2511    }
2512
2513    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2514    src_reg offset_value;
2515    if (has_nonconstant_offset) {
2516       ir->offset->accept(this);
2517       offset_value = src_reg(this->result);
2518    }
2519
2520    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2521    src_reg lod, dPdx, dPdy, sample_index, mcs;
2522    switch (ir->op) {
2523    case ir_tex:
2524       lod = src_reg(0.0f);
2525       lod_type = glsl_type::float_type;
2526       break;
2527    case ir_txf:
2528    case ir_txl:
2529    case ir_txs:
2530       ir->lod_info.lod->accept(this);
2531       lod = this->result;
2532       lod_type = ir->lod_info.lod->type;
2533       break;
2534    case ir_query_levels:
2535       lod = src_reg(0);
2536       lod_type = glsl_type::int_type;
2537       break;
2538    case ir_txf_ms:
2539       ir->lod_info.sample_index->accept(this);
2540       sample_index = this->result;
2541       sample_index_type = ir->lod_info.sample_index->type;
2542
2543       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2544          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2545       else
2546          mcs = src_reg(0u);
2547       break;
2548    case ir_txd:
2549       ir->lod_info.grad.dPdx->accept(this);
2550       dPdx = this->result;
2551
2552       ir->lod_info.grad.dPdy->accept(this);
2553       dPdy = this->result;
2554
2555       lod_type = ir->lod_info.grad.dPdx->type;
2556       break;
2557    case ir_txb:
2558    case ir_lod:
2559    case ir_tg4:
2560       break;
2561    }
2562
2563    enum opcode opcode;
2564    switch (ir->op) {
2565    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2566    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2567    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2568    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2569    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2570    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2571    case ir_tg4: opcode = has_nonconstant_offset
2572                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2573    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2574    case ir_txb:
2575       unreachable("TXB is not valid for vertex shaders.");
2576    case ir_lod:
2577       unreachable("LOD is not valid for vertex shaders.");
2578    default:
2579       unreachable("Unrecognized tex op");
2580    }
2581
2582    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2583
2584    if (ir->offset != NULL && !has_nonconstant_offset) {
2585       inst->offset =
2586          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2587                             ir->offset->type->vector_elements);
2588    }
2589
2590    /* Stuff the channel select bits in the top of the texture offset */
2591    if (ir->op == ir_tg4)
2592       inst->offset |= gather_channel(ir, sampler) << 16;
2593
2594    /* The message header is necessary for:
2595     * - Gen4 (always)
2596     * - Gen9+ for selecting SIMD4x2
2597     * - Texel offsets
2598     * - Gather channel selection
2599     * - Sampler indices too large to fit in a 4-bit value.
2600     */
2601    inst->header_present =
2602       brw->gen < 5 || brw->gen >= 9 ||
2603       inst->offset != 0 || ir->op == ir_tg4 ||
2604       is_high_sampler(brw, sampler_reg);
2605    inst->base_mrf = 2;
2606    inst->mlen = inst->header_present + 1; /* always at least one */
2607    inst->dst = dst_reg(this, ir->type);
2608    inst->dst.writemask = WRITEMASK_XYZW;
2609    inst->shadow_compare = ir->shadow_comparitor != NULL;
2610
2611    inst->src[1] = sampler_reg;
2612
2613    /* MRF for the first parameter */
2614    int param_base = inst->base_mrf + inst->header_present;
2615
2616    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2617       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2618       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2619    } else {
2620       /* Load the coordinate */
2621       /* FINISHME: gl_clamp_mask and saturate */
2622       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2623       int zero_mask = 0xf & ~coord_mask;
2624
2625       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2626                coordinate));
2627
2628       if (zero_mask != 0) {
2629          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2630                   src_reg(0)));
2631       }
2632       /* Load the shadow comparitor */
2633       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2634          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2635                           WRITEMASK_X),
2636                   shadow_comparitor));
2637          inst->mlen++;
2638       }
2639
2640       /* Load the LOD info */
2641       if (ir->op == ir_tex || ir->op == ir_txl) {
2642          int mrf, writemask;
2643          if (brw->gen >= 5) {
2644             mrf = param_base + 1;
2645             if (ir->shadow_comparitor) {
2646                writemask = WRITEMASK_Y;
2647                /* mlen already incremented */
2648             } else {
2649                writemask = WRITEMASK_X;
2650                inst->mlen++;
2651             }
2652          } else /* brw->gen == 4 */ {
2653             mrf = param_base;
2654             writemask = WRITEMASK_W;
2655          }
2656          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2657       } else if (ir->op == ir_txf) {
2658          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2659       } else if (ir->op == ir_txf_ms) {
2660          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2661                   sample_index));
2662          if (brw->gen >= 7) {
2663             /* MCS data is in the first channel of `mcs`, but we need to get it into
2664              * the .y channel of the second vec4 of params, so replicate .x across
2665              * the whole vec4 and then mask off everything except .y
2666              */
2667             mcs.swizzle = BRW_SWIZZLE_XXXX;
2668             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2669                      mcs));
2670          }
2671          inst->mlen++;
2672       } else if (ir->op == ir_txd) {
2673          const glsl_type *type = lod_type;
2674
2675          if (brw->gen >= 5) {
2676             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2677             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2678             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2679             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2680             inst->mlen++;
2681
2682             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2683                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2684                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2685                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2686                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2687                inst->mlen++;
2688
2689                if (ir->shadow_comparitor) {
2690                   emit(MOV(dst_reg(MRF, param_base + 2,
2691                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2692                            shadow_comparitor));
2693                }
2694             }
2695          } else /* brw->gen == 4 */ {
2696             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2697             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2698             inst->mlen += 2;
2699          }
2700       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2701          if (ir->shadow_comparitor) {
2702             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2703                      shadow_comparitor));
2704          }
2705
2706          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2707                   offset_value));
2708          inst->mlen++;
2709       }
2710    }
2711
2712    emit(inst);
2713
2714    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2715     * spec requires layers.
2716     */
2717    if (ir->op == ir_txs) {
2718       glsl_type const *type = ir->sampler->type;
2719       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2720           type->sampler_array) {
2721          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2722                    writemask(inst->dst, WRITEMASK_Z),
2723                    src_reg(inst->dst), src_reg(6));
2724       }
2725    }
2726
2727    if (brw->gen == 6 && ir->op == ir_tg4) {
2728       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2729    }
2730
2731    swizzle_result(ir, src_reg(inst->dst), sampler);
2732 }
2733
2734 /**
2735  * Apply workarounds for Gen6 gather with UINT/SINT
2736  */
2737 void
2738 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2739 {
2740    if (!wa)
2741       return;
2742
2743    int width = (wa & WA_8BIT) ? 8 : 16;
2744    dst_reg dst_f = dst;
2745    dst_f.type = BRW_REGISTER_TYPE_F;
2746
2747    /* Convert from UNORM to UINT */
2748    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2749    emit(MOV(dst, src_reg(dst_f)));
2750
2751    if (wa & WA_SIGN) {
2752       /* Reinterpret the UINT value as a signed INT value by
2753        * shifting the sign bit into place, then shifting back
2754        * preserving sign.
2755        */
2756       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2757       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2758    }
2759 }
2760
2761 /**
2762  * Set up the gather channel based on the swizzle, for gather4.
2763  */
2764 uint32_t
2765 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2766 {
2767    ir_constant *chan = ir->lod_info.component->as_constant();
2768    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2769    switch (swiz) {
2770       case SWIZZLE_X: return 0;
2771       case SWIZZLE_Y:
2772          /* gather4 sampler is broken for green channel on RG32F --
2773           * we must ask for blue instead.
2774           */
2775          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2776             return 2;
2777          return 1;
2778       case SWIZZLE_Z: return 2;
2779       case SWIZZLE_W: return 3;
2780       default:
2781          unreachable("Not reached"); /* zero, one swizzles handled already */
2782    }
2783 }
2784
2785 void
2786 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2787 {
2788    int s = key->tex.swizzles[sampler];
2789
2790    this->result = src_reg(this, ir->type);
2791    dst_reg swizzled_result(this->result);
2792
2793    if (ir->op == ir_query_levels) {
2794       /* # levels is in .w */
2795       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2796       emit(MOV(swizzled_result, orig_val));
2797       return;
2798    }
2799
2800    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2801                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2802       emit(MOV(swizzled_result, orig_val));
2803       return;
2804    }
2805
2806
2807    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2808    int swizzle[4] = {0};
2809
2810    for (int i = 0; i < 4; i++) {
2811       switch (GET_SWZ(s, i)) {
2812       case SWIZZLE_ZERO:
2813          zero_mask |= (1 << i);
2814          break;
2815       case SWIZZLE_ONE:
2816          one_mask |= (1 << i);
2817          break;
2818       default:
2819          copy_mask |= (1 << i);
2820          swizzle[i] = GET_SWZ(s, i);
2821          break;
2822       }
2823    }
2824
2825    if (copy_mask) {
2826       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2827       swizzled_result.writemask = copy_mask;
2828       emit(MOV(swizzled_result, orig_val));
2829    }
2830
2831    if (zero_mask) {
2832       swizzled_result.writemask = zero_mask;
2833       emit(MOV(swizzled_result, src_reg(0.0f)));
2834    }
2835
2836    if (one_mask) {
2837       swizzled_result.writemask = one_mask;
2838       emit(MOV(swizzled_result, src_reg(1.0f)));
2839    }
2840 }
2841
2842 void
2843 vec4_visitor::visit(ir_return *)
2844 {
2845    unreachable("not reached");
2846 }
2847
2848 void
2849 vec4_visitor::visit(ir_discard *)
2850 {
2851    unreachable("not reached");
2852 }
2853
2854 void
2855 vec4_visitor::visit(ir_if *ir)
2856 {
2857    /* Don't point the annotation at the if statement, because then it plus
2858     * the then and else blocks get printed.
2859     */
2860    this->base_ir = ir->condition;
2861
2862    if (brw->gen == 6) {
2863       emit_if_gen6(ir);
2864    } else {
2865       enum brw_predicate predicate;
2866       emit_bool_to_cond_code(ir->condition, &predicate);
2867       emit(IF(predicate));
2868    }
2869
2870    visit_instructions(&ir->then_instructions);
2871
2872    if (!ir->else_instructions.is_empty()) {
2873       this->base_ir = ir->condition;
2874       emit(BRW_OPCODE_ELSE);
2875
2876       visit_instructions(&ir->else_instructions);
2877    }
2878
2879    this->base_ir = ir->condition;
2880    emit(BRW_OPCODE_ENDIF);
2881 }
2882
2883 void
2884 vec4_visitor::visit(ir_emit_vertex *)
2885 {
2886    unreachable("not reached");
2887 }
2888
2889 void
2890 vec4_visitor::visit(ir_end_primitive *)
2891 {
2892    unreachable("not reached");
2893 }
2894
2895 void
2896 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2897                                   dst_reg dst, src_reg offset,
2898                                   src_reg src0, src_reg src1)
2899 {
2900    unsigned mlen = 0;
2901
2902    /* Set the atomic operation offset. */
2903    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2904    mlen++;
2905
2906    /* Set the atomic operation arguments. */
2907    if (src0.file != BAD_FILE) {
2908       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2909       mlen++;
2910    }
2911
2912    if (src1.file != BAD_FILE) {
2913       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2914       mlen++;
2915    }
2916
2917    /* Emit the instruction.  Note that this maps to the normal SIMD8
2918     * untyped atomic message on Ivy Bridge, but that's OK because
2919     * unused channels will be masked out.
2920     */
2921    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2922                                  src_reg(atomic_op), src_reg(surf_index));
2923    inst->base_mrf = 0;
2924    inst->mlen = mlen;
2925 }
2926
2927 void
2928 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2929                                         src_reg offset)
2930 {
2931    /* Set the surface read offset. */
2932    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2933
2934    /* Emit the instruction.  Note that this maps to the normal SIMD8
2935     * untyped surface read message, but that's OK because unused
2936     * channels will be masked out.
2937     */
2938    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2939                                  dst, src_reg(surf_index));
2940    inst->base_mrf = 0;
2941    inst->mlen = 1;
2942 }
2943
2944 void
2945 vec4_visitor::emit_ndc_computation()
2946 {
2947    /* Get the position */
2948    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2949
2950    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2951    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2952    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2953
2954    current_annotation = "NDC";
2955    dst_reg ndc_w = ndc;
2956    ndc_w.writemask = WRITEMASK_W;
2957    src_reg pos_w = pos;
2958    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2959    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2960
2961    dst_reg ndc_xyz = ndc;
2962    ndc_xyz.writemask = WRITEMASK_XYZ;
2963
2964    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2965 }
2966
2967 void
2968 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2969 {
2970    if (brw->gen < 6 &&
2971        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2972         key->userclip_active || brw->has_negative_rhw_bug)) {
2973       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2974       dst_reg header1_w = header1;
2975       header1_w.writemask = WRITEMASK_W;
2976
2977       emit(MOV(header1, 0u));
2978
2979       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2980          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2981
2982          current_annotation = "Point size";
2983          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2984          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2985       }
2986
2987       if (key->userclip_active) {
2988          current_annotation = "Clipping flags";
2989          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2990          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2991
2992          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2993          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2994          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2995
2996          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2997          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2998          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2999          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3000       }
3001
3002       /* i965 clipping workaround:
3003        * 1) Test for -ve rhw
3004        * 2) If set,
3005        *      set ndc = (0,0,0,0)
3006        *      set ucp[6] = 1
3007        *
3008        * Later, clipping will detect ucp[6] and ensure the primitive is
3009        * clipped against all fixed planes.
3010        */
3011       if (brw->has_negative_rhw_bug) {
3012          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3013          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3014          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3015          vec4_instruction *inst;
3016          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3017          inst->predicate = BRW_PREDICATE_NORMAL;
3018          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3019          inst->predicate = BRW_PREDICATE_NORMAL;
3020       }
3021
3022       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3023    } else if (brw->gen < 6) {
3024       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3025    } else {
3026       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3027       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3028          dst_reg reg_w = reg;
3029          reg_w.writemask = WRITEMASK_W;
3030          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3031       }
3032       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3033          dst_reg reg_y = reg;
3034          reg_y.writemask = WRITEMASK_Y;
3035          reg_y.type = BRW_REGISTER_TYPE_D;
3036          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3037       }
3038       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3039          dst_reg reg_z = reg;
3040          reg_z.writemask = WRITEMASK_Z;
3041          reg_z.type = BRW_REGISTER_TYPE_D;
3042          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3043       }
3044    }
3045 }
3046
3047 void
3048 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3049 {
3050    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3051     *
3052     *     "If a linked set of shaders forming the vertex stage contains no
3053     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3054     *     application has requested clipping against user clip planes through
3055     *     the API, then the coordinate written to gl_Position is used for
3056     *     comparison against the user clip planes."
3057     *
3058     * This function is only called if the shader didn't write to
3059     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3060     * if the user wrote to it; otherwise we use gl_Position.
3061     */
3062    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3063    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3064       clip_vertex = VARYING_SLOT_POS;
3065    }
3066
3067    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3068         ++i) {
3069       reg.writemask = 1 << i;
3070       emit(DP4(reg,
3071                src_reg(output_reg[clip_vertex]),
3072                src_reg(this->userplane[i + offset])));
3073    }
3074 }
3075
3076 vec4_instruction *
3077 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3078 {
3079    assert (varying < VARYING_SLOT_MAX);
3080    reg.type = output_reg[varying].type;
3081    current_annotation = output_reg_annotation[varying];
3082    /* Copy the register, saturating if necessary */
3083    return emit(MOV(reg, src_reg(output_reg[varying])));
3084 }
3085
3086 void
3087 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3088 {
3089    reg.type = BRW_REGISTER_TYPE_F;
3090
3091    switch (varying) {
3092    case VARYING_SLOT_PSIZ:
3093    {
3094       /* PSIZ is always in slot 0, and is coupled with other flags. */
3095       current_annotation = "indices, point width, clip flags";
3096       emit_psiz_and_flags(reg);
3097       break;
3098    }
3099    case BRW_VARYING_SLOT_NDC:
3100       current_annotation = "NDC";
3101       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3102       break;
3103    case VARYING_SLOT_POS:
3104       current_annotation = "gl_Position";
3105       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3106       break;
3107    case VARYING_SLOT_EDGE:
3108       /* This is present when doing unfilled polygons.  We're supposed to copy
3109        * the edge flag from the user-provided vertex array
3110        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3111        * of that attribute (starts as 1.0f).  This is then used in clipping to
3112        * determine which edges should be drawn as wireframe.
3113        */
3114       current_annotation = "edge flag";
3115       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3116                                     glsl_type::float_type, WRITEMASK_XYZW))));
3117       break;
3118    case BRW_VARYING_SLOT_PAD:
3119       /* No need to write to this slot */
3120       break;
3121    case VARYING_SLOT_COL0:
3122    case VARYING_SLOT_COL1:
3123    case VARYING_SLOT_BFC0:
3124    case VARYING_SLOT_BFC1: {
3125       /* These built-in varyings are only supported in compatibility mode,
3126        * and we only support GS in core profile.  So, this must be a vertex
3127        * shader.
3128        */
3129       assert(stage == MESA_SHADER_VERTEX);
3130       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3131       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3132          inst->saturate = true;
3133       break;
3134    }
3135
3136    default:
3137       emit_generic_urb_slot(reg, varying);
3138       break;
3139    }
3140 }
3141
3142 static int
3143 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3144 {
3145    if (brw->gen >= 6) {
3146       /* URB data written (does not include the message header reg) must
3147        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3148        * section 5.4.3.2.2: URB_INTERLEAVED.
3149        *
3150        * URB entries are allocated on a multiple of 1024 bits, so an
3151        * extra 128 bits written here to make the end align to 256 is
3152        * no problem.
3153        */
3154       if ((mlen % 2) != 1)
3155          mlen++;
3156    }
3157
3158    return mlen;
3159 }
3160
3161
3162 /**
3163  * Generates the VUE payload plus the necessary URB write instructions to
3164  * output it.
3165  *
3166  * The VUE layout is documented in Volume 2a.
3167  */
3168 void
3169 vec4_visitor::emit_vertex()
3170 {
3171    /* MRF 0 is reserved for the debugger, so start with message header
3172     * in MRF 1.
3173     */
3174    int base_mrf = 1;
3175    int mrf = base_mrf;
3176    /* In the process of generating our URB write message contents, we
3177     * may need to unspill a register or load from an array.  Those
3178     * reads would use MRFs 14-15.
3179     */
3180    int max_usable_mrf = 13;
3181
3182    /* The following assertion verifies that max_usable_mrf causes an
3183     * even-numbered amount of URB write data, which will meet gen6's
3184     * requirements for length alignment.
3185     */
3186    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3187
3188    /* First mrf is the g0-based message header containing URB handles and
3189     * such.
3190     */
3191    emit_urb_write_header(mrf++);
3192
3193    if (brw->gen < 6) {
3194       emit_ndc_computation();
3195    }
3196
3197    /* Lower legacy ff and ClipVertex clipping to clip distances */
3198    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3199       current_annotation = "user clip distances";
3200
3201       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3202       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3203
3204       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3205       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3206    }
3207
3208    /* We may need to split this up into several URB writes, so do them in a
3209     * loop.
3210     */
3211    int slot = 0;
3212    bool complete = false;
3213    do {
3214       /* URB offset is in URB row increments, and each of our MRFs is half of
3215        * one of those, since we're doing interleaved writes.
3216        */
3217       int offset = slot / 2;
3218
3219       mrf = base_mrf + 1;
3220       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3221          emit_urb_slot(dst_reg(MRF, mrf++),
3222                        prog_data->vue_map.slot_to_varying[slot]);
3223
3224          /* If this was max_usable_mrf, we can't fit anything more into this
3225           * URB WRITE.
3226           */
3227          if (mrf > max_usable_mrf) {
3228             slot++;
3229             break;
3230          }
3231       }
3232
3233       complete = slot >= prog_data->vue_map.num_slots;
3234       current_annotation = "URB write";
3235       vec4_instruction *inst = emit_urb_write_opcode(complete);
3236       inst->base_mrf = base_mrf;
3237       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3238       inst->offset += offset;
3239    } while(!complete);
3240 }
3241
3242
3243 src_reg
3244 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3245                                  src_reg *reladdr, int reg_offset)
3246 {
3247    /* Because we store the values to scratch interleaved like our
3248     * vertex data, we need to scale the vec4 index by 2.
3249     */
3250    int message_header_scale = 2;
3251
3252    /* Pre-gen6, the message header uses byte offsets instead of vec4
3253     * (16-byte) offset units.
3254     */
3255    if (brw->gen < 6)
3256       message_header_scale *= 16;
3257
3258    if (reladdr) {
3259       src_reg index = src_reg(this, glsl_type::int_type);
3260
3261       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3262                                    src_reg(reg_offset)));
3263       emit_before(block, inst, MUL(dst_reg(index), index,
3264                                    src_reg(message_header_scale)));
3265
3266       return index;
3267    } else {
3268       return src_reg(reg_offset * message_header_scale);
3269    }
3270 }
3271
3272 src_reg
3273 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3274                                        src_reg *reladdr, int reg_offset)
3275 {
3276    if (reladdr) {
3277       src_reg index = src_reg(this, glsl_type::int_type);
3278
3279       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3280                                    src_reg(reg_offset)));
3281
3282       /* Pre-gen6, the message header uses byte offsets instead of vec4
3283        * (16-byte) offset units.
3284        */
3285       if (brw->gen < 6) {
3286          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3287       }
3288
3289       return index;
3290    } else if (brw->gen >= 8) {
3291       /* Store the offset in a GRF so we can send-from-GRF. */
3292       src_reg offset = src_reg(this, glsl_type::int_type);
3293       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3294       return offset;
3295    } else {
3296       int message_header_scale = brw->gen < 6 ? 16 : 1;
3297       return src_reg(reg_offset * message_header_scale);
3298    }
3299 }
3300
3301 /**
3302  * Emits an instruction before @inst to load the value named by @orig_src
3303  * from scratch space at @base_offset to @temp.
3304  *
3305  * @base_offset is measured in 32-byte units (the size of a register).
3306  */
3307 void
3308 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3309                                 dst_reg temp, src_reg orig_src,
3310                                 int base_offset)
3311 {
3312    int reg_offset = base_offset + orig_src.reg_offset;
3313    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3314                                       reg_offset);
3315
3316    emit_before(block, inst, SCRATCH_READ(temp, index));
3317 }
3318
3319 /**
3320  * Emits an instruction after @inst to store the value to be written
3321  * to @orig_dst to scratch space at @base_offset, from @temp.
3322  *
3323  * @base_offset is measured in 32-byte units (the size of a register).
3324  */
3325 void
3326 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3327                                  int base_offset)
3328 {
3329    int reg_offset = base_offset + inst->dst.reg_offset;
3330    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3331                                       reg_offset);
3332
3333    /* Create a temporary register to store *inst's result in.
3334     *
3335     * We have to be careful in MOVing from our temporary result register in
3336     * the scratch write.  If we swizzle from channels of the temporary that
3337     * weren't initialized, it will confuse live interval analysis, which will
3338     * make spilling fail to make progress.
3339     */
3340    src_reg temp = src_reg(this, glsl_type::vec4_type);
3341    temp.type = inst->dst.type;
3342    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3343    int swizzles[4];
3344    for (int i = 0; i < 4; i++)
3345       if (inst->dst.writemask & (1 << i))
3346          swizzles[i] = i;
3347       else
3348          swizzles[i] = first_writemask_chan;
3349    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3350                                swizzles[2], swizzles[3]);
3351
3352    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3353                                        inst->dst.writemask));
3354    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3355    write->predicate = inst->predicate;
3356    write->ir = inst->ir;
3357    write->annotation = inst->annotation;
3358    inst->insert_after(block, write);
3359
3360    inst->dst.file = temp.file;
3361    inst->dst.reg = temp.reg;
3362    inst->dst.reg_offset = temp.reg_offset;
3363    inst->dst.reladdr = NULL;
3364 }
3365
3366 /**
3367  * We can't generally support array access in GRF space, because a
3368  * single instruction's destination can only span 2 contiguous
3369  * registers.  So, we send all GRF arrays that get variable index
3370  * access to scratch space.
3371  */
3372 void
3373 vec4_visitor::move_grf_array_access_to_scratch()
3374 {
3375    int scratch_loc[this->virtual_grf_count];
3376    memset(scratch_loc, -1, sizeof(scratch_loc));
3377
3378    /* First, calculate the set of virtual GRFs that need to be punted
3379     * to scratch due to having any array access on them, and where in
3380     * scratch.
3381     */
3382    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3383       if (inst->dst.file == GRF && inst->dst.reladdr &&
3384           scratch_loc[inst->dst.reg] == -1) {
3385          scratch_loc[inst->dst.reg] = c->last_scratch;
3386          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3387       }
3388
3389       for (int i = 0 ; i < 3; i++) {
3390          src_reg *src = &inst->src[i];
3391
3392          if (src->file == GRF && src->reladdr &&
3393              scratch_loc[src->reg] == -1) {
3394             scratch_loc[src->reg] = c->last_scratch;
3395             c->last_scratch += this->virtual_grf_sizes[src->reg];
3396          }
3397       }
3398    }
3399
3400    /* Now, for anything that will be accessed through scratch, rewrite
3401     * it to load/store.  Note that this is a _safe list walk, because
3402     * we may generate a new scratch_write instruction after the one
3403     * we're processing.
3404     */
3405    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3406       /* Set up the annotation tracking for new generated instructions. */
3407       base_ir = inst->ir;
3408       current_annotation = inst->annotation;
3409
3410       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3411          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3412       }
3413
3414       for (int i = 0 ; i < 3; i++) {
3415          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3416             continue;
3417
3418          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3419
3420          emit_scratch_read(block, inst, temp, inst->src[i],
3421                            scratch_loc[inst->src[i].reg]);
3422
3423          inst->src[i].file = temp.file;
3424          inst->src[i].reg = temp.reg;
3425          inst->src[i].reg_offset = temp.reg_offset;
3426          inst->src[i].reladdr = NULL;
3427       }
3428    }
3429 }
3430
3431 /**
3432  * Emits an instruction before @inst to load the value named by @orig_src
3433  * from the pull constant buffer (surface) at @base_offset to @temp.
3434  */
3435 void
3436 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3437                                       dst_reg temp, src_reg orig_src,
3438                                       int base_offset)
3439 {
3440    int reg_offset = base_offset + orig_src.reg_offset;
3441    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3442    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3443                                              reg_offset);
3444    vec4_instruction *load;
3445
3446    if (brw->gen >= 7) {
3447       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3448       grf_offset.type = offset.type;
3449       emit_before(block, inst, MOV(grf_offset, offset));
3450
3451       load = new(mem_ctx) vec4_instruction(this,
3452                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3453                                            temp, index, src_reg(grf_offset));
3454    } else {
3455       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3456                                            temp, index, offset);
3457       load->base_mrf = 14;
3458       load->mlen = 1;
3459    }
3460    emit_before(block, inst, load);
3461 }
3462
3463 /**
3464  * Implements array access of uniforms by inserting a
3465  * PULL_CONSTANT_LOAD instruction.
3466  *
3467  * Unlike temporary GRF array access (where we don't support it due to
3468  * the difficulty of doing relative addressing on instruction
3469  * destinations), we could potentially do array access of uniforms
3470  * that were loaded in GRF space as push constants.  In real-world
3471  * usage we've seen, though, the arrays being used are always larger
3472  * than we could load as push constants, so just always move all
3473  * uniform array access out to a pull constant buffer.
3474  */
3475 void
3476 vec4_visitor::move_uniform_array_access_to_pull_constants()
3477 {
3478    int pull_constant_loc[this->uniforms];
3479    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3480    bool nested_reladdr;
3481
3482    /* Walk through and find array access of uniforms.  Put a copy of that
3483     * uniform in the pull constant buffer.
3484     *
3485     * Note that we don't move constant-indexed accesses to arrays.  No
3486     * testing has been done of the performance impact of this choice.
3487     */
3488    do {
3489       nested_reladdr = false;
3490
3491       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3492          for (int i = 0 ; i < 3; i++) {
3493             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3494                continue;
3495
3496             int uniform = inst->src[i].reg;
3497
3498             if (inst->src[i].reladdr->reladdr)
3499                nested_reladdr = true;  /* will need another pass */
3500
3501             /* If this array isn't already present in the pull constant buffer,
3502              * add it.
3503              */
3504             if (pull_constant_loc[uniform] == -1) {
3505                const gl_constant_value **values =
3506                   &stage_prog_data->param[uniform * 4];
3507
3508                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3509
3510                assert(uniform < uniform_array_size);
3511                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3512                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3513                      = values[j];
3514                }
3515             }
3516
3517             /* Set up the annotation tracking for new generated instructions. */
3518             base_ir = inst->ir;
3519             current_annotation = inst->annotation;
3520
3521             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3522
3523             emit_pull_constant_load(block, inst, temp, inst->src[i],
3524                                     pull_constant_loc[uniform]);
3525
3526             inst->src[i].file = temp.file;
3527             inst->src[i].reg = temp.reg;
3528             inst->src[i].reg_offset = temp.reg_offset;
3529             inst->src[i].reladdr = NULL;
3530          }
3531       }
3532    } while (nested_reladdr);
3533
3534    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3535     * no need to track them as larger-than-vec4 objects.  This will be
3536     * relied on in cutting out unused uniform vectors from push
3537     * constants.
3538     */
3539    split_uniform_registers();
3540 }
3541
3542 void
3543 vec4_visitor::resolve_ud_negate(src_reg *reg)
3544 {
3545    if (reg->type != BRW_REGISTER_TYPE_UD ||
3546        !reg->negate)
3547       return;
3548
3549    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3550    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3551    *reg = temp;
3552 }
3553
3554 /**
3555  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3556  *
3557  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3558  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3559  */
3560 void
3561 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3562 {
3563    assert(brw->gen <= 5);
3564
3565    if (!rvalue->type->is_boolean())
3566       return;
3567
3568    src_reg and_result = src_reg(this, rvalue->type);
3569    src_reg neg_result = src_reg(this, rvalue->type);
3570    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3571    emit(MOV(dst_reg(neg_result), negate(and_result)));
3572    *reg = neg_result;
3573 }
3574
3575 vec4_visitor::vec4_visitor(struct brw_context *brw,
3576                            struct brw_vec4_compile *c,
3577                            struct gl_program *prog,
3578                            const struct brw_vue_prog_key *key,
3579                            struct brw_vue_prog_data *prog_data,
3580                            struct gl_shader_program *shader_prog,
3581                            gl_shader_stage stage,
3582                            void *mem_ctx,
3583                            bool debug_flag,
3584                            bool no_spills,
3585                            shader_time_shader_type st_base,
3586                            shader_time_shader_type st_written,
3587                            shader_time_shader_type st_reset)
3588    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3589      c(c),
3590      key(key),
3591      prog_data(prog_data),
3592      sanity_param_count(0),
3593      fail_msg(NULL),
3594      first_non_payload_grf(0),
3595      need_all_constants_in_pull_buffer(false),
3596      debug_flag(debug_flag),
3597      no_spills(no_spills),
3598      st_base(st_base),
3599      st_written(st_written),
3600      st_reset(st_reset)
3601 {
3602    this->mem_ctx = mem_ctx;
3603    this->failed = false;
3604
3605    this->base_ir = NULL;
3606    this->current_annotation = NULL;
3607    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3608
3609    this->variable_ht = hash_table_ctor(0,
3610                                        hash_table_pointer_hash,
3611                                        hash_table_pointer_compare);
3612
3613    this->virtual_grf_start = NULL;
3614    this->virtual_grf_end = NULL;
3615    this->virtual_grf_sizes = NULL;
3616    this->virtual_grf_count = 0;
3617    this->virtual_grf_reg_map = NULL;
3618    this->virtual_grf_reg_count = 0;
3619    this->virtual_grf_array_size = 0;
3620    this->live_intervals = NULL;
3621
3622    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3623
3624    this->uniforms = 0;
3625
3626    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3627     * at least one. See setup_uniforms() in brw_vec4.cpp.
3628     */
3629    this->uniform_array_size = 1;
3630    if (prog_data) {
3631       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3632    }
3633
3634    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3635    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3636 }
3637
3638 vec4_visitor::~vec4_visitor()
3639 {
3640    hash_table_dtor(this->variable_ht);
3641 }
3642
3643
3644 void
3645 vec4_visitor::fail(const char *format, ...)
3646 {
3647    va_list va;
3648    char *msg;
3649
3650    if (failed)
3651       return;
3652
3653    failed = true;
3654
3655    va_start(va, format);
3656    msg = ralloc_vasprintf(mem_ctx, format, va);
3657    va_end(va);
3658    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3659
3660    this->fail_msg = msg;
3661
3662    if (debug_flag) {
3663       fprintf(stderr, "%s",  msg);
3664    }
3665 }
3666
3667 } /* namespace brw */