src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 690           (storage->name[namelen] != 0 &&
 691            storage->name[namelen] != '.' &&
 692            storage->name[namelen] != '[')) {
 693          continue;
 694       }
 695
 696       gl_constant_value *components = storage->storage;
 697       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 698                                storage->type->matrix_columns);
 699
 700       for (unsigned s = 0; s < vector_count; s++) {
 701          assert(uniforms < uniform_array_size);
 702          uniform_vector_size[uniforms] = storage->type->vector_elements;
 703
 704          int i;
 705          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 706             stage_prog_data->param[uniforms * 4 + i] = components;
 707             components++;
 708          }
 709          for (; i < 4; i++) {
 710             static gl_constant_value zero = { 0.0 };
 711             stage_prog_data->param[uniforms * 4 + i] = &zero;
 712          }
 713
 714          uniforms++;
 715       }
 716    }
 717 }
 718
 719 void
 720 vec4_visitor::setup_uniform_clipplane_values()
 721 {
 722    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 723
 724    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 4;
 727       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 728       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 729       for (int j = 0; j < 4; ++j) {
 730          stage_prog_data->param[this->uniforms * 4 + j] =
 731             (gl_constant_value *) &clip_planes[i][j];
 732       }
 733       ++this->uniforms;
 734    }
 735 }
 736
 737 /* Our support for builtin uniforms is even scarier than non-builtin.
 738  * It sits on top of the PROG_STATE_VAR parameters that are
 739  * automatically updated from GL context state.
 740  */
 741 void
 742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 743 {
 744    const ir_state_slot *const slots = ir->get_state_slots();
 745    assert(slots != NULL);
 746
 747    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 748       /* This state reference has already been setup by ir_to_mesa,
 749        * but we'll get the same index back here.  We can reference
 750        * ParameterValues directly, since unlike brw_fs.cpp, we never
 751        * add new state references during compile.
 752        */
 753       int index = _mesa_add_state_reference(this->prog->Parameters,
 754                                             (gl_state_index *)slots[i].tokens);
 755       gl_constant_value *values =
 756          &this->prog->Parameters->ParameterValues[index][0];
 757
 758       assert(this->uniforms < uniform_array_size);
 759       this->uniform_vector_size[this->uniforms] = 0;
 760       /* Add each of the unique swizzled channels of the element.
 761        * This will end up matching the size of the glsl_type of this field.
 762        */
 763       int last_swiz = -1;
 764       for (unsigned int j = 0; j < 4; j++) {
 765          int swiz = GET_SWZ(slots[i].swizzle, j);
 766          last_swiz = swiz;
 767
 768          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 769          assert(this->uniforms < uniform_array_size);
 770          if (swiz <= last_swiz)
 771             this->uniform_vector_size[this->uniforms]++;
 772       }
 773       this->uniforms++;
 774    }
 775 }
 776
 777 dst_reg *
 778 vec4_visitor::variable_storage(ir_variable *var)
 779 {
 780    return (dst_reg *)hash_table_find(this->variable_ht, var);
 781 }
 782
 783 void
 784 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 785                                      enum brw_predicate *predicate)
 786 {
 787    ir_expression *expr = ir->as_expression();
 788
 789    *predicate = BRW_PREDICATE_NORMAL;
 790
 791    if (expr && expr->operation != ir_binop_ubo_load) {
 792       src_reg op[3];
 793       vec4_instruction *inst;
 794
 795       assert(expr->get_num_operands() <= 3);
 796       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 797          expr->operands[i]->accept(this);
 798          op[i] = this->result;
 799
 800          resolve_ud_negate(&op[i]);
 801       }
 802
 803       switch (expr->operation) {
 804       case ir_unop_logic_not:
 805          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 806          inst->conditional_mod = BRW_CONDITIONAL_Z;
 807          break;
 808
 809       case ir_binop_logic_xor:
 810          if (brw->gen <= 5) {
 811             src_reg temp = src_reg(this, ir->type);
 812             emit(XOR(dst_reg(temp), op[0], op[1]));
 813             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 814          } else {
 815             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 816          }
 817          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          break;
 819
 820       case ir_binop_logic_or:
 821          if (brw->gen <= 5) {
 822             src_reg temp = src_reg(this, ir->type);
 823             emit(OR(dst_reg(temp), op[0], op[1]));
 824             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 825          } else {
 826             inst = emit(OR(dst_null_d(), op[0], op[1]));
 827          }
 828          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 829          break;
 830
 831       case ir_binop_logic_and:
 832          if (brw->gen <= 5) {
 833             src_reg temp = src_reg(this, ir->type);
 834             emit(AND(dst_reg(temp), op[0], op[1]));
 835             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 836          } else {
 837             inst = emit(AND(dst_null_d(), op[0], op[1]));
 838          }
 839          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 840          break;
 841
 842       case ir_unop_f2b:
 843          if (brw->gen >= 6) {
 844             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 845          } else {
 846             inst = emit(MOV(dst_null_f(), op[0]));
 847             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 848          }
 849          break;
 850
 851       case ir_unop_i2b:
 852          if (brw->gen >= 6) {
 853             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 854          } else {
 855             inst = emit(MOV(dst_null_d(), op[0]));
 856             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 857          }
 858          break;
 859
 860       case ir_binop_all_equal:
 861          if (brw->gen <= 5) {
 862             resolve_bool_comparison(expr->operands[0], &op[0]);
 863             resolve_bool_comparison(expr->operands[1], &op[1]);
 864          }
 865          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 866          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 867          break;
 868
 869       case ir_binop_any_nequal:
 870          if (brw->gen <= 5) {
 871             resolve_bool_comparison(expr->operands[0], &op[0]);
 872             resolve_bool_comparison(expr->operands[1], &op[1]);
 873          }
 874          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 875          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 876          break;
 877
 878       case ir_unop_any:
 879          if (brw->gen <= 5) {
 880             resolve_bool_comparison(expr->operands[0], &op[0]);
 881          }
 882          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 883          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 884          break;
 885
 886       case ir_binop_greater:
 887       case ir_binop_gequal:
 888       case ir_binop_less:
 889       case ir_binop_lequal:
 890       case ir_binop_equal:
 891       case ir_binop_nequal:
 892          if (brw->gen <= 5) {
 893             resolve_bool_comparison(expr->operands[0], &op[0]);
 894             resolve_bool_comparison(expr->operands[1], &op[1]);
 895          }
 896          emit(CMP(dst_null_d(), op[0], op[1],
 897                   brw_conditional_for_comparison(expr->operation)));
 898          break;
 899
 900       case ir_triop_csel: {
 901          /* Expand the boolean condition into the flag register. */
 902          inst = emit(MOV(dst_null_d(), op[0]));
 903          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 904
 905          /* Select which boolean to return. */
 906          dst_reg temp(this, expr->operands[1]->type);
 907          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 908          inst->predicate = BRW_PREDICATE_NORMAL;
 909
 910          /* Expand the result to a condition code. */
 911          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 912          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 913          break;
 914       }
 915
 916       default:
 917          unreachable("not reached");
 918       }
 919       return;
 920    }
 921
 922    ir->accept(this);
 923
 924    resolve_ud_negate(&this->result);
 925
 926    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 927    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 928 }
 929
 930 /**
 931  * Emit a gen6 IF statement with the comparison folded into the IF
 932  * instruction.
 933  */
 934 void
 935 vec4_visitor::emit_if_gen6(ir_if *ir)
 936 {
 937    ir_expression *expr = ir->condition->as_expression();
 938
 939    if (expr && expr->operation != ir_binop_ubo_load) {
 940       src_reg op[3];
 941       dst_reg temp;
 942
 943       assert(expr->get_num_operands() <= 3);
 944       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 945          expr->operands[i]->accept(this);
 946          op[i] = this->result;
 947       }
 948
 949       switch (expr->operation) {
 950       case ir_unop_logic_not:
 951          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 952          return;
 953
 954       case ir_binop_logic_xor:
 955          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 956          return;
 957
 958       case ir_binop_logic_or:
 959          temp = dst_reg(this, glsl_type::bool_type);
 960          emit(OR(temp, op[0], op[1]));
 961          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 962          return;
 963
 964       case ir_binop_logic_and:
 965          temp = dst_reg(this, glsl_type::bool_type);
 966          emit(AND(temp, op[0], op[1]));
 967          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_unop_f2b:
 971          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_unop_i2b:
 975          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 976          return;
 977
 978       case ir_binop_greater:
 979       case ir_binop_gequal:
 980       case ir_binop_less:
 981       case ir_binop_lequal:
 982       case ir_binop_equal:
 983       case ir_binop_nequal:
 984          emit(IF(op[0], op[1],
 985                  brw_conditional_for_comparison(expr->operation)));
 986          return;
 987
 988       case ir_binop_all_equal:
 989          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 990          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 991          return;
 992
 993       case ir_binop_any_nequal:
 994          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 995          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 996          return;
 997
 998       case ir_unop_any:
 999          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1000          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1001          return;
1002
1003       case ir_triop_csel: {
1004          /* Expand the boolean condition into the flag register. */
1005          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1006          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1007
1008          /* Select which boolean to return. */
1009          dst_reg temp(this, expr->operands[1]->type);
1010          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1011          inst->predicate = BRW_PREDICATE_NORMAL;
1012
1013          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1014          return;
1015       }
1016
1017       default:
1018          unreachable("not reached");
1019       }
1020       return;
1021    }
1022
1023    ir->condition->accept(this);
1024
1025    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_variable *ir)
1030 {
1031    dst_reg *reg = NULL;
1032
1033    if (variable_storage(ir))
1034       return;
1035
1036    switch (ir->data.mode) {
1037    case ir_var_shader_in:
1038       assert(ir->data.location != -1);
1039       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1040       break;
1041
1042    case ir_var_shader_out:
1043       assert(ir->data.location != -1);
1044       reg = new(mem_ctx) dst_reg(this, ir->type);
1045
1046       for (int i = 0; i < type_size(ir->type); i++) {
1047          output_reg[ir->data.location + i] = *reg;
1048          output_reg[ir->data.location + i].reg_offset = i;
1049          output_reg[ir->data.location + i].type =
1050             brw_type_for_base_type(ir->type->get_scalar_type());
1051          output_reg_annotation[ir->data.location + i] = ir->name;
1052       }
1053       break;
1054
1055    case ir_var_auto:
1056    case ir_var_temporary:
1057       reg = new(mem_ctx) dst_reg(this, ir->type);
1058       break;
1059
1060    case ir_var_uniform:
1061       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1062
1063       /* Thanks to the lower_ubo_reference pass, we will see only
1064        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1065        * variables, so no need for them to be in variable_ht.
1066        *
1067        * Some uniforms, such as samplers and atomic counters, have no actual
1068        * storage, so we should ignore them.
1069        */
1070       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1071          return;
1072
1073       /* Track how big the whole uniform variable is, in case we need to put a
1074        * copy of its data into pull constants for array access.
1075        */
1076       assert(this->uniforms < uniform_array_size);
1077       this->uniform_size[this->uniforms] = type_size(ir->type);
1078
1079       if (!strncmp(ir->name, "gl_", 3)) {
1080          setup_builtin_uniform_values(ir);
1081       } else {
1082          setup_uniform_values(ir);
1083       }
1084       break;
1085
1086    case ir_var_system_value:
1087       reg = make_reg_for_system_value(ir);
1088       break;
1089
1090    default:
1091       unreachable("not reached");
1092    }
1093
1094    reg->type = brw_type_for_base_type(ir->type);
1095    hash_table_insert(this->variable_ht, reg, ir);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop *ir)
1100 {
1101    /* We don't want debugging output to print the whole body of the
1102     * loop as the annotation.
1103     */
1104    this->base_ir = NULL;
1105
1106    emit(BRW_OPCODE_DO);
1107
1108    visit_instructions(&ir->body_instructions);
1109
1110    emit(BRW_OPCODE_WHILE);
1111 }
1112
1113 void
1114 vec4_visitor::visit(ir_loop_jump *ir)
1115 {
1116    switch (ir->mode) {
1117    case ir_loop_jump::jump_break:
1118       emit(BRW_OPCODE_BREAK);
1119       break;
1120    case ir_loop_jump::jump_continue:
1121       emit(BRW_OPCODE_CONTINUE);
1122       break;
1123    }
1124 }
1125
1126
1127 void
1128 vec4_visitor::visit(ir_function_signature *)
1129 {
1130    unreachable("not reached");
1131 }
1132
1133 void
1134 vec4_visitor::visit(ir_function *ir)
1135 {
1136    /* Ignore function bodies other than main() -- we shouldn't see calls to
1137     * them since they should all be inlined.
1138     */
1139    if (strcmp(ir->name, "main") == 0) {
1140       const ir_function_signature *sig;
1141       exec_list empty;
1142
1143       sig = ir->matching_signature(NULL, &empty, false);
1144
1145       assert(sig);
1146
1147       visit_instructions(&sig->body);
1148    }
1149 }
1150
1151 bool
1152 vec4_visitor::try_emit_mad(ir_expression *ir)
1153 {
1154    /* 3-src instructions were introduced in gen6. */
1155    if (brw->gen < 6)
1156       return false;
1157
1158    /* MAD can only handle floating-point data. */
1159    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1160       return false;
1161
1162    ir_rvalue *nonmul = ir->operands[1];
1163    ir_expression *mul = ir->operands[0]->as_expression();
1164
1165    bool mul_negate = false, mul_abs = false;
1166    if (mul && mul->operation == ir_unop_abs) {
1167       mul = mul->operands[0]->as_expression();
1168       mul_abs = true;
1169    } else if (mul && mul->operation == ir_unop_neg) {
1170       mul = mul->operands[0]->as_expression();
1171       mul_negate = true;
1172    }
1173
1174    if (!mul || mul->operation != ir_binop_mul) {
1175       nonmul = ir->operands[0];
1176       mul = ir->operands[1]->as_expression();
1177
1178       if (mul && mul->operation == ir_unop_abs) {
1179          mul = mul->operands[0]->as_expression();
1180          mul_abs = true;
1181       } else if (mul && mul->operation == ir_unop_neg) {
1182          mul = mul->operands[0]->as_expression();
1183          mul_negate = true;
1184       }
1185
1186       if (!mul || mul->operation != ir_binop_mul)
1187          return false;
1188    }
1189
1190    nonmul->accept(this);
1191    src_reg src0 = fix_3src_operand(this->result);
1192
1193    mul->operands[0]->accept(this);
1194    src_reg src1 = fix_3src_operand(this->result);
1195    src1.negate ^= mul_negate;
1196    src1.abs = mul_abs;
1197    if (mul_abs)
1198       src1.negate = false;
1199
1200    mul->operands[1]->accept(this);
1201    src_reg src2 = fix_3src_operand(this->result);
1202    src2.abs = mul_abs;
1203    if (mul_abs)
1204       src2.negate = false;
1205
1206    this->result = src_reg(this, ir->type);
1207    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1208
1209    return true;
1210 }
1211
1212 bool
1213 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1214 {
1215    /* This optimization relies on CMP setting the destination to 0 when
1216     * false.  Early hardware only sets the least significant bit, and
1217     * leaves the other bits undefined.  So we can't use it.
1218     */
1219    if (brw->gen < 6)
1220       return false;
1221
1222    ir_expression *const cmp = ir->operands[0]->as_expression();
1223
1224    if (cmp == NULL)
1225       return false;
1226
1227    switch (cmp->operation) {
1228    case ir_binop_less:
1229    case ir_binop_greater:
1230    case ir_binop_lequal:
1231    case ir_binop_gequal:
1232    case ir_binop_equal:
1233    case ir_binop_nequal:
1234       break;
1235
1236    default:
1237       return false;
1238    }
1239
1240    cmp->operands[0]->accept(this);
1241    const src_reg cmp_src0 = this->result;
1242
1243    cmp->operands[1]->accept(this);
1244    const src_reg cmp_src1 = this->result;
1245
1246    this->result = src_reg(this, ir->type);
1247
1248    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1249             brw_conditional_for_comparison(cmp->operation)));
1250
1251    /* If the comparison is false, this->result will just happen to be zero.
1252     */
1253    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1254                                        this->result, src_reg(1.0f));
1255    inst->predicate = BRW_PREDICATE_NORMAL;
1256    inst->predicate_inverse = true;
1257
1258    return true;
1259 }
1260
1261 void
1262 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1263                           src_reg src0, src_reg src1)
1264 {
1265    vec4_instruction *inst;
1266
1267    if (brw->gen >= 6) {
1268       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1269       inst->conditional_mod = conditionalmod;
1270    } else {
1271       emit(CMP(dst, src0, src1, conditionalmod));
1272
1273       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1274       inst->predicate = BRW_PREDICATE_NORMAL;
1275    }
1276 }
1277
1278 void
1279 vec4_visitor::emit_lrp(const dst_reg &dst,
1280                        const src_reg &x, const src_reg &y, const src_reg &a)
1281 {
1282    if (brw->gen >= 6) {
1283       /* Note that the instruction's argument order is reversed from GLSL
1284        * and the IR.
1285        */
1286       emit(LRP(dst,
1287                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1288    } else {
1289       /* Earlier generations don't support three source operations, so we
1290        * need to emit x*(1-a) + y*a.
1291        */
1292       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1293       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1294       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1295       y_times_a.writemask           = dst.writemask;
1296       one_minus_a.writemask         = dst.writemask;
1297       x_times_one_minus_a.writemask = dst.writemask;
1298
1299       emit(MUL(y_times_a, y, a));
1300       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1301       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1302       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1303    }
1304 }
1305
1306 void
1307 vec4_visitor::visit(ir_expression *ir)
1308 {
1309    unsigned int operand;
1310    src_reg op[Elements(ir->operands)];
1311    vec4_instruction *inst;
1312
1313    if (ir->operation == ir_binop_add) {
1314       if (try_emit_mad(ir))
1315          return;
1316    }
1317
1318    if (ir->operation == ir_unop_b2f) {
1319       if (try_emit_b2f_of_compare(ir))
1320          return;
1321    }
1322
1323    /* Storage for our result.  Ideally for an assignment we'd be using
1324     * the actual storage for the result here, instead.
1325     */
1326    dst_reg result_dst(this, ir->type);
1327    src_reg result_src(result_dst);
1328
1329    if (ir->operation == ir_triop_csel) {
1330       ir->operands[1]->accept(this);
1331       op[1] = this->result;
1332       ir->operands[2]->accept(this);
1333       op[2] = this->result;
1334
1335       enum brw_predicate predicate;
1336       emit_bool_to_cond_code(ir->operands[0], &predicate);
1337       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1338       inst->predicate = predicate;
1339       this->result = result_src;
1340       return;
1341    }
1342
1343    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1344       this->result.file = BAD_FILE;
1345       ir->operands[operand]->accept(this);
1346       if (this->result.file == BAD_FILE) {
1347          fprintf(stderr, "Failed to get tree for expression operand:\n");
1348          ir->operands[operand]->fprint(stderr);
1349          exit(1);
1350       }
1351       op[operand] = this->result;
1352
1353       /* Matrix expression operands should have been broken down to vector
1354        * operations already.
1355        */
1356       assert(!ir->operands[operand]->type->is_matrix());
1357    }
1358
1359    /* If nothing special happens, this is the result. */
1360    this->result = result_src;
1361
1362    switch (ir->operation) {
1363    case ir_unop_logic_not:
1364       emit(NOT(result_dst, op[0]));
1365       break;
1366    case ir_unop_neg:
1367       op[0].negate = !op[0].negate;
1368       emit(MOV(result_dst, op[0]));
1369       break;
1370    case ir_unop_abs:
1371       op[0].abs = true;
1372       op[0].negate = false;
1373       emit(MOV(result_dst, op[0]));
1374       break;
1375
1376    case ir_unop_sign:
1377       if (ir->type->is_float()) {
1378          /* AND(val, 0x80000000) gives the sign bit.
1379           *
1380           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1381           * zero.
1382           */
1383          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1384
1385          op[0].type = BRW_REGISTER_TYPE_UD;
1386          result_dst.type = BRW_REGISTER_TYPE_UD;
1387          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1388
1389          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1390          inst->predicate = BRW_PREDICATE_NORMAL;
1391
1392          this->result.type = BRW_REGISTER_TYPE_F;
1393       } else {
1394          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1395           *               -> non-negative val generates 0x00000000.
1396           *  Predicated OR sets 1 if val is positive.
1397           */
1398          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1399
1400          emit(ASR(result_dst, op[0], src_reg(31)));
1401
1402          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1403          inst->predicate = BRW_PREDICATE_NORMAL;
1404       }
1405       break;
1406
1407    case ir_unop_rcp:
1408       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1409       break;
1410
1411    case ir_unop_exp2:
1412       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1413       break;
1414    case ir_unop_log2:
1415       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1416       break;
1417    case ir_unop_exp:
1418    case ir_unop_log:
1419       unreachable("not reached: should be handled by ir_explog_to_explog2");
1420    case ir_unop_sin:
1421    case ir_unop_sin_reduced:
1422       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1423       break;
1424    case ir_unop_cos:
1425    case ir_unop_cos_reduced:
1426       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1427       break;
1428
1429    case ir_unop_dFdx:
1430    case ir_unop_dFdx_coarse:
1431    case ir_unop_dFdx_fine:
1432    case ir_unop_dFdy:
1433    case ir_unop_dFdy_coarse:
1434    case ir_unop_dFdy_fine:
1435       unreachable("derivatives not valid in vertex shader");
1436
1437    case ir_unop_bitfield_reverse:
1438       emit(BFREV(result_dst, op[0]));
1439       break;
1440    case ir_unop_bit_count:
1441       emit(CBIT(result_dst, op[0]));
1442       break;
1443    case ir_unop_find_msb: {
1444       src_reg temp = src_reg(this, glsl_type::uint_type);
1445
1446       inst = emit(FBH(dst_reg(temp), op[0]));
1447       inst->dst.writemask = WRITEMASK_XYZW;
1448
1449       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1450        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1451        * subtract the result from 31 to convert the MSB count into an LSB count.
1452        */
1453
1454       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1455       temp.swizzle = BRW_SWIZZLE_NOOP;
1456       emit(MOV(result_dst, temp));
1457
1458       src_reg src_tmp = src_reg(result_dst);
1459       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1460
1461       src_tmp.negate = true;
1462       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1463       inst->predicate = BRW_PREDICATE_NORMAL;
1464       break;
1465    }
1466    case ir_unop_find_lsb:
1467       emit(FBL(result_dst, op[0]));
1468       break;
1469    case ir_unop_saturate:
1470       inst = emit(MOV(result_dst, op[0]));
1471       inst->saturate = true;
1472       break;
1473
1474    case ir_unop_noise:
1475       unreachable("not reached: should be handled by lower_noise");
1476
1477    case ir_binop_add:
1478       emit(ADD(result_dst, op[0], op[1]));
1479       break;
1480    case ir_binop_sub:
1481       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1482
1483    case ir_binop_mul:
1484       if (brw->gen < 8 && ir->type->is_integer()) {
1485          /* For integer multiplication, the MUL uses the low 16 bits of one of
1486           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1487           * accumulates in the contribution of the upper 16 bits of that
1488           * operand.  If we can determine that one of the args is in the low
1489           * 16 bits, though, we can just emit a single MUL.
1490           */
1491          if (ir->operands[0]->is_uint16_constant()) {
1492             if (brw->gen < 7)
1493                emit(MUL(result_dst, op[0], op[1]));
1494             else
1495                emit(MUL(result_dst, op[1], op[0]));
1496          } else if (ir->operands[1]->is_uint16_constant()) {
1497             if (brw->gen < 7)
1498                emit(MUL(result_dst, op[1], op[0]));
1499             else
1500                emit(MUL(result_dst, op[0], op[1]));
1501          } else {
1502             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1503
1504             emit(MUL(acc, op[0], op[1]));
1505             emit(MACH(dst_null_d(), op[0], op[1]));
1506             emit(MOV(result_dst, src_reg(acc)));
1507          }
1508       } else {
1509          emit(MUL(result_dst, op[0], op[1]));
1510       }
1511       break;
1512    case ir_binop_imul_high: {
1513       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1514
1515       emit(MUL(acc, op[0], op[1]));
1516       emit(MACH(result_dst, op[0], op[1]));
1517       break;
1518    }
1519    case ir_binop_div:
1520       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1521       assert(ir->type->is_integer());
1522       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1523       break;
1524    case ir_binop_carry: {
1525       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1526
1527       emit(ADDC(dst_null_ud(), op[0], op[1]));
1528       emit(MOV(result_dst, src_reg(acc)));
1529       break;
1530    }
1531    case ir_binop_borrow: {
1532       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1533
1534       emit(SUBB(dst_null_ud(), op[0], op[1]));
1535       emit(MOV(result_dst, src_reg(acc)));
1536       break;
1537    }
1538    case ir_binop_mod:
1539       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1540       assert(ir->type->is_integer());
1541       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1542       break;
1543
1544    case ir_binop_less:
1545    case ir_binop_greater:
1546    case ir_binop_lequal:
1547    case ir_binop_gequal:
1548    case ir_binop_equal:
1549    case ir_binop_nequal: {
1550       if (brw->gen <= 5) {
1551          resolve_bool_comparison(ir->operands[0], &op[0]);
1552          resolve_bool_comparison(ir->operands[1], &op[1]);
1553       }
1554       emit(CMP(result_dst, op[0], op[1],
1555                brw_conditional_for_comparison(ir->operation)));
1556       break;
1557    }
1558
1559    case ir_binop_all_equal:
1560       /* "==" operator producing a scalar boolean. */
1561       if (ir->operands[0]->type->is_vector() ||
1562           ir->operands[1]->type->is_vector()) {
1563          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1564          emit(MOV(result_dst, src_reg(0)));
1565          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1566          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1567       } else {
1568          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1569       }
1570       break;
1571    case ir_binop_any_nequal:
1572       /* "!=" operator producing a scalar boolean. */
1573       if (ir->operands[0]->type->is_vector() ||
1574           ir->operands[1]->type->is_vector()) {
1575          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1576
1577          emit(MOV(result_dst, src_reg(0)));
1578          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1579          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1580       } else {
1581          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1582       }
1583       break;
1584
1585    case ir_unop_any:
1586       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1587       emit(MOV(result_dst, src_reg(0)));
1588
1589       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1590       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1591       break;
1592
1593    case ir_binop_logic_xor:
1594       emit(XOR(result_dst, op[0], op[1]));
1595       break;
1596
1597    case ir_binop_logic_or:
1598       emit(OR(result_dst, op[0], op[1]));
1599       break;
1600
1601    case ir_binop_logic_and:
1602       emit(AND(result_dst, op[0], op[1]));
1603       break;
1604
1605    case ir_binop_dot:
1606       assert(ir->operands[0]->type->is_vector());
1607       assert(ir->operands[0]->type == ir->operands[1]->type);
1608       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1609       break;
1610
1611    case ir_unop_sqrt:
1612       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1613       break;
1614    case ir_unop_rsq:
1615       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1616       break;
1617
1618    case ir_unop_bitcast_i2f:
1619    case ir_unop_bitcast_u2f:
1620       this->result = op[0];
1621       this->result.type = BRW_REGISTER_TYPE_F;
1622       break;
1623
1624    case ir_unop_bitcast_f2i:
1625       this->result = op[0];
1626       this->result.type = BRW_REGISTER_TYPE_D;
1627       break;
1628
1629    case ir_unop_bitcast_f2u:
1630       this->result = op[0];
1631       this->result.type = BRW_REGISTER_TYPE_UD;
1632       break;
1633
1634    case ir_unop_i2f:
1635    case ir_unop_i2u:
1636    case ir_unop_u2i:
1637    case ir_unop_u2f:
1638    case ir_unop_f2i:
1639    case ir_unop_f2u:
1640       emit(MOV(result_dst, op[0]));
1641       break;
1642    case ir_unop_b2i:
1643       emit(AND(result_dst, op[0], src_reg(1)));
1644       break;
1645    case ir_unop_b2f:
1646       if (brw->gen <= 5) {
1647          resolve_bool_comparison(ir->operands[0], &op[0]);
1648       }
1649       op[0].type = BRW_REGISTER_TYPE_D;
1650       result_dst.type = BRW_REGISTER_TYPE_D;
1651       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1652       result_dst.type = BRW_REGISTER_TYPE_F;
1653       break;
1654    case ir_unop_f2b:
1655       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1656       break;
1657    case ir_unop_i2b:
1658       emit(AND(result_dst, op[0], src_reg(1)));
1659       break;
1660
1661    case ir_unop_trunc:
1662       emit(RNDZ(result_dst, op[0]));
1663       break;
1664    case ir_unop_ceil: {
1665          src_reg tmp = src_reg(this, ir->type);
1666          op[0].negate = !op[0].negate;
1667          emit(RNDD(dst_reg(tmp), op[0]));
1668          tmp.negate = true;
1669          emit(MOV(result_dst, tmp));
1670       }
1671       break;
1672    case ir_unop_floor:
1673       inst = emit(RNDD(result_dst, op[0]));
1674       break;
1675    case ir_unop_fract:
1676       inst = emit(FRC(result_dst, op[0]));
1677       break;
1678    case ir_unop_round_even:
1679       emit(RNDE(result_dst, op[0]));
1680       break;
1681
1682    case ir_binop_min:
1683       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1684       break;
1685    case ir_binop_max:
1686       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1687       break;
1688
1689    case ir_binop_pow:
1690       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1691       break;
1692
1693    case ir_unop_bit_not:
1694       inst = emit(NOT(result_dst, op[0]));
1695       break;
1696    case ir_binop_bit_and:
1697       inst = emit(AND(result_dst, op[0], op[1]));
1698       break;
1699    case ir_binop_bit_xor:
1700       inst = emit(XOR(result_dst, op[0], op[1]));
1701       break;
1702    case ir_binop_bit_or:
1703       inst = emit(OR(result_dst, op[0], op[1]));
1704       break;
1705
1706    case ir_binop_lshift:
1707       inst = emit(SHL(result_dst, op[0], op[1]));
1708       break;
1709
1710    case ir_binop_rshift:
1711       if (ir->type->base_type == GLSL_TYPE_INT)
1712          inst = emit(ASR(result_dst, op[0], op[1]));
1713       else
1714          inst = emit(SHR(result_dst, op[0], op[1]));
1715       break;
1716
1717    case ir_binop_bfm:
1718       emit(BFI1(result_dst, op[0], op[1]));
1719       break;
1720
1721    case ir_binop_ubo_load: {
1722       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1723       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1724       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1725       src_reg offset;
1726
1727       /* Now, load the vector from that offset. */
1728       assert(ir->type->is_vector() || ir->type->is_scalar());
1729
1730       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1731       packed_consts.type = result.type;
1732       src_reg surf_index;
1733
1734       if (const_uniform_block) {
1735          /* The block index is a constant, so just emit the binding table entry
1736           * as an immediate.
1737           */
1738          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1739                               const_uniform_block->value.u[0]);
1740       } else {
1741          /* The block index is not a constant. Evaluate the index expression
1742           * per-channel and add the base UBO index; the generator will select
1743           * a value from any live channel.
1744           */
1745          surf_index = src_reg(this, glsl_type::uint_type);
1746          emit(ADD(dst_reg(surf_index), op[0],
1747                   src_reg(prog_data->base.binding_table.ubo_start)));
1748
1749          /* Assume this may touch any UBO. It would be nice to provide
1750           * a tighter bound, but the array information is already lowered away.
1751           */
1752          brw_mark_surface_used(&prog_data->base,
1753                                prog_data->base.binding_table.ubo_start +
1754                                shader_prog->NumUniformBlocks - 1);
1755       }
1756
1757       if (const_offset_ir) {
1758          if (brw->gen >= 8) {
1759             /* Store the offset in a GRF so we can send-from-GRF. */
1760             offset = src_reg(this, glsl_type::int_type);
1761             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1762          } else {
1763             /* Immediates are fine on older generations since they'll be moved
1764              * to a (potentially fake) MRF at the generator level.
1765              */
1766             offset = src_reg(const_offset / 16);
1767          }
1768       } else {
1769          offset = src_reg(this, glsl_type::uint_type);
1770          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1771       }
1772
1773       if (brw->gen >= 7) {
1774          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1775          grf_offset.type = offset.type;
1776
1777          emit(MOV(grf_offset, offset));
1778
1779          vec4_instruction *pull =
1780             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1781                                                dst_reg(packed_consts),
1782                                                surf_index,
1783                                                src_reg(grf_offset)));
1784          pull->mlen = 1;
1785       } else {
1786          vec4_instruction *pull =
1787             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1788                                                dst_reg(packed_consts),
1789                                                surf_index,
1790                                                offset));
1791          pull->base_mrf = 14;
1792          pull->mlen = 1;
1793       }
1794
1795       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1796       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1797                                             const_offset % 16 / 4,
1798                                             const_offset % 16 / 4,
1799                                             const_offset % 16 / 4);
1800
1801       /* UBO bools are any nonzero int.  We need to convert them to use the
1802        * value of true stored in ctx->Const.UniformBooleanTrue.
1803        */
1804       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1805          emit(CMP(result_dst, packed_consts, src_reg(0u),
1806                   BRW_CONDITIONAL_NZ));
1807       } else {
1808          emit(MOV(result_dst, packed_consts));
1809       }
1810       break;
1811    }
1812
1813    case ir_binop_vector_extract:
1814       unreachable("should have been lowered by vec_index_to_cond_assign");
1815
1816    case ir_triop_fma:
1817       op[0] = fix_3src_operand(op[0]);
1818       op[1] = fix_3src_operand(op[1]);
1819       op[2] = fix_3src_operand(op[2]);
1820       /* Note that the instruction's argument order is reversed from GLSL
1821        * and the IR.
1822        */
1823       emit(MAD(result_dst, op[2], op[1], op[0]));
1824       break;
1825
1826    case ir_triop_lrp:
1827       emit_lrp(result_dst, op[0], op[1], op[2]);
1828       break;
1829
1830    case ir_triop_csel:
1831       unreachable("already handled above");
1832       break;
1833
1834    case ir_triop_bfi:
1835       op[0] = fix_3src_operand(op[0]);
1836       op[1] = fix_3src_operand(op[1]);
1837       op[2] = fix_3src_operand(op[2]);
1838       emit(BFI2(result_dst, op[0], op[1], op[2]));
1839       break;
1840
1841    case ir_triop_bitfield_extract:
1842       op[0] = fix_3src_operand(op[0]);
1843       op[1] = fix_3src_operand(op[1]);
1844       op[2] = fix_3src_operand(op[2]);
1845       /* Note that the instruction's argument order is reversed from GLSL
1846        * and the IR.
1847        */
1848       emit(BFE(result_dst, op[2], op[1], op[0]));
1849       break;
1850
1851    case ir_triop_vector_insert:
1852       unreachable("should have been lowered by lower_vector_insert");
1853
1854    case ir_quadop_bitfield_insert:
1855       unreachable("not reached: should be handled by "
1856               "bitfield_insert_to_bfm_bfi\n");
1857
1858    case ir_quadop_vector:
1859       unreachable("not reached: should be handled by lower_quadop_vector");
1860
1861    case ir_unop_pack_half_2x16:
1862       emit_pack_half_2x16(result_dst, op[0]);
1863       break;
1864    case ir_unop_unpack_half_2x16:
1865       emit_unpack_half_2x16(result_dst, op[0]);
1866       break;
1867    case ir_unop_unpack_unorm_4x8:
1868       emit_unpack_unorm_4x8(result_dst, op[0]);
1869       break;
1870    case ir_unop_unpack_snorm_4x8:
1871       emit_unpack_snorm_4x8(result_dst, op[0]);
1872       break;
1873    case ir_unop_pack_unorm_4x8:
1874       emit_pack_unorm_4x8(result_dst, op[0]);
1875       break;
1876    case ir_unop_pack_snorm_4x8:
1877       emit_pack_snorm_4x8(result_dst, op[0]);
1878       break;
1879    case ir_unop_pack_snorm_2x16:
1880    case ir_unop_pack_unorm_2x16:
1881    case ir_unop_unpack_snorm_2x16:
1882    case ir_unop_unpack_unorm_2x16:
1883       unreachable("not reached: should be handled by lower_packing_builtins");
1884    case ir_unop_unpack_half_2x16_split_x:
1885    case ir_unop_unpack_half_2x16_split_y:
1886    case ir_binop_pack_half_2x16_split:
1887    case ir_unop_interpolate_at_centroid:
1888    case ir_binop_interpolate_at_sample:
1889    case ir_binop_interpolate_at_offset:
1890       unreachable("not reached: should not occur in vertex shader");
1891    case ir_binop_ldexp:
1892       unreachable("not reached: should be handled by ldexp_to_arith()");
1893    case ir_unop_d2f:
1894    case ir_unop_f2d:
1895    case ir_unop_d2i:
1896    case ir_unop_i2d:
1897    case ir_unop_d2u:
1898    case ir_unop_u2d:
1899    case ir_unop_d2b:
1900    case ir_unop_pack_double_2x32:
1901    case ir_unop_unpack_double_2x32:
1902    case ir_unop_frexp_sig:
1903    case ir_unop_frexp_exp:
1904       unreachable("fp64 todo");
1905    }
1906 }
1907
1908
1909 void
1910 vec4_visitor::visit(ir_swizzle *ir)
1911 {
1912    src_reg src;
1913    int i = 0;
1914    int swizzle[4];
1915
1916    /* Note that this is only swizzles in expressions, not those on the left
1917     * hand side of an assignment, which do write masking.  See ir_assignment
1918     * for that.
1919     */
1920
1921    ir->val->accept(this);
1922    src = this->result;
1923    assert(src.file != BAD_FILE);
1924
1925    for (i = 0; i < ir->type->vector_elements; i++) {
1926       switch (i) {
1927       case 0:
1928          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1929          break;
1930       case 1:
1931          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1932          break;
1933       case 2:
1934          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1935          break;
1936       case 3:
1937          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1938             break;
1939       }
1940    }
1941    for (; i < 4; i++) {
1942       /* Replicate the last channel out. */
1943       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1944    }
1945
1946    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1947
1948    this->result = src;
1949 }
1950
1951 void
1952 vec4_visitor::visit(ir_dereference_variable *ir)
1953 {
1954    const struct glsl_type *type = ir->type;
1955    dst_reg *reg = variable_storage(ir->var);
1956
1957    if (!reg) {
1958       fail("Failed to find variable storage for %s\n", ir->var->name);
1959       this->result = src_reg(brw_null_reg());
1960       return;
1961    }
1962
1963    this->result = src_reg(*reg);
1964
1965    /* System values get their swizzle from the dst_reg writemask */
1966    if (ir->var->data.mode == ir_var_system_value)
1967       return;
1968
1969    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1970       this->result.swizzle = swizzle_for_size(type->vector_elements);
1971 }
1972
1973
1974 int
1975 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1976 {
1977    /* Under normal circumstances array elements are stored consecutively, so
1978     * the stride is equal to the size of the array element.
1979     */
1980    return type_size(ir->type);
1981 }
1982
1983
1984 void
1985 vec4_visitor::visit(ir_dereference_array *ir)
1986 {
1987    ir_constant *constant_index;
1988    src_reg src;
1989    int array_stride = compute_array_stride(ir);
1990
1991    constant_index = ir->array_index->constant_expression_value();
1992
1993    ir->array->accept(this);
1994    src = this->result;
1995
1996    if (constant_index) {
1997       src.reg_offset += constant_index->value.i[0] * array_stride;
1998    } else {
1999       /* Variable index array dereference.  It eats the "vec4" of the
2000        * base of the array and an index that offsets the Mesa register
2001        * index.
2002        */
2003       ir->array_index->accept(this);
2004
2005       src_reg index_reg;
2006
2007       if (array_stride == 1) {
2008          index_reg = this->result;
2009       } else {
2010          index_reg = src_reg(this, glsl_type::int_type);
2011
2012          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2013       }
2014
2015       if (src.reladdr) {
2016          src_reg temp = src_reg(this, glsl_type::int_type);
2017
2018          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2019
2020          index_reg = temp;
2021       }
2022
2023       src.reladdr = ralloc(mem_ctx, src_reg);
2024       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2025    }
2026
2027    /* If the type is smaller than a vec4, replicate the last channel out. */
2028    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2029       src.swizzle = swizzle_for_size(ir->type->vector_elements);
2030    else
2031       src.swizzle = BRW_SWIZZLE_NOOP;
2032    src.type = brw_type_for_base_type(ir->type);
2033
2034    this->result = src;
2035 }
2036
2037 void
2038 vec4_visitor::visit(ir_dereference_record *ir)
2039 {
2040    unsigned int i;
2041    const glsl_type *struct_type = ir->record->type;
2042    int offset = 0;
2043
2044    ir->record->accept(this);
2045
2046    for (i = 0; i < struct_type->length; i++) {
2047       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2048          break;
2049       offset += type_size(struct_type->fields.structure[i].type);
2050    }
2051
2052    /* If the type is smaller than a vec4, replicate the last channel out. */
2053    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2054       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2055    else
2056       this->result.swizzle = BRW_SWIZZLE_NOOP;
2057    this->result.type = brw_type_for_base_type(ir->type);
2058
2059    this->result.reg_offset += offset;
2060 }
2061
2062 /**
2063  * We want to be careful in assignment setup to hit the actual storage
2064  * instead of potentially using a temporary like we might with the
2065  * ir_dereference handler.
2066  */
2067 static dst_reg
2068 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2069 {
2070    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2071     * access of a vector, it must be separated into a series conditional moves
2072     * before reaching this point (see ir_vec_index_to_cond_assign).
2073     */
2074    assert(ir->as_dereference());
2075    ir_dereference_array *deref_array = ir->as_dereference_array();
2076    if (deref_array) {
2077       assert(!deref_array->array->type->is_vector());
2078    }
2079
2080    /* Use the rvalue deref handler for the most part.  We'll ignore
2081     * swizzles in it and write swizzles using writemask, though.
2082     */
2083    ir->accept(v);
2084    return dst_reg(v->result);
2085 }
2086
2087 void
2088 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2089                               const struct glsl_type *type,
2090                               enum brw_predicate predicate)
2091 {
2092    if (type->base_type == GLSL_TYPE_STRUCT) {
2093       for (unsigned int i = 0; i < type->length; i++) {
2094          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2095       }
2096       return;
2097    }
2098
2099    if (type->is_array()) {
2100       for (unsigned int i = 0; i < type->length; i++) {
2101          emit_block_move(dst, src, type->fields.array, predicate);
2102       }
2103       return;
2104    }
2105
2106    if (type->is_matrix()) {
2107       const struct glsl_type *vec_type;
2108
2109       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2110                                          type->vector_elements, 1);
2111
2112       for (int i = 0; i < type->matrix_columns; i++) {
2113          emit_block_move(dst, src, vec_type, predicate);
2114       }
2115       return;
2116    }
2117
2118    assert(type->is_scalar() || type->is_vector());
2119
2120    dst->type = brw_type_for_base_type(type);
2121    src->type = dst->type;
2122
2123    dst->writemask = (1 << type->vector_elements) - 1;
2124
2125    src->swizzle = swizzle_for_size(type->vector_elements);
2126
2127    vec4_instruction *inst = emit(MOV(*dst, *src));
2128    inst->predicate = predicate;
2129
2130    dst->reg_offset++;
2131    src->reg_offset++;
2132 }
2133
2134
2135 /* If the RHS processing resulted in an instruction generating a
2136  * temporary value, and it would be easy to rewrite the instruction to
2137  * generate its result right into the LHS instead, do so.  This ends
2138  * up reliably removing instructions where it can be tricky to do so
2139  * later without real UD chain information.
2140  */
2141 bool
2142 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2143                                      dst_reg dst,
2144                                      src_reg src,
2145                                      vec4_instruction *pre_rhs_inst,
2146                                      vec4_instruction *last_rhs_inst)
2147 {
2148    /* This could be supported, but it would take more smarts. */
2149    if (ir->condition)
2150       return false;
2151
2152    if (pre_rhs_inst == last_rhs_inst)
2153       return false; /* No instructions generated to work with. */
2154
2155    /* Make sure the last instruction generated our source reg. */
2156    if (src.file != GRF ||
2157        src.file != last_rhs_inst->dst.file ||
2158        src.reg != last_rhs_inst->dst.reg ||
2159        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2160        src.reladdr ||
2161        src.abs ||
2162        src.negate ||
2163        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2164       return false;
2165
2166    /* Check that that last instruction fully initialized the channels
2167     * we want to use, in the order we want to use them.  We could
2168     * potentially reswizzle the operands of many instructions so that
2169     * we could handle out of order channels, but don't yet.
2170     */
2171
2172    for (unsigned i = 0; i < 4; i++) {
2173       if (dst.writemask & (1 << i)) {
2174          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2175             return false;
2176
2177          if (BRW_GET_SWZ(src.swizzle, i) != i)
2178             return false;
2179       }
2180    }
2181
2182    /* Success!  Rewrite the instruction. */
2183    last_rhs_inst->dst.file = dst.file;
2184    last_rhs_inst->dst.reg = dst.reg;
2185    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2186    last_rhs_inst->dst.reladdr = dst.reladdr;
2187    last_rhs_inst->dst.writemask &= dst.writemask;
2188
2189    return true;
2190 }
2191
2192 void
2193 vec4_visitor::visit(ir_assignment *ir)
2194 {
2195    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2196    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2197
2198    if (!ir->lhs->type->is_scalar() &&
2199        !ir->lhs->type->is_vector()) {
2200       ir->rhs->accept(this);
2201       src_reg src = this->result;
2202
2203       if (ir->condition) {
2204          emit_bool_to_cond_code(ir->condition, &predicate);
2205       }
2206
2207       /* emit_block_move doesn't account for swizzles in the source register.
2208        * This should be ok, since the source register is a structure or an
2209        * array, and those can't be swizzled.  But double-check to be sure.
2210        */
2211       assert(src.swizzle ==
2212              (ir->rhs->type->is_matrix()
2213               ? swizzle_for_size(ir->rhs->type->vector_elements)
2214               : BRW_SWIZZLE_NOOP));
2215
2216       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2217       return;
2218    }
2219
2220    /* Now we're down to just a scalar/vector with writemasks. */
2221    int i;
2222
2223    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2224    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2225
2226    ir->rhs->accept(this);
2227
2228    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2229
2230    src_reg src = this->result;
2231
2232    int swizzles[4];
2233    int first_enabled_chan = 0;
2234    int src_chan = 0;
2235
2236    assert(ir->lhs->type->is_vector() ||
2237           ir->lhs->type->is_scalar());
2238    dst.writemask = ir->write_mask;
2239
2240    for (int i = 0; i < 4; i++) {
2241       if (dst.writemask & (1 << i)) {
2242          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2243          break;
2244       }
2245    }
2246
2247    /* Swizzle a small RHS vector into the channels being written.
2248     *
2249     * glsl ir treats write_mask as dictating how many channels are
2250     * present on the RHS while in our instructions we need to make
2251     * those channels appear in the slots of the vec4 they're written to.
2252     */
2253    for (int i = 0; i < 4; i++) {
2254       if (dst.writemask & (1 << i))
2255          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2256       else
2257          swizzles[i] = first_enabled_chan;
2258    }
2259    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2260                               swizzles[2], swizzles[3]);
2261
2262    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2263       return;
2264    }
2265
2266    if (ir->condition) {
2267       emit_bool_to_cond_code(ir->condition, &predicate);
2268    }
2269
2270    for (i = 0; i < type_size(ir->lhs->type); i++) {
2271       vec4_instruction *inst = emit(MOV(dst, src));
2272       inst->predicate = predicate;
2273
2274       dst.reg_offset++;
2275       src.reg_offset++;
2276    }
2277 }
2278
2279 void
2280 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2281 {
2282    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2283       foreach_in_list(ir_constant, field_value, &ir->components) {
2284          emit_constant_values(dst, field_value);
2285       }
2286       return;
2287    }
2288
2289    if (ir->type->is_array()) {
2290       for (unsigned int i = 0; i < ir->type->length; i++) {
2291          emit_constant_values(dst, ir->array_elements[i]);
2292       }
2293       return;
2294    }
2295
2296    if (ir->type->is_matrix()) {
2297       for (int i = 0; i < ir->type->matrix_columns; i++) {
2298          float *vec = &ir->value.f[i * ir->type->vector_elements];
2299
2300          for (int j = 0; j < ir->type->vector_elements; j++) {
2301             dst->writemask = 1 << j;
2302             dst->type = BRW_REGISTER_TYPE_F;
2303
2304             emit(MOV(*dst, src_reg(vec[j])));
2305          }
2306          dst->reg_offset++;
2307       }
2308       return;
2309    }
2310
2311    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2312
2313    for (int i = 0; i < ir->type->vector_elements; i++) {
2314       if (!(remaining_writemask & (1 << i)))
2315          continue;
2316
2317       dst->writemask = 1 << i;
2318       dst->type = brw_type_for_base_type(ir->type);
2319
2320       /* Find other components that match the one we're about to
2321        * write.  Emits fewer instructions for things like vec4(0.5,
2322        * 1.5, 1.5, 1.5).
2323        */
2324       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2325          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2326             if (ir->value.b[i] == ir->value.b[j])
2327                dst->writemask |= (1 << j);
2328          } else {
2329             /* u, i, and f storage all line up, so no need for a
2330              * switch case for comparing each type.
2331              */
2332             if (ir->value.u[i] == ir->value.u[j])
2333                dst->writemask |= (1 << j);
2334          }
2335       }
2336
2337       switch (ir->type->base_type) {
2338       case GLSL_TYPE_FLOAT:
2339          emit(MOV(*dst, src_reg(ir->value.f[i])));
2340          break;
2341       case GLSL_TYPE_INT:
2342          emit(MOV(*dst, src_reg(ir->value.i[i])));
2343          break;
2344       case GLSL_TYPE_UINT:
2345          emit(MOV(*dst, src_reg(ir->value.u[i])));
2346          break;
2347       case GLSL_TYPE_BOOL:
2348          emit(MOV(*dst,
2349                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2350                                               : 0)));
2351          break;
2352       default:
2353          unreachable("Non-float/uint/int/bool constant");
2354       }
2355
2356       remaining_writemask &= ~dst->writemask;
2357    }
2358    dst->reg_offset++;
2359 }
2360
2361 void
2362 vec4_visitor::visit(ir_constant *ir)
2363 {
2364    dst_reg dst = dst_reg(this, ir->type);
2365    this->result = src_reg(dst);
2366
2367    emit_constant_values(&dst, ir);
2368 }
2369
2370 void
2371 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2372 {
2373    ir_dereference *deref = static_cast<ir_dereference *>(
2374       ir->actual_parameters.get_head());
2375    ir_variable *location = deref->variable_referenced();
2376    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2377                           location->data.binding);
2378
2379    /* Calculate the surface offset */
2380    src_reg offset(this, glsl_type::uint_type);
2381    ir_dereference_array *deref_array = deref->as_dereference_array();
2382    if (deref_array) {
2383       deref_array->array_index->accept(this);
2384
2385       src_reg tmp(this, glsl_type::uint_type);
2386       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2387       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2388    } else {
2389       offset = location->data.atomic.offset;
2390    }
2391
2392    /* Emit the appropriate machine instruction */
2393    const char *callee = ir->callee->function_name();
2394    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2395
2396    if (!strcmp("__intrinsic_atomic_read", callee)) {
2397       emit_untyped_surface_read(surf_index, dst, offset);
2398
2399    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2400       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2401                           src_reg(), src_reg());
2402
2403    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2404       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2405                           src_reg(), src_reg());
2406    }
2407 }
2408
2409 void
2410 vec4_visitor::visit(ir_call *ir)
2411 {
2412    const char *callee = ir->callee->function_name();
2413
2414    if (!strcmp("__intrinsic_atomic_read", callee) ||
2415        !strcmp("__intrinsic_atomic_increment", callee) ||
2416        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2417       visit_atomic_counter_intrinsic(ir);
2418    } else {
2419       unreachable("Unsupported intrinsic.");
2420    }
2421 }
2422
2423 src_reg
2424 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2425 {
2426    vec4_instruction *inst =
2427       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2428                                     dst_reg(this, glsl_type::uvec4_type));
2429    inst->base_mrf = 2;
2430    inst->mlen = 1;
2431    inst->src[1] = sampler;
2432
2433    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2434    int param_base = inst->base_mrf;
2435    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2436    int zero_mask = 0xf & ~coord_mask;
2437
2438    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2439             coordinate));
2440
2441    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2442             src_reg(0)));
2443
2444    emit(inst);
2445    return src_reg(inst->dst);
2446 }
2447
2448 static bool
2449 is_high_sampler(struct brw_context *brw, src_reg sampler)
2450 {
2451    if (brw->gen < 8 && !brw->is_haswell)
2452       return false;
2453
2454    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2455 }
2456
2457 void
2458 vec4_visitor::visit(ir_texture *ir)
2459 {
2460    uint32_t sampler =
2461       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2462
2463    ir_rvalue *nonconst_sampler_index =
2464       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2465
2466    /* Handle non-constant sampler array indexing */
2467    src_reg sampler_reg;
2468    if (nonconst_sampler_index) {
2469       /* The highest sampler which may be used by this operation is
2470        * the last element of the array. Mark it here, because the generator
2471        * doesn't have enough information to determine the bound.
2472        */
2473       uint32_t array_size = ir->sampler->as_dereference_array()
2474          ->array->type->array_size();
2475
2476       uint32_t max_used = sampler + array_size - 1;
2477       if (ir->op == ir_tg4 && brw->gen < 8) {
2478          max_used += prog_data->base.binding_table.gather_texture_start;
2479       } else {
2480          max_used += prog_data->base.binding_table.texture_start;
2481       }
2482
2483       brw_mark_surface_used(&prog_data->base, max_used);
2484
2485       /* Emit code to evaluate the actual indexing expression */
2486       nonconst_sampler_index->accept(this);
2487       dst_reg temp(this, glsl_type::uint_type);
2488       emit(ADD(temp, this->result, src_reg(sampler)))
2489          ->force_writemask_all = true;
2490       sampler_reg = src_reg(temp);
2491    } else {
2492       /* Single sampler, or constant array index; the indexing expression
2493        * is just an immediate.
2494        */
2495       sampler_reg = src_reg(sampler);
2496    }
2497
2498    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2499     * emitting anything other than setting up the constant result.
2500     */
2501    if (ir->op == ir_tg4) {
2502       ir_constant *chan = ir->lod_info.component->as_constant();
2503       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2504       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2505          dst_reg result(this, ir->type);
2506          this->result = src_reg(result);
2507          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2508          return;
2509       }
2510    }
2511
2512    /* Should be lowered by do_lower_texture_projection */
2513    assert(!ir->projector);
2514
2515    /* Should be lowered */
2516    assert(!ir->offset || !ir->offset->type->is_array());
2517
2518    /* Generate code to compute all the subexpression trees.  This has to be
2519     * done before loading any values into MRFs for the sampler message since
2520     * generating these values may involve SEND messages that need the MRFs.
2521     */
2522    src_reg coordinate;
2523    if (ir->coordinate) {
2524       ir->coordinate->accept(this);
2525       coordinate = this->result;
2526    }
2527
2528    src_reg shadow_comparitor;
2529    if (ir->shadow_comparitor) {
2530       ir->shadow_comparitor->accept(this);
2531       shadow_comparitor = this->result;
2532    }
2533
2534    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2535    src_reg offset_value;
2536    if (has_nonconstant_offset) {
2537       ir->offset->accept(this);
2538       offset_value = src_reg(this->result);
2539    }
2540
2541    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2542    src_reg lod, dPdx, dPdy, sample_index, mcs;
2543    switch (ir->op) {
2544    case ir_tex:
2545       lod = src_reg(0.0f);
2546       lod_type = glsl_type::float_type;
2547       break;
2548    case ir_txf:
2549    case ir_txl:
2550    case ir_txs:
2551       ir->lod_info.lod->accept(this);
2552       lod = this->result;
2553       lod_type = ir->lod_info.lod->type;
2554       break;
2555    case ir_query_levels:
2556       lod = src_reg(0);
2557       lod_type = glsl_type::int_type;
2558       break;
2559    case ir_txf_ms:
2560       ir->lod_info.sample_index->accept(this);
2561       sample_index = this->result;
2562       sample_index_type = ir->lod_info.sample_index->type;
2563
2564       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2565          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2566       else
2567          mcs = src_reg(0u);
2568       break;
2569    case ir_txd:
2570       ir->lod_info.grad.dPdx->accept(this);
2571       dPdx = this->result;
2572
2573       ir->lod_info.grad.dPdy->accept(this);
2574       dPdy = this->result;
2575
2576       lod_type = ir->lod_info.grad.dPdx->type;
2577       break;
2578    case ir_txb:
2579    case ir_lod:
2580    case ir_tg4:
2581       break;
2582    }
2583
2584    enum opcode opcode;
2585    switch (ir->op) {
2586    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2587    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2588    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2589    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2590    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2591    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2592    case ir_tg4: opcode = has_nonconstant_offset
2593                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2594    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2595    case ir_txb:
2596       unreachable("TXB is not valid for vertex shaders.");
2597    case ir_lod:
2598       unreachable("LOD is not valid for vertex shaders.");
2599    default:
2600       unreachable("Unrecognized tex op");
2601    }
2602
2603    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2604       opcode, dst_reg(this, ir->type));
2605
2606    if (ir->offset != NULL && !has_nonconstant_offset) {
2607       inst->offset =
2608          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2609                             ir->offset->type->vector_elements);
2610    }
2611
2612    /* Stuff the channel select bits in the top of the texture offset */
2613    if (ir->op == ir_tg4)
2614       inst->offset |= gather_channel(ir, sampler) << 16;
2615
2616    /* The message header is necessary for:
2617     * - Gen4 (always)
2618     * - Gen9+ for selecting SIMD4x2
2619     * - Texel offsets
2620     * - Gather channel selection
2621     * - Sampler indices too large to fit in a 4-bit value.
2622     */
2623    inst->header_present =
2624       brw->gen < 5 || brw->gen >= 9 ||
2625       inst->offset != 0 || ir->op == ir_tg4 ||
2626       is_high_sampler(brw, sampler_reg);
2627    inst->base_mrf = 2;
2628    inst->mlen = inst->header_present + 1; /* always at least one */
2629    inst->dst.writemask = WRITEMASK_XYZW;
2630    inst->shadow_compare = ir->shadow_comparitor != NULL;
2631
2632    inst->src[1] = sampler_reg;
2633
2634    /* MRF for the first parameter */
2635    int param_base = inst->base_mrf + inst->header_present;
2636
2637    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2638       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2639       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2640    } else {
2641       /* Load the coordinate */
2642       /* FINISHME: gl_clamp_mask and saturate */
2643       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2644       int zero_mask = 0xf & ~coord_mask;
2645
2646       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2647                coordinate));
2648
2649       if (zero_mask != 0) {
2650          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2651                   src_reg(0)));
2652       }
2653       /* Load the shadow comparitor */
2654       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2655          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2656                           WRITEMASK_X),
2657                   shadow_comparitor));
2658          inst->mlen++;
2659       }
2660
2661       /* Load the LOD info */
2662       if (ir->op == ir_tex || ir->op == ir_txl) {
2663          int mrf, writemask;
2664          if (brw->gen >= 5) {
2665             mrf = param_base + 1;
2666             if (ir->shadow_comparitor) {
2667                writemask = WRITEMASK_Y;
2668                /* mlen already incremented */
2669             } else {
2670                writemask = WRITEMASK_X;
2671                inst->mlen++;
2672             }
2673          } else /* brw->gen == 4 */ {
2674             mrf = param_base;
2675             writemask = WRITEMASK_W;
2676          }
2677          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2678       } else if (ir->op == ir_txf) {
2679          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2680       } else if (ir->op == ir_txf_ms) {
2681          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2682                   sample_index));
2683          if (brw->gen >= 7) {
2684             /* MCS data is in the first channel of `mcs`, but we need to get it into
2685              * the .y channel of the second vec4 of params, so replicate .x across
2686              * the whole vec4 and then mask off everything except .y
2687              */
2688             mcs.swizzle = BRW_SWIZZLE_XXXX;
2689             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2690                      mcs));
2691          }
2692          inst->mlen++;
2693       } else if (ir->op == ir_txd) {
2694          const glsl_type *type = lod_type;
2695
2696          if (brw->gen >= 5) {
2697             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2698             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2699             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2700             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2701             inst->mlen++;
2702
2703             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2704                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2705                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2706                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2707                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2708                inst->mlen++;
2709
2710                if (ir->shadow_comparitor) {
2711                   emit(MOV(dst_reg(MRF, param_base + 2,
2712                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2713                            shadow_comparitor));
2714                }
2715             }
2716          } else /* brw->gen == 4 */ {
2717             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2718             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2719             inst->mlen += 2;
2720          }
2721       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2722          if (ir->shadow_comparitor) {
2723             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2724                      shadow_comparitor));
2725          }
2726
2727          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2728                   offset_value));
2729          inst->mlen++;
2730       }
2731    }
2732
2733    emit(inst);
2734
2735    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2736     * spec requires layers.
2737     */
2738    if (ir->op == ir_txs) {
2739       glsl_type const *type = ir->sampler->type;
2740       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2741           type->sampler_array) {
2742          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2743                    writemask(inst->dst, WRITEMASK_Z),
2744                    src_reg(inst->dst), src_reg(6));
2745       }
2746    }
2747
2748    if (brw->gen == 6 && ir->op == ir_tg4) {
2749       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2750    }
2751
2752    swizzle_result(ir, src_reg(inst->dst), sampler);
2753 }
2754
2755 /**
2756  * Apply workarounds for Gen6 gather with UINT/SINT
2757  */
2758 void
2759 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2760 {
2761    if (!wa)
2762       return;
2763
2764    int width = (wa & WA_8BIT) ? 8 : 16;
2765    dst_reg dst_f = dst;
2766    dst_f.type = BRW_REGISTER_TYPE_F;
2767
2768    /* Convert from UNORM to UINT */
2769    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2770    emit(MOV(dst, src_reg(dst_f)));
2771
2772    if (wa & WA_SIGN) {
2773       /* Reinterpret the UINT value as a signed INT value by
2774        * shifting the sign bit into place, then shifting back
2775        * preserving sign.
2776        */
2777       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2778       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2779    }
2780 }
2781
2782 /**
2783  * Set up the gather channel based on the swizzle, for gather4.
2784  */
2785 uint32_t
2786 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2787 {
2788    ir_constant *chan = ir->lod_info.component->as_constant();
2789    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2790    switch (swiz) {
2791       case SWIZZLE_X: return 0;
2792       case SWIZZLE_Y:
2793          /* gather4 sampler is broken for green channel on RG32F --
2794           * we must ask for blue instead.
2795           */
2796          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2797             return 2;
2798          return 1;
2799       case SWIZZLE_Z: return 2;
2800       case SWIZZLE_W: return 3;
2801       default:
2802          unreachable("Not reached"); /* zero, one swizzles handled already */
2803    }
2804 }
2805
2806 void
2807 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2808 {
2809    int s = key->tex.swizzles[sampler];
2810
2811    this->result = src_reg(this, ir->type);
2812    dst_reg swizzled_result(this->result);
2813
2814    if (ir->op == ir_query_levels) {
2815       /* # levels is in .w */
2816       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2817       emit(MOV(swizzled_result, orig_val));
2818       return;
2819    }
2820
2821    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2822                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2823       emit(MOV(swizzled_result, orig_val));
2824       return;
2825    }
2826
2827
2828    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2829    int swizzle[4] = {0};
2830
2831    for (int i = 0; i < 4; i++) {
2832       switch (GET_SWZ(s, i)) {
2833       case SWIZZLE_ZERO:
2834          zero_mask |= (1 << i);
2835          break;
2836       case SWIZZLE_ONE:
2837          one_mask |= (1 << i);
2838          break;
2839       default:
2840          copy_mask |= (1 << i);
2841          swizzle[i] = GET_SWZ(s, i);
2842          break;
2843       }
2844    }
2845
2846    if (copy_mask) {
2847       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2848       swizzled_result.writemask = copy_mask;
2849       emit(MOV(swizzled_result, orig_val));
2850    }
2851
2852    if (zero_mask) {
2853       swizzled_result.writemask = zero_mask;
2854       emit(MOV(swizzled_result, src_reg(0.0f)));
2855    }
2856
2857    if (one_mask) {
2858       swizzled_result.writemask = one_mask;
2859       emit(MOV(swizzled_result, src_reg(1.0f)));
2860    }
2861 }
2862
2863 void
2864 vec4_visitor::visit(ir_return *)
2865 {
2866    unreachable("not reached");
2867 }
2868
2869 void
2870 vec4_visitor::visit(ir_discard *)
2871 {
2872    unreachable("not reached");
2873 }
2874
2875 void
2876 vec4_visitor::visit(ir_if *ir)
2877 {
2878    /* Don't point the annotation at the if statement, because then it plus
2879     * the then and else blocks get printed.
2880     */
2881    this->base_ir = ir->condition;
2882
2883    if (brw->gen == 6) {
2884       emit_if_gen6(ir);
2885    } else {
2886       enum brw_predicate predicate;
2887       emit_bool_to_cond_code(ir->condition, &predicate);
2888       emit(IF(predicate));
2889    }
2890
2891    visit_instructions(&ir->then_instructions);
2892
2893    if (!ir->else_instructions.is_empty()) {
2894       this->base_ir = ir->condition;
2895       emit(BRW_OPCODE_ELSE);
2896
2897       visit_instructions(&ir->else_instructions);
2898    }
2899
2900    this->base_ir = ir->condition;
2901    emit(BRW_OPCODE_ENDIF);
2902 }
2903
2904 void
2905 vec4_visitor::visit(ir_emit_vertex *)
2906 {
2907    unreachable("not reached");
2908 }
2909
2910 void
2911 vec4_visitor::visit(ir_end_primitive *)
2912 {
2913    unreachable("not reached");
2914 }
2915
2916 void
2917 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2918                                   dst_reg dst, src_reg offset,
2919                                   src_reg src0, src_reg src1)
2920 {
2921    unsigned mlen = 0;
2922
2923    /* Set the atomic operation offset. */
2924    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2925    mlen++;
2926
2927    /* Set the atomic operation arguments. */
2928    if (src0.file != BAD_FILE) {
2929       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2930       mlen++;
2931    }
2932
2933    if (src1.file != BAD_FILE) {
2934       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2935       mlen++;
2936    }
2937
2938    /* Emit the instruction.  Note that this maps to the normal SIMD8
2939     * untyped atomic message on Ivy Bridge, but that's OK because
2940     * unused channels will be masked out.
2941     */
2942    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2943                                  src_reg(atomic_op), src_reg(surf_index));
2944    inst->base_mrf = 0;
2945    inst->mlen = mlen;
2946 }
2947
2948 void
2949 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2950                                         src_reg offset)
2951 {
2952    /* Set the surface read offset. */
2953    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2954
2955    /* Emit the instruction.  Note that this maps to the normal SIMD8
2956     * untyped surface read message, but that's OK because unused
2957     * channels will be masked out.
2958     */
2959    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2960                                  dst, src_reg(surf_index));
2961    inst->base_mrf = 0;
2962    inst->mlen = 1;
2963 }
2964
2965 void
2966 vec4_visitor::emit_ndc_computation()
2967 {
2968    /* Get the position */
2969    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2970
2971    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2972    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2973    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2974
2975    current_annotation = "NDC";
2976    dst_reg ndc_w = ndc;
2977    ndc_w.writemask = WRITEMASK_W;
2978    src_reg pos_w = pos;
2979    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2980    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2981
2982    dst_reg ndc_xyz = ndc;
2983    ndc_xyz.writemask = WRITEMASK_XYZ;
2984
2985    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2986 }
2987
2988 void
2989 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2990 {
2991    if (brw->gen < 6 &&
2992        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2993         key->userclip_active || brw->has_negative_rhw_bug)) {
2994       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2995       dst_reg header1_w = header1;
2996       header1_w.writemask = WRITEMASK_W;
2997
2998       emit(MOV(header1, 0u));
2999
3000       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3001          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3002
3003          current_annotation = "Point size";
3004          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3005          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3006       }
3007
3008       if (key->userclip_active) {
3009          current_annotation = "Clipping flags";
3010          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3011          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3012
3013          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3014          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3015          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3016
3017          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3018          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3019          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3020          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3021       }
3022
3023       /* i965 clipping workaround:
3024        * 1) Test for -ve rhw
3025        * 2) If set,
3026        *      set ndc = (0,0,0,0)
3027        *      set ucp[6] = 1
3028        *
3029        * Later, clipping will detect ucp[6] and ensure the primitive is
3030        * clipped against all fixed planes.
3031        */
3032       if (brw->has_negative_rhw_bug) {
3033          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3034          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3035          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3036          vec4_instruction *inst;
3037          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3038          inst->predicate = BRW_PREDICATE_NORMAL;
3039          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3040          inst->predicate = BRW_PREDICATE_NORMAL;
3041       }
3042
3043       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3044    } else if (brw->gen < 6) {
3045       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3046    } else {
3047       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3048       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3049          dst_reg reg_w = reg;
3050          reg_w.writemask = WRITEMASK_W;
3051          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3052       }
3053       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3054          dst_reg reg_y = reg;
3055          reg_y.writemask = WRITEMASK_Y;
3056          reg_y.type = BRW_REGISTER_TYPE_D;
3057          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3058       }
3059       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3060          dst_reg reg_z = reg;
3061          reg_z.writemask = WRITEMASK_Z;
3062          reg_z.type = BRW_REGISTER_TYPE_D;
3063          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3064       }
3065    }
3066 }
3067
3068 void
3069 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3070 {
3071    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3072     *
3073     *     "If a linked set of shaders forming the vertex stage contains no
3074     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3075     *     application has requested clipping against user clip planes through
3076     *     the API, then the coordinate written to gl_Position is used for
3077     *     comparison against the user clip planes."
3078     *
3079     * This function is only called if the shader didn't write to
3080     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3081     * if the user wrote to it; otherwise we use gl_Position.
3082     */
3083    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3084    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3085       clip_vertex = VARYING_SLOT_POS;
3086    }
3087
3088    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3089         ++i) {
3090       reg.writemask = 1 << i;
3091       emit(DP4(reg,
3092                src_reg(output_reg[clip_vertex]),
3093                src_reg(this->userplane[i + offset])));
3094    }
3095 }
3096
3097 vec4_instruction *
3098 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3099 {
3100    assert (varying < VARYING_SLOT_MAX);
3101    reg.type = output_reg[varying].type;
3102    current_annotation = output_reg_annotation[varying];
3103    /* Copy the register, saturating if necessary */
3104    return emit(MOV(reg, src_reg(output_reg[varying])));
3105 }
3106
3107 void
3108 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3109 {
3110    reg.type = BRW_REGISTER_TYPE_F;
3111
3112    switch (varying) {
3113    case VARYING_SLOT_PSIZ:
3114    {
3115       /* PSIZ is always in slot 0, and is coupled with other flags. */
3116       current_annotation = "indices, point width, clip flags";
3117       emit_psiz_and_flags(reg);
3118       break;
3119    }
3120    case BRW_VARYING_SLOT_NDC:
3121       current_annotation = "NDC";
3122       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3123       break;
3124    case VARYING_SLOT_POS:
3125       current_annotation = "gl_Position";
3126       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3127       break;
3128    case VARYING_SLOT_EDGE:
3129       /* This is present when doing unfilled polygons.  We're supposed to copy
3130        * the edge flag from the user-provided vertex array
3131        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3132        * of that attribute (starts as 1.0f).  This is then used in clipping to
3133        * determine which edges should be drawn as wireframe.
3134        */
3135       current_annotation = "edge flag";
3136       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3137                                     glsl_type::float_type, WRITEMASK_XYZW))));
3138       break;
3139    case BRW_VARYING_SLOT_PAD:
3140       /* No need to write to this slot */
3141       break;
3142    case VARYING_SLOT_COL0:
3143    case VARYING_SLOT_COL1:
3144    case VARYING_SLOT_BFC0:
3145    case VARYING_SLOT_BFC1: {
3146       /* These built-in varyings are only supported in compatibility mode,
3147        * and we only support GS in core profile.  So, this must be a vertex
3148        * shader.
3149        */
3150       assert(stage == MESA_SHADER_VERTEX);
3151       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3152       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3153          inst->saturate = true;
3154       break;
3155    }
3156
3157    default:
3158       emit_generic_urb_slot(reg, varying);
3159       break;
3160    }
3161 }
3162
3163 static int
3164 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3165 {
3166    if (brw->gen >= 6) {
3167       /* URB data written (does not include the message header reg) must
3168        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3169        * section 5.4.3.2.2: URB_INTERLEAVED.
3170        *
3171        * URB entries are allocated on a multiple of 1024 bits, so an
3172        * extra 128 bits written here to make the end align to 256 is
3173        * no problem.
3174        */
3175       if ((mlen % 2) != 1)
3176          mlen++;
3177    }
3178
3179    return mlen;
3180 }
3181
3182
3183 /**
3184  * Generates the VUE payload plus the necessary URB write instructions to
3185  * output it.
3186  *
3187  * The VUE layout is documented in Volume 2a.
3188  */
3189 void
3190 vec4_visitor::emit_vertex()
3191 {
3192    /* MRF 0 is reserved for the debugger, so start with message header
3193     * in MRF 1.
3194     */
3195    int base_mrf = 1;
3196    int mrf = base_mrf;
3197    /* In the process of generating our URB write message contents, we
3198     * may need to unspill a register or load from an array.  Those
3199     * reads would use MRFs 14-15.
3200     */
3201    int max_usable_mrf = 13;
3202
3203    /* The following assertion verifies that max_usable_mrf causes an
3204     * even-numbered amount of URB write data, which will meet gen6's
3205     * requirements for length alignment.
3206     */
3207    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3208
3209    /* First mrf is the g0-based message header containing URB handles and
3210     * such.
3211     */
3212    emit_urb_write_header(mrf++);
3213
3214    if (brw->gen < 6) {
3215       emit_ndc_computation();
3216    }
3217
3218    /* Lower legacy ff and ClipVertex clipping to clip distances */
3219    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3220       current_annotation = "user clip distances";
3221
3222       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3223       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3224
3225       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3226       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3227    }
3228
3229    /* We may need to split this up into several URB writes, so do them in a
3230     * loop.
3231     */
3232    int slot = 0;
3233    bool complete = false;
3234    do {
3235       /* URB offset is in URB row increments, and each of our MRFs is half of
3236        * one of those, since we're doing interleaved writes.
3237        */
3238       int offset = slot / 2;
3239
3240       mrf = base_mrf + 1;
3241       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3242          emit_urb_slot(dst_reg(MRF, mrf++),
3243                        prog_data->vue_map.slot_to_varying[slot]);
3244
3245          /* If this was max_usable_mrf, we can't fit anything more into this
3246           * URB WRITE.
3247           */
3248          if (mrf > max_usable_mrf) {
3249             slot++;
3250             break;
3251          }
3252       }
3253
3254       complete = slot >= prog_data->vue_map.num_slots;
3255       current_annotation = "URB write";
3256       vec4_instruction *inst = emit_urb_write_opcode(complete);
3257       inst->base_mrf = base_mrf;
3258       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3259       inst->offset += offset;
3260    } while(!complete);
3261 }
3262
3263
3264 src_reg
3265 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3266                                  src_reg *reladdr, int reg_offset)
3267 {
3268    /* Because we store the values to scratch interleaved like our
3269     * vertex data, we need to scale the vec4 index by 2.
3270     */
3271    int message_header_scale = 2;
3272
3273    /* Pre-gen6, the message header uses byte offsets instead of vec4
3274     * (16-byte) offset units.
3275     */
3276    if (brw->gen < 6)
3277       message_header_scale *= 16;
3278
3279    if (reladdr) {
3280       src_reg index = src_reg(this, glsl_type::int_type);
3281
3282       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3283                                    src_reg(reg_offset)));
3284       emit_before(block, inst, MUL(dst_reg(index), index,
3285                                    src_reg(message_header_scale)));
3286
3287       return index;
3288    } else {
3289       return src_reg(reg_offset * message_header_scale);
3290    }
3291 }
3292
3293 src_reg
3294 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3295                                        src_reg *reladdr, int reg_offset)
3296 {
3297    if (reladdr) {
3298       src_reg index = src_reg(this, glsl_type::int_type);
3299
3300       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3301                                    src_reg(reg_offset)));
3302
3303       /* Pre-gen6, the message header uses byte offsets instead of vec4
3304        * (16-byte) offset units.
3305        */
3306       if (brw->gen < 6) {
3307          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3308       }
3309
3310       return index;
3311    } else if (brw->gen >= 8) {
3312       /* Store the offset in a GRF so we can send-from-GRF. */
3313       src_reg offset = src_reg(this, glsl_type::int_type);
3314       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3315       return offset;
3316    } else {
3317       int message_header_scale = brw->gen < 6 ? 16 : 1;
3318       return src_reg(reg_offset * message_header_scale);
3319    }
3320 }
3321
3322 /**
3323  * Emits an instruction before @inst to load the value named by @orig_src
3324  * from scratch space at @base_offset to @temp.
3325  *
3326  * @base_offset is measured in 32-byte units (the size of a register).
3327  */
3328 void
3329 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3330                                 dst_reg temp, src_reg orig_src,
3331                                 int base_offset)
3332 {
3333    int reg_offset = base_offset + orig_src.reg_offset;
3334    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3335                                       reg_offset);
3336
3337    emit_before(block, inst, SCRATCH_READ(temp, index));
3338 }
3339
3340 /**
3341  * Emits an instruction after @inst to store the value to be written
3342  * to @orig_dst to scratch space at @base_offset, from @temp.
3343  *
3344  * @base_offset is measured in 32-byte units (the size of a register).
3345  */
3346 void
3347 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3348                                  int base_offset)
3349 {
3350    int reg_offset = base_offset + inst->dst.reg_offset;
3351    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3352                                       reg_offset);
3353
3354    /* Create a temporary register to store *inst's result in.
3355     *
3356     * We have to be careful in MOVing from our temporary result register in
3357     * the scratch write.  If we swizzle from channels of the temporary that
3358     * weren't initialized, it will confuse live interval analysis, which will
3359     * make spilling fail to make progress.
3360     */
3361    src_reg temp = src_reg(this, glsl_type::vec4_type);
3362    temp.type = inst->dst.type;
3363    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3364    int swizzles[4];
3365    for (int i = 0; i < 4; i++)
3366       if (inst->dst.writemask & (1 << i))
3367          swizzles[i] = i;
3368       else
3369          swizzles[i] = first_writemask_chan;
3370    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3371                                swizzles[2], swizzles[3]);
3372
3373    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3374                                        inst->dst.writemask));
3375    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3376    write->predicate = inst->predicate;
3377    write->ir = inst->ir;
3378    write->annotation = inst->annotation;
3379    inst->insert_after(block, write);
3380
3381    inst->dst.file = temp.file;
3382    inst->dst.reg = temp.reg;
3383    inst->dst.reg_offset = temp.reg_offset;
3384    inst->dst.reladdr = NULL;
3385 }
3386
3387 /**
3388  * We can't generally support array access in GRF space, because a
3389  * single instruction's destination can only span 2 contiguous
3390  * registers.  So, we send all GRF arrays that get variable index
3391  * access to scratch space.
3392  */
3393 void
3394 vec4_visitor::move_grf_array_access_to_scratch()
3395 {
3396    int scratch_loc[this->alloc.count];
3397    memset(scratch_loc, -1, sizeof(scratch_loc));
3398
3399    /* First, calculate the set of virtual GRFs that need to be punted
3400     * to scratch due to having any array access on them, and where in
3401     * scratch.
3402     */
3403    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3404       if (inst->dst.file == GRF && inst->dst.reladdr &&
3405           scratch_loc[inst->dst.reg] == -1) {
3406          scratch_loc[inst->dst.reg] = c->last_scratch;
3407          c->last_scratch += this->alloc.sizes[inst->dst.reg];
3408       }
3409
3410       for (int i = 0 ; i < 3; i++) {
3411          src_reg *src = &inst->src[i];
3412
3413          if (src->file == GRF && src->reladdr &&
3414              scratch_loc[src->reg] == -1) {
3415             scratch_loc[src->reg] = c->last_scratch;
3416             c->last_scratch += this->alloc.sizes[src->reg];
3417          }
3418       }
3419    }
3420
3421    /* Now, for anything that will be accessed through scratch, rewrite
3422     * it to load/store.  Note that this is a _safe list walk, because
3423     * we may generate a new scratch_write instruction after the one
3424     * we're processing.
3425     */
3426    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3427       /* Set up the annotation tracking for new generated instructions. */
3428       base_ir = inst->ir;
3429       current_annotation = inst->annotation;
3430
3431       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3432          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3433       }
3434
3435       for (int i = 0 ; i < 3; i++) {
3436          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3437             continue;
3438
3439          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3440
3441          emit_scratch_read(block, inst, temp, inst->src[i],
3442                            scratch_loc[inst->src[i].reg]);
3443
3444          inst->src[i].file = temp.file;
3445          inst->src[i].reg = temp.reg;
3446          inst->src[i].reg_offset = temp.reg_offset;
3447          inst->src[i].reladdr = NULL;
3448       }
3449    }
3450 }
3451
3452 /**
3453  * Emits an instruction before @inst to load the value named by @orig_src
3454  * from the pull constant buffer (surface) at @base_offset to @temp.
3455  */
3456 void
3457 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3458                                       dst_reg temp, src_reg orig_src,
3459                                       int base_offset)
3460 {
3461    int reg_offset = base_offset + orig_src.reg_offset;
3462    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3463    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3464                                              reg_offset);
3465    vec4_instruction *load;
3466
3467    if (brw->gen >= 7) {
3468       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3469       grf_offset.type = offset.type;
3470       emit_before(block, inst, MOV(grf_offset, offset));
3471
3472       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3473                                            temp, index, src_reg(grf_offset));
3474       load->mlen = 1;
3475    } else {
3476       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3477                                            temp, index, offset);
3478       load->base_mrf = 14;
3479       load->mlen = 1;
3480    }
3481    emit_before(block, inst, load);
3482 }
3483
3484 /**
3485  * Implements array access of uniforms by inserting a
3486  * PULL_CONSTANT_LOAD instruction.
3487  *
3488  * Unlike temporary GRF array access (where we don't support it due to
3489  * the difficulty of doing relative addressing on instruction
3490  * destinations), we could potentially do array access of uniforms
3491  * that were loaded in GRF space as push constants.  In real-world
3492  * usage we've seen, though, the arrays being used are always larger
3493  * than we could load as push constants, so just always move all
3494  * uniform array access out to a pull constant buffer.
3495  */
3496 void
3497 vec4_visitor::move_uniform_array_access_to_pull_constants()
3498 {
3499    int pull_constant_loc[this->uniforms];
3500    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3501    bool nested_reladdr;
3502
3503    /* Walk through and find array access of uniforms.  Put a copy of that
3504     * uniform in the pull constant buffer.
3505     *
3506     * Note that we don't move constant-indexed accesses to arrays.  No
3507     * testing has been done of the performance impact of this choice.
3508     */
3509    do {
3510       nested_reladdr = false;
3511
3512       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3513          for (int i = 0 ; i < 3; i++) {
3514             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3515                continue;
3516
3517             int uniform = inst->src[i].reg;
3518
3519             if (inst->src[i].reladdr->reladdr)
3520                nested_reladdr = true;  /* will need another pass */
3521
3522             /* If this array isn't already present in the pull constant buffer,
3523              * add it.
3524              */
3525             if (pull_constant_loc[uniform] == -1) {
3526                const gl_constant_value **values =
3527                   &stage_prog_data->param[uniform * 4];
3528
3529                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3530
3531                assert(uniform < uniform_array_size);
3532                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3533                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3534                      = values[j];
3535                }
3536             }
3537
3538             /* Set up the annotation tracking for new generated instructions. */
3539             base_ir = inst->ir;
3540             current_annotation = inst->annotation;
3541
3542             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3543
3544             emit_pull_constant_load(block, inst, temp, inst->src[i],
3545                                     pull_constant_loc[uniform]);
3546
3547             inst->src[i].file = temp.file;
3548             inst->src[i].reg = temp.reg;
3549             inst->src[i].reg_offset = temp.reg_offset;
3550             inst->src[i].reladdr = NULL;
3551          }
3552       }
3553    } while (nested_reladdr);
3554
3555    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3556     * no need to track them as larger-than-vec4 objects.  This will be
3557     * relied on in cutting out unused uniform vectors from push
3558     * constants.
3559     */
3560    split_uniform_registers();
3561 }
3562
3563 void
3564 vec4_visitor::resolve_ud_negate(src_reg *reg)
3565 {
3566    if (reg->type != BRW_REGISTER_TYPE_UD ||
3567        !reg->negate)
3568       return;
3569
3570    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3571    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3572    *reg = temp;
3573 }
3574
3575 /**
3576  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3577  *
3578  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3579  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3580  */
3581 void
3582 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3583 {
3584    assert(brw->gen <= 5);
3585
3586    if (!rvalue->type->is_boolean())
3587       return;
3588
3589    src_reg and_result = src_reg(this, rvalue->type);
3590    src_reg neg_result = src_reg(this, rvalue->type);
3591    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3592    emit(MOV(dst_reg(neg_result), negate(and_result)));
3593    *reg = neg_result;
3594 }
3595
3596 vec4_visitor::vec4_visitor(struct brw_context *brw,
3597                            struct brw_vec4_compile *c,
3598                            struct gl_program *prog,
3599                            const struct brw_vue_prog_key *key,
3600                            struct brw_vue_prog_data *prog_data,
3601                            struct gl_shader_program *shader_prog,
3602                            gl_shader_stage stage,
3603                            void *mem_ctx,
3604                            bool no_spills,
3605                            shader_time_shader_type st_base,
3606                            shader_time_shader_type st_written,
3607                            shader_time_shader_type st_reset)
3608    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3609      c(c),
3610      key(key),
3611      prog_data(prog_data),
3612      sanity_param_count(0),
3613      fail_msg(NULL),
3614      first_non_payload_grf(0),
3615      need_all_constants_in_pull_buffer(false),
3616      no_spills(no_spills),
3617      st_base(st_base),
3618      st_written(st_written),
3619      st_reset(st_reset)
3620 {
3621    this->mem_ctx = mem_ctx;
3622    this->failed = false;
3623
3624    this->base_ir = NULL;
3625    this->current_annotation = NULL;
3626    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3627
3628    this->variable_ht = hash_table_ctor(0,
3629                                        hash_table_pointer_hash,
3630                                        hash_table_pointer_compare);
3631
3632    this->virtual_grf_start = NULL;
3633    this->virtual_grf_end = NULL;
3634    this->live_intervals = NULL;
3635
3636    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3637
3638    this->uniforms = 0;
3639
3640    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3641     * at least one. See setup_uniforms() in brw_vec4.cpp.
3642     */
3643    this->uniform_array_size = 1;
3644    if (prog_data) {
3645       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3646    }
3647
3648    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3649    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3650 }
3651
3652 vec4_visitor::~vec4_visitor()
3653 {
3654    hash_table_dtor(this->variable_ht);
3655 }
3656
3657
3658 void
3659 vec4_visitor::fail(const char *format, ...)
3660 {
3661    va_list va;
3662    char *msg;
3663
3664    if (failed)
3665       return;
3666
3667    failed = true;
3668
3669    va_start(va, format);
3670    msg = ralloc_vasprintf(mem_ctx, format, va);
3671    va_end(va);
3672    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3673
3674    this->fail_msg = msg;
3675
3676    if (debug_enabled) {
3677       fprintf(stderr, "%s",  msg);
3678    }
3679 }
3680
3681 } /* namespace brw */