src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(MOV(f, src_reg(shifted)));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(MOV(f, src_reg(shifted)));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_ERROR:
 616    case GLSL_TYPE_INTERFACE:
 617       unreachable("not reached");
 618    }
 619
 620    return 0;
 621 }
 622
 623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->alloc.allocate(type_size(type));
 629
 630    if (type->is_array() || type->is_record()) {
 631       this->swizzle = BRW_SWIZZLE_NOOP;
 632    } else {
 633       this->swizzle = swizzle_for_size(type->vector_elements);
 634    }
 635
 636    this->type = brw_type_for_base_type(type);
 637 }
 638
 639 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 640 {
 641    assert(size > 0);
 642
 643    init();
 644
 645    this->file = GRF;
 646    this->reg = v->alloc.allocate(type_size(type) * size);
 647
 648    this->swizzle = BRW_SWIZZLE_NOOP;
 649
 650    this->type = brw_type_for_base_type(type);
 651 }
 652
 653 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 654 {
 655    init();
 656
 657    this->file = GRF;
 658    this->reg = v->alloc.allocate(type_size(type));
 659
 660    if (type->is_array() || type->is_record()) {
 661       this->writemask = WRITEMASK_XYZW;
 662    } else {
 663       this->writemask = (1 << type->vector_elements) - 1;
 664    }
 665
 666    this->type = brw_type_for_base_type(type);
 667 }
 668
 669 /* Our support for uniforms is piggy-backed on the struct
 670  * gl_fragment_program, because that's where the values actually
 671  * get stored, rather than in some global gl_shader_program uniform
 672  * store.
 673  */
 674 void
 675 vec4_visitor::setup_uniform_values(ir_variable *ir)
 676 {
 677    int namelen = strlen(ir->name);
 678
 679    /* The data for our (non-builtin) uniforms is stored in a series of
 680     * gl_uniform_driver_storage structs for each subcomponent that
 681     * glGetUniformLocation() could name.  We know it's been set up in the same
 682     * order we'd walk the type, so walk the list of storage and find anything
 683     * with our name, or the prefix of a component that starts with our name.
 684     */
 685    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 686       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 687
 688       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 689           (storage->name[namelen] != 0 &&
 690            storage->name[namelen] != '.' &&
 691            storage->name[namelen] != '[')) {
 692          continue;
 693       }
 694
 695       gl_constant_value *components = storage->storage;
 696       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 697                                storage->type->matrix_columns);
 698
 699       for (unsigned s = 0; s < vector_count; s++) {
 700          assert(uniforms < uniform_array_size);
 701          uniform_vector_size[uniforms] = storage->type->vector_elements;
 702
 703          int i;
 704          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 705             stage_prog_data->param[uniforms * 4 + i] = components;
 706             components++;
 707          }
 708          for (; i < 4; i++) {
 709             static gl_constant_value zero = { 0.0 };
 710             stage_prog_data->param[uniforms * 4 + i] = &zero;
 711          }
 712
 713          uniforms++;
 714       }
 715    }
 716 }
 717
 718 void
 719 vec4_visitor::setup_uniform_clipplane_values()
 720 {
 721    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 722
 723    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 724       assert(this->uniforms < uniform_array_size);
 725       this->uniform_vector_size[this->uniforms] = 4;
 726       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 727       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 728       for (int j = 0; j < 4; ++j) {
 729          stage_prog_data->param[this->uniforms * 4 + j] =
 730             (gl_constant_value *) &clip_planes[i][j];
 731       }
 732       ++this->uniforms;
 733    }
 734 }
 735
 736 /* Our support for builtin uniforms is even scarier than non-builtin.
 737  * It sits on top of the PROG_STATE_VAR parameters that are
 738  * automatically updated from GL context state.
 739  */
 740 void
 741 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 742 {
 743    const ir_state_slot *const slots = ir->get_state_slots();
 744    assert(slots != NULL);
 745
 746    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 747       /* This state reference has already been setup by ir_to_mesa,
 748        * but we'll get the same index back here.  We can reference
 749        * ParameterValues directly, since unlike brw_fs.cpp, we never
 750        * add new state references during compile.
 751        */
 752       int index = _mesa_add_state_reference(this->prog->Parameters,
 753                                             (gl_state_index *)slots[i].tokens);
 754       gl_constant_value *values =
 755          &this->prog->Parameters->ParameterValues[index][0];
 756
 757       assert(this->uniforms < uniform_array_size);
 758       this->uniform_vector_size[this->uniforms] = 0;
 759       /* Add each of the unique swizzled channels of the element.
 760        * This will end up matching the size of the glsl_type of this field.
 761        */
 762       int last_swiz = -1;
 763       for (unsigned int j = 0; j < 4; j++) {
 764          int swiz = GET_SWZ(slots[i].swizzle, j);
 765          last_swiz = swiz;
 766
 767          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 768          assert(this->uniforms < uniform_array_size);
 769          if (swiz <= last_swiz)
 770             this->uniform_vector_size[this->uniforms]++;
 771       }
 772       this->uniforms++;
 773    }
 774 }
 775
 776 dst_reg *
 777 vec4_visitor::variable_storage(ir_variable *var)
 778 {
 779    return (dst_reg *)hash_table_find(this->variable_ht, var);
 780 }
 781
 782 void
 783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 784                                      enum brw_predicate *predicate)
 785 {
 786    ir_expression *expr = ir->as_expression();
 787
 788    *predicate = BRW_PREDICATE_NORMAL;
 789
 790    if (expr && expr->operation != ir_binop_ubo_load) {
 791       src_reg op[3];
 792       vec4_instruction *inst;
 793
 794       assert(expr->get_num_operands() <= 3);
 795       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 796          expr->operands[i]->accept(this);
 797          op[i] = this->result;
 798
 799          resolve_ud_negate(&op[i]);
 800       }
 801
 802       switch (expr->operation) {
 803       case ir_unop_logic_not:
 804          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 805          inst->conditional_mod = BRW_CONDITIONAL_Z;
 806          break;
 807
 808       case ir_binop_logic_xor:
 809          if (brw->gen <= 5) {
 810             src_reg temp = src_reg(this, ir->type);
 811             emit(XOR(dst_reg(temp), op[0], op[1]));
 812             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 813          } else {
 814             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 815          }
 816          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          break;
 818
 819       case ir_binop_logic_or:
 820          if (brw->gen <= 5) {
 821             src_reg temp = src_reg(this, ir->type);
 822             emit(OR(dst_reg(temp), op[0], op[1]));
 823             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 824          } else {
 825             inst = emit(OR(dst_null_d(), op[0], op[1]));
 826          }
 827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 828          break;
 829
 830       case ir_binop_logic_and:
 831          if (brw->gen <= 5) {
 832             src_reg temp = src_reg(this, ir->type);
 833             emit(AND(dst_reg(temp), op[0], op[1]));
 834             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 835          } else {
 836             inst = emit(AND(dst_null_d(), op[0], op[1]));
 837          }
 838          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839          break;
 840
 841       case ir_unop_f2b:
 842          if (brw->gen >= 6) {
 843             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 844          } else {
 845             inst = emit(MOV(dst_null_f(), op[0]));
 846             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 847          }
 848          break;
 849
 850       case ir_unop_i2b:
 851          if (brw->gen >= 6) {
 852             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 853          } else {
 854             inst = emit(MOV(dst_null_d(), op[0]));
 855             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 856          }
 857          break;
 858
 859       case ir_binop_all_equal:
 860          if (brw->gen <= 5) {
 861             resolve_bool_comparison(expr->operands[0], &op[0]);
 862             resolve_bool_comparison(expr->operands[1], &op[1]);
 863          }
 864          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 865          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 866          break;
 867
 868       case ir_binop_any_nequal:
 869          if (brw->gen <= 5) {
 870             resolve_bool_comparison(expr->operands[0], &op[0]);
 871             resolve_bool_comparison(expr->operands[1], &op[1]);
 872          }
 873          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 874          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 875          break;
 876
 877       case ir_unop_any:
 878          if (brw->gen <= 5) {
 879             resolve_bool_comparison(expr->operands[0], &op[0]);
 880          }
 881          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 882          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 883          break;
 884
 885       case ir_binop_greater:
 886       case ir_binop_gequal:
 887       case ir_binop_less:
 888       case ir_binop_lequal:
 889       case ir_binop_equal:
 890       case ir_binop_nequal:
 891          if (brw->gen <= 5) {
 892             resolve_bool_comparison(expr->operands[0], &op[0]);
 893             resolve_bool_comparison(expr->operands[1], &op[1]);
 894          }
 895          emit(CMP(dst_null_d(), op[0], op[1],
 896                   brw_conditional_for_comparison(expr->operation)));
 897          break;
 898
 899       case ir_triop_csel: {
 900          /* Expand the boolean condition into the flag register. */
 901          inst = emit(MOV(dst_null_d(), op[0]));
 902          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 903
 904          /* Select which boolean to return. */
 905          dst_reg temp(this, expr->operands[1]->type);
 906          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 907          inst->predicate = BRW_PREDICATE_NORMAL;
 908
 909          /* Expand the result to a condition code. */
 910          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 911          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 912          break;
 913       }
 914
 915       default:
 916          unreachable("not reached");
 917       }
 918       return;
 919    }
 920
 921    ir->accept(this);
 922
 923    resolve_ud_negate(&this->result);
 924
 925    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 926    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 927 }
 928
 929 /**
 930  * Emit a gen6 IF statement with the comparison folded into the IF
 931  * instruction.
 932  */
 933 void
 934 vec4_visitor::emit_if_gen6(ir_if *ir)
 935 {
 936    ir_expression *expr = ir->condition->as_expression();
 937
 938    if (expr && expr->operation != ir_binop_ubo_load) {
 939       src_reg op[3];
 940       dst_reg temp;
 941
 942       assert(expr->get_num_operands() <= 3);
 943       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 944          expr->operands[i]->accept(this);
 945          op[i] = this->result;
 946       }
 947
 948       switch (expr->operation) {
 949       case ir_unop_logic_not:
 950          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 951          return;
 952
 953       case ir_binop_logic_xor:
 954          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 955          return;
 956
 957       case ir_binop_logic_or:
 958          temp = dst_reg(this, glsl_type::bool_type);
 959          emit(OR(temp, op[0], op[1]));
 960          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 961          return;
 962
 963       case ir_binop_logic_and:
 964          temp = dst_reg(this, glsl_type::bool_type);
 965          emit(AND(temp, op[0], op[1]));
 966          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_f2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_unop_i2b:
 974          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 975          return;
 976
 977       case ir_binop_greater:
 978       case ir_binop_gequal:
 979       case ir_binop_less:
 980       case ir_binop_lequal:
 981       case ir_binop_equal:
 982       case ir_binop_nequal:
 983          emit(IF(op[0], op[1],
 984                  brw_conditional_for_comparison(expr->operation)));
 985          return;
 986
 987       case ir_binop_all_equal:
 988          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 989          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 990          return;
 991
 992       case ir_binop_any_nequal:
 993          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 994          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 995          return;
 996
 997       case ir_unop_any:
 998          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 999          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000          return;
1001
1002       case ir_triop_csel: {
1003          /* Expand the boolean condition into the flag register. */
1004          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007          /* Select which boolean to return. */
1008          dst_reg temp(this, expr->operands[1]->type);
1009          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010          inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013          return;
1014       }
1015
1016       default:
1017          unreachable("not reached");
1018       }
1019       return;
1020    }
1021
1022    ir->condition->accept(this);
1023
1024    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030    dst_reg *reg = NULL;
1031
1032    if (variable_storage(ir))
1033       return;
1034
1035    switch (ir->data.mode) {
1036    case ir_var_shader_in:
1037       assert(ir->data.location != -1);
1038       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039       break;
1040
1041    case ir_var_shader_out:
1042       assert(ir->data.location != -1);
1043       reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045       for (int i = 0; i < type_size(ir->type); i++) {
1046          output_reg[ir->data.location + i] = *reg;
1047          output_reg[ir->data.location + i].reg_offset = i;
1048          output_reg[ir->data.location + i].type =
1049             brw_type_for_base_type(ir->type->get_scalar_type());
1050          output_reg_annotation[ir->data.location + i] = ir->name;
1051       }
1052       break;
1053
1054    case ir_var_auto:
1055    case ir_var_temporary:
1056       reg = new(mem_ctx) dst_reg(this, ir->type);
1057       break;
1058
1059    case ir_var_uniform:
1060       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062       /* Thanks to the lower_ubo_reference pass, we will see only
1063        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064        * variables, so no need for them to be in variable_ht.
1065        *
1066        * Some uniforms, such as samplers and atomic counters, have no actual
1067        * storage, so we should ignore them.
1068        */
1069       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1070          return;
1071
1072       /* Track how big the whole uniform variable is, in case we need to put a
1073        * copy of its data into pull constants for array access.
1074        */
1075       assert(this->uniforms < uniform_array_size);
1076       this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078       if (!strncmp(ir->name, "gl_", 3)) {
1079          setup_builtin_uniform_values(ir);
1080       } else {
1081          setup_uniform_values(ir);
1082       }
1083       break;
1084
1085    case ir_var_system_value:
1086       reg = make_reg_for_system_value(ir);
1087       break;
1088
1089    default:
1090       unreachable("not reached");
1091    }
1092
1093    reg->type = brw_type_for_base_type(ir->type);
1094    hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100    /* We don't want debugging output to print the whole body of the
1101     * loop as the annotation.
1102     */
1103    this->base_ir = NULL;
1104
1105    emit(BRW_OPCODE_DO);
1106
1107    visit_instructions(&ir->body_instructions);
1108
1109    emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115    switch (ir->mode) {
1116    case ir_loop_jump::jump_break:
1117       emit(BRW_OPCODE_BREAK);
1118       break;
1119    case ir_loop_jump::jump_continue:
1120       emit(BRW_OPCODE_CONTINUE);
1121       break;
1122    }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129    unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135    /* Ignore function bodies other than main() -- we shouldn't see calls to
1136     * them since they should all be inlined.
1137     */
1138    if (strcmp(ir->name, "main") == 0) {
1139       const ir_function_signature *sig;
1140       exec_list empty;
1141
1142       sig = ir->matching_signature(NULL, &empty, false);
1143
1144       assert(sig);
1145
1146       visit_instructions(&sig->body);
1147    }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153    /* 3-src instructions were introduced in gen6. */
1154    if (brw->gen < 6)
1155       return false;
1156
1157    /* MAD can only handle floating-point data. */
1158    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159       return false;
1160
1161    ir_rvalue *nonmul = ir->operands[1];
1162    ir_expression *mul = ir->operands[0]->as_expression();
1163
1164    bool mul_negate = false;
1165    if (mul && mul->operation == ir_unop_neg) {
1166       mul = mul->operands[0]->as_expression();
1167       mul_negate = true;
1168    }
1169
1170    if (!mul || mul->operation != ir_binop_mul) {
1171       nonmul = ir->operands[0];
1172       mul = ir->operands[1]->as_expression();
1173
1174       if (mul && mul->operation == ir_unop_neg) {
1175          mul = mul->operands[0]->as_expression();
1176          mul_negate = true;
1177       }
1178
1179       if (!mul || mul->operation != ir_binop_mul)
1180          return false;
1181    }
1182
1183    nonmul->accept(this);
1184    src_reg src0 = fix_3src_operand(this->result);
1185
1186    mul->operands[0]->accept(this);
1187    src_reg src1 = fix_3src_operand(this->result);
1188    src1.negate ^= mul_negate;
1189
1190    mul->operands[1]->accept(this);
1191    src_reg src2 = fix_3src_operand(this->result);
1192
1193    this->result = src_reg(this, ir->type);
1194    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1195
1196    return true;
1197 }
1198
1199 bool
1200 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1201 {
1202    /* This optimization relies on CMP setting the destination to 0 when
1203     * false.  Early hardware only sets the least significant bit, and
1204     * leaves the other bits undefined.  So we can't use it.
1205     */
1206    if (brw->gen < 6)
1207       return false;
1208
1209    ir_expression *const cmp = ir->operands[0]->as_expression();
1210
1211    if (cmp == NULL)
1212       return false;
1213
1214    switch (cmp->operation) {
1215    case ir_binop_less:
1216    case ir_binop_greater:
1217    case ir_binop_lequal:
1218    case ir_binop_gequal:
1219    case ir_binop_equal:
1220    case ir_binop_nequal:
1221       break;
1222
1223    default:
1224       return false;
1225    }
1226
1227    cmp->operands[0]->accept(this);
1228    const src_reg cmp_src0 = this->result;
1229
1230    cmp->operands[1]->accept(this);
1231    const src_reg cmp_src1 = this->result;
1232
1233    this->result = src_reg(this, ir->type);
1234
1235    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1236             brw_conditional_for_comparison(cmp->operation)));
1237
1238    /* If the comparison is false, this->result will just happen to be zero.
1239     */
1240    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1241                                        this->result, src_reg(1.0f));
1242    inst->predicate = BRW_PREDICATE_NORMAL;
1243    inst->predicate_inverse = true;
1244
1245    return true;
1246 }
1247
1248 void
1249 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1250                           src_reg src0, src_reg src1)
1251 {
1252    vec4_instruction *inst;
1253
1254    if (brw->gen >= 6) {
1255       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1256       inst->conditional_mod = conditionalmod;
1257    } else {
1258       emit(CMP(dst, src0, src1, conditionalmod));
1259
1260       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1261       inst->predicate = BRW_PREDICATE_NORMAL;
1262    }
1263 }
1264
1265 void
1266 vec4_visitor::emit_lrp(const dst_reg &dst,
1267                        const src_reg &x, const src_reg &y, const src_reg &a)
1268 {
1269    if (brw->gen >= 6) {
1270       /* Note that the instruction's argument order is reversed from GLSL
1271        * and the IR.
1272        */
1273       emit(LRP(dst,
1274                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1275    } else {
1276       /* Earlier generations don't support three source operations, so we
1277        * need to emit x*(1-a) + y*a.
1278        */
1279       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1280       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1281       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1282       y_times_a.writemask           = dst.writemask;
1283       one_minus_a.writemask         = dst.writemask;
1284       x_times_one_minus_a.writemask = dst.writemask;
1285
1286       emit(MUL(y_times_a, y, a));
1287       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1288       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1289       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1290    }
1291 }
1292
1293 void
1294 vec4_visitor::visit(ir_expression *ir)
1295 {
1296    unsigned int operand;
1297    src_reg op[Elements(ir->operands)];
1298    vec4_instruction *inst;
1299
1300    if (ir->operation == ir_binop_add) {
1301       if (try_emit_mad(ir))
1302          return;
1303    }
1304
1305    if (ir->operation == ir_unop_b2f) {
1306       if (try_emit_b2f_of_compare(ir))
1307          return;
1308    }
1309
1310    /* Storage for our result.  Ideally for an assignment we'd be using
1311     * the actual storage for the result here, instead.
1312     */
1313    dst_reg result_dst(this, ir->type);
1314    src_reg result_src(result_dst);
1315
1316    if (ir->operation == ir_triop_csel) {
1317       ir->operands[1]->accept(this);
1318       op[1] = this->result;
1319       ir->operands[2]->accept(this);
1320       op[2] = this->result;
1321
1322       enum brw_predicate predicate;
1323       emit_bool_to_cond_code(ir->operands[0], &predicate);
1324       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1325       inst->predicate = predicate;
1326       this->result = result_src;
1327       return;
1328    }
1329
1330    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1331       this->result.file = BAD_FILE;
1332       ir->operands[operand]->accept(this);
1333       if (this->result.file == BAD_FILE) {
1334          fprintf(stderr, "Failed to get tree for expression operand:\n");
1335          ir->operands[operand]->fprint(stderr);
1336          exit(1);
1337       }
1338       op[operand] = this->result;
1339
1340       /* Matrix expression operands should have been broken down to vector
1341        * operations already.
1342        */
1343       assert(!ir->operands[operand]->type->is_matrix());
1344    }
1345
1346    /* If nothing special happens, this is the result. */
1347    this->result = result_src;
1348
1349    switch (ir->operation) {
1350    case ir_unop_logic_not:
1351       emit(NOT(result_dst, op[0]));
1352       break;
1353    case ir_unop_neg:
1354       op[0].negate = !op[0].negate;
1355       emit(MOV(result_dst, op[0]));
1356       break;
1357    case ir_unop_abs:
1358       op[0].abs = true;
1359       op[0].negate = false;
1360       emit(MOV(result_dst, op[0]));
1361       break;
1362
1363    case ir_unop_sign:
1364       if (ir->type->is_float()) {
1365          /* AND(val, 0x80000000) gives the sign bit.
1366           *
1367           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1368           * zero.
1369           */
1370          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1371
1372          op[0].type = BRW_REGISTER_TYPE_UD;
1373          result_dst.type = BRW_REGISTER_TYPE_UD;
1374          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1375
1376          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1377          inst->predicate = BRW_PREDICATE_NORMAL;
1378
1379          this->result.type = BRW_REGISTER_TYPE_F;
1380       } else {
1381          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1382           *               -> non-negative val generates 0x00000000.
1383           *  Predicated OR sets 1 if val is positive.
1384           */
1385          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1386
1387          emit(ASR(result_dst, op[0], src_reg(31)));
1388
1389          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1390          inst->predicate = BRW_PREDICATE_NORMAL;
1391       }
1392       break;
1393
1394    case ir_unop_rcp:
1395       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1396       break;
1397
1398    case ir_unop_exp2:
1399       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1400       break;
1401    case ir_unop_log2:
1402       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1403       break;
1404    case ir_unop_exp:
1405    case ir_unop_log:
1406       unreachable("not reached: should be handled by ir_explog_to_explog2");
1407    case ir_unop_sin:
1408    case ir_unop_sin_reduced:
1409       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1410       break;
1411    case ir_unop_cos:
1412    case ir_unop_cos_reduced:
1413       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1414       break;
1415
1416    case ir_unop_dFdx:
1417    case ir_unop_dFdx_coarse:
1418    case ir_unop_dFdx_fine:
1419    case ir_unop_dFdy:
1420    case ir_unop_dFdy_coarse:
1421    case ir_unop_dFdy_fine:
1422       unreachable("derivatives not valid in vertex shader");
1423
1424    case ir_unop_bitfield_reverse:
1425       emit(BFREV(result_dst, op[0]));
1426       break;
1427    case ir_unop_bit_count:
1428       emit(CBIT(result_dst, op[0]));
1429       break;
1430    case ir_unop_find_msb: {
1431       src_reg temp = src_reg(this, glsl_type::uint_type);
1432
1433       inst = emit(FBH(dst_reg(temp), op[0]));
1434       inst->dst.writemask = WRITEMASK_XYZW;
1435
1436       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1437        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1438        * subtract the result from 31 to convert the MSB count into an LSB count.
1439        */
1440
1441       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1442       temp.swizzle = BRW_SWIZZLE_NOOP;
1443       emit(MOV(result_dst, temp));
1444
1445       src_reg src_tmp = src_reg(result_dst);
1446       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1447
1448       src_tmp.negate = true;
1449       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1450       inst->predicate = BRW_PREDICATE_NORMAL;
1451       break;
1452    }
1453    case ir_unop_find_lsb:
1454       emit(FBL(result_dst, op[0]));
1455       break;
1456    case ir_unop_saturate:
1457       inst = emit(MOV(result_dst, op[0]));
1458       inst->saturate = true;
1459       break;
1460
1461    case ir_unop_noise:
1462       unreachable("not reached: should be handled by lower_noise");
1463
1464    case ir_binop_add:
1465       emit(ADD(result_dst, op[0], op[1]));
1466       break;
1467    case ir_binop_sub:
1468       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1469
1470    case ir_binop_mul:
1471       if (brw->gen < 8 && ir->type->is_integer()) {
1472          /* For integer multiplication, the MUL uses the low 16 bits of one of
1473           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1474           * accumulates in the contribution of the upper 16 bits of that
1475           * operand.  If we can determine that one of the args is in the low
1476           * 16 bits, though, we can just emit a single MUL.
1477           */
1478          if (ir->operands[0]->is_uint16_constant()) {
1479             if (brw->gen < 7)
1480                emit(MUL(result_dst, op[0], op[1]));
1481             else
1482                emit(MUL(result_dst, op[1], op[0]));
1483          } else if (ir->operands[1]->is_uint16_constant()) {
1484             if (brw->gen < 7)
1485                emit(MUL(result_dst, op[1], op[0]));
1486             else
1487                emit(MUL(result_dst, op[0], op[1]));
1488          } else {
1489             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1490
1491             emit(MUL(acc, op[0], op[1]));
1492             emit(MACH(dst_null_d(), op[0], op[1]));
1493             emit(MOV(result_dst, src_reg(acc)));
1494          }
1495       } else {
1496          emit(MUL(result_dst, op[0], op[1]));
1497       }
1498       break;
1499    case ir_binop_imul_high: {
1500       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1501
1502       emit(MUL(acc, op[0], op[1]));
1503       emit(MACH(result_dst, op[0], op[1]));
1504       break;
1505    }
1506    case ir_binop_div:
1507       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1508       assert(ir->type->is_integer());
1509       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1510       break;
1511    case ir_binop_carry: {
1512       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1513
1514       emit(ADDC(dst_null_ud(), op[0], op[1]));
1515       emit(MOV(result_dst, src_reg(acc)));
1516       break;
1517    }
1518    case ir_binop_borrow: {
1519       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1520
1521       emit(SUBB(dst_null_ud(), op[0], op[1]));
1522       emit(MOV(result_dst, src_reg(acc)));
1523       break;
1524    }
1525    case ir_binop_mod:
1526       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1527       assert(ir->type->is_integer());
1528       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1529       break;
1530
1531    case ir_binop_less:
1532    case ir_binop_greater:
1533    case ir_binop_lequal:
1534    case ir_binop_gequal:
1535    case ir_binop_equal:
1536    case ir_binop_nequal: {
1537       if (brw->gen <= 5) {
1538          resolve_bool_comparison(ir->operands[0], &op[0]);
1539          resolve_bool_comparison(ir->operands[1], &op[1]);
1540       }
1541       emit(CMP(result_dst, op[0], op[1],
1542                brw_conditional_for_comparison(ir->operation)));
1543       break;
1544    }
1545
1546    case ir_binop_all_equal:
1547       /* "==" operator producing a scalar boolean. */
1548       if (ir->operands[0]->type->is_vector() ||
1549           ir->operands[1]->type->is_vector()) {
1550          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1551          emit(MOV(result_dst, src_reg(0)));
1552          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1553          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1554       } else {
1555          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1556       }
1557       break;
1558    case ir_binop_any_nequal:
1559       /* "!=" operator producing a scalar boolean. */
1560       if (ir->operands[0]->type->is_vector() ||
1561           ir->operands[1]->type->is_vector()) {
1562          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1563
1564          emit(MOV(result_dst, src_reg(0)));
1565          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1566          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1567       } else {
1568          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1569       }
1570       break;
1571
1572    case ir_unop_any:
1573       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1574       emit(MOV(result_dst, src_reg(0)));
1575
1576       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1577       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1578       break;
1579
1580    case ir_binop_logic_xor:
1581       emit(XOR(result_dst, op[0], op[1]));
1582       break;
1583
1584    case ir_binop_logic_or:
1585       emit(OR(result_dst, op[0], op[1]));
1586       break;
1587
1588    case ir_binop_logic_and:
1589       emit(AND(result_dst, op[0], op[1]));
1590       break;
1591
1592    case ir_binop_dot:
1593       assert(ir->operands[0]->type->is_vector());
1594       assert(ir->operands[0]->type == ir->operands[1]->type);
1595       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1596       break;
1597
1598    case ir_unop_sqrt:
1599       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1600       break;
1601    case ir_unop_rsq:
1602       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1603       break;
1604
1605    case ir_unop_bitcast_i2f:
1606    case ir_unop_bitcast_u2f:
1607       this->result = op[0];
1608       this->result.type = BRW_REGISTER_TYPE_F;
1609       break;
1610
1611    case ir_unop_bitcast_f2i:
1612       this->result = op[0];
1613       this->result.type = BRW_REGISTER_TYPE_D;
1614       break;
1615
1616    case ir_unop_bitcast_f2u:
1617       this->result = op[0];
1618       this->result.type = BRW_REGISTER_TYPE_UD;
1619       break;
1620
1621    case ir_unop_i2f:
1622    case ir_unop_i2u:
1623    case ir_unop_u2i:
1624    case ir_unop_u2f:
1625    case ir_unop_f2i:
1626    case ir_unop_f2u:
1627       emit(MOV(result_dst, op[0]));
1628       break;
1629    case ir_unop_b2i:
1630       emit(AND(result_dst, op[0], src_reg(1)));
1631       break;
1632    case ir_unop_b2f:
1633       if (brw->gen <= 5) {
1634          resolve_bool_comparison(ir->operands[0], &op[0]);
1635       }
1636       op[0].type = BRW_REGISTER_TYPE_D;
1637       result_dst.type = BRW_REGISTER_TYPE_D;
1638       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1639       result_dst.type = BRW_REGISTER_TYPE_F;
1640       break;
1641    case ir_unop_f2b:
1642       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1643       break;
1644    case ir_unop_i2b:
1645       emit(AND(result_dst, op[0], src_reg(1)));
1646       break;
1647
1648    case ir_unop_trunc:
1649       emit(RNDZ(result_dst, op[0]));
1650       break;
1651    case ir_unop_ceil: {
1652          src_reg tmp = src_reg(this, ir->type);
1653          op[0].negate = !op[0].negate;
1654          emit(RNDD(dst_reg(tmp), op[0]));
1655          tmp.negate = true;
1656          emit(MOV(result_dst, tmp));
1657       }
1658       break;
1659    case ir_unop_floor:
1660       inst = emit(RNDD(result_dst, op[0]));
1661       break;
1662    case ir_unop_fract:
1663       inst = emit(FRC(result_dst, op[0]));
1664       break;
1665    case ir_unop_round_even:
1666       emit(RNDE(result_dst, op[0]));
1667       break;
1668
1669    case ir_binop_min:
1670       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1671       break;
1672    case ir_binop_max:
1673       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1674       break;
1675
1676    case ir_binop_pow:
1677       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1678       break;
1679
1680    case ir_unop_bit_not:
1681       inst = emit(NOT(result_dst, op[0]));
1682       break;
1683    case ir_binop_bit_and:
1684       inst = emit(AND(result_dst, op[0], op[1]));
1685       break;
1686    case ir_binop_bit_xor:
1687       inst = emit(XOR(result_dst, op[0], op[1]));
1688       break;
1689    case ir_binop_bit_or:
1690       inst = emit(OR(result_dst, op[0], op[1]));
1691       break;
1692
1693    case ir_binop_lshift:
1694       inst = emit(SHL(result_dst, op[0], op[1]));
1695       break;
1696
1697    case ir_binop_rshift:
1698       if (ir->type->base_type == GLSL_TYPE_INT)
1699          inst = emit(ASR(result_dst, op[0], op[1]));
1700       else
1701          inst = emit(SHR(result_dst, op[0], op[1]));
1702       break;
1703
1704    case ir_binop_bfm:
1705       emit(BFI1(result_dst, op[0], op[1]));
1706       break;
1707
1708    case ir_binop_ubo_load: {
1709       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1710       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1711       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1712       src_reg offset;
1713
1714       /* Now, load the vector from that offset. */
1715       assert(ir->type->is_vector() || ir->type->is_scalar());
1716
1717       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1718       packed_consts.type = result.type;
1719       src_reg surf_index;
1720
1721       if (const_uniform_block) {
1722          /* The block index is a constant, so just emit the binding table entry
1723           * as an immediate.
1724           */
1725          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1726                               const_uniform_block->value.u[0]);
1727       } else {
1728          /* The block index is not a constant. Evaluate the index expression
1729           * per-channel and add the base UBO index; the generator will select
1730           * a value from any live channel.
1731           */
1732          surf_index = src_reg(this, glsl_type::uint_type);
1733          emit(ADD(dst_reg(surf_index), op[0],
1734                   src_reg(prog_data->base.binding_table.ubo_start)));
1735
1736          /* Assume this may touch any UBO. It would be nice to provide
1737           * a tighter bound, but the array information is already lowered away.
1738           */
1739          brw_mark_surface_used(&prog_data->base,
1740                                prog_data->base.binding_table.ubo_start +
1741                                shader_prog->NumUniformBlocks - 1);
1742       }
1743
1744       if (const_offset_ir) {
1745          if (brw->gen >= 8) {
1746             /* Store the offset in a GRF so we can send-from-GRF. */
1747             offset = src_reg(this, glsl_type::int_type);
1748             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1749          } else {
1750             /* Immediates are fine on older generations since they'll be moved
1751              * to a (potentially fake) MRF at the generator level.
1752              */
1753             offset = src_reg(const_offset / 16);
1754          }
1755       } else {
1756          offset = src_reg(this, glsl_type::uint_type);
1757          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1758       }
1759
1760       if (brw->gen >= 7) {
1761          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1762          grf_offset.type = offset.type;
1763
1764          emit(MOV(grf_offset, offset));
1765
1766          vec4_instruction *pull =
1767             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1768                                                dst_reg(packed_consts),
1769                                                surf_index,
1770                                                src_reg(grf_offset)));
1771          pull->mlen = 1;
1772       } else {
1773          vec4_instruction *pull =
1774             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1775                                                dst_reg(packed_consts),
1776                                                surf_index,
1777                                                offset));
1778          pull->base_mrf = 14;
1779          pull->mlen = 1;
1780       }
1781
1782       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1783       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1784                                             const_offset % 16 / 4,
1785                                             const_offset % 16 / 4,
1786                                             const_offset % 16 / 4);
1787
1788       /* UBO bools are any nonzero int.  We need to convert them to use the
1789        * value of true stored in ctx->Const.UniformBooleanTrue.
1790        */
1791       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1792          emit(CMP(result_dst, packed_consts, src_reg(0u),
1793                   BRW_CONDITIONAL_NZ));
1794       } else {
1795          emit(MOV(result_dst, packed_consts));
1796       }
1797       break;
1798    }
1799
1800    case ir_binop_vector_extract:
1801       unreachable("should have been lowered by vec_index_to_cond_assign");
1802
1803    case ir_triop_fma:
1804       op[0] = fix_3src_operand(op[0]);
1805       op[1] = fix_3src_operand(op[1]);
1806       op[2] = fix_3src_operand(op[2]);
1807       /* Note that the instruction's argument order is reversed from GLSL
1808        * and the IR.
1809        */
1810       emit(MAD(result_dst, op[2], op[1], op[0]));
1811       break;
1812
1813    case ir_triop_lrp:
1814       emit_lrp(result_dst, op[0], op[1], op[2]);
1815       break;
1816
1817    case ir_triop_csel:
1818       unreachable("already handled above");
1819       break;
1820
1821    case ir_triop_bfi:
1822       op[0] = fix_3src_operand(op[0]);
1823       op[1] = fix_3src_operand(op[1]);
1824       op[2] = fix_3src_operand(op[2]);
1825       emit(BFI2(result_dst, op[0], op[1], op[2]));
1826       break;
1827
1828    case ir_triop_bitfield_extract:
1829       op[0] = fix_3src_operand(op[0]);
1830       op[1] = fix_3src_operand(op[1]);
1831       op[2] = fix_3src_operand(op[2]);
1832       /* Note that the instruction's argument order is reversed from GLSL
1833        * and the IR.
1834        */
1835       emit(BFE(result_dst, op[2], op[1], op[0]));
1836       break;
1837
1838    case ir_triop_vector_insert:
1839       unreachable("should have been lowered by lower_vector_insert");
1840
1841    case ir_quadop_bitfield_insert:
1842       unreachable("not reached: should be handled by "
1843               "bitfield_insert_to_bfm_bfi\n");
1844
1845    case ir_quadop_vector:
1846       unreachable("not reached: should be handled by lower_quadop_vector");
1847
1848    case ir_unop_pack_half_2x16:
1849       emit_pack_half_2x16(result_dst, op[0]);
1850       break;
1851    case ir_unop_unpack_half_2x16:
1852       emit_unpack_half_2x16(result_dst, op[0]);
1853       break;
1854    case ir_unop_unpack_unorm_4x8:
1855       emit_unpack_unorm_4x8(result_dst, op[0]);
1856       break;
1857    case ir_unop_unpack_snorm_4x8:
1858       emit_unpack_snorm_4x8(result_dst, op[0]);
1859       break;
1860    case ir_unop_pack_unorm_4x8:
1861       emit_pack_unorm_4x8(result_dst, op[0]);
1862       break;
1863    case ir_unop_pack_snorm_4x8:
1864       emit_pack_snorm_4x8(result_dst, op[0]);
1865       break;
1866    case ir_unop_pack_snorm_2x16:
1867    case ir_unop_pack_unorm_2x16:
1868    case ir_unop_unpack_snorm_2x16:
1869    case ir_unop_unpack_unorm_2x16:
1870       unreachable("not reached: should be handled by lower_packing_builtins");
1871    case ir_unop_unpack_half_2x16_split_x:
1872    case ir_unop_unpack_half_2x16_split_y:
1873    case ir_binop_pack_half_2x16_split:
1874    case ir_unop_interpolate_at_centroid:
1875    case ir_binop_interpolate_at_sample:
1876    case ir_binop_interpolate_at_offset:
1877       unreachable("not reached: should not occur in vertex shader");
1878    case ir_binop_ldexp:
1879       unreachable("not reached: should be handled by ldexp_to_arith()");
1880    }
1881 }
1882
1883
1884 void
1885 vec4_visitor::visit(ir_swizzle *ir)
1886 {
1887    src_reg src;
1888    int i = 0;
1889    int swizzle[4];
1890
1891    /* Note that this is only swizzles in expressions, not those on the left
1892     * hand side of an assignment, which do write masking.  See ir_assignment
1893     * for that.
1894     */
1895
1896    ir->val->accept(this);
1897    src = this->result;
1898    assert(src.file != BAD_FILE);
1899
1900    for (i = 0; i < ir->type->vector_elements; i++) {
1901       switch (i) {
1902       case 0:
1903          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1904          break;
1905       case 1:
1906          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1907          break;
1908       case 2:
1909          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1910          break;
1911       case 3:
1912          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1913             break;
1914       }
1915    }
1916    for (; i < 4; i++) {
1917       /* Replicate the last channel out. */
1918       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1919    }
1920
1921    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1922
1923    this->result = src;
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_dereference_variable *ir)
1928 {
1929    const struct glsl_type *type = ir->type;
1930    dst_reg *reg = variable_storage(ir->var);
1931
1932    if (!reg) {
1933       fail("Failed to find variable storage for %s\n", ir->var->name);
1934       this->result = src_reg(brw_null_reg());
1935       return;
1936    }
1937
1938    this->result = src_reg(*reg);
1939
1940    /* System values get their swizzle from the dst_reg writemask */
1941    if (ir->var->data.mode == ir_var_system_value)
1942       return;
1943
1944    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1945       this->result.swizzle = swizzle_for_size(type->vector_elements);
1946 }
1947
1948
1949 int
1950 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1951 {
1952    /* Under normal circumstances array elements are stored consecutively, so
1953     * the stride is equal to the size of the array element.
1954     */
1955    return type_size(ir->type);
1956 }
1957
1958
1959 void
1960 vec4_visitor::visit(ir_dereference_array *ir)
1961 {
1962    ir_constant *constant_index;
1963    src_reg src;
1964    int array_stride = compute_array_stride(ir);
1965
1966    constant_index = ir->array_index->constant_expression_value();
1967
1968    ir->array->accept(this);
1969    src = this->result;
1970
1971    if (constant_index) {
1972       src.reg_offset += constant_index->value.i[0] * array_stride;
1973    } else {
1974       /* Variable index array dereference.  It eats the "vec4" of the
1975        * base of the array and an index that offsets the Mesa register
1976        * index.
1977        */
1978       ir->array_index->accept(this);
1979
1980       src_reg index_reg;
1981
1982       if (array_stride == 1) {
1983          index_reg = this->result;
1984       } else {
1985          index_reg = src_reg(this, glsl_type::int_type);
1986
1987          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1988       }
1989
1990       if (src.reladdr) {
1991          src_reg temp = src_reg(this, glsl_type::int_type);
1992
1993          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1994
1995          index_reg = temp;
1996       }
1997
1998       src.reladdr = ralloc(mem_ctx, src_reg);
1999       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2000    }
2001
2002    /* If the type is smaller than a vec4, replicate the last channel out. */
2003    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2004       src.swizzle = swizzle_for_size(ir->type->vector_elements);
2005    else
2006       src.swizzle = BRW_SWIZZLE_NOOP;
2007    src.type = brw_type_for_base_type(ir->type);
2008
2009    this->result = src;
2010 }
2011
2012 void
2013 vec4_visitor::visit(ir_dereference_record *ir)
2014 {
2015    unsigned int i;
2016    const glsl_type *struct_type = ir->record->type;
2017    int offset = 0;
2018
2019    ir->record->accept(this);
2020
2021    for (i = 0; i < struct_type->length; i++) {
2022       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2023          break;
2024       offset += type_size(struct_type->fields.structure[i].type);
2025    }
2026
2027    /* If the type is smaller than a vec4, replicate the last channel out. */
2028    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2029       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2030    else
2031       this->result.swizzle = BRW_SWIZZLE_NOOP;
2032    this->result.type = brw_type_for_base_type(ir->type);
2033
2034    this->result.reg_offset += offset;
2035 }
2036
2037 /**
2038  * We want to be careful in assignment setup to hit the actual storage
2039  * instead of potentially using a temporary like we might with the
2040  * ir_dereference handler.
2041  */
2042 static dst_reg
2043 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2044 {
2045    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2046     * access of a vector, it must be separated into a series conditional moves
2047     * before reaching this point (see ir_vec_index_to_cond_assign).
2048     */
2049    assert(ir->as_dereference());
2050    ir_dereference_array *deref_array = ir->as_dereference_array();
2051    if (deref_array) {
2052       assert(!deref_array->array->type->is_vector());
2053    }
2054
2055    /* Use the rvalue deref handler for the most part.  We'll ignore
2056     * swizzles in it and write swizzles using writemask, though.
2057     */
2058    ir->accept(v);
2059    return dst_reg(v->result);
2060 }
2061
2062 void
2063 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2064                               const struct glsl_type *type,
2065                               enum brw_predicate predicate)
2066 {
2067    if (type->base_type == GLSL_TYPE_STRUCT) {
2068       for (unsigned int i = 0; i < type->length; i++) {
2069          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2070       }
2071       return;
2072    }
2073
2074    if (type->is_array()) {
2075       for (unsigned int i = 0; i < type->length; i++) {
2076          emit_block_move(dst, src, type->fields.array, predicate);
2077       }
2078       return;
2079    }
2080
2081    if (type->is_matrix()) {
2082       const struct glsl_type *vec_type;
2083
2084       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2085                                          type->vector_elements, 1);
2086
2087       for (int i = 0; i < type->matrix_columns; i++) {
2088          emit_block_move(dst, src, vec_type, predicate);
2089       }
2090       return;
2091    }
2092
2093    assert(type->is_scalar() || type->is_vector());
2094
2095    dst->type = brw_type_for_base_type(type);
2096    src->type = dst->type;
2097
2098    dst->writemask = (1 << type->vector_elements) - 1;
2099
2100    src->swizzle = swizzle_for_size(type->vector_elements);
2101
2102    vec4_instruction *inst = emit(MOV(*dst, *src));
2103    inst->predicate = predicate;
2104
2105    dst->reg_offset++;
2106    src->reg_offset++;
2107 }
2108
2109
2110 /* If the RHS processing resulted in an instruction generating a
2111  * temporary value, and it would be easy to rewrite the instruction to
2112  * generate its result right into the LHS instead, do so.  This ends
2113  * up reliably removing instructions where it can be tricky to do so
2114  * later without real UD chain information.
2115  */
2116 bool
2117 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2118                                      dst_reg dst,
2119                                      src_reg src,
2120                                      vec4_instruction *pre_rhs_inst,
2121                                      vec4_instruction *last_rhs_inst)
2122 {
2123    /* This could be supported, but it would take more smarts. */
2124    if (ir->condition)
2125       return false;
2126
2127    if (pre_rhs_inst == last_rhs_inst)
2128       return false; /* No instructions generated to work with. */
2129
2130    /* Make sure the last instruction generated our source reg. */
2131    if (src.file != GRF ||
2132        src.file != last_rhs_inst->dst.file ||
2133        src.reg != last_rhs_inst->dst.reg ||
2134        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2135        src.reladdr ||
2136        src.abs ||
2137        src.negate ||
2138        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2139       return false;
2140
2141    /* Check that that last instruction fully initialized the channels
2142     * we want to use, in the order we want to use them.  We could
2143     * potentially reswizzle the operands of many instructions so that
2144     * we could handle out of order channels, but don't yet.
2145     */
2146
2147    for (unsigned i = 0; i < 4; i++) {
2148       if (dst.writemask & (1 << i)) {
2149          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2150             return false;
2151
2152          if (BRW_GET_SWZ(src.swizzle, i) != i)
2153             return false;
2154       }
2155    }
2156
2157    /* Success!  Rewrite the instruction. */
2158    last_rhs_inst->dst.file = dst.file;
2159    last_rhs_inst->dst.reg = dst.reg;
2160    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2161    last_rhs_inst->dst.reladdr = dst.reladdr;
2162    last_rhs_inst->dst.writemask &= dst.writemask;
2163
2164    return true;
2165 }
2166
2167 void
2168 vec4_visitor::visit(ir_assignment *ir)
2169 {
2170    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2171    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2172
2173    if (!ir->lhs->type->is_scalar() &&
2174        !ir->lhs->type->is_vector()) {
2175       ir->rhs->accept(this);
2176       src_reg src = this->result;
2177
2178       if (ir->condition) {
2179          emit_bool_to_cond_code(ir->condition, &predicate);
2180       }
2181
2182       /* emit_block_move doesn't account for swizzles in the source register.
2183        * This should be ok, since the source register is a structure or an
2184        * array, and those can't be swizzled.  But double-check to be sure.
2185        */
2186       assert(src.swizzle ==
2187              (ir->rhs->type->is_matrix()
2188               ? swizzle_for_size(ir->rhs->type->vector_elements)
2189               : BRW_SWIZZLE_NOOP));
2190
2191       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2192       return;
2193    }
2194
2195    /* Now we're down to just a scalar/vector with writemasks. */
2196    int i;
2197
2198    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2199    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2200
2201    ir->rhs->accept(this);
2202
2203    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2204
2205    src_reg src = this->result;
2206
2207    int swizzles[4];
2208    int first_enabled_chan = 0;
2209    int src_chan = 0;
2210
2211    assert(ir->lhs->type->is_vector() ||
2212           ir->lhs->type->is_scalar());
2213    dst.writemask = ir->write_mask;
2214
2215    for (int i = 0; i < 4; i++) {
2216       if (dst.writemask & (1 << i)) {
2217          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2218          break;
2219       }
2220    }
2221
2222    /* Swizzle a small RHS vector into the channels being written.
2223     *
2224     * glsl ir treats write_mask as dictating how many channels are
2225     * present on the RHS while in our instructions we need to make
2226     * those channels appear in the slots of the vec4 they're written to.
2227     */
2228    for (int i = 0; i < 4; i++) {
2229       if (dst.writemask & (1 << i))
2230          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2231       else
2232          swizzles[i] = first_enabled_chan;
2233    }
2234    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2235                               swizzles[2], swizzles[3]);
2236
2237    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2238       return;
2239    }
2240
2241    if (ir->condition) {
2242       emit_bool_to_cond_code(ir->condition, &predicate);
2243    }
2244
2245    for (i = 0; i < type_size(ir->lhs->type); i++) {
2246       vec4_instruction *inst = emit(MOV(dst, src));
2247       inst->predicate = predicate;
2248
2249       dst.reg_offset++;
2250       src.reg_offset++;
2251    }
2252 }
2253
2254 void
2255 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2256 {
2257    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2258       foreach_in_list(ir_constant, field_value, &ir->components) {
2259          emit_constant_values(dst, field_value);
2260       }
2261       return;
2262    }
2263
2264    if (ir->type->is_array()) {
2265       for (unsigned int i = 0; i < ir->type->length; i++) {
2266          emit_constant_values(dst, ir->array_elements[i]);
2267       }
2268       return;
2269    }
2270
2271    if (ir->type->is_matrix()) {
2272       for (int i = 0; i < ir->type->matrix_columns; i++) {
2273          float *vec = &ir->value.f[i * ir->type->vector_elements];
2274
2275          for (int j = 0; j < ir->type->vector_elements; j++) {
2276             dst->writemask = 1 << j;
2277             dst->type = BRW_REGISTER_TYPE_F;
2278
2279             emit(MOV(*dst, src_reg(vec[j])));
2280          }
2281          dst->reg_offset++;
2282       }
2283       return;
2284    }
2285
2286    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2287
2288    for (int i = 0; i < ir->type->vector_elements; i++) {
2289       if (!(remaining_writemask & (1 << i)))
2290          continue;
2291
2292       dst->writemask = 1 << i;
2293       dst->type = brw_type_for_base_type(ir->type);
2294
2295       /* Find other components that match the one we're about to
2296        * write.  Emits fewer instructions for things like vec4(0.5,
2297        * 1.5, 1.5, 1.5).
2298        */
2299       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2300          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2301             if (ir->value.b[i] == ir->value.b[j])
2302                dst->writemask |= (1 << j);
2303          } else {
2304             /* u, i, and f storage all line up, so no need for a
2305              * switch case for comparing each type.
2306              */
2307             if (ir->value.u[i] == ir->value.u[j])
2308                dst->writemask |= (1 << j);
2309          }
2310       }
2311
2312       switch (ir->type->base_type) {
2313       case GLSL_TYPE_FLOAT:
2314          emit(MOV(*dst, src_reg(ir->value.f[i])));
2315          break;
2316       case GLSL_TYPE_INT:
2317          emit(MOV(*dst, src_reg(ir->value.i[i])));
2318          break;
2319       case GLSL_TYPE_UINT:
2320          emit(MOV(*dst, src_reg(ir->value.u[i])));
2321          break;
2322       case GLSL_TYPE_BOOL:
2323          emit(MOV(*dst,
2324                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2325                                               : 0)));
2326          break;
2327       default:
2328          unreachable("Non-float/uint/int/bool constant");
2329       }
2330
2331       remaining_writemask &= ~dst->writemask;
2332    }
2333    dst->reg_offset++;
2334 }
2335
2336 void
2337 vec4_visitor::visit(ir_constant *ir)
2338 {
2339    dst_reg dst = dst_reg(this, ir->type);
2340    this->result = src_reg(dst);
2341
2342    emit_constant_values(&dst, ir);
2343 }
2344
2345 void
2346 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2347 {
2348    ir_dereference *deref = static_cast<ir_dereference *>(
2349       ir->actual_parameters.get_head());
2350    ir_variable *location = deref->variable_referenced();
2351    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2352                           location->data.binding);
2353
2354    /* Calculate the surface offset */
2355    src_reg offset(this, glsl_type::uint_type);
2356    ir_dereference_array *deref_array = deref->as_dereference_array();
2357    if (deref_array) {
2358       deref_array->array_index->accept(this);
2359
2360       src_reg tmp(this, glsl_type::uint_type);
2361       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2362       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2363    } else {
2364       offset = location->data.atomic.offset;
2365    }
2366
2367    /* Emit the appropriate machine instruction */
2368    const char *callee = ir->callee->function_name();
2369    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2370
2371    if (!strcmp("__intrinsic_atomic_read", callee)) {
2372       emit_untyped_surface_read(surf_index, dst, offset);
2373
2374    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2375       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2376                           src_reg(), src_reg());
2377
2378    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2379       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2380                           src_reg(), src_reg());
2381    }
2382 }
2383
2384 void
2385 vec4_visitor::visit(ir_call *ir)
2386 {
2387    const char *callee = ir->callee->function_name();
2388
2389    if (!strcmp("__intrinsic_atomic_read", callee) ||
2390        !strcmp("__intrinsic_atomic_increment", callee) ||
2391        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2392       visit_atomic_counter_intrinsic(ir);
2393    } else {
2394       unreachable("Unsupported intrinsic.");
2395    }
2396 }
2397
2398 src_reg
2399 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2400 {
2401    vec4_instruction *inst =
2402       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2403                                     dst_reg(this, glsl_type::uvec4_type));
2404    inst->base_mrf = 2;
2405    inst->mlen = 1;
2406    inst->src[1] = sampler;
2407
2408    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2409    int param_base = inst->base_mrf;
2410    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2411    int zero_mask = 0xf & ~coord_mask;
2412
2413    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2414             coordinate));
2415
2416    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2417             src_reg(0)));
2418
2419    emit(inst);
2420    return src_reg(inst->dst);
2421 }
2422
2423 static bool
2424 is_high_sampler(struct brw_context *brw, src_reg sampler)
2425 {
2426    if (brw->gen < 8 && !brw->is_haswell)
2427       return false;
2428
2429    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2430 }
2431
2432 void
2433 vec4_visitor::visit(ir_texture *ir)
2434 {
2435    uint32_t sampler =
2436       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2437
2438    ir_rvalue *nonconst_sampler_index =
2439       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2440
2441    /* Handle non-constant sampler array indexing */
2442    src_reg sampler_reg;
2443    if (nonconst_sampler_index) {
2444       /* The highest sampler which may be used by this operation is
2445        * the last element of the array. Mark it here, because the generator
2446        * doesn't have enough information to determine the bound.
2447        */
2448       uint32_t array_size = ir->sampler->as_dereference_array()
2449          ->array->type->array_size();
2450
2451       uint32_t max_used = sampler + array_size - 1;
2452       if (ir->op == ir_tg4 && brw->gen < 8) {
2453          max_used += prog_data->base.binding_table.gather_texture_start;
2454       } else {
2455          max_used += prog_data->base.binding_table.texture_start;
2456       }
2457
2458       brw_mark_surface_used(&prog_data->base, max_used);
2459
2460       /* Emit code to evaluate the actual indexing expression */
2461       nonconst_sampler_index->accept(this);
2462       dst_reg temp(this, glsl_type::uint_type);
2463       emit(ADD(temp, this->result, src_reg(sampler)))
2464          ->force_writemask_all = true;
2465       sampler_reg = src_reg(temp);
2466    } else {
2467       /* Single sampler, or constant array index; the indexing expression
2468        * is just an immediate.
2469        */
2470       sampler_reg = src_reg(sampler);
2471    }
2472
2473    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2474     * emitting anything other than setting up the constant result.
2475     */
2476    if (ir->op == ir_tg4) {
2477       ir_constant *chan = ir->lod_info.component->as_constant();
2478       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2479       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2480          dst_reg result(this, ir->type);
2481          this->result = src_reg(result);
2482          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2483          return;
2484       }
2485    }
2486
2487    /* Should be lowered by do_lower_texture_projection */
2488    assert(!ir->projector);
2489
2490    /* Should be lowered */
2491    assert(!ir->offset || !ir->offset->type->is_array());
2492
2493    /* Generate code to compute all the subexpression trees.  This has to be
2494     * done before loading any values into MRFs for the sampler message since
2495     * generating these values may involve SEND messages that need the MRFs.
2496     */
2497    src_reg coordinate;
2498    if (ir->coordinate) {
2499       ir->coordinate->accept(this);
2500       coordinate = this->result;
2501    }
2502
2503    src_reg shadow_comparitor;
2504    if (ir->shadow_comparitor) {
2505       ir->shadow_comparitor->accept(this);
2506       shadow_comparitor = this->result;
2507    }
2508
2509    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2510    src_reg offset_value;
2511    if (has_nonconstant_offset) {
2512       ir->offset->accept(this);
2513       offset_value = src_reg(this->result);
2514    }
2515
2516    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2517    src_reg lod, dPdx, dPdy, sample_index, mcs;
2518    switch (ir->op) {
2519    case ir_tex:
2520       lod = src_reg(0.0f);
2521       lod_type = glsl_type::float_type;
2522       break;
2523    case ir_txf:
2524    case ir_txl:
2525    case ir_txs:
2526       ir->lod_info.lod->accept(this);
2527       lod = this->result;
2528       lod_type = ir->lod_info.lod->type;
2529       break;
2530    case ir_query_levels:
2531       lod = src_reg(0);
2532       lod_type = glsl_type::int_type;
2533       break;
2534    case ir_txf_ms:
2535       ir->lod_info.sample_index->accept(this);
2536       sample_index = this->result;
2537       sample_index_type = ir->lod_info.sample_index->type;
2538
2539       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2540          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2541       else
2542          mcs = src_reg(0u);
2543       break;
2544    case ir_txd:
2545       ir->lod_info.grad.dPdx->accept(this);
2546       dPdx = this->result;
2547
2548       ir->lod_info.grad.dPdy->accept(this);
2549       dPdy = this->result;
2550
2551       lod_type = ir->lod_info.grad.dPdx->type;
2552       break;
2553    case ir_txb:
2554    case ir_lod:
2555    case ir_tg4:
2556       break;
2557    }
2558
2559    enum opcode opcode;
2560    switch (ir->op) {
2561    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2562    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2563    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2564    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2565    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2566    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2567    case ir_tg4: opcode = has_nonconstant_offset
2568                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2569    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2570    case ir_txb:
2571       unreachable("TXB is not valid for vertex shaders.");
2572    case ir_lod:
2573       unreachable("LOD is not valid for vertex shaders.");
2574    default:
2575       unreachable("Unrecognized tex op");
2576    }
2577
2578    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2579       opcode, dst_reg(this, ir->type));
2580
2581    if (ir->offset != NULL && !has_nonconstant_offset) {
2582       inst->offset =
2583          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2584                             ir->offset->type->vector_elements);
2585    }
2586
2587    /* Stuff the channel select bits in the top of the texture offset */
2588    if (ir->op == ir_tg4)
2589       inst->offset |= gather_channel(ir, sampler) << 16;
2590
2591    /* The message header is necessary for:
2592     * - Gen4 (always)
2593     * - Gen9+ for selecting SIMD4x2
2594     * - Texel offsets
2595     * - Gather channel selection
2596     * - Sampler indices too large to fit in a 4-bit value.
2597     */
2598    inst->header_present =
2599       brw->gen < 5 || brw->gen >= 9 ||
2600       inst->offset != 0 || ir->op == ir_tg4 ||
2601       is_high_sampler(brw, sampler_reg);
2602    inst->base_mrf = 2;
2603    inst->mlen = inst->header_present + 1; /* always at least one */
2604    inst->shadow_compare = ir->shadow_comparitor != NULL;
2605
2606    inst->src[1] = sampler_reg;
2607
2608    /* MRF for the first parameter */
2609    int param_base = inst->base_mrf + inst->header_present;
2610
2611    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2612       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2613       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2614    } else {
2615       /* Load the coordinate */
2616       /* FINISHME: gl_clamp_mask and saturate */
2617       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2618       int zero_mask = 0xf & ~coord_mask;
2619
2620       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2621                coordinate));
2622
2623       if (zero_mask != 0) {
2624          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2625                   src_reg(0)));
2626       }
2627       /* Load the shadow comparitor */
2628       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2629          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2630                           WRITEMASK_X),
2631                   shadow_comparitor));
2632          inst->mlen++;
2633       }
2634
2635       /* Load the LOD info */
2636       if (ir->op == ir_tex || ir->op == ir_txl) {
2637          int mrf, writemask;
2638          if (brw->gen >= 5) {
2639             mrf = param_base + 1;
2640             if (ir->shadow_comparitor) {
2641                writemask = WRITEMASK_Y;
2642                /* mlen already incremented */
2643             } else {
2644                writemask = WRITEMASK_X;
2645                inst->mlen++;
2646             }
2647          } else /* brw->gen == 4 */ {
2648             mrf = param_base;
2649             writemask = WRITEMASK_W;
2650          }
2651          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2652       } else if (ir->op == ir_txf) {
2653          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2654       } else if (ir->op == ir_txf_ms) {
2655          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2656                   sample_index));
2657          if (brw->gen >= 7) {
2658             /* MCS data is in the first channel of `mcs`, but we need to get it into
2659              * the .y channel of the second vec4 of params, so replicate .x across
2660              * the whole vec4 and then mask off everything except .y
2661              */
2662             mcs.swizzle = BRW_SWIZZLE_XXXX;
2663             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2664                      mcs));
2665          }
2666          inst->mlen++;
2667       } else if (ir->op == ir_txd) {
2668          const glsl_type *type = lod_type;
2669
2670          if (brw->gen >= 5) {
2671             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2672             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2673             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2674             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2675             inst->mlen++;
2676
2677             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2678                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2679                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2680                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2681                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2682                inst->mlen++;
2683
2684                if (ir->shadow_comparitor) {
2685                   emit(MOV(dst_reg(MRF, param_base + 2,
2686                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2687                            shadow_comparitor));
2688                }
2689             }
2690          } else /* brw->gen == 4 */ {
2691             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2692             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2693             inst->mlen += 2;
2694          }
2695       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2696          if (ir->shadow_comparitor) {
2697             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2698                      shadow_comparitor));
2699          }
2700
2701          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2702                   offset_value));
2703          inst->mlen++;
2704       }
2705    }
2706
2707    emit(inst);
2708
2709    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2710     * spec requires layers.
2711     */
2712    if (ir->op == ir_txs) {
2713       glsl_type const *type = ir->sampler->type;
2714       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2715           type->sampler_array) {
2716          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2717                    writemask(inst->dst, WRITEMASK_Z),
2718                    src_reg(inst->dst), src_reg(6));
2719       }
2720    }
2721
2722    if (brw->gen == 6 && ir->op == ir_tg4) {
2723       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2724    }
2725
2726    swizzle_result(ir, src_reg(inst->dst), sampler);
2727 }
2728
2729 /**
2730  * Apply workarounds for Gen6 gather with UINT/SINT
2731  */
2732 void
2733 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2734 {
2735    if (!wa)
2736       return;
2737
2738    int width = (wa & WA_8BIT) ? 8 : 16;
2739    dst_reg dst_f = dst;
2740    dst_f.type = BRW_REGISTER_TYPE_F;
2741
2742    /* Convert from UNORM to UINT */
2743    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2744    emit(MOV(dst, src_reg(dst_f)));
2745
2746    if (wa & WA_SIGN) {
2747       /* Reinterpret the UINT value as a signed INT value by
2748        * shifting the sign bit into place, then shifting back
2749        * preserving sign.
2750        */
2751       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2752       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2753    }
2754 }
2755
2756 /**
2757  * Set up the gather channel based on the swizzle, for gather4.
2758  */
2759 uint32_t
2760 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2761 {
2762    ir_constant *chan = ir->lod_info.component->as_constant();
2763    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2764    switch (swiz) {
2765       case SWIZZLE_X: return 0;
2766       case SWIZZLE_Y:
2767          /* gather4 sampler is broken for green channel on RG32F --
2768           * we must ask for blue instead.
2769           */
2770          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2771             return 2;
2772          return 1;
2773       case SWIZZLE_Z: return 2;
2774       case SWIZZLE_W: return 3;
2775       default:
2776          unreachable("Not reached"); /* zero, one swizzles handled already */
2777    }
2778 }
2779
2780 void
2781 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2782 {
2783    int s = key->tex.swizzles[sampler];
2784
2785    this->result = src_reg(this, ir->type);
2786    dst_reg swizzled_result(this->result);
2787
2788    if (ir->op == ir_query_levels) {
2789       /* # levels is in .w */
2790       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2791       emit(MOV(swizzled_result, orig_val));
2792       return;
2793    }
2794
2795    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2796                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2797       emit(MOV(swizzled_result, orig_val));
2798       return;
2799    }
2800
2801
2802    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2803    int swizzle[4] = {0};
2804
2805    for (int i = 0; i < 4; i++) {
2806       switch (GET_SWZ(s, i)) {
2807       case SWIZZLE_ZERO:
2808          zero_mask |= (1 << i);
2809          break;
2810       case SWIZZLE_ONE:
2811          one_mask |= (1 << i);
2812          break;
2813       default:
2814          copy_mask |= (1 << i);
2815          swizzle[i] = GET_SWZ(s, i);
2816          break;
2817       }
2818    }
2819
2820    if (copy_mask) {
2821       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2822       swizzled_result.writemask = copy_mask;
2823       emit(MOV(swizzled_result, orig_val));
2824    }
2825
2826    if (zero_mask) {
2827       swizzled_result.writemask = zero_mask;
2828       emit(MOV(swizzled_result, src_reg(0.0f)));
2829    }
2830
2831    if (one_mask) {
2832       swizzled_result.writemask = one_mask;
2833       emit(MOV(swizzled_result, src_reg(1.0f)));
2834    }
2835 }
2836
2837 void
2838 vec4_visitor::visit(ir_return *)
2839 {
2840    unreachable("not reached");
2841 }
2842
2843 void
2844 vec4_visitor::visit(ir_discard *)
2845 {
2846    unreachable("not reached");
2847 }
2848
2849 void
2850 vec4_visitor::visit(ir_if *ir)
2851 {
2852    /* Don't point the annotation at the if statement, because then it plus
2853     * the then and else blocks get printed.
2854     */
2855    this->base_ir = ir->condition;
2856
2857    if (brw->gen == 6) {
2858       emit_if_gen6(ir);
2859    } else {
2860       enum brw_predicate predicate;
2861       emit_bool_to_cond_code(ir->condition, &predicate);
2862       emit(IF(predicate));
2863    }
2864
2865    visit_instructions(&ir->then_instructions);
2866
2867    if (!ir->else_instructions.is_empty()) {
2868       this->base_ir = ir->condition;
2869       emit(BRW_OPCODE_ELSE);
2870
2871       visit_instructions(&ir->else_instructions);
2872    }
2873
2874    this->base_ir = ir->condition;
2875    emit(BRW_OPCODE_ENDIF);
2876 }
2877
2878 void
2879 vec4_visitor::visit(ir_emit_vertex *)
2880 {
2881    unreachable("not reached");
2882 }
2883
2884 void
2885 vec4_visitor::visit(ir_end_primitive *)
2886 {
2887    unreachable("not reached");
2888 }
2889
2890 void
2891 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2892                                   dst_reg dst, src_reg offset,
2893                                   src_reg src0, src_reg src1)
2894 {
2895    unsigned mlen = 0;
2896
2897    /* Set the atomic operation offset. */
2898    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2899    mlen++;
2900
2901    /* Set the atomic operation arguments. */
2902    if (src0.file != BAD_FILE) {
2903       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2904       mlen++;
2905    }
2906
2907    if (src1.file != BAD_FILE) {
2908       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2909       mlen++;
2910    }
2911
2912    /* Emit the instruction.  Note that this maps to the normal SIMD8
2913     * untyped atomic message on Ivy Bridge, but that's OK because
2914     * unused channels will be masked out.
2915     */
2916    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2917                                  src_reg(atomic_op), src_reg(surf_index));
2918    inst->base_mrf = 0;
2919    inst->mlen = mlen;
2920 }
2921
2922 void
2923 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2924                                         src_reg offset)
2925 {
2926    /* Set the surface read offset. */
2927    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2928
2929    /* Emit the instruction.  Note that this maps to the normal SIMD8
2930     * untyped surface read message, but that's OK because unused
2931     * channels will be masked out.
2932     */
2933    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2934                                  dst, src_reg(surf_index));
2935    inst->base_mrf = 0;
2936    inst->mlen = 1;
2937 }
2938
2939 void
2940 vec4_visitor::emit_ndc_computation()
2941 {
2942    /* Get the position */
2943    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2944
2945    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2946    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2947    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2948
2949    current_annotation = "NDC";
2950    dst_reg ndc_w = ndc;
2951    ndc_w.writemask = WRITEMASK_W;
2952    src_reg pos_w = pos;
2953    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2954    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2955
2956    dst_reg ndc_xyz = ndc;
2957    ndc_xyz.writemask = WRITEMASK_XYZ;
2958
2959    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2960 }
2961
2962 void
2963 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2964 {
2965    if (brw->gen < 6 &&
2966        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2967         key->userclip_active || brw->has_negative_rhw_bug)) {
2968       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2969       dst_reg header1_w = header1;
2970       header1_w.writemask = WRITEMASK_W;
2971
2972       emit(MOV(header1, 0u));
2973
2974       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2975          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2976
2977          current_annotation = "Point size";
2978          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2979          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2980       }
2981
2982       if (key->userclip_active) {
2983          current_annotation = "Clipping flags";
2984          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2985          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2986
2987          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2988          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2989          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2990
2991          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2992          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2993          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2994          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2995       }
2996
2997       /* i965 clipping workaround:
2998        * 1) Test for -ve rhw
2999        * 2) If set,
3000        *      set ndc = (0,0,0,0)
3001        *      set ucp[6] = 1
3002        *
3003        * Later, clipping will detect ucp[6] and ensure the primitive is
3004        * clipped against all fixed planes.
3005        */
3006       if (brw->has_negative_rhw_bug) {
3007          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3008          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3009          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3010          vec4_instruction *inst;
3011          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3012          inst->predicate = BRW_PREDICATE_NORMAL;
3013          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3014          inst->predicate = BRW_PREDICATE_NORMAL;
3015       }
3016
3017       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3018    } else if (brw->gen < 6) {
3019       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3020    } else {
3021       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3022       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3023          dst_reg reg_w = reg;
3024          reg_w.writemask = WRITEMASK_W;
3025          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3026       }
3027       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3028          dst_reg reg_y = reg;
3029          reg_y.writemask = WRITEMASK_Y;
3030          reg_y.type = BRW_REGISTER_TYPE_D;
3031          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3032       }
3033       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3034          dst_reg reg_z = reg;
3035          reg_z.writemask = WRITEMASK_Z;
3036          reg_z.type = BRW_REGISTER_TYPE_D;
3037          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3038       }
3039    }
3040 }
3041
3042 void
3043 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3044 {
3045    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3046     *
3047     *     "If a linked set of shaders forming the vertex stage contains no
3048     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3049     *     application has requested clipping against user clip planes through
3050     *     the API, then the coordinate written to gl_Position is used for
3051     *     comparison against the user clip planes."
3052     *
3053     * This function is only called if the shader didn't write to
3054     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3055     * if the user wrote to it; otherwise we use gl_Position.
3056     */
3057    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3058    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3059       clip_vertex = VARYING_SLOT_POS;
3060    }
3061
3062    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3063         ++i) {
3064       reg.writemask = 1 << i;
3065       emit(DP4(reg,
3066                src_reg(output_reg[clip_vertex]),
3067                src_reg(this->userplane[i + offset])));
3068    }
3069 }
3070
3071 vec4_instruction *
3072 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3073 {
3074    assert (varying < VARYING_SLOT_MAX);
3075    reg.type = output_reg[varying].type;
3076    current_annotation = output_reg_annotation[varying];
3077    /* Copy the register, saturating if necessary */
3078    return emit(MOV(reg, src_reg(output_reg[varying])));
3079 }
3080
3081 void
3082 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3083 {
3084    reg.type = BRW_REGISTER_TYPE_F;
3085
3086    switch (varying) {
3087    case VARYING_SLOT_PSIZ:
3088    {
3089       /* PSIZ is always in slot 0, and is coupled with other flags. */
3090       current_annotation = "indices, point width, clip flags";
3091       emit_psiz_and_flags(reg);
3092       break;
3093    }
3094    case BRW_VARYING_SLOT_NDC:
3095       current_annotation = "NDC";
3096       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3097       break;
3098    case VARYING_SLOT_POS:
3099       current_annotation = "gl_Position";
3100       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3101       break;
3102    case VARYING_SLOT_EDGE:
3103       /* This is present when doing unfilled polygons.  We're supposed to copy
3104        * the edge flag from the user-provided vertex array
3105        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3106        * of that attribute (starts as 1.0f).  This is then used in clipping to
3107        * determine which edges should be drawn as wireframe.
3108        */
3109       current_annotation = "edge flag";
3110       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3111                                     glsl_type::float_type, WRITEMASK_XYZW))));
3112       break;
3113    case BRW_VARYING_SLOT_PAD:
3114       /* No need to write to this slot */
3115       break;
3116    case VARYING_SLOT_COL0:
3117    case VARYING_SLOT_COL1:
3118    case VARYING_SLOT_BFC0:
3119    case VARYING_SLOT_BFC1: {
3120       /* These built-in varyings are only supported in compatibility mode,
3121        * and we only support GS in core profile.  So, this must be a vertex
3122        * shader.
3123        */
3124       assert(stage == MESA_SHADER_VERTEX);
3125       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3126       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3127          inst->saturate = true;
3128       break;
3129    }
3130
3131    default:
3132       emit_generic_urb_slot(reg, varying);
3133       break;
3134    }
3135 }
3136
3137 static int
3138 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3139 {
3140    if (brw->gen >= 6) {
3141       /* URB data written (does not include the message header reg) must
3142        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3143        * section 5.4.3.2.2: URB_INTERLEAVED.
3144        *
3145        * URB entries are allocated on a multiple of 1024 bits, so an
3146        * extra 128 bits written here to make the end align to 256 is
3147        * no problem.
3148        */
3149       if ((mlen % 2) != 1)
3150          mlen++;
3151    }
3152
3153    return mlen;
3154 }
3155
3156
3157 /**
3158  * Generates the VUE payload plus the necessary URB write instructions to
3159  * output it.
3160  *
3161  * The VUE layout is documented in Volume 2a.
3162  */
3163 void
3164 vec4_visitor::emit_vertex()
3165 {
3166    /* MRF 0 is reserved for the debugger, so start with message header
3167     * in MRF 1.
3168     */
3169    int base_mrf = 1;
3170    int mrf = base_mrf;
3171    /* In the process of generating our URB write message contents, we
3172     * may need to unspill a register or load from an array.  Those
3173     * reads would use MRFs 14-15.
3174     */
3175    int max_usable_mrf = 13;
3176
3177    /* The following assertion verifies that max_usable_mrf causes an
3178     * even-numbered amount of URB write data, which will meet gen6's
3179     * requirements for length alignment.
3180     */
3181    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3182
3183    /* First mrf is the g0-based message header containing URB handles and
3184     * such.
3185     */
3186    emit_urb_write_header(mrf++);
3187
3188    if (brw->gen < 6) {
3189       emit_ndc_computation();
3190    }
3191
3192    /* Lower legacy ff and ClipVertex clipping to clip distances */
3193    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3194       current_annotation = "user clip distances";
3195
3196       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3197       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3198
3199       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3200       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3201    }
3202
3203    /* We may need to split this up into several URB writes, so do them in a
3204     * loop.
3205     */
3206    int slot = 0;
3207    bool complete = false;
3208    do {
3209       /* URB offset is in URB row increments, and each of our MRFs is half of
3210        * one of those, since we're doing interleaved writes.
3211        */
3212       int offset = slot / 2;
3213
3214       mrf = base_mrf + 1;
3215       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3216          emit_urb_slot(dst_reg(MRF, mrf++),
3217                        prog_data->vue_map.slot_to_varying[slot]);
3218
3219          /* If this was max_usable_mrf, we can't fit anything more into this
3220           * URB WRITE.
3221           */
3222          if (mrf > max_usable_mrf) {
3223             slot++;
3224             break;
3225          }
3226       }
3227
3228       complete = slot >= prog_data->vue_map.num_slots;
3229       current_annotation = "URB write";
3230       vec4_instruction *inst = emit_urb_write_opcode(complete);
3231       inst->base_mrf = base_mrf;
3232       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3233       inst->offset += offset;
3234    } while(!complete);
3235 }
3236
3237
3238 src_reg
3239 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3240                                  src_reg *reladdr, int reg_offset)
3241 {
3242    /* Because we store the values to scratch interleaved like our
3243     * vertex data, we need to scale the vec4 index by 2.
3244     */
3245    int message_header_scale = 2;
3246
3247    /* Pre-gen6, the message header uses byte offsets instead of vec4
3248     * (16-byte) offset units.
3249     */
3250    if (brw->gen < 6)
3251       message_header_scale *= 16;
3252
3253    if (reladdr) {
3254       src_reg index = src_reg(this, glsl_type::int_type);
3255
3256       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3257                                    src_reg(reg_offset)));
3258       emit_before(block, inst, MUL(dst_reg(index), index,
3259                                    src_reg(message_header_scale)));
3260
3261       return index;
3262    } else {
3263       return src_reg(reg_offset * message_header_scale);
3264    }
3265 }
3266
3267 src_reg
3268 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3269                                        src_reg *reladdr, int reg_offset)
3270 {
3271    if (reladdr) {
3272       src_reg index = src_reg(this, glsl_type::int_type);
3273
3274       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3275                                    src_reg(reg_offset)));
3276
3277       /* Pre-gen6, the message header uses byte offsets instead of vec4
3278        * (16-byte) offset units.
3279        */
3280       if (brw->gen < 6) {
3281          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3282       }
3283
3284       return index;
3285    } else if (brw->gen >= 8) {
3286       /* Store the offset in a GRF so we can send-from-GRF. */
3287       src_reg offset = src_reg(this, glsl_type::int_type);
3288       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3289       return offset;
3290    } else {
3291       int message_header_scale = brw->gen < 6 ? 16 : 1;
3292       return src_reg(reg_offset * message_header_scale);
3293    }
3294 }
3295
3296 /**
3297  * Emits an instruction before @inst to load the value named by @orig_src
3298  * from scratch space at @base_offset to @temp.
3299  *
3300  * @base_offset is measured in 32-byte units (the size of a register).
3301  */
3302 void
3303 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3304                                 dst_reg temp, src_reg orig_src,
3305                                 int base_offset)
3306 {
3307    int reg_offset = base_offset + orig_src.reg_offset;
3308    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3309                                       reg_offset);
3310
3311    emit_before(block, inst, SCRATCH_READ(temp, index));
3312 }
3313
3314 /**
3315  * Emits an instruction after @inst to store the value to be written
3316  * to @orig_dst to scratch space at @base_offset, from @temp.
3317  *
3318  * @base_offset is measured in 32-byte units (the size of a register).
3319  */
3320 void
3321 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3322                                  int base_offset)
3323 {
3324    int reg_offset = base_offset + inst->dst.reg_offset;
3325    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3326                                       reg_offset);
3327
3328    /* Create a temporary register to store *inst's result in.
3329     *
3330     * We have to be careful in MOVing from our temporary result register in
3331     * the scratch write.  If we swizzle from channels of the temporary that
3332     * weren't initialized, it will confuse live interval analysis, which will
3333     * make spilling fail to make progress.
3334     */
3335    src_reg temp = src_reg(this, glsl_type::vec4_type);
3336    temp.type = inst->dst.type;
3337    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3338    int swizzles[4];
3339    for (int i = 0; i < 4; i++)
3340       if (inst->dst.writemask & (1 << i))
3341          swizzles[i] = i;
3342       else
3343          swizzles[i] = first_writemask_chan;
3344    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3345                                swizzles[2], swizzles[3]);
3346
3347    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3348                                        inst->dst.writemask));
3349    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3350    write->predicate = inst->predicate;
3351    write->ir = inst->ir;
3352    write->annotation = inst->annotation;
3353    inst->insert_after(block, write);
3354
3355    inst->dst.file = temp.file;
3356    inst->dst.reg = temp.reg;
3357    inst->dst.reg_offset = temp.reg_offset;
3358    inst->dst.reladdr = NULL;
3359 }
3360
3361 /**
3362  * We can't generally support array access in GRF space, because a
3363  * single instruction's destination can only span 2 contiguous
3364  * registers.  So, we send all GRF arrays that get variable index
3365  * access to scratch space.
3366  */
3367 void
3368 vec4_visitor::move_grf_array_access_to_scratch()
3369 {
3370    int scratch_loc[this->alloc.count];
3371    memset(scratch_loc, -1, sizeof(scratch_loc));
3372
3373    /* First, calculate the set of virtual GRFs that need to be punted
3374     * to scratch due to having any array access on them, and where in
3375     * scratch.
3376     */
3377    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3378       if (inst->dst.file == GRF && inst->dst.reladdr &&
3379           scratch_loc[inst->dst.reg] == -1) {
3380          scratch_loc[inst->dst.reg] = c->last_scratch;
3381          c->last_scratch += this->alloc.sizes[inst->dst.reg];
3382       }
3383
3384       for (int i = 0 ; i < 3; i++) {
3385          src_reg *src = &inst->src[i];
3386
3387          if (src->file == GRF && src->reladdr &&
3388              scratch_loc[src->reg] == -1) {
3389             scratch_loc[src->reg] = c->last_scratch;
3390             c->last_scratch += this->alloc.sizes[src->reg];
3391          }
3392       }
3393    }
3394
3395    /* Now, for anything that will be accessed through scratch, rewrite
3396     * it to load/store.  Note that this is a _safe list walk, because
3397     * we may generate a new scratch_write instruction after the one
3398     * we're processing.
3399     */
3400    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3401       /* Set up the annotation tracking for new generated instructions. */
3402       base_ir = inst->ir;
3403       current_annotation = inst->annotation;
3404
3405       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3406          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3407       }
3408
3409       for (int i = 0 ; i < 3; i++) {
3410          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3411             continue;
3412
3413          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3414
3415          emit_scratch_read(block, inst, temp, inst->src[i],
3416                            scratch_loc[inst->src[i].reg]);
3417
3418          inst->src[i].file = temp.file;
3419          inst->src[i].reg = temp.reg;
3420          inst->src[i].reg_offset = temp.reg_offset;
3421          inst->src[i].reladdr = NULL;
3422       }
3423    }
3424 }
3425
3426 /**
3427  * Emits an instruction before @inst to load the value named by @orig_src
3428  * from the pull constant buffer (surface) at @base_offset to @temp.
3429  */
3430 void
3431 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3432                                       dst_reg temp, src_reg orig_src,
3433                                       int base_offset)
3434 {
3435    int reg_offset = base_offset + orig_src.reg_offset;
3436    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3437    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3438                                              reg_offset);
3439    vec4_instruction *load;
3440
3441    if (brw->gen >= 7) {
3442       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3443       grf_offset.type = offset.type;
3444       emit_before(block, inst, MOV(grf_offset, offset));
3445
3446       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3447                                            temp, index, src_reg(grf_offset));
3448       load->mlen = 1;
3449    } else {
3450       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3451                                            temp, index, offset);
3452       load->base_mrf = 14;
3453       load->mlen = 1;
3454    }
3455    emit_before(block, inst, load);
3456 }
3457
3458 /**
3459  * Implements array access of uniforms by inserting a
3460  * PULL_CONSTANT_LOAD instruction.
3461  *
3462  * Unlike temporary GRF array access (where we don't support it due to
3463  * the difficulty of doing relative addressing on instruction
3464  * destinations), we could potentially do array access of uniforms
3465  * that were loaded in GRF space as push constants.  In real-world
3466  * usage we've seen, though, the arrays being used are always larger
3467  * than we could load as push constants, so just always move all
3468  * uniform array access out to a pull constant buffer.
3469  */
3470 void
3471 vec4_visitor::move_uniform_array_access_to_pull_constants()
3472 {
3473    int pull_constant_loc[this->uniforms];
3474    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3475    bool nested_reladdr;
3476
3477    /* Walk through and find array access of uniforms.  Put a copy of that
3478     * uniform in the pull constant buffer.
3479     *
3480     * Note that we don't move constant-indexed accesses to arrays.  No
3481     * testing has been done of the performance impact of this choice.
3482     */
3483    do {
3484       nested_reladdr = false;
3485
3486       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3487          for (int i = 0 ; i < 3; i++) {
3488             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3489                continue;
3490
3491             int uniform = inst->src[i].reg;
3492
3493             if (inst->src[i].reladdr->reladdr)
3494                nested_reladdr = true;  /* will need another pass */
3495
3496             /* If this array isn't already present in the pull constant buffer,
3497              * add it.
3498              */
3499             if (pull_constant_loc[uniform] == -1) {
3500                const gl_constant_value **values =
3501                   &stage_prog_data->param[uniform * 4];
3502
3503                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3504
3505                assert(uniform < uniform_array_size);
3506                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3507                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3508                      = values[j];
3509                }
3510             }
3511
3512             /* Set up the annotation tracking for new generated instructions. */
3513             base_ir = inst->ir;
3514             current_annotation = inst->annotation;
3515
3516             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3517
3518             emit_pull_constant_load(block, inst, temp, inst->src[i],
3519                                     pull_constant_loc[uniform]);
3520
3521             inst->src[i].file = temp.file;
3522             inst->src[i].reg = temp.reg;
3523             inst->src[i].reg_offset = temp.reg_offset;
3524             inst->src[i].reladdr = NULL;
3525          }
3526       }
3527    } while (nested_reladdr);
3528
3529    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3530     * no need to track them as larger-than-vec4 objects.  This will be
3531     * relied on in cutting out unused uniform vectors from push
3532     * constants.
3533     */
3534    split_uniform_registers();
3535 }
3536
3537 void
3538 vec4_visitor::resolve_ud_negate(src_reg *reg)
3539 {
3540    if (reg->type != BRW_REGISTER_TYPE_UD ||
3541        !reg->negate)
3542       return;
3543
3544    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3545    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3546    *reg = temp;
3547 }
3548
3549 /**
3550  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3551  *
3552  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3553  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3554  */
3555 void
3556 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3557 {
3558    assert(brw->gen <= 5);
3559
3560    if (!rvalue->type->is_boolean())
3561       return;
3562
3563    src_reg and_result = src_reg(this, rvalue->type);
3564    src_reg neg_result = src_reg(this, rvalue->type);
3565    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3566    emit(MOV(dst_reg(neg_result), negate(and_result)));
3567    *reg = neg_result;
3568 }
3569
3570 vec4_visitor::vec4_visitor(struct brw_context *brw,
3571                            struct brw_vec4_compile *c,
3572                            struct gl_program *prog,
3573                            const struct brw_vue_prog_key *key,
3574                            struct brw_vue_prog_data *prog_data,
3575                            struct gl_shader_program *shader_prog,
3576                            gl_shader_stage stage,
3577                            void *mem_ctx,
3578                            bool debug_flag,
3579                            bool no_spills,
3580                            shader_time_shader_type st_base,
3581                            shader_time_shader_type st_written,
3582                            shader_time_shader_type st_reset)
3583    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3584      c(c),
3585      key(key),
3586      prog_data(prog_data),
3587      sanity_param_count(0),
3588      fail_msg(NULL),
3589      first_non_payload_grf(0),
3590      need_all_constants_in_pull_buffer(false),
3591      debug_flag(debug_flag),
3592      no_spills(no_spills),
3593      st_base(st_base),
3594      st_written(st_written),
3595      st_reset(st_reset)
3596 {
3597    this->mem_ctx = mem_ctx;
3598    this->failed = false;
3599
3600    this->base_ir = NULL;
3601    this->current_annotation = NULL;
3602    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3603
3604    this->variable_ht = hash_table_ctor(0,
3605                                        hash_table_pointer_hash,
3606                                        hash_table_pointer_compare);
3607
3608    this->virtual_grf_start = NULL;
3609    this->virtual_grf_end = NULL;
3610    this->live_intervals = NULL;
3611
3612    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3613
3614    this->uniforms = 0;
3615
3616    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3617     * at least one. See setup_uniforms() in brw_vec4.cpp.
3618     */
3619    this->uniform_array_size = 1;
3620    if (prog_data) {
3621       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3622    }
3623
3624    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3625    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3626 }
3627
3628 vec4_visitor::~vec4_visitor()
3629 {
3630    hash_table_dtor(this->variable_ht);
3631 }
3632
3633
3634 void
3635 vec4_visitor::fail(const char *format, ...)
3636 {
3637    va_list va;
3638    char *msg;
3639
3640    if (failed)
3641       return;
3642
3643    failed = true;
3644
3645    va_start(va, format);
3646    msg = ralloc_vasprintf(mem_ctx, format, va);
3647    va_end(va);
3648    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3649
3650    this->fail_msg = msg;
3651
3652    if (debug_flag) {
3653       fprintf(stderr, "%s",  msg);
3654    }
3655 }
3656
3657 } /* namespace brw */