src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  70                           vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(block, new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  82                    const src_reg &src1, const src_reg &src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  91                    const src_reg &src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 119                                            src0);                       \
 120    }
 121
 122 #define ALU2(op)                                                        \
 123    vec4_instruction *                                                   \
 124    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 125                     const src_reg &src1)                                \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0, src1);                 \
 129    }
 130
 131 #define ALU2_ACC(op)                                                    \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 134                     const src_reg &src1)                                \
 135    {                                                                    \
 136       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 137                        BRW_OPCODE_##op, dst, src0, src1);               \
 138       inst->writes_accumulator = true;                                 \
 139       return inst;                                                     \
 140    }
 141
 142 #define ALU3(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 145                     const src_reg &src1, const src_reg &src2)           \
 146    {                                                                    \
 147       assert(brw->gen >= 6);                                            \
 148       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 149                                            src0, src1, src2);           \
 150    }
 151
 152 ALU1(NOT)
 153 ALU1(MOV)
 154 ALU1(FRC)
 155 ALU1(RNDD)
 156 ALU1(RNDE)
 157 ALU1(RNDZ)
 158 ALU1(F32TO16)
 159 ALU1(F16TO32)
 160 ALU2(ADD)
 161 ALU2(MUL)
 162 ALU2_ACC(MACH)
 163 ALU2(AND)
 164 ALU2(OR)
 165 ALU2(XOR)
 166 ALU2(DP3)
 167 ALU2(DP4)
 168 ALU2(DPH)
 169 ALU2(SHL)
 170 ALU2(SHR)
 171 ALU2(ASR)
 172 ALU3(LRP)
 173 ALU1(BFREV)
 174 ALU3(BFE)
 175 ALU2(BFI1)
 176 ALU3(BFI2)
 177 ALU1(FBH)
 178 ALU1(FBL)
 179 ALU1(CBIT)
 180 ALU3(MAD)
 181 ALU2_ACC(ADDC)
 182 ALU2_ACC(SUBB)
 183 ALU2(MAC)
 184
 185 /** Gen4 predicated IF. */
 186 vec4_instruction *
 187 vec4_visitor::IF(enum brw_predicate predicate)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 vec4_instruction *
 199 vec4_visitor::IF(src_reg src0, src_reg src1,
 200                  enum brw_conditional_mod condition)
 201 {
 202    assert(brw->gen == 6);
 203
 204    vec4_instruction *inst;
 205
 206    resolve_ud_negate(&src0);
 207    resolve_ud_negate(&src1);
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 210                                         src0, src1);
 211    inst->conditional_mod = condition;
 212
 213    return inst;
 214 }
 215
 216 /**
 217  * CMP: Sets the low bit of the destination channels with the result
 218  * of the comparison, while the upper bits are undefined, and updates
 219  * the flag register with the packed 16 bits of the result.
 220  */
 221 vec4_instruction *
 222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 223                   enum brw_conditional_mod condition)
 224 {
 225    vec4_instruction *inst;
 226
 227    /* original gen4 does type conversion to the destination type
 228     * before before comparison, producing garbage results for floating
 229     * point comparisons.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 vec4_instruction *
 247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 248 {
 249    vec4_instruction *inst;
 250
 251    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 252                                         dst, index);
 253    inst->base_mrf = 14;
 254    inst->mlen = 2;
 255
 256    return inst;
 257 }
 258
 259 vec4_instruction *
 260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 261                             const src_reg &index)
 262 {
 263    vec4_instruction *inst;
 264
 265    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 266                                         dst, src, index);
 267    inst->base_mrf = 13;
 268    inst->mlen = 3;
 269
 270    return inst;
 271 }
 272
 273 void
 274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 275 {
 276    static enum opcode dot_opcodes[] = {
 277       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 278    };
 279
 280    emit(dot_opcodes[elements - 2], dst, src0, src1);
 281 }
 282
 283 src_reg
 284 vec4_visitor::fix_3src_operand(src_reg src)
 285 {
 286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 287     * able to use vertical stride of zero to replicate the vec4 uniform, like
 288     *
 289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 290     *
 291     * But you can't, since vertical stride is always four in three-source
 292     * instructions. Instead, insert a MOV instruction to do the replication so
 293     * that the three-source instruction can consume it.
 294     */
 295
 296    /* The MOV is only needed if the source is a uniform or immediate. */
 297    if (src.file != UNIFORM && src.file != IMM)
 298       return src;
 299
 300    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 src_reg
 310 vec4_visitor::fix_math_operand(src_reg src)
 311 {
 312    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 313       return src;
 314
 315    /* The gen6 math instruction ignores the source modifiers --
 316     * swizzle, abs, negate, and at least some parts of the register
 317     * region description.
 318     *
 319     * Rather than trying to enumerate all these cases, *always* expand the
 320     * operand to a temp GRF for gen6.
 321     *
 322     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 323     * can't use.
 324     */
 325
 326    if (brw->gen == 7 && src.file != IMM)
 327       return src;
 328
 329    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 330    expanded.type = src.type;
 331    emit(MOV(expanded, src));
 332    return src_reg(expanded);
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(enum opcode opcode,
 337                         const dst_reg &dst,
 338                         const src_reg &src0, const src_reg &src1)
 339 {
 340    vec4_instruction *math =
 341       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 342
 343    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 344       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 345       math->dst = dst_reg(this, glsl_type::vec4_type);
 346       math->dst.type = dst.type;
 347       emit(MOV(dst, src_reg(math->dst)));
 348    } else if (brw->gen < 6) {
 349       math->base_mrf = 1;
 350       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 351    }
 352 }
 353
 354 void
 355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 356 {
 357    if (brw->gen < 7) {
 358       unreachable("ir_unop_pack_half_2x16 should be lowered");
 359    }
 360
 361    assert(dst.type == BRW_REGISTER_TYPE_UD);
 362    assert(src0.type == BRW_REGISTER_TYPE_F);
 363
 364    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 365     *
 366     *   Because this instruction does not have a 16-bit floating-point type,
 367     *   the destination data type must be Word (W).
 368     *
 369     *   The destination must be DWord-aligned and specify a horizontal stride
 370     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 371     *   each destination channel and the upper word is not modified.
 372     *
 373     * The above restriction implies that the f32to16 instruction must use
 374     * align1 mode, because only in align1 mode is it possible to specify
 375     * horizontal stride.  We choose here to defy the hardware docs and emit
 376     * align16 instructions.
 377     *
 378     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 379     * instructions. I was partially successful in that the code passed all
 380     * tests.  However, the code was dubiously correct and fragile, and the
 381     * tests were not harsh enough to probe that frailty. Not trusting the
 382     * code, I chose instead to remain in align16 mode in defiance of the hw
 383     * docs).
 384     *
 385     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 386     * simulator, emitting a f32to16 in align16 mode with UD as destination
 387     * data type is safe. The behavior differs from that specified in the PRM
 388     * in that the upper word of each destination channel is cleared to 0.
 389     */
 390
 391    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 392    src_reg tmp_src(tmp_dst);
 393
 394 #if 0
 395    /* Verify the undocumented behavior on which the following instructions
 396     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 397     * then the result of the bit-or instruction below will be incorrect.
 398     *
 399     * You should inspect the disasm output in order to verify that the MOV is
 400     * not optimized away.
 401     */
 402    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 403 #endif
 404
 405    /* Give tmp the form below, where "." means untouched.
 406     *
 407     *     w z          y          x w z          y          x
 408     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 409     *
 410     * That the upper word of each write-channel be 0 is required for the
 411     * following bit-shift and bit-or instructions to work. Note that this
 412     * relies on the undocumented hardware behavior mentioned above.
 413     */
 414    tmp_dst.writemask = WRITEMASK_XY;
 415    emit(F32TO16(tmp_dst, src0));
 416
 417    /* Give the write-channels of dst the form:
 418     *   0xhhhh0000
 419     */
 420    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 421    emit(SHL(dst, tmp_src, src_reg(16u)));
 422
 423    /* Finally, give the write-channels of dst the form of packHalf2x16's
 424     * output:
 425     *   0xhhhhllll
 426     */
 427    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 428    emit(OR(dst, src_reg(dst), tmp_src));
 429 }
 430
 431 void
 432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 433 {
 434    if (brw->gen < 7) {
 435       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 436    }
 437
 438    assert(dst.type == BRW_REGISTER_TYPE_F);
 439    assert(src0.type == BRW_REGISTER_TYPE_UD);
 440
 441    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 442     *
 443     *   Because this instruction does not have a 16-bit floating-point type,
 444     *   the source data type must be Word (W). The destination type must be
 445     *   F (Float).
 446     *
 447     * To use W as the source data type, we must adjust horizontal strides,
 448     * which is only possible in align1 mode. All my [chadv] attempts at
 449     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 450     * Piglit tests, so I gave up.
 451     *
 452     * I've verified that, on gen7 hardware and the simulator, it is safe to
 453     * emit f16to32 in align16 mode with UD as source data type.
 454     */
 455
 456    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 457    src_reg tmp_src(tmp_dst);
 458
 459    tmp_dst.writemask = WRITEMASK_X;
 460    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 461
 462    tmp_dst.writemask = WRITEMASK_Y;
 463    emit(SHR(tmp_dst, src0, src_reg(16u)));
 464
 465    dst.writemask = WRITEMASK_XY;
 466    emit(F16TO32(dst, tmp_src));
 467 }
 468
 469 void
 470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 471 {
 472    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 473     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 474     * is not suitable to generate the shift values, but we can use the packed
 475     * vector float and a type-converting MOV.
 476     */
 477    dst_reg shift(this, glsl_type::uvec4_type);
 478    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 479
 480    dst_reg shifted(this, glsl_type::uvec4_type);
 481    src0.swizzle = BRW_SWIZZLE_XXXX;
 482    emit(SHR(shifted, src0, src_reg(shift)));
 483
 484    shifted.type = BRW_REGISTER_TYPE_UB;
 485    dst_reg f(this, glsl_type::vec4_type);
 486    emit(MOV(f, src_reg(shifted)));
 487
 488    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 489 }
 490
 491 void
 492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 493 {
 494    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 495     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 496     * is not suitable to generate the shift values, but we can use the packed
 497     * vector float and a type-converting MOV.
 498     */
 499    dst_reg shift(this, glsl_type::uvec4_type);
 500    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 501
 502    dst_reg shifted(this, glsl_type::uvec4_type);
 503    src0.swizzle = BRW_SWIZZLE_XXXX;
 504    emit(SHR(shifted, src0, src_reg(shift)));
 505
 506    shifted.type = BRW_REGISTER_TYPE_B;
 507    dst_reg f(this, glsl_type::vec4_type);
 508    emit(MOV(f, src_reg(shifted)));
 509
 510    dst_reg scaled(this, glsl_type::vec4_type);
 511    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 512
 513    dst_reg max(this, glsl_type::vec4_type);
 514    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 515    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 516 }
 517
 518 void
 519 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 520 {
 521    dst_reg saturated(this, glsl_type::vec4_type);
 522    vec4_instruction *inst = emit(MOV(saturated, src0));
 523    inst->saturate = true;
 524
 525    dst_reg scaled(this, glsl_type::vec4_type);
 526    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 527
 528    dst_reg rounded(this, glsl_type::vec4_type);
 529    emit(RNDE(rounded, src_reg(scaled)));
 530
 531    dst_reg u(this, glsl_type::uvec4_type);
 532    emit(MOV(u, src_reg(rounded)));
 533
 534    src_reg bytes(u);
 535    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 536 }
 537
 538 void
 539 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 540 {
 541    dst_reg max(this, glsl_type::vec4_type);
 542    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 543
 544    dst_reg min(this, glsl_type::vec4_type);
 545    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 546
 547    dst_reg scaled(this, glsl_type::vec4_type);
 548    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 549
 550    dst_reg rounded(this, glsl_type::vec4_type);
 551    emit(RNDE(rounded, src_reg(scaled)));
 552
 553    dst_reg i(this, glsl_type::ivec4_type);
 554    emit(MOV(i, src_reg(rounded)));
 555
 556    src_reg bytes(i);
 557    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 558 }
 559
 560 void
 561 vec4_visitor::visit_instructions(const exec_list *list)
 562 {
 563    foreach_in_list(ir_instruction, ir, list) {
 564       base_ir = ir;
 565       ir->accept(this);
 566    }
 567 }
 568
 569
 570 static int
 571 type_size(const struct glsl_type *type)
 572 {
 573    unsigned int i;
 574    int size;
 575
 576    switch (type->base_type) {
 577    case GLSL_TYPE_UINT:
 578    case GLSL_TYPE_INT:
 579    case GLSL_TYPE_FLOAT:
 580    case GLSL_TYPE_BOOL:
 581       if (type->is_matrix()) {
 582          return type->matrix_columns;
 583       } else {
 584          /* Regardless of size of vector, it gets a vec4. This is bad
 585           * packing for things like floats, but otherwise arrays become a
 586           * mess.  Hopefully a later pass over the code can pack scalars
 587           * down if appropriate.
 588           */
 589          return 1;
 590       }
 591    case GLSL_TYPE_ARRAY:
 592       assert(type->length > 0);
 593       return type_size(type->fields.array) * type->length;
 594    case GLSL_TYPE_STRUCT:
 595       size = 0;
 596       for (i = 0; i < type->length; i++) {
 597          size += type_size(type->fields.structure[i].type);
 598       }
 599       return size;
 600    case GLSL_TYPE_SAMPLER:
 601       /* Samplers take up no register space, since they're baked in at
 602        * link time.
 603        */
 604       return 0;
 605    case GLSL_TYPE_ATOMIC_UINT:
 606       return 0;
 607    case GLSL_TYPE_IMAGE:
 608    case GLSL_TYPE_VOID:
 609    case GLSL_TYPE_ERROR:
 610    case GLSL_TYPE_INTERFACE:
 611       unreachable("not reached");
 612    }
 613
 614    return 0;
 615 }
 616
 617 int
 618 vec4_visitor::virtual_grf_alloc(int size)
 619 {
 620    if (virtual_grf_array_size <= virtual_grf_count) {
 621       if (virtual_grf_array_size == 0)
 622          virtual_grf_array_size = 16;
 623       else
 624          virtual_grf_array_size *= 2;
 625       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 626                                    virtual_grf_array_size);
 627       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 628                                      virtual_grf_array_size);
 629    }
 630    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 631    virtual_grf_reg_count += size;
 632    virtual_grf_sizes[virtual_grf_count] = size;
 633    return virtual_grf_count++;
 634 }
 635
 636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 637 {
 638    init();
 639
 640    this->file = GRF;
 641    this->reg = v->virtual_grf_alloc(type_size(type));
 642
 643    if (type->is_array() || type->is_record()) {
 644       this->swizzle = BRW_SWIZZLE_NOOP;
 645    } else {
 646       this->swizzle = swizzle_for_size(type->vector_elements);
 647    }
 648
 649    this->type = brw_type_for_base_type(type);
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 653 {
 654    assert(size > 0);
 655
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 660
 661    this->swizzle = BRW_SWIZZLE_NOOP;
 662
 663    this->type = brw_type_for_base_type(type);
 664 }
 665
 666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 667 {
 668    init();
 669
 670    this->file = GRF;
 671    this->reg = v->virtual_grf_alloc(type_size(type));
 672
 673    if (type->is_array() || type->is_record()) {
 674       this->writemask = WRITEMASK_XYZW;
 675    } else {
 676       this->writemask = (1 << type->vector_elements) - 1;
 677    }
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 /* Our support for uniforms is piggy-backed on the struct
 683  * gl_fragment_program, because that's where the values actually
 684  * get stored, rather than in some global gl_shader_program uniform
 685  * store.
 686  */
 687 void
 688 vec4_visitor::setup_uniform_values(ir_variable *ir)
 689 {
 690    int namelen = strlen(ir->name);
 691
 692    /* The data for our (non-builtin) uniforms is stored in a series of
 693     * gl_uniform_driver_storage structs for each subcomponent that
 694     * glGetUniformLocation() could name.  We know it's been set up in the same
 695     * order we'd walk the type, so walk the list of storage and find anything
 696     * with our name, or the prefix of a component that starts with our name.
 697     */
 698    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 699       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 700
 701       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 702           (storage->name[namelen] != 0 &&
 703            storage->name[namelen] != '.' &&
 704            storage->name[namelen] != '[')) {
 705          continue;
 706       }
 707
 708       gl_constant_value *components = storage->storage;
 709       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 710                                storage->type->matrix_columns);
 711
 712       for (unsigned s = 0; s < vector_count; s++) {
 713          assert(uniforms < uniform_array_size);
 714          uniform_vector_size[uniforms] = storage->type->vector_elements;
 715
 716          int i;
 717          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 718             stage_prog_data->param[uniforms * 4 + i] = components;
 719             components++;
 720          }
 721          for (; i < 4; i++) {
 722             static gl_constant_value zero = { 0.0 };
 723             stage_prog_data->param[uniforms * 4 + i] = &zero;
 724          }
 725
 726          uniforms++;
 727       }
 728    }
 729 }
 730
 731 void
 732 vec4_visitor::setup_uniform_clipplane_values()
 733 {
 734    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 735
 736    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 737       assert(this->uniforms < uniform_array_size);
 738       this->uniform_vector_size[this->uniforms] = 4;
 739       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 740       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 741       for (int j = 0; j < 4; ++j) {
 742          stage_prog_data->param[this->uniforms * 4 + j] =
 743             (gl_constant_value *) &clip_planes[i][j];
 744       }
 745       ++this->uniforms;
 746    }
 747 }
 748
 749 /* Our support for builtin uniforms is even scarier than non-builtin.
 750  * It sits on top of the PROG_STATE_VAR parameters that are
 751  * automatically updated from GL context state.
 752  */
 753 void
 754 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 755 {
 756    const ir_state_slot *const slots = ir->get_state_slots();
 757    assert(slots != NULL);
 758
 759    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 760       /* This state reference has already been setup by ir_to_mesa,
 761        * but we'll get the same index back here.  We can reference
 762        * ParameterValues directly, since unlike brw_fs.cpp, we never
 763        * add new state references during compile.
 764        */
 765       int index = _mesa_add_state_reference(this->prog->Parameters,
 766                                             (gl_state_index *)slots[i].tokens);
 767       gl_constant_value *values =
 768          &this->prog->Parameters->ParameterValues[index][0];
 769
 770       assert(this->uniforms < uniform_array_size);
 771       this->uniform_vector_size[this->uniforms] = 0;
 772       /* Add each of the unique swizzled channels of the element.
 773        * This will end up matching the size of the glsl_type of this field.
 774        */
 775       int last_swiz = -1;
 776       for (unsigned int j = 0; j < 4; j++) {
 777          int swiz = GET_SWZ(slots[i].swizzle, j);
 778          last_swiz = swiz;
 779
 780          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 781          assert(this->uniforms < uniform_array_size);
 782          if (swiz <= last_swiz)
 783             this->uniform_vector_size[this->uniforms]++;
 784       }
 785       this->uniforms++;
 786    }
 787 }
 788
 789 dst_reg *
 790 vec4_visitor::variable_storage(ir_variable *var)
 791 {
 792    return (dst_reg *)hash_table_find(this->variable_ht, var);
 793 }
 794
 795 void
 796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 797                                      enum brw_predicate *predicate)
 798 {
 799    ir_expression *expr = ir->as_expression();
 800
 801    *predicate = BRW_PREDICATE_NORMAL;
 802
 803    if (expr && expr->operation != ir_binop_ubo_load) {
 804       src_reg op[3];
 805       vec4_instruction *inst;
 806
 807       assert(expr->get_num_operands() <= 3);
 808       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 809          expr->operands[i]->accept(this);
 810          op[i] = this->result;
 811
 812          resolve_ud_negate(&op[i]);
 813       }
 814
 815       switch (expr->operation) {
 816       case ir_unop_logic_not:
 817          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 818          inst->conditional_mod = BRW_CONDITIONAL_Z;
 819          break;
 820
 821       case ir_binop_logic_xor:
 822          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_binop_logic_or:
 827          inst = emit(OR(dst_null_d(), op[0], op[1]));
 828          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 829          break;
 830
 831       case ir_binop_logic_and:
 832          inst = emit(AND(dst_null_d(), op[0], op[1]));
 833          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 834          break;
 835
 836       case ir_unop_f2b:
 837          if (brw->gen >= 6) {
 838             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 839          } else {
 840             inst = emit(MOV(dst_null_f(), op[0]));
 841             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 842          }
 843          break;
 844
 845       case ir_unop_i2b:
 846          if (brw->gen >= 6) {
 847             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 848          } else {
 849             inst = emit(MOV(dst_null_d(), op[0]));
 850             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 851          }
 852          break;
 853
 854       case ir_binop_all_equal:
 855          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 856          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 857          break;
 858
 859       case ir_binop_any_nequal:
 860          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 861          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 862          break;
 863
 864       case ir_unop_any:
 865          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 866          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 867          break;
 868
 869       case ir_binop_greater:
 870       case ir_binop_gequal:
 871       case ir_binop_less:
 872       case ir_binop_lequal:
 873       case ir_binop_equal:
 874       case ir_binop_nequal:
 875          emit(CMP(dst_null_d(), op[0], op[1],
 876                   brw_conditional_for_comparison(expr->operation)));
 877          break;
 878
 879       case ir_triop_csel: {
 880          /* Expand the boolean condition into the flag register. */
 881          inst = emit(MOV(dst_null_d(), op[0]));
 882          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 883
 884          /* Select which boolean to return. */
 885          dst_reg temp(this, expr->operands[1]->type);
 886          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 887          inst->predicate = BRW_PREDICATE_NORMAL;
 888
 889          /* Expand the result to a condition code. */
 890          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 891          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 892          break;
 893       }
 894
 895       default:
 896          unreachable("not reached");
 897       }
 898       return;
 899    }
 900
 901    ir->accept(this);
 902
 903    resolve_ud_negate(&this->result);
 904
 905    if (brw->gen >= 6) {
 906       vec4_instruction *inst = emit(AND(dst_null_d(),
 907                                         this->result, src_reg(1)));
 908       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 909    } else {
 910       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 911       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 912    }
 913 }
 914
 915 /**
 916  * Emit a gen6 IF statement with the comparison folded into the IF
 917  * instruction.
 918  */
 919 void
 920 vec4_visitor::emit_if_gen6(ir_if *ir)
 921 {
 922    ir_expression *expr = ir->condition->as_expression();
 923
 924    if (expr && expr->operation != ir_binop_ubo_load) {
 925       src_reg op[3];
 926       dst_reg temp;
 927
 928       assert(expr->get_num_operands() <= 3);
 929       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 930          expr->operands[i]->accept(this);
 931          op[i] = this->result;
 932       }
 933
 934       switch (expr->operation) {
 935       case ir_unop_logic_not:
 936          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 937          return;
 938
 939       case ir_binop_logic_xor:
 940          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 941          return;
 942
 943       case ir_binop_logic_or:
 944          temp = dst_reg(this, glsl_type::bool_type);
 945          emit(OR(temp, op[0], op[1]));
 946          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 947          return;
 948
 949       case ir_binop_logic_and:
 950          temp = dst_reg(this, glsl_type::bool_type);
 951          emit(AND(temp, op[0], op[1]));
 952          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 953          return;
 954
 955       case ir_unop_f2b:
 956          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958
 959       case ir_unop_i2b:
 960          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 961          return;
 962
 963       case ir_binop_greater:
 964       case ir_binop_gequal:
 965       case ir_binop_less:
 966       case ir_binop_lequal:
 967       case ir_binop_equal:
 968       case ir_binop_nequal:
 969          emit(IF(op[0], op[1],
 970                  brw_conditional_for_comparison(expr->operation)));
 971          return;
 972
 973       case ir_binop_all_equal:
 974          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 975          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 976          return;
 977
 978       case ir_binop_any_nequal:
 979          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 980          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 981          return;
 982
 983       case ir_unop_any:
 984          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 985          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 986          return;
 987
 988       case ir_triop_csel: {
 989          /* Expand the boolean condition into the flag register. */
 990          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 991          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 992
 993          /* Select which boolean to return. */
 994          dst_reg temp(this, expr->operands[1]->type);
 995          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 996          inst->predicate = BRW_PREDICATE_NORMAL;
 997
 998          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 999          return;
1000       }
1001
1002       default:
1003          unreachable("not reached");
1004       }
1005       return;
1006    }
1007
1008    ir->condition->accept(this);
1009
1010    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1011 }
1012
1013 void
1014 vec4_visitor::visit(ir_variable *ir)
1015 {
1016    dst_reg *reg = NULL;
1017
1018    if (variable_storage(ir))
1019       return;
1020
1021    switch (ir->data.mode) {
1022    case ir_var_shader_in:
1023       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1024       break;
1025
1026    case ir_var_shader_out:
1027       reg = new(mem_ctx) dst_reg(this, ir->type);
1028
1029       for (int i = 0; i < type_size(ir->type); i++) {
1030          output_reg[ir->data.location + i] = *reg;
1031          output_reg[ir->data.location + i].reg_offset = i;
1032          output_reg[ir->data.location + i].type =
1033             brw_type_for_base_type(ir->type->get_scalar_type());
1034          output_reg_annotation[ir->data.location + i] = ir->name;
1035       }
1036       break;
1037
1038    case ir_var_auto:
1039    case ir_var_temporary:
1040       reg = new(mem_ctx) dst_reg(this, ir->type);
1041       break;
1042
1043    case ir_var_uniform:
1044       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1045
1046       /* Thanks to the lower_ubo_reference pass, we will see only
1047        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1048        * variables, so no need for them to be in variable_ht.
1049        *
1050        * Some uniforms, such as samplers and atomic counters, have no actual
1051        * storage, so we should ignore them.
1052        */
1053       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1054          return;
1055
1056       /* Track how big the whole uniform variable is, in case we need to put a
1057        * copy of its data into pull constants for array access.
1058        */
1059       assert(this->uniforms < uniform_array_size);
1060       this->uniform_size[this->uniforms] = type_size(ir->type);
1061
1062       if (!strncmp(ir->name, "gl_", 3)) {
1063          setup_builtin_uniform_values(ir);
1064       } else {
1065          setup_uniform_values(ir);
1066       }
1067       break;
1068
1069    case ir_var_system_value:
1070       reg = make_reg_for_system_value(ir);
1071       break;
1072
1073    default:
1074       unreachable("not reached");
1075    }
1076
1077    reg->type = brw_type_for_base_type(ir->type);
1078    hash_table_insert(this->variable_ht, reg, ir);
1079 }
1080
1081 void
1082 vec4_visitor::visit(ir_loop *ir)
1083 {
1084    /* We don't want debugging output to print the whole body of the
1085     * loop as the annotation.
1086     */
1087    this->base_ir = NULL;
1088
1089    emit(BRW_OPCODE_DO);
1090
1091    visit_instructions(&ir->body_instructions);
1092
1093    emit(BRW_OPCODE_WHILE);
1094 }
1095
1096 void
1097 vec4_visitor::visit(ir_loop_jump *ir)
1098 {
1099    switch (ir->mode) {
1100    case ir_loop_jump::jump_break:
1101       emit(BRW_OPCODE_BREAK);
1102       break;
1103    case ir_loop_jump::jump_continue:
1104       emit(BRW_OPCODE_CONTINUE);
1105       break;
1106    }
1107 }
1108
1109
1110 void
1111 vec4_visitor::visit(ir_function_signature *)
1112 {
1113    unreachable("not reached");
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_function *ir)
1118 {
1119    /* Ignore function bodies other than main() -- we shouldn't see calls to
1120     * them since they should all be inlined.
1121     */
1122    if (strcmp(ir->name, "main") == 0) {
1123       const ir_function_signature *sig;
1124       exec_list empty;
1125
1126       sig = ir->matching_signature(NULL, &empty, false);
1127
1128       assert(sig);
1129
1130       visit_instructions(&sig->body);
1131    }
1132 }
1133
1134 bool
1135 vec4_visitor::try_emit_mad(ir_expression *ir)
1136 {
1137    /* 3-src instructions were introduced in gen6. */
1138    if (brw->gen < 6)
1139       return false;
1140
1141    /* MAD can only handle floating-point data. */
1142    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1143       return false;
1144
1145    ir_rvalue *nonmul = ir->operands[1];
1146    ir_expression *mul = ir->operands[0]->as_expression();
1147
1148    if (!mul || mul->operation != ir_binop_mul) {
1149       nonmul = ir->operands[0];
1150       mul = ir->operands[1]->as_expression();
1151
1152       if (!mul || mul->operation != ir_binop_mul)
1153          return false;
1154    }
1155
1156    nonmul->accept(this);
1157    src_reg src0 = fix_3src_operand(this->result);
1158
1159    mul->operands[0]->accept(this);
1160    src_reg src1 = fix_3src_operand(this->result);
1161
1162    mul->operands[1]->accept(this);
1163    src_reg src2 = fix_3src_operand(this->result);
1164
1165    this->result = src_reg(this, ir->type);
1166    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1167
1168    return true;
1169 }
1170
1171 bool
1172 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1173 {
1174    /* This optimization relies on CMP setting the destination to 0 when
1175     * false.  Early hardware only sets the least significant bit, and
1176     * leaves the other bits undefined.  So we can't use it.
1177     */
1178    if (brw->gen < 6)
1179       return false;
1180
1181    ir_expression *const cmp = ir->operands[0]->as_expression();
1182
1183    if (cmp == NULL)
1184       return false;
1185
1186    switch (cmp->operation) {
1187    case ir_binop_less:
1188    case ir_binop_greater:
1189    case ir_binop_lequal:
1190    case ir_binop_gequal:
1191    case ir_binop_equal:
1192    case ir_binop_nequal:
1193       break;
1194
1195    default:
1196       return false;
1197    }
1198
1199    cmp->operands[0]->accept(this);
1200    const src_reg cmp_src0 = this->result;
1201
1202    cmp->operands[1]->accept(this);
1203    const src_reg cmp_src1 = this->result;
1204
1205    this->result = src_reg(this, ir->type);
1206
1207    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1208             brw_conditional_for_comparison(cmp->operation)));
1209
1210    /* If the comparison is false, this->result will just happen to be zero.
1211     */
1212    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1213                                        this->result, src_reg(1.0f));
1214    inst->predicate = BRW_PREDICATE_NORMAL;
1215    inst->predicate_inverse = true;
1216
1217    return true;
1218 }
1219
1220 void
1221 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1222                           src_reg src0, src_reg src1)
1223 {
1224    vec4_instruction *inst;
1225
1226    if (brw->gen >= 6) {
1227       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1228       inst->conditional_mod = conditionalmod;
1229    } else {
1230       emit(CMP(dst, src0, src1, conditionalmod));
1231
1232       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1233       inst->predicate = BRW_PREDICATE_NORMAL;
1234    }
1235 }
1236
1237 void
1238 vec4_visitor::emit_lrp(const dst_reg &dst,
1239                        const src_reg &x, const src_reg &y, const src_reg &a)
1240 {
1241    if (brw->gen >= 6) {
1242       /* Note that the instruction's argument order is reversed from GLSL
1243        * and the IR.
1244        */
1245       emit(LRP(dst,
1246                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1247    } else {
1248       /* Earlier generations don't support three source operations, so we
1249        * need to emit x*(1-a) + y*a.
1250        */
1251       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1252       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1253       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1254       y_times_a.writemask           = dst.writemask;
1255       one_minus_a.writemask         = dst.writemask;
1256       x_times_one_minus_a.writemask = dst.writemask;
1257
1258       emit(MUL(y_times_a, y, a));
1259       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1260       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1261       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1262    }
1263 }
1264
1265 void
1266 vec4_visitor::visit(ir_expression *ir)
1267 {
1268    unsigned int operand;
1269    src_reg op[Elements(ir->operands)];
1270    vec4_instruction *inst;
1271
1272    if (ir->operation == ir_binop_add) {
1273       if (try_emit_mad(ir))
1274          return;
1275    }
1276
1277    if (ir->operation == ir_unop_b2f) {
1278       if (try_emit_b2f_of_compare(ir))
1279          return;
1280    }
1281
1282    /* Storage for our result.  Ideally for an assignment we'd be using
1283     * the actual storage for the result here, instead.
1284     */
1285    dst_reg result_dst(this, ir->type);
1286    src_reg result_src(result_dst);
1287
1288    if (ir->operation == ir_triop_csel) {
1289       ir->operands[1]->accept(this);
1290       op[1] = this->result;
1291       ir->operands[2]->accept(this);
1292       op[2] = this->result;
1293
1294       enum brw_predicate predicate;
1295       emit_bool_to_cond_code(ir->operands[0], &predicate);
1296       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1297       inst->predicate = predicate;
1298       this->result = result_src;
1299       return;
1300    }
1301
1302    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1303       this->result.file = BAD_FILE;
1304       ir->operands[operand]->accept(this);
1305       if (this->result.file == BAD_FILE) {
1306          fprintf(stderr, "Failed to get tree for expression operand:\n");
1307          ir->operands[operand]->fprint(stderr);
1308          exit(1);
1309       }
1310       op[operand] = this->result;
1311
1312       /* Matrix expression operands should have been broken down to vector
1313        * operations already.
1314        */
1315       assert(!ir->operands[operand]->type->is_matrix());
1316    }
1317
1318    /* If nothing special happens, this is the result. */
1319    this->result = result_src;
1320
1321    switch (ir->operation) {
1322    case ir_unop_logic_not:
1323       if (ctx->Const.UniformBooleanTrue != 1) {
1324          emit(NOT(result_dst, op[0]));
1325       } else {
1326          emit(XOR(result_dst, op[0], src_reg(1u)));
1327       }
1328       break;
1329    case ir_unop_neg:
1330       op[0].negate = !op[0].negate;
1331       emit(MOV(result_dst, op[0]));
1332       break;
1333    case ir_unop_abs:
1334       op[0].abs = true;
1335       op[0].negate = false;
1336       emit(MOV(result_dst, op[0]));
1337       break;
1338
1339    case ir_unop_sign:
1340       if (ir->type->is_float()) {
1341          /* AND(val, 0x80000000) gives the sign bit.
1342           *
1343           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1344           * zero.
1345           */
1346          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1347
1348          op[0].type = BRW_REGISTER_TYPE_UD;
1349          result_dst.type = BRW_REGISTER_TYPE_UD;
1350          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1351
1352          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1353          inst->predicate = BRW_PREDICATE_NORMAL;
1354
1355          this->result.type = BRW_REGISTER_TYPE_F;
1356       } else {
1357          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1358           *               -> non-negative val generates 0x00000000.
1359           *  Predicated OR sets 1 if val is positive.
1360           */
1361          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1362
1363          emit(ASR(result_dst, op[0], src_reg(31)));
1364
1365          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1366          inst->predicate = BRW_PREDICATE_NORMAL;
1367       }
1368       break;
1369
1370    case ir_unop_rcp:
1371       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1372       break;
1373
1374    case ir_unop_exp2:
1375       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1376       break;
1377    case ir_unop_log2:
1378       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1379       break;
1380    case ir_unop_exp:
1381    case ir_unop_log:
1382       unreachable("not reached: should be handled by ir_explog_to_explog2");
1383    case ir_unop_sin:
1384    case ir_unop_sin_reduced:
1385       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1386       break;
1387    case ir_unop_cos:
1388    case ir_unop_cos_reduced:
1389       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1390       break;
1391
1392    case ir_unop_dFdx:
1393    case ir_unop_dFdx_coarse:
1394    case ir_unop_dFdx_fine:
1395    case ir_unop_dFdy:
1396    case ir_unop_dFdy_coarse:
1397    case ir_unop_dFdy_fine:
1398       unreachable("derivatives not valid in vertex shader");
1399
1400    case ir_unop_bitfield_reverse:
1401       emit(BFREV(result_dst, op[0]));
1402       break;
1403    case ir_unop_bit_count:
1404       emit(CBIT(result_dst, op[0]));
1405       break;
1406    case ir_unop_find_msb: {
1407       src_reg temp = src_reg(this, glsl_type::uint_type);
1408
1409       inst = emit(FBH(dst_reg(temp), op[0]));
1410       inst->dst.writemask = WRITEMASK_XYZW;
1411
1412       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1413        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1414        * subtract the result from 31 to convert the MSB count into an LSB count.
1415        */
1416
1417       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1418       temp.swizzle = BRW_SWIZZLE_NOOP;
1419       emit(MOV(result_dst, temp));
1420
1421       src_reg src_tmp = src_reg(result_dst);
1422       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1423
1424       src_tmp.negate = true;
1425       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1426       inst->predicate = BRW_PREDICATE_NORMAL;
1427       break;
1428    }
1429    case ir_unop_find_lsb:
1430       emit(FBL(result_dst, op[0]));
1431       break;
1432    case ir_unop_saturate:
1433       inst = emit(MOV(result_dst, op[0]));
1434       inst->saturate = true;
1435       break;
1436
1437    case ir_unop_noise:
1438       unreachable("not reached: should be handled by lower_noise");
1439
1440    case ir_binop_add:
1441       emit(ADD(result_dst, op[0], op[1]));
1442       break;
1443    case ir_binop_sub:
1444       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1445
1446    case ir_binop_mul:
1447       if (brw->gen < 8 && ir->type->is_integer()) {
1448          /* For integer multiplication, the MUL uses the low 16 bits of one of
1449           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1450           * accumulates in the contribution of the upper 16 bits of that
1451           * operand.  If we can determine that one of the args is in the low
1452           * 16 bits, though, we can just emit a single MUL.
1453           */
1454          if (ir->operands[0]->is_uint16_constant()) {
1455             if (brw->gen < 7)
1456                emit(MUL(result_dst, op[0], op[1]));
1457             else
1458                emit(MUL(result_dst, op[1], op[0]));
1459          } else if (ir->operands[1]->is_uint16_constant()) {
1460             if (brw->gen < 7)
1461                emit(MUL(result_dst, op[1], op[0]));
1462             else
1463                emit(MUL(result_dst, op[0], op[1]));
1464          } else {
1465             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1466
1467             emit(MUL(acc, op[0], op[1]));
1468             emit(MACH(dst_null_d(), op[0], op[1]));
1469             emit(MOV(result_dst, src_reg(acc)));
1470          }
1471       } else {
1472          emit(MUL(result_dst, op[0], op[1]));
1473       }
1474       break;
1475    case ir_binop_imul_high: {
1476       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1477
1478       emit(MUL(acc, op[0], op[1]));
1479       emit(MACH(result_dst, op[0], op[1]));
1480       break;
1481    }
1482    case ir_binop_div:
1483       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1484       assert(ir->type->is_integer());
1485       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1486       break;
1487    case ir_binop_carry: {
1488       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1489
1490       emit(ADDC(dst_null_ud(), op[0], op[1]));
1491       emit(MOV(result_dst, src_reg(acc)));
1492       break;
1493    }
1494    case ir_binop_borrow: {
1495       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1496
1497       emit(SUBB(dst_null_ud(), op[0], op[1]));
1498       emit(MOV(result_dst, src_reg(acc)));
1499       break;
1500    }
1501    case ir_binop_mod:
1502       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1503       assert(ir->type->is_integer());
1504       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1505       break;
1506
1507    case ir_binop_less:
1508    case ir_binop_greater:
1509    case ir_binop_lequal:
1510    case ir_binop_gequal:
1511    case ir_binop_equal:
1512    case ir_binop_nequal: {
1513       emit(CMP(result_dst, op[0], op[1],
1514                brw_conditional_for_comparison(ir->operation)));
1515       if (ctx->Const.UniformBooleanTrue == 1) {
1516          emit(AND(result_dst, result_src, src_reg(1u)));
1517       }
1518       break;
1519    }
1520
1521    case ir_binop_all_equal:
1522       /* "==" operator producing a scalar boolean. */
1523       if (ir->operands[0]->type->is_vector() ||
1524           ir->operands[1]->type->is_vector()) {
1525          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1526          emit(MOV(result_dst, src_reg(0)));
1527          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1528          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1529       } else {
1530          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1531          if (ctx->Const.UniformBooleanTrue == 1) {
1532             emit(AND(result_dst, result_src, src_reg(1u)));
1533          }
1534       }
1535       break;
1536    case ir_binop_any_nequal:
1537       /* "!=" operator producing a scalar boolean. */
1538       if (ir->operands[0]->type->is_vector() ||
1539           ir->operands[1]->type->is_vector()) {
1540          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1541
1542          emit(MOV(result_dst, src_reg(0)));
1543          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1544          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1545       } else {
1546          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1547          if (ctx->Const.UniformBooleanTrue == 1) {
1548             emit(AND(result_dst, result_src, src_reg(1u)));
1549          }
1550       }
1551       break;
1552
1553    case ir_unop_any:
1554       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1555       emit(MOV(result_dst, src_reg(0)));
1556
1557       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1558       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1559       break;
1560
1561    case ir_binop_logic_xor:
1562       emit(XOR(result_dst, op[0], op[1]));
1563       break;
1564
1565    case ir_binop_logic_or:
1566       emit(OR(result_dst, op[0], op[1]));
1567       break;
1568
1569    case ir_binop_logic_and:
1570       emit(AND(result_dst, op[0], op[1]));
1571       break;
1572
1573    case ir_binop_dot:
1574       assert(ir->operands[0]->type->is_vector());
1575       assert(ir->operands[0]->type == ir->operands[1]->type);
1576       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1577       break;
1578
1579    case ir_unop_sqrt:
1580       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1581       break;
1582    case ir_unop_rsq:
1583       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1584       break;
1585
1586    case ir_unop_bitcast_i2f:
1587    case ir_unop_bitcast_u2f:
1588       this->result = op[0];
1589       this->result.type = BRW_REGISTER_TYPE_F;
1590       break;
1591
1592    case ir_unop_bitcast_f2i:
1593       this->result = op[0];
1594       this->result.type = BRW_REGISTER_TYPE_D;
1595       break;
1596
1597    case ir_unop_bitcast_f2u:
1598       this->result = op[0];
1599       this->result.type = BRW_REGISTER_TYPE_UD;
1600       break;
1601
1602    case ir_unop_i2f:
1603    case ir_unop_i2u:
1604    case ir_unop_u2i:
1605    case ir_unop_u2f:
1606    case ir_unop_f2i:
1607    case ir_unop_f2u:
1608       emit(MOV(result_dst, op[0]));
1609       break;
1610    case ir_unop_b2i:
1611       if (ctx->Const.UniformBooleanTrue != 1) {
1612          emit(AND(result_dst, op[0], src_reg(1u)));
1613       } else {
1614          emit(MOV(result_dst, op[0]));
1615       }
1616       break;
1617    case ir_unop_b2f:
1618       if (ctx->Const.UniformBooleanTrue != 1) {
1619          op[0].type = BRW_REGISTER_TYPE_UD;
1620          result_dst.type = BRW_REGISTER_TYPE_UD;
1621          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1622          result_dst.type = BRW_REGISTER_TYPE_F;
1623       } else {
1624          emit(MOV(result_dst, op[0]));
1625       }
1626       break;
1627    case ir_unop_f2b:
1628    case ir_unop_i2b:
1629       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1630       if (ctx->Const.UniformBooleanTrue == 1) {
1631          emit(AND(result_dst, result_src, src_reg(1u)));
1632       }
1633       break;
1634
1635    case ir_unop_trunc:
1636       emit(RNDZ(result_dst, op[0]));
1637       break;
1638    case ir_unop_ceil:
1639       op[0].negate = !op[0].negate;
1640       inst = emit(RNDD(result_dst, op[0]));
1641       this->result.negate = true;
1642       break;
1643    case ir_unop_floor:
1644       inst = emit(RNDD(result_dst, op[0]));
1645       break;
1646    case ir_unop_fract:
1647       inst = emit(FRC(result_dst, op[0]));
1648       break;
1649    case ir_unop_round_even:
1650       emit(RNDE(result_dst, op[0]));
1651       break;
1652
1653    case ir_binop_min:
1654       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1655       break;
1656    case ir_binop_max:
1657       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1658       break;
1659
1660    case ir_binop_pow:
1661       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1662       break;
1663
1664    case ir_unop_bit_not:
1665       inst = emit(NOT(result_dst, op[0]));
1666       break;
1667    case ir_binop_bit_and:
1668       inst = emit(AND(result_dst, op[0], op[1]));
1669       break;
1670    case ir_binop_bit_xor:
1671       inst = emit(XOR(result_dst, op[0], op[1]));
1672       break;
1673    case ir_binop_bit_or:
1674       inst = emit(OR(result_dst, op[0], op[1]));
1675       break;
1676
1677    case ir_binop_lshift:
1678       inst = emit(SHL(result_dst, op[0], op[1]));
1679       break;
1680
1681    case ir_binop_rshift:
1682       if (ir->type->base_type == GLSL_TYPE_INT)
1683          inst = emit(ASR(result_dst, op[0], op[1]));
1684       else
1685          inst = emit(SHR(result_dst, op[0], op[1]));
1686       break;
1687
1688    case ir_binop_bfm:
1689       emit(BFI1(result_dst, op[0], op[1]));
1690       break;
1691
1692    case ir_binop_ubo_load: {
1693       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1694       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1695       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1696       src_reg offset;
1697
1698       /* Now, load the vector from that offset. */
1699       assert(ir->type->is_vector() || ir->type->is_scalar());
1700
1701       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1702       packed_consts.type = result.type;
1703       src_reg surf_index;
1704
1705       if (const_uniform_block) {
1706          /* The block index is a constant, so just emit the binding table entry
1707           * as an immediate.
1708           */
1709          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1710                               const_uniform_block->value.u[0]);
1711       } else {
1712          /* The block index is not a constant. Evaluate the index expression
1713           * per-channel and add the base UBO index; the generator will select
1714           * a value from any live channel.
1715           */
1716          surf_index = src_reg(this, glsl_type::uint_type);
1717          emit(ADD(dst_reg(surf_index), op[0],
1718                   src_reg(prog_data->base.binding_table.ubo_start)));
1719
1720          /* Assume this may touch any UBO. It would be nice to provide
1721           * a tighter bound, but the array information is already lowered away.
1722           */
1723          brw_mark_surface_used(&prog_data->base,
1724                                prog_data->base.binding_table.ubo_start +
1725                                shader_prog->NumUniformBlocks - 1);
1726       }
1727
1728       if (const_offset_ir) {
1729          if (brw->gen >= 8) {
1730             /* Store the offset in a GRF so we can send-from-GRF. */
1731             offset = src_reg(this, glsl_type::int_type);
1732             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1733          } else {
1734             /* Immediates are fine on older generations since they'll be moved
1735              * to a (potentially fake) MRF at the generator level.
1736              */
1737             offset = src_reg(const_offset / 16);
1738          }
1739       } else {
1740          offset = src_reg(this, glsl_type::uint_type);
1741          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1742       }
1743
1744       if (brw->gen >= 7) {
1745          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1746          grf_offset.type = offset.type;
1747
1748          emit(MOV(grf_offset, offset));
1749
1750          emit(new(mem_ctx) vec4_instruction(this,
1751                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1752                                             dst_reg(packed_consts),
1753                                             surf_index,
1754                                             src_reg(grf_offset)));
1755       } else {
1756          vec4_instruction *pull =
1757             emit(new(mem_ctx) vec4_instruction(this,
1758                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1759                                                dst_reg(packed_consts),
1760                                                surf_index,
1761                                                offset));
1762          pull->base_mrf = 14;
1763          pull->mlen = 1;
1764       }
1765
1766       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1767       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1768                                             const_offset % 16 / 4,
1769                                             const_offset % 16 / 4,
1770                                             const_offset % 16 / 4);
1771
1772       /* UBO bools are any nonzero int.  We need to convert them to use the
1773        * value of true stored in ctx->Const.UniformBooleanTrue.
1774        */
1775       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1776          emit(CMP(result_dst, packed_consts, src_reg(0u),
1777                   BRW_CONDITIONAL_NZ));
1778          if (ctx->Const.UniformBooleanTrue == 1) {
1779             emit(AND(result_dst, result, src_reg(1u)));
1780          }
1781       } else {
1782          emit(MOV(result_dst, packed_consts));
1783       }
1784       break;
1785    }
1786
1787    case ir_binop_vector_extract:
1788       unreachable("should have been lowered by vec_index_to_cond_assign");
1789
1790    case ir_triop_fma:
1791       op[0] = fix_3src_operand(op[0]);
1792       op[1] = fix_3src_operand(op[1]);
1793       op[2] = fix_3src_operand(op[2]);
1794       /* Note that the instruction's argument order is reversed from GLSL
1795        * and the IR.
1796        */
1797       emit(MAD(result_dst, op[2], op[1], op[0]));
1798       break;
1799
1800    case ir_triop_lrp:
1801       emit_lrp(result_dst, op[0], op[1], op[2]);
1802       break;
1803
1804    case ir_triop_csel:
1805       unreachable("already handled above");
1806       break;
1807
1808    case ir_triop_bfi:
1809       op[0] = fix_3src_operand(op[0]);
1810       op[1] = fix_3src_operand(op[1]);
1811       op[2] = fix_3src_operand(op[2]);
1812       emit(BFI2(result_dst, op[0], op[1], op[2]));
1813       break;
1814
1815    case ir_triop_bitfield_extract:
1816       op[0] = fix_3src_operand(op[0]);
1817       op[1] = fix_3src_operand(op[1]);
1818       op[2] = fix_3src_operand(op[2]);
1819       /* Note that the instruction's argument order is reversed from GLSL
1820        * and the IR.
1821        */
1822       emit(BFE(result_dst, op[2], op[1], op[0]));
1823       break;
1824
1825    case ir_triop_vector_insert:
1826       unreachable("should have been lowered by lower_vector_insert");
1827
1828    case ir_quadop_bitfield_insert:
1829       unreachable("not reached: should be handled by "
1830               "bitfield_insert_to_bfm_bfi\n");
1831
1832    case ir_quadop_vector:
1833       unreachable("not reached: should be handled by lower_quadop_vector");
1834
1835    case ir_unop_pack_half_2x16:
1836       emit_pack_half_2x16(result_dst, op[0]);
1837       break;
1838    case ir_unop_unpack_half_2x16:
1839       emit_unpack_half_2x16(result_dst, op[0]);
1840       break;
1841    case ir_unop_unpack_unorm_4x8:
1842       emit_unpack_unorm_4x8(result_dst, op[0]);
1843       break;
1844    case ir_unop_unpack_snorm_4x8:
1845       emit_unpack_snorm_4x8(result_dst, op[0]);
1846       break;
1847    case ir_unop_pack_unorm_4x8:
1848       emit_pack_unorm_4x8(result_dst, op[0]);
1849       break;
1850    case ir_unop_pack_snorm_4x8:
1851       emit_pack_snorm_4x8(result_dst, op[0]);
1852       break;
1853    case ir_unop_pack_snorm_2x16:
1854    case ir_unop_pack_unorm_2x16:
1855    case ir_unop_unpack_snorm_2x16:
1856    case ir_unop_unpack_unorm_2x16:
1857       unreachable("not reached: should be handled by lower_packing_builtins");
1858    case ir_unop_unpack_half_2x16_split_x:
1859    case ir_unop_unpack_half_2x16_split_y:
1860    case ir_binop_pack_half_2x16_split:
1861    case ir_unop_interpolate_at_centroid:
1862    case ir_binop_interpolate_at_sample:
1863    case ir_binop_interpolate_at_offset:
1864       unreachable("not reached: should not occur in vertex shader");
1865    case ir_binop_ldexp:
1866       unreachable("not reached: should be handled by ldexp_to_arith()");
1867    }
1868 }
1869
1870
1871 void
1872 vec4_visitor::visit(ir_swizzle *ir)
1873 {
1874    src_reg src;
1875    int i = 0;
1876    int swizzle[4];
1877
1878    /* Note that this is only swizzles in expressions, not those on the left
1879     * hand side of an assignment, which do write masking.  See ir_assignment
1880     * for that.
1881     */
1882
1883    ir->val->accept(this);
1884    src = this->result;
1885    assert(src.file != BAD_FILE);
1886
1887    for (i = 0; i < ir->type->vector_elements; i++) {
1888       switch (i) {
1889       case 0:
1890          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1891          break;
1892       case 1:
1893          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1894          break;
1895       case 2:
1896          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1897          break;
1898       case 3:
1899          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1900             break;
1901       }
1902    }
1903    for (; i < 4; i++) {
1904       /* Replicate the last channel out. */
1905       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1906    }
1907
1908    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1909
1910    this->result = src;
1911 }
1912
1913 void
1914 vec4_visitor::visit(ir_dereference_variable *ir)
1915 {
1916    const struct glsl_type *type = ir->type;
1917    dst_reg *reg = variable_storage(ir->var);
1918
1919    if (!reg) {
1920       fail("Failed to find variable storage for %s\n", ir->var->name);
1921       this->result = src_reg(brw_null_reg());
1922       return;
1923    }
1924
1925    this->result = src_reg(*reg);
1926
1927    /* System values get their swizzle from the dst_reg writemask */
1928    if (ir->var->data.mode == ir_var_system_value)
1929       return;
1930
1931    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1932       this->result.swizzle = swizzle_for_size(type->vector_elements);
1933 }
1934
1935
1936 int
1937 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1938 {
1939    /* Under normal circumstances array elements are stored consecutively, so
1940     * the stride is equal to the size of the array element.
1941     */
1942    return type_size(ir->type);
1943 }
1944
1945
1946 void
1947 vec4_visitor::visit(ir_dereference_array *ir)
1948 {
1949    ir_constant *constant_index;
1950    src_reg src;
1951    int array_stride = compute_array_stride(ir);
1952
1953    constant_index = ir->array_index->constant_expression_value();
1954
1955    ir->array->accept(this);
1956    src = this->result;
1957
1958    if (constant_index) {
1959       src.reg_offset += constant_index->value.i[0] * array_stride;
1960    } else {
1961       /* Variable index array dereference.  It eats the "vec4" of the
1962        * base of the array and an index that offsets the Mesa register
1963        * index.
1964        */
1965       ir->array_index->accept(this);
1966
1967       src_reg index_reg;
1968
1969       if (array_stride == 1) {
1970          index_reg = this->result;
1971       } else {
1972          index_reg = src_reg(this, glsl_type::int_type);
1973
1974          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1975       }
1976
1977       if (src.reladdr) {
1978          src_reg temp = src_reg(this, glsl_type::int_type);
1979
1980          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1981
1982          index_reg = temp;
1983       }
1984
1985       src.reladdr = ralloc(mem_ctx, src_reg);
1986       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1987    }
1988
1989    /* If the type is smaller than a vec4, replicate the last channel out. */
1990    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1991       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1992    else
1993       src.swizzle = BRW_SWIZZLE_NOOP;
1994    src.type = brw_type_for_base_type(ir->type);
1995
1996    this->result = src;
1997 }
1998
1999 void
2000 vec4_visitor::visit(ir_dereference_record *ir)
2001 {
2002    unsigned int i;
2003    const glsl_type *struct_type = ir->record->type;
2004    int offset = 0;
2005
2006    ir->record->accept(this);
2007
2008    for (i = 0; i < struct_type->length; i++) {
2009       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2010          break;
2011       offset += type_size(struct_type->fields.structure[i].type);
2012    }
2013
2014    /* If the type is smaller than a vec4, replicate the last channel out. */
2015    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2016       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2017    else
2018       this->result.swizzle = BRW_SWIZZLE_NOOP;
2019    this->result.type = brw_type_for_base_type(ir->type);
2020
2021    this->result.reg_offset += offset;
2022 }
2023
2024 /**
2025  * We want to be careful in assignment setup to hit the actual storage
2026  * instead of potentially using a temporary like we might with the
2027  * ir_dereference handler.
2028  */
2029 static dst_reg
2030 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2031 {
2032    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2033     * access of a vector, it must be separated into a series conditional moves
2034     * before reaching this point (see ir_vec_index_to_cond_assign).
2035     */
2036    assert(ir->as_dereference());
2037    ir_dereference_array *deref_array = ir->as_dereference_array();
2038    if (deref_array) {
2039       assert(!deref_array->array->type->is_vector());
2040    }
2041
2042    /* Use the rvalue deref handler for the most part.  We'll ignore
2043     * swizzles in it and write swizzles using writemask, though.
2044     */
2045    ir->accept(v);
2046    return dst_reg(v->result);
2047 }
2048
2049 void
2050 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2051                               const struct glsl_type *type,
2052                               enum brw_predicate predicate)
2053 {
2054    if (type->base_type == GLSL_TYPE_STRUCT) {
2055       for (unsigned int i = 0; i < type->length; i++) {
2056          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2057       }
2058       return;
2059    }
2060
2061    if (type->is_array()) {
2062       for (unsigned int i = 0; i < type->length; i++) {
2063          emit_block_move(dst, src, type->fields.array, predicate);
2064       }
2065       return;
2066    }
2067
2068    if (type->is_matrix()) {
2069       const struct glsl_type *vec_type;
2070
2071       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2072                                          type->vector_elements, 1);
2073
2074       for (int i = 0; i < type->matrix_columns; i++) {
2075          emit_block_move(dst, src, vec_type, predicate);
2076       }
2077       return;
2078    }
2079
2080    assert(type->is_scalar() || type->is_vector());
2081
2082    dst->type = brw_type_for_base_type(type);
2083    src->type = dst->type;
2084
2085    dst->writemask = (1 << type->vector_elements) - 1;
2086
2087    src->swizzle = swizzle_for_size(type->vector_elements);
2088
2089    vec4_instruction *inst = emit(MOV(*dst, *src));
2090    inst->predicate = predicate;
2091
2092    dst->reg_offset++;
2093    src->reg_offset++;
2094 }
2095
2096
2097 /* If the RHS processing resulted in an instruction generating a
2098  * temporary value, and it would be easy to rewrite the instruction to
2099  * generate its result right into the LHS instead, do so.  This ends
2100  * up reliably removing instructions where it can be tricky to do so
2101  * later without real UD chain information.
2102  */
2103 bool
2104 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2105                                      dst_reg dst,
2106                                      src_reg src,
2107                                      vec4_instruction *pre_rhs_inst,
2108                                      vec4_instruction *last_rhs_inst)
2109 {
2110    /* This could be supported, but it would take more smarts. */
2111    if (ir->condition)
2112       return false;
2113
2114    if (pre_rhs_inst == last_rhs_inst)
2115       return false; /* No instructions generated to work with. */
2116
2117    /* Make sure the last instruction generated our source reg. */
2118    if (src.file != GRF ||
2119        src.file != last_rhs_inst->dst.file ||
2120        src.reg != last_rhs_inst->dst.reg ||
2121        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2122        src.reladdr ||
2123        src.abs ||
2124        src.negate ||
2125        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2126       return false;
2127
2128    /* Check that that last instruction fully initialized the channels
2129     * we want to use, in the order we want to use them.  We could
2130     * potentially reswizzle the operands of many instructions so that
2131     * we could handle out of order channels, but don't yet.
2132     */
2133
2134    for (unsigned i = 0; i < 4; i++) {
2135       if (dst.writemask & (1 << i)) {
2136          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2137             return false;
2138
2139          if (BRW_GET_SWZ(src.swizzle, i) != i)
2140             return false;
2141       }
2142    }
2143
2144    /* Success!  Rewrite the instruction. */
2145    last_rhs_inst->dst.file = dst.file;
2146    last_rhs_inst->dst.reg = dst.reg;
2147    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2148    last_rhs_inst->dst.reladdr = dst.reladdr;
2149    last_rhs_inst->dst.writemask &= dst.writemask;
2150
2151    return true;
2152 }
2153
2154 void
2155 vec4_visitor::visit(ir_assignment *ir)
2156 {
2157    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2158    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2159
2160    if (!ir->lhs->type->is_scalar() &&
2161        !ir->lhs->type->is_vector()) {
2162       ir->rhs->accept(this);
2163       src_reg src = this->result;
2164
2165       if (ir->condition) {
2166          emit_bool_to_cond_code(ir->condition, &predicate);
2167       }
2168
2169       /* emit_block_move doesn't account for swizzles in the source register.
2170        * This should be ok, since the source register is a structure or an
2171        * array, and those can't be swizzled.  But double-check to be sure.
2172        */
2173       assert(src.swizzle ==
2174              (ir->rhs->type->is_matrix()
2175               ? swizzle_for_size(ir->rhs->type->vector_elements)
2176               : BRW_SWIZZLE_NOOP));
2177
2178       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2179       return;
2180    }
2181
2182    /* Now we're down to just a scalar/vector with writemasks. */
2183    int i;
2184
2185    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2186    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2187
2188    ir->rhs->accept(this);
2189
2190    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2191
2192    src_reg src = this->result;
2193
2194    int swizzles[4];
2195    int first_enabled_chan = 0;
2196    int src_chan = 0;
2197
2198    assert(ir->lhs->type->is_vector() ||
2199           ir->lhs->type->is_scalar());
2200    dst.writemask = ir->write_mask;
2201
2202    for (int i = 0; i < 4; i++) {
2203       if (dst.writemask & (1 << i)) {
2204          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2205          break;
2206       }
2207    }
2208
2209    /* Swizzle a small RHS vector into the channels being written.
2210     *
2211     * glsl ir treats write_mask as dictating how many channels are
2212     * present on the RHS while in our instructions we need to make
2213     * those channels appear in the slots of the vec4 they're written to.
2214     */
2215    for (int i = 0; i < 4; i++) {
2216       if (dst.writemask & (1 << i))
2217          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2218       else
2219          swizzles[i] = first_enabled_chan;
2220    }
2221    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2222                               swizzles[2], swizzles[3]);
2223
2224    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2225       return;
2226    }
2227
2228    if (ir->condition) {
2229       emit_bool_to_cond_code(ir->condition, &predicate);
2230    }
2231
2232    for (i = 0; i < type_size(ir->lhs->type); i++) {
2233       vec4_instruction *inst = emit(MOV(dst, src));
2234       inst->predicate = predicate;
2235
2236       dst.reg_offset++;
2237       src.reg_offset++;
2238    }
2239 }
2240
2241 void
2242 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2243 {
2244    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2245       foreach_in_list(ir_constant, field_value, &ir->components) {
2246          emit_constant_values(dst, field_value);
2247       }
2248       return;
2249    }
2250
2251    if (ir->type->is_array()) {
2252       for (unsigned int i = 0; i < ir->type->length; i++) {
2253          emit_constant_values(dst, ir->array_elements[i]);
2254       }
2255       return;
2256    }
2257
2258    if (ir->type->is_matrix()) {
2259       for (int i = 0; i < ir->type->matrix_columns; i++) {
2260          float *vec = &ir->value.f[i * ir->type->vector_elements];
2261
2262          for (int j = 0; j < ir->type->vector_elements; j++) {
2263             dst->writemask = 1 << j;
2264             dst->type = BRW_REGISTER_TYPE_F;
2265
2266             emit(MOV(*dst, src_reg(vec[j])));
2267          }
2268          dst->reg_offset++;
2269       }
2270       return;
2271    }
2272
2273    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2274
2275    for (int i = 0; i < ir->type->vector_elements; i++) {
2276       if (!(remaining_writemask & (1 << i)))
2277          continue;
2278
2279       dst->writemask = 1 << i;
2280       dst->type = brw_type_for_base_type(ir->type);
2281
2282       /* Find other components that match the one we're about to
2283        * write.  Emits fewer instructions for things like vec4(0.5,
2284        * 1.5, 1.5, 1.5).
2285        */
2286       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2287          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2288             if (ir->value.b[i] == ir->value.b[j])
2289                dst->writemask |= (1 << j);
2290          } else {
2291             /* u, i, and f storage all line up, so no need for a
2292              * switch case for comparing each type.
2293              */
2294             if (ir->value.u[i] == ir->value.u[j])
2295                dst->writemask |= (1 << j);
2296          }
2297       }
2298
2299       switch (ir->type->base_type) {
2300       case GLSL_TYPE_FLOAT:
2301          emit(MOV(*dst, src_reg(ir->value.f[i])));
2302          break;
2303       case GLSL_TYPE_INT:
2304          emit(MOV(*dst, src_reg(ir->value.i[i])));
2305          break;
2306       case GLSL_TYPE_UINT:
2307          emit(MOV(*dst, src_reg(ir->value.u[i])));
2308          break;
2309       case GLSL_TYPE_BOOL:
2310          emit(MOV(*dst,
2311                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2312                                               : 0u)));
2313          break;
2314       default:
2315          unreachable("Non-float/uint/int/bool constant");
2316       }
2317
2318       remaining_writemask &= ~dst->writemask;
2319    }
2320    dst->reg_offset++;
2321 }
2322
2323 void
2324 vec4_visitor::visit(ir_constant *ir)
2325 {
2326    dst_reg dst = dst_reg(this, ir->type);
2327    this->result = src_reg(dst);
2328
2329    emit_constant_values(&dst, ir);
2330 }
2331
2332 void
2333 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2334 {
2335    ir_dereference *deref = static_cast<ir_dereference *>(
2336       ir->actual_parameters.get_head());
2337    ir_variable *location = deref->variable_referenced();
2338    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2339                           location->data.binding);
2340
2341    /* Calculate the surface offset */
2342    src_reg offset(this, glsl_type::uint_type);
2343    ir_dereference_array *deref_array = deref->as_dereference_array();
2344    if (deref_array) {
2345       deref_array->array_index->accept(this);
2346
2347       src_reg tmp(this, glsl_type::uint_type);
2348       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2349       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2350    } else {
2351       offset = location->data.atomic.offset;
2352    }
2353
2354    /* Emit the appropriate machine instruction */
2355    const char *callee = ir->callee->function_name();
2356    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2357
2358    if (!strcmp("__intrinsic_atomic_read", callee)) {
2359       emit_untyped_surface_read(surf_index, dst, offset);
2360
2361    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2362       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2363                           src_reg(), src_reg());
2364
2365    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2366       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2367                           src_reg(), src_reg());
2368    }
2369 }
2370
2371 void
2372 vec4_visitor::visit(ir_call *ir)
2373 {
2374    const char *callee = ir->callee->function_name();
2375
2376    if (!strcmp("__intrinsic_atomic_read", callee) ||
2377        !strcmp("__intrinsic_atomic_increment", callee) ||
2378        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2379       visit_atomic_counter_intrinsic(ir);
2380    } else {
2381       unreachable("Unsupported intrinsic.");
2382    }
2383 }
2384
2385 src_reg
2386 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2387 {
2388    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2389    inst->base_mrf = 2;
2390    inst->mlen = 1;
2391    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2392    inst->dst.writemask = WRITEMASK_XYZW;
2393
2394    inst->src[1] = sampler;
2395
2396    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2397    int param_base = inst->base_mrf;
2398    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2399    int zero_mask = 0xf & ~coord_mask;
2400
2401    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2402             coordinate));
2403
2404    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2405             src_reg(0)));
2406
2407    emit(inst);
2408    return src_reg(inst->dst);
2409 }
2410
2411 static bool
2412 is_high_sampler(struct brw_context *brw, src_reg sampler)
2413 {
2414    if (brw->gen < 8 && !brw->is_haswell)
2415       return false;
2416
2417    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2418 }
2419
2420 void
2421 vec4_visitor::visit(ir_texture *ir)
2422 {
2423    uint32_t sampler =
2424       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2425
2426    ir_rvalue *nonconst_sampler_index =
2427       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2428
2429    /* Handle non-constant sampler array indexing */
2430    src_reg sampler_reg;
2431    if (nonconst_sampler_index) {
2432       /* The highest sampler which may be used by this operation is
2433        * the last element of the array. Mark it here, because the generator
2434        * doesn't have enough information to determine the bound.
2435        */
2436       uint32_t array_size = ir->sampler->as_dereference_array()
2437          ->array->type->array_size();
2438
2439       uint32_t max_used = sampler + array_size - 1;
2440       if (ir->op == ir_tg4 && brw->gen < 8) {
2441          max_used += prog_data->base.binding_table.gather_texture_start;
2442       } else {
2443          max_used += prog_data->base.binding_table.texture_start;
2444       }
2445
2446       brw_mark_surface_used(&prog_data->base, max_used);
2447
2448       /* Emit code to evaluate the actual indexing expression */
2449       nonconst_sampler_index->accept(this);
2450       dst_reg temp(this, glsl_type::uint_type);
2451       emit(ADD(temp, this->result, src_reg(sampler)))
2452          ->force_writemask_all = true;
2453       sampler_reg = src_reg(temp);
2454    } else {
2455       /* Single sampler, or constant array index; the indexing expression
2456        * is just an immediate.
2457        */
2458       sampler_reg = src_reg(sampler);
2459    }
2460
2461    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2462     * emitting anything other than setting up the constant result.
2463     */
2464    if (ir->op == ir_tg4) {
2465       ir_constant *chan = ir->lod_info.component->as_constant();
2466       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2467       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2468          dst_reg result(this, ir->type);
2469          this->result = src_reg(result);
2470          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2471          return;
2472       }
2473    }
2474
2475    /* Should be lowered by do_lower_texture_projection */
2476    assert(!ir->projector);
2477
2478    /* Should be lowered */
2479    assert(!ir->offset || !ir->offset->type->is_array());
2480
2481    /* Generate code to compute all the subexpression trees.  This has to be
2482     * done before loading any values into MRFs for the sampler message since
2483     * generating these values may involve SEND messages that need the MRFs.
2484     */
2485    src_reg coordinate;
2486    if (ir->coordinate) {
2487       ir->coordinate->accept(this);
2488       coordinate = this->result;
2489    }
2490
2491    src_reg shadow_comparitor;
2492    if (ir->shadow_comparitor) {
2493       ir->shadow_comparitor->accept(this);
2494       shadow_comparitor = this->result;
2495    }
2496
2497    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2498    src_reg offset_value;
2499    if (has_nonconstant_offset) {
2500       ir->offset->accept(this);
2501       offset_value = src_reg(this->result);
2502    }
2503
2504    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2505    src_reg lod, dPdx, dPdy, sample_index, mcs;
2506    switch (ir->op) {
2507    case ir_tex:
2508       lod = src_reg(0.0f);
2509       lod_type = glsl_type::float_type;
2510       break;
2511    case ir_txf:
2512    case ir_txl:
2513    case ir_txs:
2514       ir->lod_info.lod->accept(this);
2515       lod = this->result;
2516       lod_type = ir->lod_info.lod->type;
2517       break;
2518    case ir_query_levels:
2519       lod = src_reg(0);
2520       lod_type = glsl_type::int_type;
2521       break;
2522    case ir_txf_ms:
2523       ir->lod_info.sample_index->accept(this);
2524       sample_index = this->result;
2525       sample_index_type = ir->lod_info.sample_index->type;
2526
2527       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2528          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2529       else
2530          mcs = src_reg(0u);
2531       break;
2532    case ir_txd:
2533       ir->lod_info.grad.dPdx->accept(this);
2534       dPdx = this->result;
2535
2536       ir->lod_info.grad.dPdy->accept(this);
2537       dPdy = this->result;
2538
2539       lod_type = ir->lod_info.grad.dPdx->type;
2540       break;
2541    case ir_txb:
2542    case ir_lod:
2543    case ir_tg4:
2544       break;
2545    }
2546
2547    enum opcode opcode;
2548    switch (ir->op) {
2549    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2550    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2551    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2552    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2553    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2554    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2555    case ir_tg4: opcode = has_nonconstant_offset
2556                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2557    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2558    case ir_txb:
2559       unreachable("TXB is not valid for vertex shaders.");
2560    case ir_lod:
2561       unreachable("LOD is not valid for vertex shaders.");
2562    default:
2563       unreachable("Unrecognized tex op");
2564    }
2565
2566    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2567
2568    if (ir->offset != NULL && !has_nonconstant_offset) {
2569       inst->offset =
2570          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2571                             ir->offset->type->vector_elements);
2572    }
2573
2574    /* Stuff the channel select bits in the top of the texture offset */
2575    if (ir->op == ir_tg4)
2576       inst->offset |= gather_channel(ir, sampler) << 16;
2577
2578    /* The message header is necessary for:
2579     * - Gen4 (always)
2580     * - Texel offsets
2581     * - Gather channel selection
2582     * - Sampler indices too large to fit in a 4-bit value.
2583     */
2584    inst->header_present =
2585       brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2586       is_high_sampler(brw, sampler_reg);
2587    inst->base_mrf = 2;
2588    inst->mlen = inst->header_present + 1; /* always at least one */
2589    inst->dst = dst_reg(this, ir->type);
2590    inst->dst.writemask = WRITEMASK_XYZW;
2591    inst->shadow_compare = ir->shadow_comparitor != NULL;
2592
2593    inst->src[1] = sampler_reg;
2594
2595    /* MRF for the first parameter */
2596    int param_base = inst->base_mrf + inst->header_present;
2597
2598    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2599       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2600       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2601    } else {
2602       /* Load the coordinate */
2603       /* FINISHME: gl_clamp_mask and saturate */
2604       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2605       int zero_mask = 0xf & ~coord_mask;
2606
2607       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2608                coordinate));
2609
2610       if (zero_mask != 0) {
2611          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2612                   src_reg(0)));
2613       }
2614       /* Load the shadow comparitor */
2615       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2616          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2617                           WRITEMASK_X),
2618                   shadow_comparitor));
2619          inst->mlen++;
2620       }
2621
2622       /* Load the LOD info */
2623       if (ir->op == ir_tex || ir->op == ir_txl) {
2624          int mrf, writemask;
2625          if (brw->gen >= 5) {
2626             mrf = param_base + 1;
2627             if (ir->shadow_comparitor) {
2628                writemask = WRITEMASK_Y;
2629                /* mlen already incremented */
2630             } else {
2631                writemask = WRITEMASK_X;
2632                inst->mlen++;
2633             }
2634          } else /* brw->gen == 4 */ {
2635             mrf = param_base;
2636             writemask = WRITEMASK_W;
2637          }
2638          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2639       } else if (ir->op == ir_txf) {
2640          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2641       } else if (ir->op == ir_txf_ms) {
2642          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2643                   sample_index));
2644          if (brw->gen >= 7) {
2645             /* MCS data is in the first channel of `mcs`, but we need to get it into
2646              * the .y channel of the second vec4 of params, so replicate .x across
2647              * the whole vec4 and then mask off everything except .y
2648              */
2649             mcs.swizzle = BRW_SWIZZLE_XXXX;
2650             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2651                      mcs));
2652          }
2653          inst->mlen++;
2654       } else if (ir->op == ir_txd) {
2655          const glsl_type *type = lod_type;
2656
2657          if (brw->gen >= 5) {
2658             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2659             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2661             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2662             inst->mlen++;
2663
2664             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2665                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2666                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2667                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2668                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2669                inst->mlen++;
2670
2671                if (ir->shadow_comparitor) {
2672                   emit(MOV(dst_reg(MRF, param_base + 2,
2673                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2674                            shadow_comparitor));
2675                }
2676             }
2677          } else /* brw->gen == 4 */ {
2678             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2679             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2680             inst->mlen += 2;
2681          }
2682       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2683          if (ir->shadow_comparitor) {
2684             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2685                      shadow_comparitor));
2686          }
2687
2688          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2689                   offset_value));
2690          inst->mlen++;
2691       }
2692    }
2693
2694    emit(inst);
2695
2696    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2697     * spec requires layers.
2698     */
2699    if (ir->op == ir_txs) {
2700       glsl_type const *type = ir->sampler->type;
2701       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2702           type->sampler_array) {
2703          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2704                    writemask(inst->dst, WRITEMASK_Z),
2705                    src_reg(inst->dst), src_reg(6));
2706       }
2707    }
2708
2709    if (brw->gen == 6 && ir->op == ir_tg4) {
2710       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2711    }
2712
2713    swizzle_result(ir, src_reg(inst->dst), sampler);
2714 }
2715
2716 /**
2717  * Apply workarounds for Gen6 gather with UINT/SINT
2718  */
2719 void
2720 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2721 {
2722    if (!wa)
2723       return;
2724
2725    int width = (wa & WA_8BIT) ? 8 : 16;
2726    dst_reg dst_f = dst;
2727    dst_f.type = BRW_REGISTER_TYPE_F;
2728
2729    /* Convert from UNORM to UINT */
2730    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2731    emit(MOV(dst, src_reg(dst_f)));
2732
2733    if (wa & WA_SIGN) {
2734       /* Reinterpret the UINT value as a signed INT value by
2735        * shifting the sign bit into place, then shifting back
2736        * preserving sign.
2737        */
2738       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2739       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2740    }
2741 }
2742
2743 /**
2744  * Set up the gather channel based on the swizzle, for gather4.
2745  */
2746 uint32_t
2747 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2748 {
2749    ir_constant *chan = ir->lod_info.component->as_constant();
2750    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2751    switch (swiz) {
2752       case SWIZZLE_X: return 0;
2753       case SWIZZLE_Y:
2754          /* gather4 sampler is broken for green channel on RG32F --
2755           * we must ask for blue instead.
2756           */
2757          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2758             return 2;
2759          return 1;
2760       case SWIZZLE_Z: return 2;
2761       case SWIZZLE_W: return 3;
2762       default:
2763          unreachable("Not reached"); /* zero, one swizzles handled already */
2764    }
2765 }
2766
2767 void
2768 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2769 {
2770    int s = key->tex.swizzles[sampler];
2771
2772    this->result = src_reg(this, ir->type);
2773    dst_reg swizzled_result(this->result);
2774
2775    if (ir->op == ir_query_levels) {
2776       /* # levels is in .w */
2777       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2778       emit(MOV(swizzled_result, orig_val));
2779       return;
2780    }
2781
2782    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2783                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2784       emit(MOV(swizzled_result, orig_val));
2785       return;
2786    }
2787
2788
2789    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2790    int swizzle[4] = {0};
2791
2792    for (int i = 0; i < 4; i++) {
2793       switch (GET_SWZ(s, i)) {
2794       case SWIZZLE_ZERO:
2795          zero_mask |= (1 << i);
2796          break;
2797       case SWIZZLE_ONE:
2798          one_mask |= (1 << i);
2799          break;
2800       default:
2801          copy_mask |= (1 << i);
2802          swizzle[i] = GET_SWZ(s, i);
2803          break;
2804       }
2805    }
2806
2807    if (copy_mask) {
2808       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2809       swizzled_result.writemask = copy_mask;
2810       emit(MOV(swizzled_result, orig_val));
2811    }
2812
2813    if (zero_mask) {
2814       swizzled_result.writemask = zero_mask;
2815       emit(MOV(swizzled_result, src_reg(0.0f)));
2816    }
2817
2818    if (one_mask) {
2819       swizzled_result.writemask = one_mask;
2820       emit(MOV(swizzled_result, src_reg(1.0f)));
2821    }
2822 }
2823
2824 void
2825 vec4_visitor::visit(ir_return *)
2826 {
2827    unreachable("not reached");
2828 }
2829
2830 void
2831 vec4_visitor::visit(ir_discard *)
2832 {
2833    unreachable("not reached");
2834 }
2835
2836 void
2837 vec4_visitor::visit(ir_if *ir)
2838 {
2839    /* Don't point the annotation at the if statement, because then it plus
2840     * the then and else blocks get printed.
2841     */
2842    this->base_ir = ir->condition;
2843
2844    if (brw->gen == 6) {
2845       emit_if_gen6(ir);
2846    } else {
2847       enum brw_predicate predicate;
2848       emit_bool_to_cond_code(ir->condition, &predicate);
2849       emit(IF(predicate));
2850    }
2851
2852    visit_instructions(&ir->then_instructions);
2853
2854    if (!ir->else_instructions.is_empty()) {
2855       this->base_ir = ir->condition;
2856       emit(BRW_OPCODE_ELSE);
2857
2858       visit_instructions(&ir->else_instructions);
2859    }
2860
2861    this->base_ir = ir->condition;
2862    emit(BRW_OPCODE_ENDIF);
2863 }
2864
2865 void
2866 vec4_visitor::visit(ir_emit_vertex *)
2867 {
2868    unreachable("not reached");
2869 }
2870
2871 void
2872 vec4_visitor::visit(ir_end_primitive *)
2873 {
2874    unreachable("not reached");
2875 }
2876
2877 void
2878 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2879                                   dst_reg dst, src_reg offset,
2880                                   src_reg src0, src_reg src1)
2881 {
2882    unsigned mlen = 0;
2883
2884    /* Set the atomic operation offset. */
2885    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2886    mlen++;
2887
2888    /* Set the atomic operation arguments. */
2889    if (src0.file != BAD_FILE) {
2890       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2891       mlen++;
2892    }
2893
2894    if (src1.file != BAD_FILE) {
2895       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2896       mlen++;
2897    }
2898
2899    /* Emit the instruction.  Note that this maps to the normal SIMD8
2900     * untyped atomic message on Ivy Bridge, but that's OK because
2901     * unused channels will be masked out.
2902     */
2903    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2904                                  src_reg(atomic_op), src_reg(surf_index));
2905    inst->base_mrf = 0;
2906    inst->mlen = mlen;
2907 }
2908
2909 void
2910 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2911                                         src_reg offset)
2912 {
2913    /* Set the surface read offset. */
2914    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2915
2916    /* Emit the instruction.  Note that this maps to the normal SIMD8
2917     * untyped surface read message, but that's OK because unused
2918     * channels will be masked out.
2919     */
2920    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2921                                  dst, src_reg(surf_index));
2922    inst->base_mrf = 0;
2923    inst->mlen = 1;
2924 }
2925
2926 void
2927 vec4_visitor::emit_ndc_computation()
2928 {
2929    /* Get the position */
2930    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2931
2932    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2933    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2934    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2935
2936    current_annotation = "NDC";
2937    dst_reg ndc_w = ndc;
2938    ndc_w.writemask = WRITEMASK_W;
2939    src_reg pos_w = pos;
2940    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2941    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2942
2943    dst_reg ndc_xyz = ndc;
2944    ndc_xyz.writemask = WRITEMASK_XYZ;
2945
2946    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2947 }
2948
2949 void
2950 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2951 {
2952    if (brw->gen < 6 &&
2953        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2954         key->userclip_active || brw->has_negative_rhw_bug)) {
2955       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2956       dst_reg header1_w = header1;
2957       header1_w.writemask = WRITEMASK_W;
2958
2959       emit(MOV(header1, 0u));
2960
2961       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2962          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2963
2964          current_annotation = "Point size";
2965          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2966          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2967       }
2968
2969       if (key->userclip_active) {
2970          current_annotation = "Clipping flags";
2971          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2972          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2973
2974          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2975          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2976          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2977
2978          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2979          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2980          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2981          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2982       }
2983
2984       /* i965 clipping workaround:
2985        * 1) Test for -ve rhw
2986        * 2) If set,
2987        *      set ndc = (0,0,0,0)
2988        *      set ucp[6] = 1
2989        *
2990        * Later, clipping will detect ucp[6] and ensure the primitive is
2991        * clipped against all fixed planes.
2992        */
2993       if (brw->has_negative_rhw_bug) {
2994          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2995          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2996          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2997          vec4_instruction *inst;
2998          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2999          inst->predicate = BRW_PREDICATE_NORMAL;
3000          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3001          inst->predicate = BRW_PREDICATE_NORMAL;
3002       }
3003
3004       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3005    } else if (brw->gen < 6) {
3006       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3007    } else {
3008       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3009       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3010          dst_reg reg_w = reg;
3011          reg_w.writemask = WRITEMASK_W;
3012          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3013       }
3014       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3015          dst_reg reg_y = reg;
3016          reg_y.writemask = WRITEMASK_Y;
3017          reg_y.type = BRW_REGISTER_TYPE_D;
3018          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3019       }
3020       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3021          dst_reg reg_z = reg;
3022          reg_z.writemask = WRITEMASK_Z;
3023          reg_z.type = BRW_REGISTER_TYPE_D;
3024          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3025       }
3026    }
3027 }
3028
3029 void
3030 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3031 {
3032    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3033     *
3034     *     "If a linked set of shaders forming the vertex stage contains no
3035     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3036     *     application has requested clipping against user clip planes through
3037     *     the API, then the coordinate written to gl_Position is used for
3038     *     comparison against the user clip planes."
3039     *
3040     * This function is only called if the shader didn't write to
3041     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3042     * if the user wrote to it; otherwise we use gl_Position.
3043     */
3044    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3045    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3046       clip_vertex = VARYING_SLOT_POS;
3047    }
3048
3049    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3050         ++i) {
3051       reg.writemask = 1 << i;
3052       emit(DP4(reg,
3053                src_reg(output_reg[clip_vertex]),
3054                src_reg(this->userplane[i + offset])));
3055    }
3056 }
3057
3058 void
3059 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3060 {
3061    assert (varying < VARYING_SLOT_MAX);
3062    reg.type = output_reg[varying].type;
3063    current_annotation = output_reg_annotation[varying];
3064    /* Copy the register, saturating if necessary */
3065    vec4_instruction *inst = emit(MOV(reg,
3066                                      src_reg(output_reg[varying])));
3067    if ((varying == VARYING_SLOT_COL0 ||
3068         varying == VARYING_SLOT_COL1 ||
3069         varying == VARYING_SLOT_BFC0 ||
3070         varying == VARYING_SLOT_BFC1) &&
3071        key->clamp_vertex_color) {
3072       inst->saturate = true;
3073    }
3074 }
3075
3076 void
3077 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3078 {
3079    reg.type = BRW_REGISTER_TYPE_F;
3080
3081    switch (varying) {
3082    case VARYING_SLOT_PSIZ:
3083    {
3084       /* PSIZ is always in slot 0, and is coupled with other flags. */
3085       current_annotation = "indices, point width, clip flags";
3086       emit_psiz_and_flags(reg);
3087       break;
3088    }
3089    case BRW_VARYING_SLOT_NDC:
3090       current_annotation = "NDC";
3091       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3092       break;
3093    case VARYING_SLOT_POS:
3094       current_annotation = "gl_Position";
3095       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3096       break;
3097    case VARYING_SLOT_EDGE:
3098       /* This is present when doing unfilled polygons.  We're supposed to copy
3099        * the edge flag from the user-provided vertex array
3100        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3101        * of that attribute (starts as 1.0f).  This is then used in clipping to
3102        * determine which edges should be drawn as wireframe.
3103        */
3104       current_annotation = "edge flag";
3105       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3106                                     glsl_type::float_type, WRITEMASK_XYZW))));
3107       break;
3108    case BRW_VARYING_SLOT_PAD:
3109       /* No need to write to this slot */
3110       break;
3111    default:
3112       emit_generic_urb_slot(reg, varying);
3113       break;
3114    }
3115 }
3116
3117 static int
3118 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3119 {
3120    if (brw->gen >= 6) {
3121       /* URB data written (does not include the message header reg) must
3122        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3123        * section 5.4.3.2.2: URB_INTERLEAVED.
3124        *
3125        * URB entries are allocated on a multiple of 1024 bits, so an
3126        * extra 128 bits written here to make the end align to 256 is
3127        * no problem.
3128        */
3129       if ((mlen % 2) != 1)
3130          mlen++;
3131    }
3132
3133    return mlen;
3134 }
3135
3136
3137 /**
3138  * Generates the VUE payload plus the necessary URB write instructions to
3139  * output it.
3140  *
3141  * The VUE layout is documented in Volume 2a.
3142  */
3143 void
3144 vec4_visitor::emit_vertex()
3145 {
3146    /* MRF 0 is reserved for the debugger, so start with message header
3147     * in MRF 1.
3148     */
3149    int base_mrf = 1;
3150    int mrf = base_mrf;
3151    /* In the process of generating our URB write message contents, we
3152     * may need to unspill a register or load from an array.  Those
3153     * reads would use MRFs 14-15.
3154     */
3155    int max_usable_mrf = 13;
3156
3157    /* The following assertion verifies that max_usable_mrf causes an
3158     * even-numbered amount of URB write data, which will meet gen6's
3159     * requirements for length alignment.
3160     */
3161    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3162
3163    /* First mrf is the g0-based message header containing URB handles and
3164     * such.
3165     */
3166    emit_urb_write_header(mrf++);
3167
3168    if (brw->gen < 6) {
3169       emit_ndc_computation();
3170    }
3171
3172    /* Lower legacy ff and ClipVertex clipping to clip distances */
3173    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3174       current_annotation = "user clip distances";
3175
3176       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3177       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3178
3179       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3180       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3181    }
3182
3183    /* We may need to split this up into several URB writes, so do them in a
3184     * loop.
3185     */
3186    int slot = 0;
3187    bool complete = false;
3188    do {
3189       /* URB offset is in URB row increments, and each of our MRFs is half of
3190        * one of those, since we're doing interleaved writes.
3191        */
3192       int offset = slot / 2;
3193
3194       mrf = base_mrf + 1;
3195       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3196          emit_urb_slot(dst_reg(MRF, mrf++),
3197                        prog_data->vue_map.slot_to_varying[slot]);
3198
3199          /* If this was max_usable_mrf, we can't fit anything more into this
3200           * URB WRITE.
3201           */
3202          if (mrf > max_usable_mrf) {
3203             slot++;
3204             break;
3205          }
3206       }
3207
3208       complete = slot >= prog_data->vue_map.num_slots;
3209       current_annotation = "URB write";
3210       vec4_instruction *inst = emit_urb_write_opcode(complete);
3211       inst->base_mrf = base_mrf;
3212       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3213       inst->offset += offset;
3214    } while(!complete);
3215 }
3216
3217
3218 src_reg
3219 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3220                                  src_reg *reladdr, int reg_offset)
3221 {
3222    /* Because we store the values to scratch interleaved like our
3223     * vertex data, we need to scale the vec4 index by 2.
3224     */
3225    int message_header_scale = 2;
3226
3227    /* Pre-gen6, the message header uses byte offsets instead of vec4
3228     * (16-byte) offset units.
3229     */
3230    if (brw->gen < 6)
3231       message_header_scale *= 16;
3232
3233    if (reladdr) {
3234       src_reg index = src_reg(this, glsl_type::int_type);
3235
3236       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3237                                    src_reg(reg_offset)));
3238       emit_before(block, inst, MUL(dst_reg(index), index,
3239                                    src_reg(message_header_scale)));
3240
3241       return index;
3242    } else {
3243       return src_reg(reg_offset * message_header_scale);
3244    }
3245 }
3246
3247 src_reg
3248 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3249                                        src_reg *reladdr, int reg_offset)
3250 {
3251    if (reladdr) {
3252       src_reg index = src_reg(this, glsl_type::int_type);
3253
3254       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3255                                    src_reg(reg_offset)));
3256
3257       /* Pre-gen6, the message header uses byte offsets instead of vec4
3258        * (16-byte) offset units.
3259        */
3260       if (brw->gen < 6) {
3261          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3262       }
3263
3264       return index;
3265    } else if (brw->gen >= 8) {
3266       /* Store the offset in a GRF so we can send-from-GRF. */
3267       src_reg offset = src_reg(this, glsl_type::int_type);
3268       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3269       return offset;
3270    } else {
3271       int message_header_scale = brw->gen < 6 ? 16 : 1;
3272       return src_reg(reg_offset * message_header_scale);
3273    }
3274 }
3275
3276 /**
3277  * Emits an instruction before @inst to load the value named by @orig_src
3278  * from scratch space at @base_offset to @temp.
3279  *
3280  * @base_offset is measured in 32-byte units (the size of a register).
3281  */
3282 void
3283 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3284                                 dst_reg temp, src_reg orig_src,
3285                                 int base_offset)
3286 {
3287    int reg_offset = base_offset + orig_src.reg_offset;
3288    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3289                                       reg_offset);
3290
3291    emit_before(block, inst, SCRATCH_READ(temp, index));
3292 }
3293
3294 /**
3295  * Emits an instruction after @inst to store the value to be written
3296  * to @orig_dst to scratch space at @base_offset, from @temp.
3297  *
3298  * @base_offset is measured in 32-byte units (the size of a register).
3299  */
3300 void
3301 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3302                                  int base_offset)
3303 {
3304    int reg_offset = base_offset + inst->dst.reg_offset;
3305    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3306                                       reg_offset);
3307
3308    /* Create a temporary register to store *inst's result in.
3309     *
3310     * We have to be careful in MOVing from our temporary result register in
3311     * the scratch write.  If we swizzle from channels of the temporary that
3312     * weren't initialized, it will confuse live interval analysis, which will
3313     * make spilling fail to make progress.
3314     */
3315    src_reg temp = src_reg(this, glsl_type::vec4_type);
3316    temp.type = inst->dst.type;
3317    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3318    int swizzles[4];
3319    for (int i = 0; i < 4; i++)
3320       if (inst->dst.writemask & (1 << i))
3321          swizzles[i] = i;
3322       else
3323          swizzles[i] = first_writemask_chan;
3324    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3325                                swizzles[2], swizzles[3]);
3326
3327    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3328                                        inst->dst.writemask));
3329    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3330    write->predicate = inst->predicate;
3331    write->ir = inst->ir;
3332    write->annotation = inst->annotation;
3333    inst->insert_after(block, write);
3334
3335    inst->dst.file = temp.file;
3336    inst->dst.reg = temp.reg;
3337    inst->dst.reg_offset = temp.reg_offset;
3338    inst->dst.reladdr = NULL;
3339 }
3340
3341 /**
3342  * We can't generally support array access in GRF space, because a
3343  * single instruction's destination can only span 2 contiguous
3344  * registers.  So, we send all GRF arrays that get variable index
3345  * access to scratch space.
3346  */
3347 void
3348 vec4_visitor::move_grf_array_access_to_scratch()
3349 {
3350    int scratch_loc[this->virtual_grf_count];
3351    memset(scratch_loc, -1, sizeof(scratch_loc));
3352
3353    /* First, calculate the set of virtual GRFs that need to be punted
3354     * to scratch due to having any array access on them, and where in
3355     * scratch.
3356     */
3357    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3358       if (inst->dst.file == GRF && inst->dst.reladdr &&
3359           scratch_loc[inst->dst.reg] == -1) {
3360          scratch_loc[inst->dst.reg] = c->last_scratch;
3361          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3362       }
3363
3364       for (int i = 0 ; i < 3; i++) {
3365          src_reg *src = &inst->src[i];
3366
3367          if (src->file == GRF && src->reladdr &&
3368              scratch_loc[src->reg] == -1) {
3369             scratch_loc[src->reg] = c->last_scratch;
3370             c->last_scratch += this->virtual_grf_sizes[src->reg];
3371          }
3372       }
3373    }
3374
3375    /* Now, for anything that will be accessed through scratch, rewrite
3376     * it to load/store.  Note that this is a _safe list walk, because
3377     * we may generate a new scratch_write instruction after the one
3378     * we're processing.
3379     */
3380    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3381       /* Set up the annotation tracking for new generated instructions. */
3382       base_ir = inst->ir;
3383       current_annotation = inst->annotation;
3384
3385       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3386          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3387       }
3388
3389       for (int i = 0 ; i < 3; i++) {
3390          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3391             continue;
3392
3393          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3394
3395          emit_scratch_read(block, inst, temp, inst->src[i],
3396                            scratch_loc[inst->src[i].reg]);
3397
3398          inst->src[i].file = temp.file;
3399          inst->src[i].reg = temp.reg;
3400          inst->src[i].reg_offset = temp.reg_offset;
3401          inst->src[i].reladdr = NULL;
3402       }
3403    }
3404 }
3405
3406 /**
3407  * Emits an instruction before @inst to load the value named by @orig_src
3408  * from the pull constant buffer (surface) at @base_offset to @temp.
3409  */
3410 void
3411 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3412                                       dst_reg temp, src_reg orig_src,
3413                                       int base_offset)
3414 {
3415    int reg_offset = base_offset + orig_src.reg_offset;
3416    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3417    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3418                                              reg_offset);
3419    vec4_instruction *load;
3420
3421    if (brw->gen >= 7) {
3422       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3423       grf_offset.type = offset.type;
3424       emit_before(block, inst, MOV(grf_offset, offset));
3425
3426       load = new(mem_ctx) vec4_instruction(this,
3427                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3428                                            temp, index, src_reg(grf_offset));
3429    } else {
3430       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3431                                            temp, index, offset);
3432       load->base_mrf = 14;
3433       load->mlen = 1;
3434    }
3435    emit_before(block, inst, load);
3436 }
3437
3438 /**
3439  * Implements array access of uniforms by inserting a
3440  * PULL_CONSTANT_LOAD instruction.
3441  *
3442  * Unlike temporary GRF array access (where we don't support it due to
3443  * the difficulty of doing relative addressing on instruction
3444  * destinations), we could potentially do array access of uniforms
3445  * that were loaded in GRF space as push constants.  In real-world
3446  * usage we've seen, though, the arrays being used are always larger
3447  * than we could load as push constants, so just always move all
3448  * uniform array access out to a pull constant buffer.
3449  */
3450 void
3451 vec4_visitor::move_uniform_array_access_to_pull_constants()
3452 {
3453    int pull_constant_loc[this->uniforms];
3454    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3455    bool nested_reladdr;
3456
3457    /* Walk through and find array access of uniforms.  Put a copy of that
3458     * uniform in the pull constant buffer.
3459     *
3460     * Note that we don't move constant-indexed accesses to arrays.  No
3461     * testing has been done of the performance impact of this choice.
3462     */
3463    do {
3464       nested_reladdr = false;
3465
3466       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3467          for (int i = 0 ; i < 3; i++) {
3468             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3469                continue;
3470
3471             int uniform = inst->src[i].reg;
3472
3473             if (inst->src[i].reladdr->reladdr)
3474                nested_reladdr = true;  /* will need another pass */
3475
3476             /* If this array isn't already present in the pull constant buffer,
3477              * add it.
3478              */
3479             if (pull_constant_loc[uniform] == -1) {
3480                const gl_constant_value **values =
3481                   &stage_prog_data->param[uniform * 4];
3482
3483                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3484
3485                assert(uniform < uniform_array_size);
3486                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3487                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3488                      = values[j];
3489                }
3490             }
3491
3492             /* Set up the annotation tracking for new generated instructions. */
3493             base_ir = inst->ir;
3494             current_annotation = inst->annotation;
3495
3496             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3497
3498             emit_pull_constant_load(block, inst, temp, inst->src[i],
3499                                     pull_constant_loc[uniform]);
3500
3501             inst->src[i].file = temp.file;
3502             inst->src[i].reg = temp.reg;
3503             inst->src[i].reg_offset = temp.reg_offset;
3504             inst->src[i].reladdr = NULL;
3505          }
3506       }
3507    } while (nested_reladdr);
3508
3509    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3510     * no need to track them as larger-than-vec4 objects.  This will be
3511     * relied on in cutting out unused uniform vectors from push
3512     * constants.
3513     */
3514    split_uniform_registers();
3515 }
3516
3517 void
3518 vec4_visitor::resolve_ud_negate(src_reg *reg)
3519 {
3520    if (reg->type != BRW_REGISTER_TYPE_UD ||
3521        !reg->negate)
3522       return;
3523
3524    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3525    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3526    *reg = temp;
3527 }
3528
3529 vec4_visitor::vec4_visitor(struct brw_context *brw,
3530                            struct brw_vec4_compile *c,
3531                            struct gl_program *prog,
3532                            const struct brw_vec4_prog_key *key,
3533                            struct brw_vec4_prog_data *prog_data,
3534                            struct gl_shader_program *shader_prog,
3535                            gl_shader_stage stage,
3536                            void *mem_ctx,
3537                            bool debug_flag,
3538                            bool no_spills,
3539                            shader_time_shader_type st_base,
3540                            shader_time_shader_type st_written,
3541                            shader_time_shader_type st_reset)
3542    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3543      c(c),
3544      key(key),
3545      prog_data(prog_data),
3546      sanity_param_count(0),
3547      fail_msg(NULL),
3548      first_non_payload_grf(0),
3549      need_all_constants_in_pull_buffer(false),
3550      debug_flag(debug_flag),
3551      no_spills(no_spills),
3552      st_base(st_base),
3553      st_written(st_written),
3554      st_reset(st_reset)
3555 {
3556    this->mem_ctx = mem_ctx;
3557    this->failed = false;
3558
3559    this->base_ir = NULL;
3560    this->current_annotation = NULL;
3561    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3562
3563    this->variable_ht = hash_table_ctor(0,
3564                                        hash_table_pointer_hash,
3565                                        hash_table_pointer_compare);
3566
3567    this->virtual_grf_start = NULL;
3568    this->virtual_grf_end = NULL;
3569    this->virtual_grf_sizes = NULL;
3570    this->virtual_grf_count = 0;
3571    this->virtual_grf_reg_map = NULL;
3572    this->virtual_grf_reg_count = 0;
3573    this->virtual_grf_array_size = 0;
3574    this->live_intervals_valid = false;
3575
3576    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3577
3578    this->uniforms = 0;
3579
3580    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3581     * at least one. See setup_uniforms() in brw_vec4.cpp.
3582     */
3583    this->uniform_array_size = 1;
3584    if (prog_data) {
3585       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3586    }
3587
3588    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3589    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3590 }
3591
3592 vec4_visitor::~vec4_visitor()
3593 {
3594    hash_table_dtor(this->variable_ht);
3595 }
3596
3597
3598 void
3599 vec4_visitor::fail(const char *format, ...)
3600 {
3601    va_list va;
3602    char *msg;
3603
3604    if (failed)
3605       return;
3606
3607    failed = true;
3608
3609    va_start(va, format);
3610    msg = ralloc_vasprintf(mem_ctx, format, va);
3611    va_end(va);
3612    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3613
3614    this->fail_msg = msg;
3615
3616    if (debug_flag) {
3617       fprintf(stderr, "%s",  msg);
3618    }
3619 }
3620
3621 } /* namespace brw */