src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  70                           vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(block, new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  82                    const src_reg &src1, const src_reg &src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  91                    const src_reg &src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 119                                            src0);                       \
 120    }
 121
 122 #define ALU2(op)                                                        \
 123    vec4_instruction *                                                   \
 124    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 125                     const src_reg &src1)                                \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0, src1);                 \
 129    }
 130
 131 #define ALU2_ACC(op)                                                    \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 134                     const src_reg &src1)                                \
 135    {                                                                    \
 136       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 137                        BRW_OPCODE_##op, dst, src0, src1);               \
 138       inst->writes_accumulator = true;                                 \
 139       return inst;                                                     \
 140    }
 141
 142 #define ALU3(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 145                     const src_reg &src1, const src_reg &src2)           \
 146    {                                                                    \
 147       assert(brw->gen >= 6);                                            \
 148       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 149                                            src0, src1, src2);           \
 150    }
 151
 152 ALU1(NOT)
 153 ALU1(MOV)
 154 ALU1(FRC)
 155 ALU1(RNDD)
 156 ALU1(RNDE)
 157 ALU1(RNDZ)
 158 ALU1(F32TO16)
 159 ALU1(F16TO32)
 160 ALU2(ADD)
 161 ALU2(MUL)
 162 ALU2_ACC(MACH)
 163 ALU2(AND)
 164 ALU2(OR)
 165 ALU2(XOR)
 166 ALU2(DP3)
 167 ALU2(DP4)
 168 ALU2(DPH)
 169 ALU2(SHL)
 170 ALU2(SHR)
 171 ALU2(ASR)
 172 ALU3(LRP)
 173 ALU1(BFREV)
 174 ALU3(BFE)
 175 ALU2(BFI1)
 176 ALU3(BFI2)
 177 ALU1(FBH)
 178 ALU1(FBL)
 179 ALU1(CBIT)
 180 ALU3(MAD)
 181 ALU2_ACC(ADDC)
 182 ALU2_ACC(SUBB)
 183 ALU2(MAC)
 184
 185 /** Gen4 predicated IF. */
 186 vec4_instruction *
 187 vec4_visitor::IF(enum brw_predicate predicate)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 vec4_instruction *
 199 vec4_visitor::IF(src_reg src0, src_reg src1,
 200                  enum brw_conditional_mod condition)
 201 {
 202    assert(brw->gen == 6);
 203
 204    vec4_instruction *inst;
 205
 206    resolve_ud_negate(&src0);
 207    resolve_ud_negate(&src1);
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 210                                         src0, src1);
 211    inst->conditional_mod = condition;
 212
 213    return inst;
 214 }
 215
 216 /**
 217  * CMP: Sets the low bit of the destination channels with the result
 218  * of the comparison, while the upper bits are undefined, and updates
 219  * the flag register with the packed 16 bits of the result.
 220  */
 221 vec4_instruction *
 222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 223                   enum brw_conditional_mod condition)
 224 {
 225    vec4_instruction *inst;
 226
 227    /* original gen4 does type conversion to the destination type
 228     * before before comparison, producing garbage results for floating
 229     * point comparisons.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 vec4_instruction *
 247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 248 {
 249    vec4_instruction *inst;
 250
 251    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 252                                         dst, index);
 253    inst->base_mrf = 14;
 254    inst->mlen = 2;
 255
 256    return inst;
 257 }
 258
 259 vec4_instruction *
 260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 261                             const src_reg &index)
 262 {
 263    vec4_instruction *inst;
 264
 265    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 266                                         dst, src, index);
 267    inst->base_mrf = 13;
 268    inst->mlen = 3;
 269
 270    return inst;
 271 }
 272
 273 void
 274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 275 {
 276    static enum opcode dot_opcodes[] = {
 277       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 278    };
 279
 280    emit(dot_opcodes[elements - 2], dst, src0, src1);
 281 }
 282
 283 src_reg
 284 vec4_visitor::fix_3src_operand(src_reg src)
 285 {
 286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 287     * able to use vertical stride of zero to replicate the vec4 uniform, like
 288     *
 289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 290     *
 291     * But you can't, since vertical stride is always four in three-source
 292     * instructions. Instead, insert a MOV instruction to do the replication so
 293     * that the three-source instruction can consume it.
 294     */
 295
 296    /* The MOV is only needed if the source is a uniform or immediate. */
 297    if (src.file != UNIFORM && src.file != IMM)
 298       return src;
 299
 300    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 306    return src_reg(expanded);
 307 }
 308
 309 src_reg
 310 vec4_visitor::fix_math_operand(src_reg src)
 311 {
 312    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 313       return src;
 314
 315    /* The gen6 math instruction ignores the source modifiers --
 316     * swizzle, abs, negate, and at least some parts of the register
 317     * region description.
 318     *
 319     * Rather than trying to enumerate all these cases, *always* expand the
 320     * operand to a temp GRF for gen6.
 321     *
 322     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 323     * can't use.
 324     */
 325
 326    if (brw->gen == 7 && src.file != IMM)
 327       return src;
 328
 329    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 330    expanded.type = src.type;
 331    emit(MOV(expanded, src));
 332    return src_reg(expanded);
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(enum opcode opcode,
 337                         const dst_reg &dst,
 338                         const src_reg &src0, const src_reg &src1)
 339 {
 340    vec4_instruction *math =
 341       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 342
 343    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 344       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 345       math->dst = dst_reg(this, glsl_type::vec4_type);
 346       math->dst.type = dst.type;
 347       emit(MOV(dst, src_reg(math->dst)));
 348    } else if (brw->gen < 6) {
 349       math->base_mrf = 1;
 350       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 351    }
 352 }
 353
 354 void
 355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 356 {
 357    if (brw->gen < 7) {
 358       unreachable("ir_unop_pack_half_2x16 should be lowered");
 359    }
 360
 361    assert(dst.type == BRW_REGISTER_TYPE_UD);
 362    assert(src0.type == BRW_REGISTER_TYPE_F);
 363
 364    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 365     *
 366     *   Because this instruction does not have a 16-bit floating-point type,
 367     *   the destination data type must be Word (W).
 368     *
 369     *   The destination must be DWord-aligned and specify a horizontal stride
 370     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 371     *   each destination channel and the upper word is not modified.
 372     *
 373     * The above restriction implies that the f32to16 instruction must use
 374     * align1 mode, because only in align1 mode is it possible to specify
 375     * horizontal stride.  We choose here to defy the hardware docs and emit
 376     * align16 instructions.
 377     *
 378     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 379     * instructions. I was partially successful in that the code passed all
 380     * tests.  However, the code was dubiously correct and fragile, and the
 381     * tests were not harsh enough to probe that frailty. Not trusting the
 382     * code, I chose instead to remain in align16 mode in defiance of the hw
 383     * docs).
 384     *
 385     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 386     * simulator, emitting a f32to16 in align16 mode with UD as destination
 387     * data type is safe. The behavior differs from that specified in the PRM
 388     * in that the upper word of each destination channel is cleared to 0.
 389     */
 390
 391    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 392    src_reg tmp_src(tmp_dst);
 393
 394 #if 0
 395    /* Verify the undocumented behavior on which the following instructions
 396     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 397     * then the result of the bit-or instruction below will be incorrect.
 398     *
 399     * You should inspect the disasm output in order to verify that the MOV is
 400     * not optimized away.
 401     */
 402    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 403 #endif
 404
 405    /* Give tmp the form below, where "." means untouched.
 406     *
 407     *     w z          y          x w z          y          x
 408     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 409     *
 410     * That the upper word of each write-channel be 0 is required for the
 411     * following bit-shift and bit-or instructions to work. Note that this
 412     * relies on the undocumented hardware behavior mentioned above.
 413     */
 414    tmp_dst.writemask = WRITEMASK_XY;
 415    emit(F32TO16(tmp_dst, src0));
 416
 417    /* Give the write-channels of dst the form:
 418     *   0xhhhh0000
 419     */
 420    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 421    emit(SHL(dst, tmp_src, src_reg(16u)));
 422
 423    /* Finally, give the write-channels of dst the form of packHalf2x16's
 424     * output:
 425     *   0xhhhhllll
 426     */
 427    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 428    emit(OR(dst, src_reg(dst), tmp_src));
 429 }
 430
 431 void
 432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 433 {
 434    if (brw->gen < 7) {
 435       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 436    }
 437
 438    assert(dst.type == BRW_REGISTER_TYPE_F);
 439    assert(src0.type == BRW_REGISTER_TYPE_UD);
 440
 441    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 442     *
 443     *   Because this instruction does not have a 16-bit floating-point type,
 444     *   the source data type must be Word (W). The destination type must be
 445     *   F (Float).
 446     *
 447     * To use W as the source data type, we must adjust horizontal strides,
 448     * which is only possible in align1 mode. All my [chadv] attempts at
 449     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 450     * Piglit tests, so I gave up.
 451     *
 452     * I've verified that, on gen7 hardware and the simulator, it is safe to
 453     * emit f16to32 in align16 mode with UD as source data type.
 454     */
 455
 456    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 457    src_reg tmp_src(tmp_dst);
 458
 459    tmp_dst.writemask = WRITEMASK_X;
 460    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 461
 462    tmp_dst.writemask = WRITEMASK_Y;
 463    emit(SHR(tmp_dst, src0, src_reg(16u)));
 464
 465    dst.writemask = WRITEMASK_XY;
 466    emit(F16TO32(dst, tmp_src));
 467 }
 468
 469 void
 470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 471 {
 472    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 473     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 474     * is not suitable to generate the shift values, but we can use the packed
 475     * vector float and a type-converting MOV.
 476     */
 477    dst_reg shift(this, glsl_type::uvec4_type);
 478    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 479
 480    dst_reg shifted(this, glsl_type::uvec4_type);
 481    src0.swizzle = BRW_SWIZZLE_XXXX;
 482    emit(SHR(shifted, src0, src_reg(shift)));
 483
 484    shifted.type = BRW_REGISTER_TYPE_UB;
 485    dst_reg f(this, glsl_type::vec4_type);
 486    emit(MOV(f, src_reg(shifted)));
 487
 488    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 489 }
 490
 491 void
 492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 493 {
 494    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 495     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 496     * is not suitable to generate the shift values, but we can use the packed
 497     * vector float and a type-converting MOV.
 498     */
 499    dst_reg shift(this, glsl_type::uvec4_type);
 500    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 501
 502    dst_reg shifted(this, glsl_type::uvec4_type);
 503    src0.swizzle = BRW_SWIZZLE_XXXX;
 504    emit(SHR(shifted, src0, src_reg(shift)));
 505
 506    shifted.type = BRW_REGISTER_TYPE_B;
 507    dst_reg f(this, glsl_type::vec4_type);
 508    emit(MOV(f, src_reg(shifted)));
 509
 510    dst_reg scaled(this, glsl_type::vec4_type);
 511    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 512
 513    dst_reg max(this, glsl_type::vec4_type);
 514    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 515    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 516 }
 517
 518 void
 519 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 520 {
 521    dst_reg saturated(this, glsl_type::vec4_type);
 522    vec4_instruction *inst = emit(MOV(saturated, src0));
 523    inst->saturate = true;
 524
 525    dst_reg scaled(this, glsl_type::vec4_type);
 526    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 527
 528    dst_reg rounded(this, glsl_type::vec4_type);
 529    emit(RNDE(rounded, src_reg(scaled)));
 530
 531    dst_reg u(this, glsl_type::uvec4_type);
 532    emit(MOV(u, src_reg(rounded)));
 533
 534    src_reg bytes(u);
 535    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 536 }
 537
 538 void
 539 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 540 {
 541    dst_reg max(this, glsl_type::vec4_type);
 542    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 543
 544    dst_reg min(this, glsl_type::vec4_type);
 545    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 546
 547    dst_reg scaled(this, glsl_type::vec4_type);
 548    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 549
 550    dst_reg rounded(this, glsl_type::vec4_type);
 551    emit(RNDE(rounded, src_reg(scaled)));
 552
 553    dst_reg i(this, glsl_type::ivec4_type);
 554    emit(MOV(i, src_reg(rounded)));
 555
 556    src_reg bytes(i);
 557    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 558 }
 559
 560 void
 561 vec4_visitor::visit_instructions(const exec_list *list)
 562 {
 563    foreach_in_list(ir_instruction, ir, list) {
 564       base_ir = ir;
 565       ir->accept(this);
 566    }
 567 }
 568
 569
 570 static int
 571 type_size(const struct glsl_type *type)
 572 {
 573    unsigned int i;
 574    int size;
 575
 576    switch (type->base_type) {
 577    case GLSL_TYPE_UINT:
 578    case GLSL_TYPE_INT:
 579    case GLSL_TYPE_FLOAT:
 580    case GLSL_TYPE_BOOL:
 581       if (type->is_matrix()) {
 582          return type->matrix_columns;
 583       } else {
 584          /* Regardless of size of vector, it gets a vec4. This is bad
 585           * packing for things like floats, but otherwise arrays become a
 586           * mess.  Hopefully a later pass over the code can pack scalars
 587           * down if appropriate.
 588           */
 589          return 1;
 590       }
 591    case GLSL_TYPE_ARRAY:
 592       assert(type->length > 0);
 593       return type_size(type->fields.array) * type->length;
 594    case GLSL_TYPE_STRUCT:
 595       size = 0;
 596       for (i = 0; i < type->length; i++) {
 597          size += type_size(type->fields.structure[i].type);
 598       }
 599       return size;
 600    case GLSL_TYPE_SAMPLER:
 601       /* Samplers take up no register space, since they're baked in at
 602        * link time.
 603        */
 604       return 0;
 605    case GLSL_TYPE_ATOMIC_UINT:
 606       return 0;
 607    case GLSL_TYPE_IMAGE:
 608    case GLSL_TYPE_VOID:
 609    case GLSL_TYPE_ERROR:
 610    case GLSL_TYPE_INTERFACE:
 611       unreachable("not reached");
 612    }
 613
 614    return 0;
 615 }
 616
 617 int
 618 vec4_visitor::virtual_grf_alloc(int size)
 619 {
 620    if (virtual_grf_array_size <= virtual_grf_count) {
 621       if (virtual_grf_array_size == 0)
 622          virtual_grf_array_size = 16;
 623       else
 624          virtual_grf_array_size *= 2;
 625       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 626                                    virtual_grf_array_size);
 627       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 628                                      virtual_grf_array_size);
 629    }
 630    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 631    virtual_grf_reg_count += size;
 632    virtual_grf_sizes[virtual_grf_count] = size;
 633    return virtual_grf_count++;
 634 }
 635
 636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 637 {
 638    init();
 639
 640    this->file = GRF;
 641    this->reg = v->virtual_grf_alloc(type_size(type));
 642
 643    if (type->is_array() || type->is_record()) {
 644       this->swizzle = BRW_SWIZZLE_NOOP;
 645    } else {
 646       this->swizzle = swizzle_for_size(type->vector_elements);
 647    }
 648
 649    this->type = brw_type_for_base_type(type);
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 653 {
 654    assert(size > 0);
 655
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 660
 661    this->swizzle = BRW_SWIZZLE_NOOP;
 662
 663    this->type = brw_type_for_base_type(type);
 664 }
 665
 666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 667 {
 668    init();
 669
 670    this->file = GRF;
 671    this->reg = v->virtual_grf_alloc(type_size(type));
 672
 673    if (type->is_array() || type->is_record()) {
 674       this->writemask = WRITEMASK_XYZW;
 675    } else {
 676       this->writemask = (1 << type->vector_elements) - 1;
 677    }
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 /* Our support for uniforms is piggy-backed on the struct
 683  * gl_fragment_program, because that's where the values actually
 684  * get stored, rather than in some global gl_shader_program uniform
 685  * store.
 686  */
 687 void
 688 vec4_visitor::setup_uniform_values(ir_variable *ir)
 689 {
 690    int namelen = strlen(ir->name);
 691
 692    /* The data for our (non-builtin) uniforms is stored in a series of
 693     * gl_uniform_driver_storage structs for each subcomponent that
 694     * glGetUniformLocation() could name.  We know it's been set up in the same
 695     * order we'd walk the type, so walk the list of storage and find anything
 696     * with our name, or the prefix of a component that starts with our name.
 697     */
 698    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 699       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 700
 701       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 702           (storage->name[namelen] != 0 &&
 703            storage->name[namelen] != '.' &&
 704            storage->name[namelen] != '[')) {
 705          continue;
 706       }
 707
 708       gl_constant_value *components = storage->storage;
 709       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 710                                storage->type->matrix_columns);
 711
 712       for (unsigned s = 0; s < vector_count; s++) {
 713          assert(uniforms < uniform_array_size);
 714          uniform_vector_size[uniforms] = storage->type->vector_elements;
 715
 716          int i;
 717          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 718             stage_prog_data->param[uniforms * 4 + i] = components;
 719             components++;
 720          }
 721          for (; i < 4; i++) {
 722             static gl_constant_value zero = { 0.0 };
 723             stage_prog_data->param[uniforms * 4 + i] = &zero;
 724          }
 725
 726          uniforms++;
 727       }
 728    }
 729 }
 730
 731 void
 732 vec4_visitor::setup_uniform_clipplane_values()
 733 {
 734    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 735
 736    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 737       assert(this->uniforms < uniform_array_size);
 738       this->uniform_vector_size[this->uniforms] = 4;
 739       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 740       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 741       for (int j = 0; j < 4; ++j) {
 742          stage_prog_data->param[this->uniforms * 4 + j] =
 743             (gl_constant_value *) &clip_planes[i][j];
 744       }
 745       ++this->uniforms;
 746    }
 747 }
 748
 749 /* Our support for builtin uniforms is even scarier than non-builtin.
 750  * It sits on top of the PROG_STATE_VAR parameters that are
 751  * automatically updated from GL context state.
 752  */
 753 void
 754 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 755 {
 756    const ir_state_slot *const slots = ir->get_state_slots();
 757    assert(slots != NULL);
 758
 759    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 760       /* This state reference has already been setup by ir_to_mesa,
 761        * but we'll get the same index back here.  We can reference
 762        * ParameterValues directly, since unlike brw_fs.cpp, we never
 763        * add new state references during compile.
 764        */
 765       int index = _mesa_add_state_reference(this->prog->Parameters,
 766                                             (gl_state_index *)slots[i].tokens);
 767       gl_constant_value *values =
 768          &this->prog->Parameters->ParameterValues[index][0];
 769
 770       assert(this->uniforms < uniform_array_size);
 771       this->uniform_vector_size[this->uniforms] = 0;
 772       /* Add each of the unique swizzled channels of the element.
 773        * This will end up matching the size of the glsl_type of this field.
 774        */
 775       int last_swiz = -1;
 776       for (unsigned int j = 0; j < 4; j++) {
 777          int swiz = GET_SWZ(slots[i].swizzle, j);
 778          last_swiz = swiz;
 779
 780          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 781          assert(this->uniforms < uniform_array_size);
 782          if (swiz <= last_swiz)
 783             this->uniform_vector_size[this->uniforms]++;
 784       }
 785       this->uniforms++;
 786    }
 787 }
 788
 789 dst_reg *
 790 vec4_visitor::variable_storage(ir_variable *var)
 791 {
 792    return (dst_reg *)hash_table_find(this->variable_ht, var);
 793 }
 794
 795 void
 796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 797                                      enum brw_predicate *predicate)
 798 {
 799    ir_expression *expr = ir->as_expression();
 800
 801    *predicate = BRW_PREDICATE_NORMAL;
 802
 803    if (expr && expr->operation != ir_binop_ubo_load) {
 804       src_reg op[3];
 805       vec4_instruction *inst;
 806
 807       assert(expr->get_num_operands() <= 3);
 808       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 809          expr->operands[i]->accept(this);
 810          op[i] = this->result;
 811
 812          resolve_ud_negate(&op[i]);
 813       }
 814
 815       switch (expr->operation) {
 816       case ir_unop_logic_not:
 817          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 818          inst->conditional_mod = BRW_CONDITIONAL_Z;
 819          break;
 820
 821       case ir_binop_logic_xor:
 822          if (brw->gen <= 5) {
 823             src_reg temp = src_reg(this, ir->type);
 824             emit(XOR(dst_reg(temp), op[0], op[1]));
 825             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 826          } else {
 827             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 828          }
 829          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 830          break;
 831
 832       case ir_binop_logic_or:
 833          if (brw->gen <= 5) {
 834             src_reg temp = src_reg(this, ir->type);
 835             emit(OR(dst_reg(temp), op[0], op[1]));
 836             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 837          } else {
 838             inst = emit(OR(dst_null_d(), op[0], op[1]));
 839          }
 840          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841          break;
 842
 843       case ir_binop_logic_and:
 844          if (brw->gen <= 5) {
 845             src_reg temp = src_reg(this, ir->type);
 846             emit(AND(dst_reg(temp), op[0], op[1]));
 847             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 848          } else {
 849             inst = emit(AND(dst_null_d(), op[0], op[1]));
 850          }
 851          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 852          break;
 853
 854       case ir_unop_f2b:
 855          if (brw->gen >= 6) {
 856             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 857          } else {
 858             inst = emit(MOV(dst_null_f(), op[0]));
 859             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 860          }
 861          break;
 862
 863       case ir_unop_i2b:
 864          if (brw->gen >= 6) {
 865             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 866          } else {
 867             inst = emit(MOV(dst_null_d(), op[0]));
 868             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869          }
 870          break;
 871
 872       case ir_binop_all_equal:
 873          if (brw->gen <= 5) {
 874             resolve_bool_comparison(expr->operands[0], &op[0]);
 875             resolve_bool_comparison(expr->operands[1], &op[1]);
 876          }
 877          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 878          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 879          break;
 880
 881       case ir_binop_any_nequal:
 882          if (brw->gen <= 5) {
 883             resolve_bool_comparison(expr->operands[0], &op[0]);
 884             resolve_bool_comparison(expr->operands[1], &op[1]);
 885          }
 886          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 887          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 888          break;
 889
 890       case ir_unop_any:
 891          if (brw->gen <= 5) {
 892             resolve_bool_comparison(expr->operands[0], &op[0]);
 893          }
 894          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 895          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 896          break;
 897
 898       case ir_binop_greater:
 899       case ir_binop_gequal:
 900       case ir_binop_less:
 901       case ir_binop_lequal:
 902       case ir_binop_equal:
 903       case ir_binop_nequal:
 904          if (brw->gen <= 5) {
 905             resolve_bool_comparison(expr->operands[0], &op[0]);
 906             resolve_bool_comparison(expr->operands[1], &op[1]);
 907          }
 908          emit(CMP(dst_null_d(), op[0], op[1],
 909                   brw_conditional_for_comparison(expr->operation)));
 910          break;
 911
 912       case ir_triop_csel: {
 913          /* Expand the boolean condition into the flag register. */
 914          inst = emit(MOV(dst_null_d(), op[0]));
 915          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 916
 917          /* Select which boolean to return. */
 918          dst_reg temp(this, expr->operands[1]->type);
 919          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 920          inst->predicate = BRW_PREDICATE_NORMAL;
 921
 922          /* Expand the result to a condition code. */
 923          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 924          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 925          break;
 926       }
 927
 928       default:
 929          unreachable("not reached");
 930       }
 931       return;
 932    }
 933
 934    ir->accept(this);
 935
 936    resolve_ud_negate(&this->result);
 937
 938    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 939    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 940 }
 941
 942 /**
 943  * Emit a gen6 IF statement with the comparison folded into the IF
 944  * instruction.
 945  */
 946 void
 947 vec4_visitor::emit_if_gen6(ir_if *ir)
 948 {
 949    ir_expression *expr = ir->condition->as_expression();
 950
 951    if (expr && expr->operation != ir_binop_ubo_load) {
 952       src_reg op[3];
 953       dst_reg temp;
 954
 955       assert(expr->get_num_operands() <= 3);
 956       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 957          expr->operands[i]->accept(this);
 958          op[i] = this->result;
 959       }
 960
 961       switch (expr->operation) {
 962       case ir_unop_logic_not:
 963          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 964          return;
 965
 966       case ir_binop_logic_xor:
 967          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_binop_logic_or:
 971          temp = dst_reg(this, glsl_type::bool_type);
 972          emit(OR(temp, op[0], op[1]));
 973          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 974          return;
 975
 976       case ir_binop_logic_and:
 977          temp = dst_reg(this, glsl_type::bool_type);
 978          emit(AND(temp, op[0], op[1]));
 979          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 980          return;
 981
 982       case ir_unop_f2b:
 983          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 984          return;
 985
 986       case ir_unop_i2b:
 987          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 988          return;
 989
 990       case ir_binop_greater:
 991       case ir_binop_gequal:
 992       case ir_binop_less:
 993       case ir_binop_lequal:
 994       case ir_binop_equal:
 995       case ir_binop_nequal:
 996          emit(IF(op[0], op[1],
 997                  brw_conditional_for_comparison(expr->operation)));
 998          return;
 999
1000       case ir_binop_all_equal:
1001          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1002          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1003          return;
1004
1005       case ir_binop_any_nequal:
1006          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1007          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1008          return;
1009
1010       case ir_unop_any:
1011          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1012          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1013          return;
1014
1015       case ir_triop_csel: {
1016          /* Expand the boolean condition into the flag register. */
1017          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1018          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1019
1020          /* Select which boolean to return. */
1021          dst_reg temp(this, expr->operands[1]->type);
1022          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1023          inst->predicate = BRW_PREDICATE_NORMAL;
1024
1025          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1026          return;
1027       }
1028
1029       default:
1030          unreachable("not reached");
1031       }
1032       return;
1033    }
1034
1035    ir->condition->accept(this);
1036
1037    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_variable *ir)
1042 {
1043    dst_reg *reg = NULL;
1044
1045    if (variable_storage(ir))
1046       return;
1047
1048    switch (ir->data.mode) {
1049    case ir_var_shader_in:
1050       assert(ir->data.location != -1);
1051       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1052       break;
1053
1054    case ir_var_shader_out:
1055       assert(ir->data.location != -1);
1056       reg = new(mem_ctx) dst_reg(this, ir->type);
1057
1058       for (int i = 0; i < type_size(ir->type); i++) {
1059          output_reg[ir->data.location + i] = *reg;
1060          output_reg[ir->data.location + i].reg_offset = i;
1061          output_reg[ir->data.location + i].type =
1062             brw_type_for_base_type(ir->type->get_scalar_type());
1063          output_reg_annotation[ir->data.location + i] = ir->name;
1064       }
1065       break;
1066
1067    case ir_var_auto:
1068    case ir_var_temporary:
1069       reg = new(mem_ctx) dst_reg(this, ir->type);
1070       break;
1071
1072    case ir_var_uniform:
1073       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1074
1075       /* Thanks to the lower_ubo_reference pass, we will see only
1076        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1077        * variables, so no need for them to be in variable_ht.
1078        *
1079        * Some uniforms, such as samplers and atomic counters, have no actual
1080        * storage, so we should ignore them.
1081        */
1082       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1083          return;
1084
1085       /* Track how big the whole uniform variable is, in case we need to put a
1086        * copy of its data into pull constants for array access.
1087        */
1088       assert(this->uniforms < uniform_array_size);
1089       this->uniform_size[this->uniforms] = type_size(ir->type);
1090
1091       if (!strncmp(ir->name, "gl_", 3)) {
1092          setup_builtin_uniform_values(ir);
1093       } else {
1094          setup_uniform_values(ir);
1095       }
1096       break;
1097
1098    case ir_var_system_value:
1099       reg = make_reg_for_system_value(ir);
1100       break;
1101
1102    default:
1103       unreachable("not reached");
1104    }
1105
1106    reg->type = brw_type_for_base_type(ir->type);
1107    hash_table_insert(this->variable_ht, reg, ir);
1108 }
1109
1110 void
1111 vec4_visitor::visit(ir_loop *ir)
1112 {
1113    /* We don't want debugging output to print the whole body of the
1114     * loop as the annotation.
1115     */
1116    this->base_ir = NULL;
1117
1118    emit(BRW_OPCODE_DO);
1119
1120    visit_instructions(&ir->body_instructions);
1121
1122    emit(BRW_OPCODE_WHILE);
1123 }
1124
1125 void
1126 vec4_visitor::visit(ir_loop_jump *ir)
1127 {
1128    switch (ir->mode) {
1129    case ir_loop_jump::jump_break:
1130       emit(BRW_OPCODE_BREAK);
1131       break;
1132    case ir_loop_jump::jump_continue:
1133       emit(BRW_OPCODE_CONTINUE);
1134       break;
1135    }
1136 }
1137
1138
1139 void
1140 vec4_visitor::visit(ir_function_signature *)
1141 {
1142    unreachable("not reached");
1143 }
1144
1145 void
1146 vec4_visitor::visit(ir_function *ir)
1147 {
1148    /* Ignore function bodies other than main() -- we shouldn't see calls to
1149     * them since they should all be inlined.
1150     */
1151    if (strcmp(ir->name, "main") == 0) {
1152       const ir_function_signature *sig;
1153       exec_list empty;
1154
1155       sig = ir->matching_signature(NULL, &empty, false);
1156
1157       assert(sig);
1158
1159       visit_instructions(&sig->body);
1160    }
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_mad(ir_expression *ir)
1165 {
1166    /* 3-src instructions were introduced in gen6. */
1167    if (brw->gen < 6)
1168       return false;
1169
1170    /* MAD can only handle floating-point data. */
1171    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1172       return false;
1173
1174    ir_rvalue *nonmul = ir->operands[1];
1175    ir_expression *mul = ir->operands[0]->as_expression();
1176
1177    if (!mul || mul->operation != ir_binop_mul) {
1178       nonmul = ir->operands[0];
1179       mul = ir->operands[1]->as_expression();
1180
1181       if (!mul || mul->operation != ir_binop_mul)
1182          return false;
1183    }
1184
1185    nonmul->accept(this);
1186    src_reg src0 = fix_3src_operand(this->result);
1187
1188    mul->operands[0]->accept(this);
1189    src_reg src1 = fix_3src_operand(this->result);
1190
1191    mul->operands[1]->accept(this);
1192    src_reg src2 = fix_3src_operand(this->result);
1193
1194    this->result = src_reg(this, ir->type);
1195    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1196
1197    return true;
1198 }
1199
1200 bool
1201 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1202 {
1203    /* This optimization relies on CMP setting the destination to 0 when
1204     * false.  Early hardware only sets the least significant bit, and
1205     * leaves the other bits undefined.  So we can't use it.
1206     */
1207    if (brw->gen < 6)
1208       return false;
1209
1210    ir_expression *const cmp = ir->operands[0]->as_expression();
1211
1212    if (cmp == NULL)
1213       return false;
1214
1215    switch (cmp->operation) {
1216    case ir_binop_less:
1217    case ir_binop_greater:
1218    case ir_binop_lequal:
1219    case ir_binop_gequal:
1220    case ir_binop_equal:
1221    case ir_binop_nequal:
1222       break;
1223
1224    default:
1225       return false;
1226    }
1227
1228    cmp->operands[0]->accept(this);
1229    const src_reg cmp_src0 = this->result;
1230
1231    cmp->operands[1]->accept(this);
1232    const src_reg cmp_src1 = this->result;
1233
1234    this->result = src_reg(this, ir->type);
1235
1236    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1237             brw_conditional_for_comparison(cmp->operation)));
1238
1239    /* If the comparison is false, this->result will just happen to be zero.
1240     */
1241    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1242                                        this->result, src_reg(1.0f));
1243    inst->predicate = BRW_PREDICATE_NORMAL;
1244    inst->predicate_inverse = true;
1245
1246    return true;
1247 }
1248
1249 void
1250 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1251                           src_reg src0, src_reg src1)
1252 {
1253    vec4_instruction *inst;
1254
1255    if (brw->gen >= 6) {
1256       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1257       inst->conditional_mod = conditionalmod;
1258    } else {
1259       emit(CMP(dst, src0, src1, conditionalmod));
1260
1261       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262       inst->predicate = BRW_PREDICATE_NORMAL;
1263    }
1264 }
1265
1266 void
1267 vec4_visitor::emit_lrp(const dst_reg &dst,
1268                        const src_reg &x, const src_reg &y, const src_reg &a)
1269 {
1270    if (brw->gen >= 6) {
1271       /* Note that the instruction's argument order is reversed from GLSL
1272        * and the IR.
1273        */
1274       emit(LRP(dst,
1275                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1276    } else {
1277       /* Earlier generations don't support three source operations, so we
1278        * need to emit x*(1-a) + y*a.
1279        */
1280       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1281       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1282       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1283       y_times_a.writemask           = dst.writemask;
1284       one_minus_a.writemask         = dst.writemask;
1285       x_times_one_minus_a.writemask = dst.writemask;
1286
1287       emit(MUL(y_times_a, y, a));
1288       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1289       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1290       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1291    }
1292 }
1293
1294 void
1295 vec4_visitor::visit(ir_expression *ir)
1296 {
1297    unsigned int operand;
1298    src_reg op[Elements(ir->operands)];
1299    vec4_instruction *inst;
1300
1301    if (ir->operation == ir_binop_add) {
1302       if (try_emit_mad(ir))
1303          return;
1304    }
1305
1306    if (ir->operation == ir_unop_b2f) {
1307       if (try_emit_b2f_of_compare(ir))
1308          return;
1309    }
1310
1311    /* Storage for our result.  Ideally for an assignment we'd be using
1312     * the actual storage for the result here, instead.
1313     */
1314    dst_reg result_dst(this, ir->type);
1315    src_reg result_src(result_dst);
1316
1317    if (ir->operation == ir_triop_csel) {
1318       ir->operands[1]->accept(this);
1319       op[1] = this->result;
1320       ir->operands[2]->accept(this);
1321       op[2] = this->result;
1322
1323       enum brw_predicate predicate;
1324       emit_bool_to_cond_code(ir->operands[0], &predicate);
1325       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1326       inst->predicate = predicate;
1327       this->result = result_src;
1328       return;
1329    }
1330
1331    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1332       this->result.file = BAD_FILE;
1333       ir->operands[operand]->accept(this);
1334       if (this->result.file == BAD_FILE) {
1335          fprintf(stderr, "Failed to get tree for expression operand:\n");
1336          ir->operands[operand]->fprint(stderr);
1337          exit(1);
1338       }
1339       op[operand] = this->result;
1340
1341       /* Matrix expression operands should have been broken down to vector
1342        * operations already.
1343        */
1344       assert(!ir->operands[operand]->type->is_matrix());
1345    }
1346
1347    /* If nothing special happens, this is the result. */
1348    this->result = result_src;
1349
1350    switch (ir->operation) {
1351    case ir_unop_logic_not:
1352       emit(NOT(result_dst, op[0]));
1353       break;
1354    case ir_unop_neg:
1355       op[0].negate = !op[0].negate;
1356       emit(MOV(result_dst, op[0]));
1357       break;
1358    case ir_unop_abs:
1359       op[0].abs = true;
1360       op[0].negate = false;
1361       emit(MOV(result_dst, op[0]));
1362       break;
1363
1364    case ir_unop_sign:
1365       if (ir->type->is_float()) {
1366          /* AND(val, 0x80000000) gives the sign bit.
1367           *
1368           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1369           * zero.
1370           */
1371          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1372
1373          op[0].type = BRW_REGISTER_TYPE_UD;
1374          result_dst.type = BRW_REGISTER_TYPE_UD;
1375          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1376
1377          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1378          inst->predicate = BRW_PREDICATE_NORMAL;
1379
1380          this->result.type = BRW_REGISTER_TYPE_F;
1381       } else {
1382          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1383           *               -> non-negative val generates 0x00000000.
1384           *  Predicated OR sets 1 if val is positive.
1385           */
1386          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1387
1388          emit(ASR(result_dst, op[0], src_reg(31)));
1389
1390          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1391          inst->predicate = BRW_PREDICATE_NORMAL;
1392       }
1393       break;
1394
1395    case ir_unop_rcp:
1396       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1397       break;
1398
1399    case ir_unop_exp2:
1400       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1401       break;
1402    case ir_unop_log2:
1403       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1404       break;
1405    case ir_unop_exp:
1406    case ir_unop_log:
1407       unreachable("not reached: should be handled by ir_explog_to_explog2");
1408    case ir_unop_sin:
1409    case ir_unop_sin_reduced:
1410       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1411       break;
1412    case ir_unop_cos:
1413    case ir_unop_cos_reduced:
1414       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1415       break;
1416
1417    case ir_unop_dFdx:
1418    case ir_unop_dFdx_coarse:
1419    case ir_unop_dFdx_fine:
1420    case ir_unop_dFdy:
1421    case ir_unop_dFdy_coarse:
1422    case ir_unop_dFdy_fine:
1423       unreachable("derivatives not valid in vertex shader");
1424
1425    case ir_unop_bitfield_reverse:
1426       emit(BFREV(result_dst, op[0]));
1427       break;
1428    case ir_unop_bit_count:
1429       emit(CBIT(result_dst, op[0]));
1430       break;
1431    case ir_unop_find_msb: {
1432       src_reg temp = src_reg(this, glsl_type::uint_type);
1433
1434       inst = emit(FBH(dst_reg(temp), op[0]));
1435       inst->dst.writemask = WRITEMASK_XYZW;
1436
1437       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1438        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1439        * subtract the result from 31 to convert the MSB count into an LSB count.
1440        */
1441
1442       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1443       temp.swizzle = BRW_SWIZZLE_NOOP;
1444       emit(MOV(result_dst, temp));
1445
1446       src_reg src_tmp = src_reg(result_dst);
1447       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1448
1449       src_tmp.negate = true;
1450       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1451       inst->predicate = BRW_PREDICATE_NORMAL;
1452       break;
1453    }
1454    case ir_unop_find_lsb:
1455       emit(FBL(result_dst, op[0]));
1456       break;
1457    case ir_unop_saturate:
1458       inst = emit(MOV(result_dst, op[0]));
1459       inst->saturate = true;
1460       break;
1461
1462    case ir_unop_noise:
1463       unreachable("not reached: should be handled by lower_noise");
1464
1465    case ir_binop_add:
1466       emit(ADD(result_dst, op[0], op[1]));
1467       break;
1468    case ir_binop_sub:
1469       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1470
1471    case ir_binop_mul:
1472       if (brw->gen < 8 && ir->type->is_integer()) {
1473          /* For integer multiplication, the MUL uses the low 16 bits of one of
1474           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1475           * accumulates in the contribution of the upper 16 bits of that
1476           * operand.  If we can determine that one of the args is in the low
1477           * 16 bits, though, we can just emit a single MUL.
1478           */
1479          if (ir->operands[0]->is_uint16_constant()) {
1480             if (brw->gen < 7)
1481                emit(MUL(result_dst, op[0], op[1]));
1482             else
1483                emit(MUL(result_dst, op[1], op[0]));
1484          } else if (ir->operands[1]->is_uint16_constant()) {
1485             if (brw->gen < 7)
1486                emit(MUL(result_dst, op[1], op[0]));
1487             else
1488                emit(MUL(result_dst, op[0], op[1]));
1489          } else {
1490             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1491
1492             emit(MUL(acc, op[0], op[1]));
1493             emit(MACH(dst_null_d(), op[0], op[1]));
1494             emit(MOV(result_dst, src_reg(acc)));
1495          }
1496       } else {
1497          emit(MUL(result_dst, op[0], op[1]));
1498       }
1499       break;
1500    case ir_binop_imul_high: {
1501       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1502
1503       emit(MUL(acc, op[0], op[1]));
1504       emit(MACH(result_dst, op[0], op[1]));
1505       break;
1506    }
1507    case ir_binop_div:
1508       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1509       assert(ir->type->is_integer());
1510       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1511       break;
1512    case ir_binop_carry: {
1513       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1514
1515       emit(ADDC(dst_null_ud(), op[0], op[1]));
1516       emit(MOV(result_dst, src_reg(acc)));
1517       break;
1518    }
1519    case ir_binop_borrow: {
1520       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1521
1522       emit(SUBB(dst_null_ud(), op[0], op[1]));
1523       emit(MOV(result_dst, src_reg(acc)));
1524       break;
1525    }
1526    case ir_binop_mod:
1527       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1528       assert(ir->type->is_integer());
1529       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1530       break;
1531
1532    case ir_binop_less:
1533    case ir_binop_greater:
1534    case ir_binop_lequal:
1535    case ir_binop_gequal:
1536    case ir_binop_equal:
1537    case ir_binop_nequal: {
1538       if (brw->gen <= 5) {
1539          resolve_bool_comparison(ir->operands[0], &op[0]);
1540          resolve_bool_comparison(ir->operands[1], &op[1]);
1541       }
1542       emit(CMP(result_dst, op[0], op[1],
1543                brw_conditional_for_comparison(ir->operation)));
1544       break;
1545    }
1546
1547    case ir_binop_all_equal:
1548       /* "==" operator producing a scalar boolean. */
1549       if (ir->operands[0]->type->is_vector() ||
1550           ir->operands[1]->type->is_vector()) {
1551          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1552          emit(MOV(result_dst, src_reg(0)));
1553          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1554          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1555       } else {
1556          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1557       }
1558       break;
1559    case ir_binop_any_nequal:
1560       /* "!=" operator producing a scalar boolean. */
1561       if (ir->operands[0]->type->is_vector() ||
1562           ir->operands[1]->type->is_vector()) {
1563          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1564
1565          emit(MOV(result_dst, src_reg(0)));
1566          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1567          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1568       } else {
1569          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1570       }
1571       break;
1572
1573    case ir_unop_any:
1574       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1575       emit(MOV(result_dst, src_reg(0)));
1576
1577       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1578       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1579       break;
1580
1581    case ir_binop_logic_xor:
1582       emit(XOR(result_dst, op[0], op[1]));
1583       break;
1584
1585    case ir_binop_logic_or:
1586       emit(OR(result_dst, op[0], op[1]));
1587       break;
1588
1589    case ir_binop_logic_and:
1590       emit(AND(result_dst, op[0], op[1]));
1591       break;
1592
1593    case ir_binop_dot:
1594       assert(ir->operands[0]->type->is_vector());
1595       assert(ir->operands[0]->type == ir->operands[1]->type);
1596       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1597       break;
1598
1599    case ir_unop_sqrt:
1600       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1601       break;
1602    case ir_unop_rsq:
1603       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1604       break;
1605
1606    case ir_unop_bitcast_i2f:
1607    case ir_unop_bitcast_u2f:
1608       this->result = op[0];
1609       this->result.type = BRW_REGISTER_TYPE_F;
1610       break;
1611
1612    case ir_unop_bitcast_f2i:
1613       this->result = op[0];
1614       this->result.type = BRW_REGISTER_TYPE_D;
1615       break;
1616
1617    case ir_unop_bitcast_f2u:
1618       this->result = op[0];
1619       this->result.type = BRW_REGISTER_TYPE_UD;
1620       break;
1621
1622    case ir_unop_i2f:
1623    case ir_unop_i2u:
1624    case ir_unop_u2i:
1625    case ir_unop_u2f:
1626    case ir_unop_f2i:
1627    case ir_unop_f2u:
1628       emit(MOV(result_dst, op[0]));
1629       break;
1630    case ir_unop_b2i:
1631       emit(AND(result_dst, op[0], src_reg(1)));
1632       break;
1633    case ir_unop_b2f:
1634       if (brw->gen <= 5) {
1635          resolve_bool_comparison(ir->operands[0], &op[0]);
1636       }
1637       op[0].type = BRW_REGISTER_TYPE_D;
1638       result_dst.type = BRW_REGISTER_TYPE_D;
1639       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1640       result_dst.type = BRW_REGISTER_TYPE_F;
1641       break;
1642    case ir_unop_f2b:
1643       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1644       break;
1645    case ir_unop_i2b:
1646       emit(AND(result_dst, op[0], src_reg(1)));
1647       break;
1648
1649    case ir_unop_trunc:
1650       emit(RNDZ(result_dst, op[0]));
1651       break;
1652    case ir_unop_ceil:
1653       op[0].negate = !op[0].negate;
1654       inst = emit(RNDD(result_dst, op[0]));
1655       this->result.negate = true;
1656       break;
1657    case ir_unop_floor:
1658       inst = emit(RNDD(result_dst, op[0]));
1659       break;
1660    case ir_unop_fract:
1661       inst = emit(FRC(result_dst, op[0]));
1662       break;
1663    case ir_unop_round_even:
1664       emit(RNDE(result_dst, op[0]));
1665       break;
1666
1667    case ir_binop_min:
1668       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1669       break;
1670    case ir_binop_max:
1671       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1672       break;
1673
1674    case ir_binop_pow:
1675       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1676       break;
1677
1678    case ir_unop_bit_not:
1679       inst = emit(NOT(result_dst, op[0]));
1680       break;
1681    case ir_binop_bit_and:
1682       inst = emit(AND(result_dst, op[0], op[1]));
1683       break;
1684    case ir_binop_bit_xor:
1685       inst = emit(XOR(result_dst, op[0], op[1]));
1686       break;
1687    case ir_binop_bit_or:
1688       inst = emit(OR(result_dst, op[0], op[1]));
1689       break;
1690
1691    case ir_binop_lshift:
1692       inst = emit(SHL(result_dst, op[0], op[1]));
1693       break;
1694
1695    case ir_binop_rshift:
1696       if (ir->type->base_type == GLSL_TYPE_INT)
1697          inst = emit(ASR(result_dst, op[0], op[1]));
1698       else
1699          inst = emit(SHR(result_dst, op[0], op[1]));
1700       break;
1701
1702    case ir_binop_bfm:
1703       emit(BFI1(result_dst, op[0], op[1]));
1704       break;
1705
1706    case ir_binop_ubo_load: {
1707       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1708       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1709       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1710       src_reg offset;
1711
1712       /* Now, load the vector from that offset. */
1713       assert(ir->type->is_vector() || ir->type->is_scalar());
1714
1715       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1716       packed_consts.type = result.type;
1717       src_reg surf_index;
1718
1719       if (const_uniform_block) {
1720          /* The block index is a constant, so just emit the binding table entry
1721           * as an immediate.
1722           */
1723          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1724                               const_uniform_block->value.u[0]);
1725       } else {
1726          /* The block index is not a constant. Evaluate the index expression
1727           * per-channel and add the base UBO index; the generator will select
1728           * a value from any live channel.
1729           */
1730          surf_index = src_reg(this, glsl_type::uint_type);
1731          emit(ADD(dst_reg(surf_index), op[0],
1732                   src_reg(prog_data->base.binding_table.ubo_start)));
1733
1734          /* Assume this may touch any UBO. It would be nice to provide
1735           * a tighter bound, but the array information is already lowered away.
1736           */
1737          brw_mark_surface_used(&prog_data->base,
1738                                prog_data->base.binding_table.ubo_start +
1739                                shader_prog->NumUniformBlocks - 1);
1740       }
1741
1742       if (const_offset_ir) {
1743          if (brw->gen >= 8) {
1744             /* Store the offset in a GRF so we can send-from-GRF. */
1745             offset = src_reg(this, glsl_type::int_type);
1746             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1747          } else {
1748             /* Immediates are fine on older generations since they'll be moved
1749              * to a (potentially fake) MRF at the generator level.
1750              */
1751             offset = src_reg(const_offset / 16);
1752          }
1753       } else {
1754          offset = src_reg(this, glsl_type::uint_type);
1755          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1756       }
1757
1758       if (brw->gen >= 7) {
1759          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1760          grf_offset.type = offset.type;
1761
1762          emit(MOV(grf_offset, offset));
1763
1764          emit(new(mem_ctx) vec4_instruction(this,
1765                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1766                                             dst_reg(packed_consts),
1767                                             surf_index,
1768                                             src_reg(grf_offset)));
1769       } else {
1770          vec4_instruction *pull =
1771             emit(new(mem_ctx) vec4_instruction(this,
1772                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1773                                                dst_reg(packed_consts),
1774                                                surf_index,
1775                                                offset));
1776          pull->base_mrf = 14;
1777          pull->mlen = 1;
1778       }
1779
1780       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1781       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1782                                             const_offset % 16 / 4,
1783                                             const_offset % 16 / 4,
1784                                             const_offset % 16 / 4);
1785
1786       /* UBO bools are any nonzero int.  We need to convert them to use the
1787        * value of true stored in ctx->Const.UniformBooleanTrue.
1788        */
1789       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1790          emit(CMP(result_dst, packed_consts, src_reg(0u),
1791                   BRW_CONDITIONAL_NZ));
1792       } else {
1793          emit(MOV(result_dst, packed_consts));
1794       }
1795       break;
1796    }
1797
1798    case ir_binop_vector_extract:
1799       unreachable("should have been lowered by vec_index_to_cond_assign");
1800
1801    case ir_triop_fma:
1802       op[0] = fix_3src_operand(op[0]);
1803       op[1] = fix_3src_operand(op[1]);
1804       op[2] = fix_3src_operand(op[2]);
1805       /* Note that the instruction's argument order is reversed from GLSL
1806        * and the IR.
1807        */
1808       emit(MAD(result_dst, op[2], op[1], op[0]));
1809       break;
1810
1811    case ir_triop_lrp:
1812       emit_lrp(result_dst, op[0], op[1], op[2]);
1813       break;
1814
1815    case ir_triop_csel:
1816       unreachable("already handled above");
1817       break;
1818
1819    case ir_triop_bfi:
1820       op[0] = fix_3src_operand(op[0]);
1821       op[1] = fix_3src_operand(op[1]);
1822       op[2] = fix_3src_operand(op[2]);
1823       emit(BFI2(result_dst, op[0], op[1], op[2]));
1824       break;
1825
1826    case ir_triop_bitfield_extract:
1827       op[0] = fix_3src_operand(op[0]);
1828       op[1] = fix_3src_operand(op[1]);
1829       op[2] = fix_3src_operand(op[2]);
1830       /* Note that the instruction's argument order is reversed from GLSL
1831        * and the IR.
1832        */
1833       emit(BFE(result_dst, op[2], op[1], op[0]));
1834       break;
1835
1836    case ir_triop_vector_insert:
1837       unreachable("should have been lowered by lower_vector_insert");
1838
1839    case ir_quadop_bitfield_insert:
1840       unreachable("not reached: should be handled by "
1841               "bitfield_insert_to_bfm_bfi\n");
1842
1843    case ir_quadop_vector:
1844       unreachable("not reached: should be handled by lower_quadop_vector");
1845
1846    case ir_unop_pack_half_2x16:
1847       emit_pack_half_2x16(result_dst, op[0]);
1848       break;
1849    case ir_unop_unpack_half_2x16:
1850       emit_unpack_half_2x16(result_dst, op[0]);
1851       break;
1852    case ir_unop_unpack_unorm_4x8:
1853       emit_unpack_unorm_4x8(result_dst, op[0]);
1854       break;
1855    case ir_unop_unpack_snorm_4x8:
1856       emit_unpack_snorm_4x8(result_dst, op[0]);
1857       break;
1858    case ir_unop_pack_unorm_4x8:
1859       emit_pack_unorm_4x8(result_dst, op[0]);
1860       break;
1861    case ir_unop_pack_snorm_4x8:
1862       emit_pack_snorm_4x8(result_dst, op[0]);
1863       break;
1864    case ir_unop_pack_snorm_2x16:
1865    case ir_unop_pack_unorm_2x16:
1866    case ir_unop_unpack_snorm_2x16:
1867    case ir_unop_unpack_unorm_2x16:
1868       unreachable("not reached: should be handled by lower_packing_builtins");
1869    case ir_unop_unpack_half_2x16_split_x:
1870    case ir_unop_unpack_half_2x16_split_y:
1871    case ir_binop_pack_half_2x16_split:
1872    case ir_unop_interpolate_at_centroid:
1873    case ir_binop_interpolate_at_sample:
1874    case ir_binop_interpolate_at_offset:
1875       unreachable("not reached: should not occur in vertex shader");
1876    case ir_binop_ldexp:
1877       unreachable("not reached: should be handled by ldexp_to_arith()");
1878    }
1879 }
1880
1881
1882 void
1883 vec4_visitor::visit(ir_swizzle *ir)
1884 {
1885    src_reg src;
1886    int i = 0;
1887    int swizzle[4];
1888
1889    /* Note that this is only swizzles in expressions, not those on the left
1890     * hand side of an assignment, which do write masking.  See ir_assignment
1891     * for that.
1892     */
1893
1894    ir->val->accept(this);
1895    src = this->result;
1896    assert(src.file != BAD_FILE);
1897
1898    for (i = 0; i < ir->type->vector_elements; i++) {
1899       switch (i) {
1900       case 0:
1901          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1902          break;
1903       case 1:
1904          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1905          break;
1906       case 2:
1907          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1908          break;
1909       case 3:
1910          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1911             break;
1912       }
1913    }
1914    for (; i < 4; i++) {
1915       /* Replicate the last channel out. */
1916       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1917    }
1918
1919    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1920
1921    this->result = src;
1922 }
1923
1924 void
1925 vec4_visitor::visit(ir_dereference_variable *ir)
1926 {
1927    const struct glsl_type *type = ir->type;
1928    dst_reg *reg = variable_storage(ir->var);
1929
1930    if (!reg) {
1931       fail("Failed to find variable storage for %s\n", ir->var->name);
1932       this->result = src_reg(brw_null_reg());
1933       return;
1934    }
1935
1936    this->result = src_reg(*reg);
1937
1938    /* System values get their swizzle from the dst_reg writemask */
1939    if (ir->var->data.mode == ir_var_system_value)
1940       return;
1941
1942    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1943       this->result.swizzle = swizzle_for_size(type->vector_elements);
1944 }
1945
1946
1947 int
1948 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1949 {
1950    /* Under normal circumstances array elements are stored consecutively, so
1951     * the stride is equal to the size of the array element.
1952     */
1953    return type_size(ir->type);
1954 }
1955
1956
1957 void
1958 vec4_visitor::visit(ir_dereference_array *ir)
1959 {
1960    ir_constant *constant_index;
1961    src_reg src;
1962    int array_stride = compute_array_stride(ir);
1963
1964    constant_index = ir->array_index->constant_expression_value();
1965
1966    ir->array->accept(this);
1967    src = this->result;
1968
1969    if (constant_index) {
1970       src.reg_offset += constant_index->value.i[0] * array_stride;
1971    } else {
1972       /* Variable index array dereference.  It eats the "vec4" of the
1973        * base of the array and an index that offsets the Mesa register
1974        * index.
1975        */
1976       ir->array_index->accept(this);
1977
1978       src_reg index_reg;
1979
1980       if (array_stride == 1) {
1981          index_reg = this->result;
1982       } else {
1983          index_reg = src_reg(this, glsl_type::int_type);
1984
1985          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1986       }
1987
1988       if (src.reladdr) {
1989          src_reg temp = src_reg(this, glsl_type::int_type);
1990
1991          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1992
1993          index_reg = temp;
1994       }
1995
1996       src.reladdr = ralloc(mem_ctx, src_reg);
1997       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1998    }
1999
2000    /* If the type is smaller than a vec4, replicate the last channel out. */
2001    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2002       src.swizzle = swizzle_for_size(ir->type->vector_elements);
2003    else
2004       src.swizzle = BRW_SWIZZLE_NOOP;
2005    src.type = brw_type_for_base_type(ir->type);
2006
2007    this->result = src;
2008 }
2009
2010 void
2011 vec4_visitor::visit(ir_dereference_record *ir)
2012 {
2013    unsigned int i;
2014    const glsl_type *struct_type = ir->record->type;
2015    int offset = 0;
2016
2017    ir->record->accept(this);
2018
2019    for (i = 0; i < struct_type->length; i++) {
2020       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2021          break;
2022       offset += type_size(struct_type->fields.structure[i].type);
2023    }
2024
2025    /* If the type is smaller than a vec4, replicate the last channel out. */
2026    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2027       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2028    else
2029       this->result.swizzle = BRW_SWIZZLE_NOOP;
2030    this->result.type = brw_type_for_base_type(ir->type);
2031
2032    this->result.reg_offset += offset;
2033 }
2034
2035 /**
2036  * We want to be careful in assignment setup to hit the actual storage
2037  * instead of potentially using a temporary like we might with the
2038  * ir_dereference handler.
2039  */
2040 static dst_reg
2041 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2042 {
2043    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2044     * access of a vector, it must be separated into a series conditional moves
2045     * before reaching this point (see ir_vec_index_to_cond_assign).
2046     */
2047    assert(ir->as_dereference());
2048    ir_dereference_array *deref_array = ir->as_dereference_array();
2049    if (deref_array) {
2050       assert(!deref_array->array->type->is_vector());
2051    }
2052
2053    /* Use the rvalue deref handler for the most part.  We'll ignore
2054     * swizzles in it and write swizzles using writemask, though.
2055     */
2056    ir->accept(v);
2057    return dst_reg(v->result);
2058 }
2059
2060 void
2061 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2062                               const struct glsl_type *type,
2063                               enum brw_predicate predicate)
2064 {
2065    if (type->base_type == GLSL_TYPE_STRUCT) {
2066       for (unsigned int i = 0; i < type->length; i++) {
2067          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2068       }
2069       return;
2070    }
2071
2072    if (type->is_array()) {
2073       for (unsigned int i = 0; i < type->length; i++) {
2074          emit_block_move(dst, src, type->fields.array, predicate);
2075       }
2076       return;
2077    }
2078
2079    if (type->is_matrix()) {
2080       const struct glsl_type *vec_type;
2081
2082       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2083                                          type->vector_elements, 1);
2084
2085       for (int i = 0; i < type->matrix_columns; i++) {
2086          emit_block_move(dst, src, vec_type, predicate);
2087       }
2088       return;
2089    }
2090
2091    assert(type->is_scalar() || type->is_vector());
2092
2093    dst->type = brw_type_for_base_type(type);
2094    src->type = dst->type;
2095
2096    dst->writemask = (1 << type->vector_elements) - 1;
2097
2098    src->swizzle = swizzle_for_size(type->vector_elements);
2099
2100    vec4_instruction *inst = emit(MOV(*dst, *src));
2101    inst->predicate = predicate;
2102
2103    dst->reg_offset++;
2104    src->reg_offset++;
2105 }
2106
2107
2108 /* If the RHS processing resulted in an instruction generating a
2109  * temporary value, and it would be easy to rewrite the instruction to
2110  * generate its result right into the LHS instead, do so.  This ends
2111  * up reliably removing instructions where it can be tricky to do so
2112  * later without real UD chain information.
2113  */
2114 bool
2115 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2116                                      dst_reg dst,
2117                                      src_reg src,
2118                                      vec4_instruction *pre_rhs_inst,
2119                                      vec4_instruction *last_rhs_inst)
2120 {
2121    /* This could be supported, but it would take more smarts. */
2122    if (ir->condition)
2123       return false;
2124
2125    if (pre_rhs_inst == last_rhs_inst)
2126       return false; /* No instructions generated to work with. */
2127
2128    /* Make sure the last instruction generated our source reg. */
2129    if (src.file != GRF ||
2130        src.file != last_rhs_inst->dst.file ||
2131        src.reg != last_rhs_inst->dst.reg ||
2132        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2133        src.reladdr ||
2134        src.abs ||
2135        src.negate ||
2136        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2137       return false;
2138
2139    /* Check that that last instruction fully initialized the channels
2140     * we want to use, in the order we want to use them.  We could
2141     * potentially reswizzle the operands of many instructions so that
2142     * we could handle out of order channels, but don't yet.
2143     */
2144
2145    for (unsigned i = 0; i < 4; i++) {
2146       if (dst.writemask & (1 << i)) {
2147          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2148             return false;
2149
2150          if (BRW_GET_SWZ(src.swizzle, i) != i)
2151             return false;
2152       }
2153    }
2154
2155    /* Success!  Rewrite the instruction. */
2156    last_rhs_inst->dst.file = dst.file;
2157    last_rhs_inst->dst.reg = dst.reg;
2158    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2159    last_rhs_inst->dst.reladdr = dst.reladdr;
2160    last_rhs_inst->dst.writemask &= dst.writemask;
2161
2162    return true;
2163 }
2164
2165 void
2166 vec4_visitor::visit(ir_assignment *ir)
2167 {
2168    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2169    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2170
2171    if (!ir->lhs->type->is_scalar() &&
2172        !ir->lhs->type->is_vector()) {
2173       ir->rhs->accept(this);
2174       src_reg src = this->result;
2175
2176       if (ir->condition) {
2177          emit_bool_to_cond_code(ir->condition, &predicate);
2178       }
2179
2180       /* emit_block_move doesn't account for swizzles in the source register.
2181        * This should be ok, since the source register is a structure or an
2182        * array, and those can't be swizzled.  But double-check to be sure.
2183        */
2184       assert(src.swizzle ==
2185              (ir->rhs->type->is_matrix()
2186               ? swizzle_for_size(ir->rhs->type->vector_elements)
2187               : BRW_SWIZZLE_NOOP));
2188
2189       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2190       return;
2191    }
2192
2193    /* Now we're down to just a scalar/vector with writemasks. */
2194    int i;
2195
2196    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2197    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2198
2199    ir->rhs->accept(this);
2200
2201    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2202
2203    src_reg src = this->result;
2204
2205    int swizzles[4];
2206    int first_enabled_chan = 0;
2207    int src_chan = 0;
2208
2209    assert(ir->lhs->type->is_vector() ||
2210           ir->lhs->type->is_scalar());
2211    dst.writemask = ir->write_mask;
2212
2213    for (int i = 0; i < 4; i++) {
2214       if (dst.writemask & (1 << i)) {
2215          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2216          break;
2217       }
2218    }
2219
2220    /* Swizzle a small RHS vector into the channels being written.
2221     *
2222     * glsl ir treats write_mask as dictating how many channels are
2223     * present on the RHS while in our instructions we need to make
2224     * those channels appear in the slots of the vec4 they're written to.
2225     */
2226    for (int i = 0; i < 4; i++) {
2227       if (dst.writemask & (1 << i))
2228          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2229       else
2230          swizzles[i] = first_enabled_chan;
2231    }
2232    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2233                               swizzles[2], swizzles[3]);
2234
2235    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2236       return;
2237    }
2238
2239    if (ir->condition) {
2240       emit_bool_to_cond_code(ir->condition, &predicate);
2241    }
2242
2243    for (i = 0; i < type_size(ir->lhs->type); i++) {
2244       vec4_instruction *inst = emit(MOV(dst, src));
2245       inst->predicate = predicate;
2246
2247       dst.reg_offset++;
2248       src.reg_offset++;
2249    }
2250 }
2251
2252 void
2253 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2254 {
2255    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2256       foreach_in_list(ir_constant, field_value, &ir->components) {
2257          emit_constant_values(dst, field_value);
2258       }
2259       return;
2260    }
2261
2262    if (ir->type->is_array()) {
2263       for (unsigned int i = 0; i < ir->type->length; i++) {
2264          emit_constant_values(dst, ir->array_elements[i]);
2265       }
2266       return;
2267    }
2268
2269    if (ir->type->is_matrix()) {
2270       for (int i = 0; i < ir->type->matrix_columns; i++) {
2271          float *vec = &ir->value.f[i * ir->type->vector_elements];
2272
2273          for (int j = 0; j < ir->type->vector_elements; j++) {
2274             dst->writemask = 1 << j;
2275             dst->type = BRW_REGISTER_TYPE_F;
2276
2277             emit(MOV(*dst, src_reg(vec[j])));
2278          }
2279          dst->reg_offset++;
2280       }
2281       return;
2282    }
2283
2284    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2285
2286    for (int i = 0; i < ir->type->vector_elements; i++) {
2287       if (!(remaining_writemask & (1 << i)))
2288          continue;
2289
2290       dst->writemask = 1 << i;
2291       dst->type = brw_type_for_base_type(ir->type);
2292
2293       /* Find other components that match the one we're about to
2294        * write.  Emits fewer instructions for things like vec4(0.5,
2295        * 1.5, 1.5, 1.5).
2296        */
2297       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2298          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2299             if (ir->value.b[i] == ir->value.b[j])
2300                dst->writemask |= (1 << j);
2301          } else {
2302             /* u, i, and f storage all line up, so no need for a
2303              * switch case for comparing each type.
2304              */
2305             if (ir->value.u[i] == ir->value.u[j])
2306                dst->writemask |= (1 << j);
2307          }
2308       }
2309
2310       switch (ir->type->base_type) {
2311       case GLSL_TYPE_FLOAT:
2312          emit(MOV(*dst, src_reg(ir->value.f[i])));
2313          break;
2314       case GLSL_TYPE_INT:
2315          emit(MOV(*dst, src_reg(ir->value.i[i])));
2316          break;
2317       case GLSL_TYPE_UINT:
2318          emit(MOV(*dst, src_reg(ir->value.u[i])));
2319          break;
2320       case GLSL_TYPE_BOOL:
2321          emit(MOV(*dst,
2322                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2323                                               : 0)));
2324          break;
2325       default:
2326          unreachable("Non-float/uint/int/bool constant");
2327       }
2328
2329       remaining_writemask &= ~dst->writemask;
2330    }
2331    dst->reg_offset++;
2332 }
2333
2334 void
2335 vec4_visitor::visit(ir_constant *ir)
2336 {
2337    dst_reg dst = dst_reg(this, ir->type);
2338    this->result = src_reg(dst);
2339
2340    emit_constant_values(&dst, ir);
2341 }
2342
2343 void
2344 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2345 {
2346    ir_dereference *deref = static_cast<ir_dereference *>(
2347       ir->actual_parameters.get_head());
2348    ir_variable *location = deref->variable_referenced();
2349    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2350                           location->data.binding);
2351
2352    /* Calculate the surface offset */
2353    src_reg offset(this, glsl_type::uint_type);
2354    ir_dereference_array *deref_array = deref->as_dereference_array();
2355    if (deref_array) {
2356       deref_array->array_index->accept(this);
2357
2358       src_reg tmp(this, glsl_type::uint_type);
2359       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2360       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2361    } else {
2362       offset = location->data.atomic.offset;
2363    }
2364
2365    /* Emit the appropriate machine instruction */
2366    const char *callee = ir->callee->function_name();
2367    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2368
2369    if (!strcmp("__intrinsic_atomic_read", callee)) {
2370       emit_untyped_surface_read(surf_index, dst, offset);
2371
2372    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2373       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2374                           src_reg(), src_reg());
2375
2376    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2377       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2378                           src_reg(), src_reg());
2379    }
2380 }
2381
2382 void
2383 vec4_visitor::visit(ir_call *ir)
2384 {
2385    const char *callee = ir->callee->function_name();
2386
2387    if (!strcmp("__intrinsic_atomic_read", callee) ||
2388        !strcmp("__intrinsic_atomic_increment", callee) ||
2389        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2390       visit_atomic_counter_intrinsic(ir);
2391    } else {
2392       unreachable("Unsupported intrinsic.");
2393    }
2394 }
2395
2396 src_reg
2397 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2398 {
2399    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2400    inst->base_mrf = 2;
2401    inst->mlen = 1;
2402    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2403    inst->dst.writemask = WRITEMASK_XYZW;
2404
2405    inst->src[1] = sampler;
2406
2407    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2408    int param_base = inst->base_mrf;
2409    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2410    int zero_mask = 0xf & ~coord_mask;
2411
2412    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2413             coordinate));
2414
2415    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2416             src_reg(0)));
2417
2418    emit(inst);
2419    return src_reg(inst->dst);
2420 }
2421
2422 static bool
2423 is_high_sampler(struct brw_context *brw, src_reg sampler)
2424 {
2425    if (brw->gen < 8 && !brw->is_haswell)
2426       return false;
2427
2428    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2429 }
2430
2431 void
2432 vec4_visitor::visit(ir_texture *ir)
2433 {
2434    uint32_t sampler =
2435       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2436
2437    ir_rvalue *nonconst_sampler_index =
2438       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2439
2440    /* Handle non-constant sampler array indexing */
2441    src_reg sampler_reg;
2442    if (nonconst_sampler_index) {
2443       /* The highest sampler which may be used by this operation is
2444        * the last element of the array. Mark it here, because the generator
2445        * doesn't have enough information to determine the bound.
2446        */
2447       uint32_t array_size = ir->sampler->as_dereference_array()
2448          ->array->type->array_size();
2449
2450       uint32_t max_used = sampler + array_size - 1;
2451       if (ir->op == ir_tg4 && brw->gen < 8) {
2452          max_used += prog_data->base.binding_table.gather_texture_start;
2453       } else {
2454          max_used += prog_data->base.binding_table.texture_start;
2455       }
2456
2457       brw_mark_surface_used(&prog_data->base, max_used);
2458
2459       /* Emit code to evaluate the actual indexing expression */
2460       nonconst_sampler_index->accept(this);
2461       dst_reg temp(this, glsl_type::uint_type);
2462       emit(ADD(temp, this->result, src_reg(sampler)))
2463          ->force_writemask_all = true;
2464       sampler_reg = src_reg(temp);
2465    } else {
2466       /* Single sampler, or constant array index; the indexing expression
2467        * is just an immediate.
2468        */
2469       sampler_reg = src_reg(sampler);
2470    }
2471
2472    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2473     * emitting anything other than setting up the constant result.
2474     */
2475    if (ir->op == ir_tg4) {
2476       ir_constant *chan = ir->lod_info.component->as_constant();
2477       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2478       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2479          dst_reg result(this, ir->type);
2480          this->result = src_reg(result);
2481          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2482          return;
2483       }
2484    }
2485
2486    /* Should be lowered by do_lower_texture_projection */
2487    assert(!ir->projector);
2488
2489    /* Should be lowered */
2490    assert(!ir->offset || !ir->offset->type->is_array());
2491
2492    /* Generate code to compute all the subexpression trees.  This has to be
2493     * done before loading any values into MRFs for the sampler message since
2494     * generating these values may involve SEND messages that need the MRFs.
2495     */
2496    src_reg coordinate;
2497    if (ir->coordinate) {
2498       ir->coordinate->accept(this);
2499       coordinate = this->result;
2500    }
2501
2502    src_reg shadow_comparitor;
2503    if (ir->shadow_comparitor) {
2504       ir->shadow_comparitor->accept(this);
2505       shadow_comparitor = this->result;
2506    }
2507
2508    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2509    src_reg offset_value;
2510    if (has_nonconstant_offset) {
2511       ir->offset->accept(this);
2512       offset_value = src_reg(this->result);
2513    }
2514
2515    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2516    src_reg lod, dPdx, dPdy, sample_index, mcs;
2517    switch (ir->op) {
2518    case ir_tex:
2519       lod = src_reg(0.0f);
2520       lod_type = glsl_type::float_type;
2521       break;
2522    case ir_txf:
2523    case ir_txl:
2524    case ir_txs:
2525       ir->lod_info.lod->accept(this);
2526       lod = this->result;
2527       lod_type = ir->lod_info.lod->type;
2528       break;
2529    case ir_query_levels:
2530       lod = src_reg(0);
2531       lod_type = glsl_type::int_type;
2532       break;
2533    case ir_txf_ms:
2534       ir->lod_info.sample_index->accept(this);
2535       sample_index = this->result;
2536       sample_index_type = ir->lod_info.sample_index->type;
2537
2538       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2539          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2540       else
2541          mcs = src_reg(0u);
2542       break;
2543    case ir_txd:
2544       ir->lod_info.grad.dPdx->accept(this);
2545       dPdx = this->result;
2546
2547       ir->lod_info.grad.dPdy->accept(this);
2548       dPdy = this->result;
2549
2550       lod_type = ir->lod_info.grad.dPdx->type;
2551       break;
2552    case ir_txb:
2553    case ir_lod:
2554    case ir_tg4:
2555       break;
2556    }
2557
2558    enum opcode opcode;
2559    switch (ir->op) {
2560    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2561    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2562    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2563    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2564    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2565    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2566    case ir_tg4: opcode = has_nonconstant_offset
2567                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2568    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2569    case ir_txb:
2570       unreachable("TXB is not valid for vertex shaders.");
2571    case ir_lod:
2572       unreachable("LOD is not valid for vertex shaders.");
2573    default:
2574       unreachable("Unrecognized tex op");
2575    }
2576
2577    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2578
2579    if (ir->offset != NULL && !has_nonconstant_offset) {
2580       inst->offset =
2581          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2582                             ir->offset->type->vector_elements);
2583    }
2584
2585    /* Stuff the channel select bits in the top of the texture offset */
2586    if (ir->op == ir_tg4)
2587       inst->offset |= gather_channel(ir, sampler) << 16;
2588
2589    /* The message header is necessary for:
2590     * - Gen4 (always)
2591     * - Texel offsets
2592     * - Gather channel selection
2593     * - Sampler indices too large to fit in a 4-bit value.
2594     */
2595    inst->header_present =
2596       brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2597       is_high_sampler(brw, sampler_reg);
2598    inst->base_mrf = 2;
2599    inst->mlen = inst->header_present + 1; /* always at least one */
2600    inst->dst = dst_reg(this, ir->type);
2601    inst->dst.writemask = WRITEMASK_XYZW;
2602    inst->shadow_compare = ir->shadow_comparitor != NULL;
2603
2604    inst->src[1] = sampler_reg;
2605
2606    /* MRF for the first parameter */
2607    int param_base = inst->base_mrf + inst->header_present;
2608
2609    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2610       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2611       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2612    } else {
2613       /* Load the coordinate */
2614       /* FINISHME: gl_clamp_mask and saturate */
2615       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2616       int zero_mask = 0xf & ~coord_mask;
2617
2618       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2619                coordinate));
2620
2621       if (zero_mask != 0) {
2622          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2623                   src_reg(0)));
2624       }
2625       /* Load the shadow comparitor */
2626       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2627          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2628                           WRITEMASK_X),
2629                   shadow_comparitor));
2630          inst->mlen++;
2631       }
2632
2633       /* Load the LOD info */
2634       if (ir->op == ir_tex || ir->op == ir_txl) {
2635          int mrf, writemask;
2636          if (brw->gen >= 5) {
2637             mrf = param_base + 1;
2638             if (ir->shadow_comparitor) {
2639                writemask = WRITEMASK_Y;
2640                /* mlen already incremented */
2641             } else {
2642                writemask = WRITEMASK_X;
2643                inst->mlen++;
2644             }
2645          } else /* brw->gen == 4 */ {
2646             mrf = param_base;
2647             writemask = WRITEMASK_W;
2648          }
2649          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2650       } else if (ir->op == ir_txf) {
2651          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2652       } else if (ir->op == ir_txf_ms) {
2653          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2654                   sample_index));
2655          if (brw->gen >= 7) {
2656             /* MCS data is in the first channel of `mcs`, but we need to get it into
2657              * the .y channel of the second vec4 of params, so replicate .x across
2658              * the whole vec4 and then mask off everything except .y
2659              */
2660             mcs.swizzle = BRW_SWIZZLE_XXXX;
2661             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2662                      mcs));
2663          }
2664          inst->mlen++;
2665       } else if (ir->op == ir_txd) {
2666          const glsl_type *type = lod_type;
2667
2668          if (brw->gen >= 5) {
2669             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2670             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2671             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2672             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2673             inst->mlen++;
2674
2675             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2676                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2677                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2678                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2679                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2680                inst->mlen++;
2681
2682                if (ir->shadow_comparitor) {
2683                   emit(MOV(dst_reg(MRF, param_base + 2,
2684                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2685                            shadow_comparitor));
2686                }
2687             }
2688          } else /* brw->gen == 4 */ {
2689             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2690             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2691             inst->mlen += 2;
2692          }
2693       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2694          if (ir->shadow_comparitor) {
2695             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2696                      shadow_comparitor));
2697          }
2698
2699          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2700                   offset_value));
2701          inst->mlen++;
2702       }
2703    }
2704
2705    emit(inst);
2706
2707    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2708     * spec requires layers.
2709     */
2710    if (ir->op == ir_txs) {
2711       glsl_type const *type = ir->sampler->type;
2712       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2713           type->sampler_array) {
2714          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2715                    writemask(inst->dst, WRITEMASK_Z),
2716                    src_reg(inst->dst), src_reg(6));
2717       }
2718    }
2719
2720    if (brw->gen == 6 && ir->op == ir_tg4) {
2721       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2722    }
2723
2724    swizzle_result(ir, src_reg(inst->dst), sampler);
2725 }
2726
2727 /**
2728  * Apply workarounds for Gen6 gather with UINT/SINT
2729  */
2730 void
2731 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2732 {
2733    if (!wa)
2734       return;
2735
2736    int width = (wa & WA_8BIT) ? 8 : 16;
2737    dst_reg dst_f = dst;
2738    dst_f.type = BRW_REGISTER_TYPE_F;
2739
2740    /* Convert from UNORM to UINT */
2741    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2742    emit(MOV(dst, src_reg(dst_f)));
2743
2744    if (wa & WA_SIGN) {
2745       /* Reinterpret the UINT value as a signed INT value by
2746        * shifting the sign bit into place, then shifting back
2747        * preserving sign.
2748        */
2749       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2750       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2751    }
2752 }
2753
2754 /**
2755  * Set up the gather channel based on the swizzle, for gather4.
2756  */
2757 uint32_t
2758 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2759 {
2760    ir_constant *chan = ir->lod_info.component->as_constant();
2761    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2762    switch (swiz) {
2763       case SWIZZLE_X: return 0;
2764       case SWIZZLE_Y:
2765          /* gather4 sampler is broken for green channel on RG32F --
2766           * we must ask for blue instead.
2767           */
2768          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2769             return 2;
2770          return 1;
2771       case SWIZZLE_Z: return 2;
2772       case SWIZZLE_W: return 3;
2773       default:
2774          unreachable("Not reached"); /* zero, one swizzles handled already */
2775    }
2776 }
2777
2778 void
2779 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2780 {
2781    int s = key->tex.swizzles[sampler];
2782
2783    this->result = src_reg(this, ir->type);
2784    dst_reg swizzled_result(this->result);
2785
2786    if (ir->op == ir_query_levels) {
2787       /* # levels is in .w */
2788       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2789       emit(MOV(swizzled_result, orig_val));
2790       return;
2791    }
2792
2793    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2794                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2795       emit(MOV(swizzled_result, orig_val));
2796       return;
2797    }
2798
2799
2800    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2801    int swizzle[4] = {0};
2802
2803    for (int i = 0; i < 4; i++) {
2804       switch (GET_SWZ(s, i)) {
2805       case SWIZZLE_ZERO:
2806          zero_mask |= (1 << i);
2807          break;
2808       case SWIZZLE_ONE:
2809          one_mask |= (1 << i);
2810          break;
2811       default:
2812          copy_mask |= (1 << i);
2813          swizzle[i] = GET_SWZ(s, i);
2814          break;
2815       }
2816    }
2817
2818    if (copy_mask) {
2819       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2820       swizzled_result.writemask = copy_mask;
2821       emit(MOV(swizzled_result, orig_val));
2822    }
2823
2824    if (zero_mask) {
2825       swizzled_result.writemask = zero_mask;
2826       emit(MOV(swizzled_result, src_reg(0.0f)));
2827    }
2828
2829    if (one_mask) {
2830       swizzled_result.writemask = one_mask;
2831       emit(MOV(swizzled_result, src_reg(1.0f)));
2832    }
2833 }
2834
2835 void
2836 vec4_visitor::visit(ir_return *)
2837 {
2838    unreachable("not reached");
2839 }
2840
2841 void
2842 vec4_visitor::visit(ir_discard *)
2843 {
2844    unreachable("not reached");
2845 }
2846
2847 void
2848 vec4_visitor::visit(ir_if *ir)
2849 {
2850    /* Don't point the annotation at the if statement, because then it plus
2851     * the then and else blocks get printed.
2852     */
2853    this->base_ir = ir->condition;
2854
2855    if (brw->gen == 6) {
2856       emit_if_gen6(ir);
2857    } else {
2858       enum brw_predicate predicate;
2859       emit_bool_to_cond_code(ir->condition, &predicate);
2860       emit(IF(predicate));
2861    }
2862
2863    visit_instructions(&ir->then_instructions);
2864
2865    if (!ir->else_instructions.is_empty()) {
2866       this->base_ir = ir->condition;
2867       emit(BRW_OPCODE_ELSE);
2868
2869       visit_instructions(&ir->else_instructions);
2870    }
2871
2872    this->base_ir = ir->condition;
2873    emit(BRW_OPCODE_ENDIF);
2874 }
2875
2876 void
2877 vec4_visitor::visit(ir_emit_vertex *)
2878 {
2879    unreachable("not reached");
2880 }
2881
2882 void
2883 vec4_visitor::visit(ir_end_primitive *)
2884 {
2885    unreachable("not reached");
2886 }
2887
2888 void
2889 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2890                                   dst_reg dst, src_reg offset,
2891                                   src_reg src0, src_reg src1)
2892 {
2893    unsigned mlen = 0;
2894
2895    /* Set the atomic operation offset. */
2896    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2897    mlen++;
2898
2899    /* Set the atomic operation arguments. */
2900    if (src0.file != BAD_FILE) {
2901       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2902       mlen++;
2903    }
2904
2905    if (src1.file != BAD_FILE) {
2906       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2907       mlen++;
2908    }
2909
2910    /* Emit the instruction.  Note that this maps to the normal SIMD8
2911     * untyped atomic message on Ivy Bridge, but that's OK because
2912     * unused channels will be masked out.
2913     */
2914    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2915                                  src_reg(atomic_op), src_reg(surf_index));
2916    inst->base_mrf = 0;
2917    inst->mlen = mlen;
2918 }
2919
2920 void
2921 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2922                                         src_reg offset)
2923 {
2924    /* Set the surface read offset. */
2925    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2926
2927    /* Emit the instruction.  Note that this maps to the normal SIMD8
2928     * untyped surface read message, but that's OK because unused
2929     * channels will be masked out.
2930     */
2931    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2932                                  dst, src_reg(surf_index));
2933    inst->base_mrf = 0;
2934    inst->mlen = 1;
2935 }
2936
2937 void
2938 vec4_visitor::emit_ndc_computation()
2939 {
2940    /* Get the position */
2941    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2942
2943    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2944    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2945    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2946
2947    current_annotation = "NDC";
2948    dst_reg ndc_w = ndc;
2949    ndc_w.writemask = WRITEMASK_W;
2950    src_reg pos_w = pos;
2951    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2952    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2953
2954    dst_reg ndc_xyz = ndc;
2955    ndc_xyz.writemask = WRITEMASK_XYZ;
2956
2957    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2958 }
2959
2960 void
2961 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2962 {
2963    if (brw->gen < 6 &&
2964        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2965         key->userclip_active || brw->has_negative_rhw_bug)) {
2966       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2967       dst_reg header1_w = header1;
2968       header1_w.writemask = WRITEMASK_W;
2969
2970       emit(MOV(header1, 0u));
2971
2972       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2973          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2974
2975          current_annotation = "Point size";
2976          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2977          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2978       }
2979
2980       if (key->userclip_active) {
2981          current_annotation = "Clipping flags";
2982          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2983          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2984
2985          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2986          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2987          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2988
2989          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2990          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2991          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2992          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2993       }
2994
2995       /* i965 clipping workaround:
2996        * 1) Test for -ve rhw
2997        * 2) If set,
2998        *      set ndc = (0,0,0,0)
2999        *      set ucp[6] = 1
3000        *
3001        * Later, clipping will detect ucp[6] and ensure the primitive is
3002        * clipped against all fixed planes.
3003        */
3004       if (brw->has_negative_rhw_bug) {
3005          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3006          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3007          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3008          vec4_instruction *inst;
3009          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3010          inst->predicate = BRW_PREDICATE_NORMAL;
3011          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3012          inst->predicate = BRW_PREDICATE_NORMAL;
3013       }
3014
3015       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3016    } else if (brw->gen < 6) {
3017       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3018    } else {
3019       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3020       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3021          dst_reg reg_w = reg;
3022          reg_w.writemask = WRITEMASK_W;
3023          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3024       }
3025       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3026          dst_reg reg_y = reg;
3027          reg_y.writemask = WRITEMASK_Y;
3028          reg_y.type = BRW_REGISTER_TYPE_D;
3029          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3030       }
3031       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3032          dst_reg reg_z = reg;
3033          reg_z.writemask = WRITEMASK_Z;
3034          reg_z.type = BRW_REGISTER_TYPE_D;
3035          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3036       }
3037    }
3038 }
3039
3040 void
3041 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3042 {
3043    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3044     *
3045     *     "If a linked set of shaders forming the vertex stage contains no
3046     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3047     *     application has requested clipping against user clip planes through
3048     *     the API, then the coordinate written to gl_Position is used for
3049     *     comparison against the user clip planes."
3050     *
3051     * This function is only called if the shader didn't write to
3052     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3053     * if the user wrote to it; otherwise we use gl_Position.
3054     */
3055    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3056    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3057       clip_vertex = VARYING_SLOT_POS;
3058    }
3059
3060    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3061         ++i) {
3062       reg.writemask = 1 << i;
3063       emit(DP4(reg,
3064                src_reg(output_reg[clip_vertex]),
3065                src_reg(this->userplane[i + offset])));
3066    }
3067 }
3068
3069 vec4_instruction *
3070 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3071 {
3072    assert (varying < VARYING_SLOT_MAX);
3073    reg.type = output_reg[varying].type;
3074    current_annotation = output_reg_annotation[varying];
3075    /* Copy the register, saturating if necessary */
3076    return emit(MOV(reg, src_reg(output_reg[varying])));
3077 }
3078
3079 void
3080 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3081 {
3082    reg.type = BRW_REGISTER_TYPE_F;
3083
3084    switch (varying) {
3085    case VARYING_SLOT_PSIZ:
3086    {
3087       /* PSIZ is always in slot 0, and is coupled with other flags. */
3088       current_annotation = "indices, point width, clip flags";
3089       emit_psiz_and_flags(reg);
3090       break;
3091    }
3092    case BRW_VARYING_SLOT_NDC:
3093       current_annotation = "NDC";
3094       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3095       break;
3096    case VARYING_SLOT_POS:
3097       current_annotation = "gl_Position";
3098       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3099       break;
3100    case VARYING_SLOT_EDGE:
3101       /* This is present when doing unfilled polygons.  We're supposed to copy
3102        * the edge flag from the user-provided vertex array
3103        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3104        * of that attribute (starts as 1.0f).  This is then used in clipping to
3105        * determine which edges should be drawn as wireframe.
3106        */
3107       current_annotation = "edge flag";
3108       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3109                                     glsl_type::float_type, WRITEMASK_XYZW))));
3110       break;
3111    case BRW_VARYING_SLOT_PAD:
3112       /* No need to write to this slot */
3113       break;
3114    case VARYING_SLOT_COL0:
3115    case VARYING_SLOT_COL1:
3116    case VARYING_SLOT_BFC0:
3117    case VARYING_SLOT_BFC1: {
3118       /* These built-in varyings are only supported in compatibility mode,
3119        * and we only support GS in core profile.  So, this must be a vertex
3120        * shader.
3121        */
3122       assert(stage == MESA_SHADER_VERTEX);
3123       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3124       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3125          inst->saturate = true;
3126       break;
3127    }
3128
3129    default:
3130       emit_generic_urb_slot(reg, varying);
3131       break;
3132    }
3133 }
3134
3135 static int
3136 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3137 {
3138    if (brw->gen >= 6) {
3139       /* URB data written (does not include the message header reg) must
3140        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3141        * section 5.4.3.2.2: URB_INTERLEAVED.
3142        *
3143        * URB entries are allocated on a multiple of 1024 bits, so an
3144        * extra 128 bits written here to make the end align to 256 is
3145        * no problem.
3146        */
3147       if ((mlen % 2) != 1)
3148          mlen++;
3149    }
3150
3151    return mlen;
3152 }
3153
3154
3155 /**
3156  * Generates the VUE payload plus the necessary URB write instructions to
3157  * output it.
3158  *
3159  * The VUE layout is documented in Volume 2a.
3160  */
3161 void
3162 vec4_visitor::emit_vertex()
3163 {
3164    /* MRF 0 is reserved for the debugger, so start with message header
3165     * in MRF 1.
3166     */
3167    int base_mrf = 1;
3168    int mrf = base_mrf;
3169    /* In the process of generating our URB write message contents, we
3170     * may need to unspill a register or load from an array.  Those
3171     * reads would use MRFs 14-15.
3172     */
3173    int max_usable_mrf = 13;
3174
3175    /* The following assertion verifies that max_usable_mrf causes an
3176     * even-numbered amount of URB write data, which will meet gen6's
3177     * requirements for length alignment.
3178     */
3179    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3180
3181    /* First mrf is the g0-based message header containing URB handles and
3182     * such.
3183     */
3184    emit_urb_write_header(mrf++);
3185
3186    if (brw->gen < 6) {
3187       emit_ndc_computation();
3188    }
3189
3190    /* Lower legacy ff and ClipVertex clipping to clip distances */
3191    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3192       current_annotation = "user clip distances";
3193
3194       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3195       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3196
3197       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3198       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3199    }
3200
3201    /* We may need to split this up into several URB writes, so do them in a
3202     * loop.
3203     */
3204    int slot = 0;
3205    bool complete = false;
3206    do {
3207       /* URB offset is in URB row increments, and each of our MRFs is half of
3208        * one of those, since we're doing interleaved writes.
3209        */
3210       int offset = slot / 2;
3211
3212       mrf = base_mrf + 1;
3213       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3214          emit_urb_slot(dst_reg(MRF, mrf++),
3215                        prog_data->vue_map.slot_to_varying[slot]);
3216
3217          /* If this was max_usable_mrf, we can't fit anything more into this
3218           * URB WRITE.
3219           */
3220          if (mrf > max_usable_mrf) {
3221             slot++;
3222             break;
3223          }
3224       }
3225
3226       complete = slot >= prog_data->vue_map.num_slots;
3227       current_annotation = "URB write";
3228       vec4_instruction *inst = emit_urb_write_opcode(complete);
3229       inst->base_mrf = base_mrf;
3230       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3231       inst->offset += offset;
3232    } while(!complete);
3233 }
3234
3235
3236 src_reg
3237 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3238                                  src_reg *reladdr, int reg_offset)
3239 {
3240    /* Because we store the values to scratch interleaved like our
3241     * vertex data, we need to scale the vec4 index by 2.
3242     */
3243    int message_header_scale = 2;
3244
3245    /* Pre-gen6, the message header uses byte offsets instead of vec4
3246     * (16-byte) offset units.
3247     */
3248    if (brw->gen < 6)
3249       message_header_scale *= 16;
3250
3251    if (reladdr) {
3252       src_reg index = src_reg(this, glsl_type::int_type);
3253
3254       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3255                                    src_reg(reg_offset)));
3256       emit_before(block, inst, MUL(dst_reg(index), index,
3257                                    src_reg(message_header_scale)));
3258
3259       return index;
3260    } else {
3261       return src_reg(reg_offset * message_header_scale);
3262    }
3263 }
3264
3265 src_reg
3266 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3267                                        src_reg *reladdr, int reg_offset)
3268 {
3269    if (reladdr) {
3270       src_reg index = src_reg(this, glsl_type::int_type);
3271
3272       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3273                                    src_reg(reg_offset)));
3274
3275       /* Pre-gen6, the message header uses byte offsets instead of vec4
3276        * (16-byte) offset units.
3277        */
3278       if (brw->gen < 6) {
3279          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3280       }
3281
3282       return index;
3283    } else if (brw->gen >= 8) {
3284       /* Store the offset in a GRF so we can send-from-GRF. */
3285       src_reg offset = src_reg(this, glsl_type::int_type);
3286       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3287       return offset;
3288    } else {
3289       int message_header_scale = brw->gen < 6 ? 16 : 1;
3290       return src_reg(reg_offset * message_header_scale);
3291    }
3292 }
3293
3294 /**
3295  * Emits an instruction before @inst to load the value named by @orig_src
3296  * from scratch space at @base_offset to @temp.
3297  *
3298  * @base_offset is measured in 32-byte units (the size of a register).
3299  */
3300 void
3301 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3302                                 dst_reg temp, src_reg orig_src,
3303                                 int base_offset)
3304 {
3305    int reg_offset = base_offset + orig_src.reg_offset;
3306    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3307                                       reg_offset);
3308
3309    emit_before(block, inst, SCRATCH_READ(temp, index));
3310 }
3311
3312 /**
3313  * Emits an instruction after @inst to store the value to be written
3314  * to @orig_dst to scratch space at @base_offset, from @temp.
3315  *
3316  * @base_offset is measured in 32-byte units (the size of a register).
3317  */
3318 void
3319 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3320                                  int base_offset)
3321 {
3322    int reg_offset = base_offset + inst->dst.reg_offset;
3323    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3324                                       reg_offset);
3325
3326    /* Create a temporary register to store *inst's result in.
3327     *
3328     * We have to be careful in MOVing from our temporary result register in
3329     * the scratch write.  If we swizzle from channels of the temporary that
3330     * weren't initialized, it will confuse live interval analysis, which will
3331     * make spilling fail to make progress.
3332     */
3333    src_reg temp = src_reg(this, glsl_type::vec4_type);
3334    temp.type = inst->dst.type;
3335    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3336    int swizzles[4];
3337    for (int i = 0; i < 4; i++)
3338       if (inst->dst.writemask & (1 << i))
3339          swizzles[i] = i;
3340       else
3341          swizzles[i] = first_writemask_chan;
3342    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3343                                swizzles[2], swizzles[3]);
3344
3345    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3346                                        inst->dst.writemask));
3347    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3348    write->predicate = inst->predicate;
3349    write->ir = inst->ir;
3350    write->annotation = inst->annotation;
3351    inst->insert_after(block, write);
3352
3353    inst->dst.file = temp.file;
3354    inst->dst.reg = temp.reg;
3355    inst->dst.reg_offset = temp.reg_offset;
3356    inst->dst.reladdr = NULL;
3357 }
3358
3359 /**
3360  * We can't generally support array access in GRF space, because a
3361  * single instruction's destination can only span 2 contiguous
3362  * registers.  So, we send all GRF arrays that get variable index
3363  * access to scratch space.
3364  */
3365 void
3366 vec4_visitor::move_grf_array_access_to_scratch()
3367 {
3368    int scratch_loc[this->virtual_grf_count];
3369    memset(scratch_loc, -1, sizeof(scratch_loc));
3370
3371    /* First, calculate the set of virtual GRFs that need to be punted
3372     * to scratch due to having any array access on them, and where in
3373     * scratch.
3374     */
3375    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3376       if (inst->dst.file == GRF && inst->dst.reladdr &&
3377           scratch_loc[inst->dst.reg] == -1) {
3378          scratch_loc[inst->dst.reg] = c->last_scratch;
3379          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3380       }
3381
3382       for (int i = 0 ; i < 3; i++) {
3383          src_reg *src = &inst->src[i];
3384
3385          if (src->file == GRF && src->reladdr &&
3386              scratch_loc[src->reg] == -1) {
3387             scratch_loc[src->reg] = c->last_scratch;
3388             c->last_scratch += this->virtual_grf_sizes[src->reg];
3389          }
3390       }
3391    }
3392
3393    /* Now, for anything that will be accessed through scratch, rewrite
3394     * it to load/store.  Note that this is a _safe list walk, because
3395     * we may generate a new scratch_write instruction after the one
3396     * we're processing.
3397     */
3398    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3399       /* Set up the annotation tracking for new generated instructions. */
3400       base_ir = inst->ir;
3401       current_annotation = inst->annotation;
3402
3403       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3404          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3405       }
3406
3407       for (int i = 0 ; i < 3; i++) {
3408          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3409             continue;
3410
3411          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3412
3413          emit_scratch_read(block, inst, temp, inst->src[i],
3414                            scratch_loc[inst->src[i].reg]);
3415
3416          inst->src[i].file = temp.file;
3417          inst->src[i].reg = temp.reg;
3418          inst->src[i].reg_offset = temp.reg_offset;
3419          inst->src[i].reladdr = NULL;
3420       }
3421    }
3422 }
3423
3424 /**
3425  * Emits an instruction before @inst to load the value named by @orig_src
3426  * from the pull constant buffer (surface) at @base_offset to @temp.
3427  */
3428 void
3429 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3430                                       dst_reg temp, src_reg orig_src,
3431                                       int base_offset)
3432 {
3433    int reg_offset = base_offset + orig_src.reg_offset;
3434    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3435    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3436                                              reg_offset);
3437    vec4_instruction *load;
3438
3439    if (brw->gen >= 7) {
3440       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3441       grf_offset.type = offset.type;
3442       emit_before(block, inst, MOV(grf_offset, offset));
3443
3444       load = new(mem_ctx) vec4_instruction(this,
3445                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3446                                            temp, index, src_reg(grf_offset));
3447    } else {
3448       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3449                                            temp, index, offset);
3450       load->base_mrf = 14;
3451       load->mlen = 1;
3452    }
3453    emit_before(block, inst, load);
3454 }
3455
3456 /**
3457  * Implements array access of uniforms by inserting a
3458  * PULL_CONSTANT_LOAD instruction.
3459  *
3460  * Unlike temporary GRF array access (where we don't support it due to
3461  * the difficulty of doing relative addressing on instruction
3462  * destinations), we could potentially do array access of uniforms
3463  * that were loaded in GRF space as push constants.  In real-world
3464  * usage we've seen, though, the arrays being used are always larger
3465  * than we could load as push constants, so just always move all
3466  * uniform array access out to a pull constant buffer.
3467  */
3468 void
3469 vec4_visitor::move_uniform_array_access_to_pull_constants()
3470 {
3471    int pull_constant_loc[this->uniforms];
3472    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3473    bool nested_reladdr;
3474
3475    /* Walk through and find array access of uniforms.  Put a copy of that
3476     * uniform in the pull constant buffer.
3477     *
3478     * Note that we don't move constant-indexed accesses to arrays.  No
3479     * testing has been done of the performance impact of this choice.
3480     */
3481    do {
3482       nested_reladdr = false;
3483
3484       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3485          for (int i = 0 ; i < 3; i++) {
3486             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3487                continue;
3488
3489             int uniform = inst->src[i].reg;
3490
3491             if (inst->src[i].reladdr->reladdr)
3492                nested_reladdr = true;  /* will need another pass */
3493
3494             /* If this array isn't already present in the pull constant buffer,
3495              * add it.
3496              */
3497             if (pull_constant_loc[uniform] == -1) {
3498                const gl_constant_value **values =
3499                   &stage_prog_data->param[uniform * 4];
3500
3501                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3502
3503                assert(uniform < uniform_array_size);
3504                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3505                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3506                      = values[j];
3507                }
3508             }
3509
3510             /* Set up the annotation tracking for new generated instructions. */
3511             base_ir = inst->ir;
3512             current_annotation = inst->annotation;
3513
3514             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3515
3516             emit_pull_constant_load(block, inst, temp, inst->src[i],
3517                                     pull_constant_loc[uniform]);
3518
3519             inst->src[i].file = temp.file;
3520             inst->src[i].reg = temp.reg;
3521             inst->src[i].reg_offset = temp.reg_offset;
3522             inst->src[i].reladdr = NULL;
3523          }
3524       }
3525    } while (nested_reladdr);
3526
3527    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3528     * no need to track them as larger-than-vec4 objects.  This will be
3529     * relied on in cutting out unused uniform vectors from push
3530     * constants.
3531     */
3532    split_uniform_registers();
3533 }
3534
3535 void
3536 vec4_visitor::resolve_ud_negate(src_reg *reg)
3537 {
3538    if (reg->type != BRW_REGISTER_TYPE_UD ||
3539        !reg->negate)
3540       return;
3541
3542    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3543    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3544    *reg = temp;
3545 }
3546
3547 /**
3548  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3549  *
3550  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3551  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3552  */
3553 void
3554 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3555 {
3556    assert(brw->gen <= 5);
3557
3558    if (!rvalue->type->is_boolean())
3559       return;
3560
3561    src_reg and_result = src_reg(this, rvalue->type);
3562    src_reg neg_result = src_reg(this, rvalue->type);
3563    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3564    emit(MOV(dst_reg(neg_result), negate(and_result)));
3565    *reg = neg_result;
3566 }
3567
3568 vec4_visitor::vec4_visitor(struct brw_context *brw,
3569                            struct brw_vec4_compile *c,
3570                            struct gl_program *prog,
3571                            const struct brw_vec4_prog_key *key,
3572                            struct brw_vec4_prog_data *prog_data,
3573                            struct gl_shader_program *shader_prog,
3574                            gl_shader_stage stage,
3575                            void *mem_ctx,
3576                            bool debug_flag,
3577                            bool no_spills,
3578                            shader_time_shader_type st_base,
3579                            shader_time_shader_type st_written,
3580                            shader_time_shader_type st_reset)
3581    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3582      c(c),
3583      key(key),
3584      prog_data(prog_data),
3585      sanity_param_count(0),
3586      fail_msg(NULL),
3587      first_non_payload_grf(0),
3588      need_all_constants_in_pull_buffer(false),
3589      debug_flag(debug_flag),
3590      no_spills(no_spills),
3591      st_base(st_base),
3592      st_written(st_written),
3593      st_reset(st_reset)
3594 {
3595    this->mem_ctx = mem_ctx;
3596    this->failed = false;
3597
3598    this->base_ir = NULL;
3599    this->current_annotation = NULL;
3600    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3601
3602    this->variable_ht = hash_table_ctor(0,
3603                                        hash_table_pointer_hash,
3604                                        hash_table_pointer_compare);
3605
3606    this->virtual_grf_start = NULL;
3607    this->virtual_grf_end = NULL;
3608    this->virtual_grf_sizes = NULL;
3609    this->virtual_grf_count = 0;
3610    this->virtual_grf_reg_map = NULL;
3611    this->virtual_grf_reg_count = 0;
3612    this->virtual_grf_array_size = 0;
3613    this->live_intervals = NULL;
3614
3615    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3616
3617    this->uniforms = 0;
3618
3619    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3620     * at least one. See setup_uniforms() in brw_vec4.cpp.
3621     */
3622    this->uniform_array_size = 1;
3623    if (prog_data) {
3624       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3625    }
3626
3627    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3628    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3629 }
3630
3631 vec4_visitor::~vec4_visitor()
3632 {
3633    hash_table_dtor(this->variable_ht);
3634 }
3635
3636
3637 void
3638 vec4_visitor::fail(const char *format, ...)
3639 {
3640    va_list va;
3641    char *msg;
3642
3643    if (failed)
3644       return;
3645
3646    failed = true;
3647
3648    va_start(va, format);
3649    msg = ralloc_vasprintf(mem_ctx, format, va);
3650    va_end(va);
3651    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3652
3653    this->fail_msg = msg;
3654
3655    if (debug_flag) {
3656       fprintf(stderr, "%s",  msg);
3657    }
3658 }
3659
3660 } /* namespace brw */