src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  70                           vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(block, new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  82                    const src_reg &src1, const src_reg &src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  91                    const src_reg &src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 119                                            src0);                       \
 120    }
 121
 122 #define ALU2(op)                                                        \
 123    vec4_instruction *                                                   \
 124    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 125                     const src_reg &src1)                                \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0, src1);                 \
 129    }
 130
 131 #define ALU2_ACC(op)                                                    \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 134                     const src_reg &src1)                                \
 135    {                                                                    \
 136       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 137                        BRW_OPCODE_##op, dst, src0, src1);               \
 138       inst->writes_accumulator = true;                                 \
 139       return inst;                                                     \
 140    }
 141
 142 #define ALU3(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 145                     const src_reg &src1, const src_reg &src2)           \
 146    {                                                                    \
 147       assert(brw->gen >= 6);                                            \
 148       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 149                                            src0, src1, src2);           \
 150    }
 151
 152 ALU1(NOT)
 153 ALU1(MOV)
 154 ALU1(FRC)
 155 ALU1(RNDD)
 156 ALU1(RNDE)
 157 ALU1(RNDZ)
 158 ALU1(F32TO16)
 159 ALU1(F16TO32)
 160 ALU2(ADD)
 161 ALU2(MUL)
 162 ALU2_ACC(MACH)
 163 ALU2(AND)
 164 ALU2(OR)
 165 ALU2(XOR)
 166 ALU2(DP3)
 167 ALU2(DP4)
 168 ALU2(DPH)
 169 ALU2(SHL)
 170 ALU2(SHR)
 171 ALU2(ASR)
 172 ALU3(LRP)
 173 ALU1(BFREV)
 174 ALU3(BFE)
 175 ALU2(BFI1)
 176 ALU3(BFI2)
 177 ALU1(FBH)
 178 ALU1(FBL)
 179 ALU1(CBIT)
 180 ALU3(MAD)
 181 ALU2_ACC(ADDC)
 182 ALU2_ACC(SUBB)
 183 ALU2(MAC)
 184
 185 /** Gen4 predicated IF. */
 186 vec4_instruction *
 187 vec4_visitor::IF(enum brw_predicate predicate)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 vec4_instruction *
 199 vec4_visitor::IF(src_reg src0, src_reg src1,
 200                  enum brw_conditional_mod condition)
 201 {
 202    assert(brw->gen == 6);
 203
 204    vec4_instruction *inst;
 205
 206    resolve_ud_negate(&src0);
 207    resolve_ud_negate(&src1);
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 210                                         src0, src1);
 211    inst->conditional_mod = condition;
 212
 213    return inst;
 214 }
 215
 216 /**
 217  * CMP: Sets the low bit of the destination channels with the result
 218  * of the comparison, while the upper bits are undefined, and updates
 219  * the flag register with the packed 16 bits of the result.
 220  */
 221 vec4_instruction *
 222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 223                   enum brw_conditional_mod condition)
 224 {
 225    vec4_instruction *inst;
 226
 227    /* original gen4 does type conversion to the destination type
 228     * before before comparison, producing garbage results for floating
 229     * point comparisons.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 vec4_instruction *
 247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 248 {
 249    vec4_instruction *inst;
 250
 251    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 252                                         dst, index);
 253    inst->base_mrf = 14;
 254    inst->mlen = 2;
 255
 256    return inst;
 257 }
 258
 259 vec4_instruction *
 260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 261                             const src_reg &index)
 262 {
 263    vec4_instruction *inst;
 264
 265    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 266                                         dst, src, index);
 267    inst->base_mrf = 13;
 268    inst->mlen = 3;
 269
 270    return inst;
 271 }
 272
 273 void
 274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 275 {
 276    static enum opcode dot_opcodes[] = {
 277       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 278    };
 279
 280    emit(dot_opcodes[elements - 2], dst, src0, src1);
 281 }
 282
 283 src_reg
 284 vec4_visitor::fix_3src_operand(src_reg src)
 285 {
 286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 287     * able to use vertical stride of zero to replicate the vec4 uniform, like
 288     *
 289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 290     *
 291     * But you can't, since vertical stride is always four in three-source
 292     * instructions. Instead, insert a MOV instruction to do the replication so
 293     * that the three-source instruction can consume it.
 294     */
 295
 296    /* The MOV is only needed if the source is a uniform or immediate. */
 297    if (src.file != UNIFORM && src.file != IMM)
 298       return src;
 299
 300    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 306    return src_reg(expanded);
 307 }
 308
 309 src_reg
 310 vec4_visitor::fix_math_operand(src_reg src)
 311 {
 312    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 313       return src;
 314
 315    /* The gen6 math instruction ignores the source modifiers --
 316     * swizzle, abs, negate, and at least some parts of the register
 317     * region description.
 318     *
 319     * Rather than trying to enumerate all these cases, *always* expand the
 320     * operand to a temp GRF for gen6.
 321     *
 322     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 323     * can't use.
 324     */
 325
 326    if (brw->gen == 7 && src.file != IMM)
 327       return src;
 328
 329    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 330    expanded.type = src.type;
 331    emit(MOV(expanded, src));
 332    return src_reg(expanded);
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(enum opcode opcode,
 337                         const dst_reg &dst,
 338                         const src_reg &src0, const src_reg &src1)
 339 {
 340    vec4_instruction *math =
 341       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 342
 343    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 344       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 345       math->dst = dst_reg(this, glsl_type::vec4_type);
 346       math->dst.type = dst.type;
 347       emit(MOV(dst, src_reg(math->dst)));
 348    } else if (brw->gen < 6) {
 349       math->base_mrf = 1;
 350       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 351    }
 352 }
 353
 354 void
 355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 356 {
 357    if (brw->gen < 7) {
 358       unreachable("ir_unop_pack_half_2x16 should be lowered");
 359    }
 360
 361    assert(dst.type == BRW_REGISTER_TYPE_UD);
 362    assert(src0.type == BRW_REGISTER_TYPE_F);
 363
 364    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 365     *
 366     *   Because this instruction does not have a 16-bit floating-point type,
 367     *   the destination data type must be Word (W).
 368     *
 369     *   The destination must be DWord-aligned and specify a horizontal stride
 370     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 371     *   each destination channel and the upper word is not modified.
 372     *
 373     * The above restriction implies that the f32to16 instruction must use
 374     * align1 mode, because only in align1 mode is it possible to specify
 375     * horizontal stride.  We choose here to defy the hardware docs and emit
 376     * align16 instructions.
 377     *
 378     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 379     * instructions. I was partially successful in that the code passed all
 380     * tests.  However, the code was dubiously correct and fragile, and the
 381     * tests were not harsh enough to probe that frailty. Not trusting the
 382     * code, I chose instead to remain in align16 mode in defiance of the hw
 383     * docs).
 384     *
 385     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 386     * simulator, emitting a f32to16 in align16 mode with UD as destination
 387     * data type is safe. The behavior differs from that specified in the PRM
 388     * in that the upper word of each destination channel is cleared to 0.
 389     */
 390
 391    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 392    src_reg tmp_src(tmp_dst);
 393
 394 #if 0
 395    /* Verify the undocumented behavior on which the following instructions
 396     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 397     * then the result of the bit-or instruction below will be incorrect.
 398     *
 399     * You should inspect the disasm output in order to verify that the MOV is
 400     * not optimized away.
 401     */
 402    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 403 #endif
 404
 405    /* Give tmp the form below, where "." means untouched.
 406     *
 407     *     w z          y          x w z          y          x
 408     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 409     *
 410     * That the upper word of each write-channel be 0 is required for the
 411     * following bit-shift and bit-or instructions to work. Note that this
 412     * relies on the undocumented hardware behavior mentioned above.
 413     */
 414    tmp_dst.writemask = WRITEMASK_XY;
 415    emit(F32TO16(tmp_dst, src0));
 416
 417    /* Give the write-channels of dst the form:
 418     *   0xhhhh0000
 419     */
 420    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 421    emit(SHL(dst, tmp_src, src_reg(16u)));
 422
 423    /* Finally, give the write-channels of dst the form of packHalf2x16's
 424     * output:
 425     *   0xhhhhllll
 426     */
 427    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 428    emit(OR(dst, src_reg(dst), tmp_src));
 429 }
 430
 431 void
 432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 433 {
 434    if (brw->gen < 7) {
 435       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 436    }
 437
 438    assert(dst.type == BRW_REGISTER_TYPE_F);
 439    assert(src0.type == BRW_REGISTER_TYPE_UD);
 440
 441    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 442     *
 443     *   Because this instruction does not have a 16-bit floating-point type,
 444     *   the source data type must be Word (W). The destination type must be
 445     *   F (Float).
 446     *
 447     * To use W as the source data type, we must adjust horizontal strides,
 448     * which is only possible in align1 mode. All my [chadv] attempts at
 449     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 450     * Piglit tests, so I gave up.
 451     *
 452     * I've verified that, on gen7 hardware and the simulator, it is safe to
 453     * emit f16to32 in align16 mode with UD as source data type.
 454     */
 455
 456    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 457    src_reg tmp_src(tmp_dst);
 458
 459    tmp_dst.writemask = WRITEMASK_X;
 460    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 461
 462    tmp_dst.writemask = WRITEMASK_Y;
 463    emit(SHR(tmp_dst, src0, src_reg(16u)));
 464
 465    dst.writemask = WRITEMASK_XY;
 466    emit(F16TO32(dst, tmp_src));
 467 }
 468
 469 void
 470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 471 {
 472    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 473     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 474     * is not suitable to generate the shift values, but we can use the packed
 475     * vector float and a type-converting MOV.
 476     */
 477    dst_reg shift(this, glsl_type::uvec4_type);
 478    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 479
 480    dst_reg shifted(this, glsl_type::uvec4_type);
 481    src0.swizzle = BRW_SWIZZLE_XXXX;
 482    emit(SHR(shifted, src0, src_reg(shift)));
 483
 484    shifted.type = BRW_REGISTER_TYPE_UB;
 485    dst_reg f(this, glsl_type::vec4_type);
 486    emit(MOV(f, src_reg(shifted)));
 487
 488    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 489 }
 490
 491 void
 492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 493 {
 494    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 495     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 496     * is not suitable to generate the shift values, but we can use the packed
 497     * vector float and a type-converting MOV.
 498     */
 499    dst_reg shift(this, glsl_type::uvec4_type);
 500    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 501
 502    dst_reg shifted(this, glsl_type::uvec4_type);
 503    src0.swizzle = BRW_SWIZZLE_XXXX;
 504    emit(SHR(shifted, src0, src_reg(shift)));
 505
 506    shifted.type = BRW_REGISTER_TYPE_B;
 507    dst_reg f(this, glsl_type::vec4_type);
 508    emit(MOV(f, src_reg(shifted)));
 509
 510    dst_reg scaled(this, glsl_type::vec4_type);
 511    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 512
 513    dst_reg max(this, glsl_type::vec4_type);
 514    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 515    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 516 }
 517
 518 void
 519 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 520 {
 521    dst_reg saturated(this, glsl_type::vec4_type);
 522    vec4_instruction *inst = emit(MOV(saturated, src0));
 523    inst->saturate = true;
 524
 525    dst_reg scaled(this, glsl_type::vec4_type);
 526    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 527
 528    dst_reg rounded(this, glsl_type::vec4_type);
 529    emit(RNDE(rounded, src_reg(scaled)));
 530
 531    dst_reg u(this, glsl_type::uvec4_type);
 532    emit(MOV(u, src_reg(rounded)));
 533
 534    src_reg bytes(u);
 535    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 536 }
 537
 538 void
 539 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 540 {
 541    dst_reg max(this, glsl_type::vec4_type);
 542    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 543
 544    dst_reg min(this, glsl_type::vec4_type);
 545    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 546
 547    dst_reg scaled(this, glsl_type::vec4_type);
 548    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 549
 550    dst_reg rounded(this, glsl_type::vec4_type);
 551    emit(RNDE(rounded, src_reg(scaled)));
 552
 553    dst_reg i(this, glsl_type::ivec4_type);
 554    emit(MOV(i, src_reg(rounded)));
 555
 556    src_reg bytes(i);
 557    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 558 }
 559
 560 void
 561 vec4_visitor::visit_instructions(const exec_list *list)
 562 {
 563    foreach_in_list(ir_instruction, ir, list) {
 564       base_ir = ir;
 565       ir->accept(this);
 566    }
 567 }
 568
 569
 570 static int
 571 type_size(const struct glsl_type *type)
 572 {
 573    unsigned int i;
 574    int size;
 575
 576    switch (type->base_type) {
 577    case GLSL_TYPE_UINT:
 578    case GLSL_TYPE_INT:
 579    case GLSL_TYPE_FLOAT:
 580    case GLSL_TYPE_BOOL:
 581       if (type->is_matrix()) {
 582          return type->matrix_columns;
 583       } else {
 584          /* Regardless of size of vector, it gets a vec4. This is bad
 585           * packing for things like floats, but otherwise arrays become a
 586           * mess.  Hopefully a later pass over the code can pack scalars
 587           * down if appropriate.
 588           */
 589          return 1;
 590       }
 591    case GLSL_TYPE_ARRAY:
 592       assert(type->length > 0);
 593       return type_size(type->fields.array) * type->length;
 594    case GLSL_TYPE_STRUCT:
 595       size = 0;
 596       for (i = 0; i < type->length; i++) {
 597          size += type_size(type->fields.structure[i].type);
 598       }
 599       return size;
 600    case GLSL_TYPE_SAMPLER:
 601       /* Samplers take up no register space, since they're baked in at
 602        * link time.
 603        */
 604       return 0;
 605    case GLSL_TYPE_ATOMIC_UINT:
 606       return 0;
 607    case GLSL_TYPE_IMAGE:
 608    case GLSL_TYPE_VOID:
 609    case GLSL_TYPE_ERROR:
 610    case GLSL_TYPE_INTERFACE:
 611       unreachable("not reached");
 612    }
 613
 614    return 0;
 615 }
 616
 617 int
 618 vec4_visitor::virtual_grf_alloc(int size)
 619 {
 620    if (virtual_grf_array_size <= virtual_grf_count) {
 621       if (virtual_grf_array_size == 0)
 622          virtual_grf_array_size = 16;
 623       else
 624          virtual_grf_array_size *= 2;
 625       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 626                                    virtual_grf_array_size);
 627       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 628                                      virtual_grf_array_size);
 629    }
 630    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 631    virtual_grf_reg_count += size;
 632    virtual_grf_sizes[virtual_grf_count] = size;
 633    return virtual_grf_count++;
 634 }
 635
 636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 637 {
 638    init();
 639
 640    this->file = GRF;
 641    this->reg = v->virtual_grf_alloc(type_size(type));
 642
 643    if (type->is_array() || type->is_record()) {
 644       this->swizzle = BRW_SWIZZLE_NOOP;
 645    } else {
 646       this->swizzle = swizzle_for_size(type->vector_elements);
 647    }
 648
 649    this->type = brw_type_for_base_type(type);
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 653 {
 654    assert(size > 0);
 655
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 660
 661    this->swizzle = BRW_SWIZZLE_NOOP;
 662
 663    this->type = brw_type_for_base_type(type);
 664 }
 665
 666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 667 {
 668    init();
 669
 670    this->file = GRF;
 671    this->reg = v->virtual_grf_alloc(type_size(type));
 672
 673    if (type->is_array() || type->is_record()) {
 674       this->writemask = WRITEMASK_XYZW;
 675    } else {
 676       this->writemask = (1 << type->vector_elements) - 1;
 677    }
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 /* Our support for uniforms is piggy-backed on the struct
 683  * gl_fragment_program, because that's where the values actually
 684  * get stored, rather than in some global gl_shader_program uniform
 685  * store.
 686  */
 687 void
 688 vec4_visitor::setup_uniform_values(ir_variable *ir)
 689 {
 690    int namelen = strlen(ir->name);
 691
 692    /* The data for our (non-builtin) uniforms is stored in a series of
 693     * gl_uniform_driver_storage structs for each subcomponent that
 694     * glGetUniformLocation() could name.  We know it's been set up in the same
 695     * order we'd walk the type, so walk the list of storage and find anything
 696     * with our name, or the prefix of a component that starts with our name.
 697     */
 698    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 699       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 700
 701       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 702           (storage->name[namelen] != 0 &&
 703            storage->name[namelen] != '.' &&
 704            storage->name[namelen] != '[')) {
 705          continue;
 706       }
 707
 708       gl_constant_value *components = storage->storage;
 709       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 710                                storage->type->matrix_columns);
 711
 712       for (unsigned s = 0; s < vector_count; s++) {
 713          assert(uniforms < uniform_array_size);
 714          uniform_vector_size[uniforms] = storage->type->vector_elements;
 715
 716          int i;
 717          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 718             stage_prog_data->param[uniforms * 4 + i] = components;
 719             components++;
 720          }
 721          for (; i < 4; i++) {
 722             static gl_constant_value zero = { 0.0 };
 723             stage_prog_data->param[uniforms * 4 + i] = &zero;
 724          }
 725
 726          uniforms++;
 727       }
 728    }
 729 }
 730
 731 void
 732 vec4_visitor::setup_uniform_clipplane_values()
 733 {
 734    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 735
 736    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 737       assert(this->uniforms < uniform_array_size);
 738       this->uniform_vector_size[this->uniforms] = 4;
 739       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 740       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 741       for (int j = 0; j < 4; ++j) {
 742          stage_prog_data->param[this->uniforms * 4 + j] =
 743             (gl_constant_value *) &clip_planes[i][j];
 744       }
 745       ++this->uniforms;
 746    }
 747 }
 748
 749 /* Our support for builtin uniforms is even scarier than non-builtin.
 750  * It sits on top of the PROG_STATE_VAR parameters that are
 751  * automatically updated from GL context state.
 752  */
 753 void
 754 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 755 {
 756    const ir_state_slot *const slots = ir->get_state_slots();
 757    assert(slots != NULL);
 758
 759    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 760       /* This state reference has already been setup by ir_to_mesa,
 761        * but we'll get the same index back here.  We can reference
 762        * ParameterValues directly, since unlike brw_fs.cpp, we never
 763        * add new state references during compile.
 764        */
 765       int index = _mesa_add_state_reference(this->prog->Parameters,
 766                                             (gl_state_index *)slots[i].tokens);
 767       gl_constant_value *values =
 768          &this->prog->Parameters->ParameterValues[index][0];
 769
 770       assert(this->uniforms < uniform_array_size);
 771       this->uniform_vector_size[this->uniforms] = 0;
 772       /* Add each of the unique swizzled channels of the element.
 773        * This will end up matching the size of the glsl_type of this field.
 774        */
 775       int last_swiz = -1;
 776       for (unsigned int j = 0; j < 4; j++) {
 777          int swiz = GET_SWZ(slots[i].swizzle, j);
 778          last_swiz = swiz;
 779
 780          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 781          assert(this->uniforms < uniform_array_size);
 782          if (swiz <= last_swiz)
 783             this->uniform_vector_size[this->uniforms]++;
 784       }
 785       this->uniforms++;
 786    }
 787 }
 788
 789 dst_reg *
 790 vec4_visitor::variable_storage(ir_variable *var)
 791 {
 792    return (dst_reg *)hash_table_find(this->variable_ht, var);
 793 }
 794
 795 void
 796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 797                                      enum brw_predicate *predicate)
 798 {
 799    ir_expression *expr = ir->as_expression();
 800
 801    *predicate = BRW_PREDICATE_NORMAL;
 802
 803    if (expr && expr->operation != ir_binop_ubo_load) {
 804       src_reg op[3];
 805       vec4_instruction *inst;
 806
 807       assert(expr->get_num_operands() <= 3);
 808       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 809          expr->operands[i]->accept(this);
 810          op[i] = this->result;
 811
 812          resolve_ud_negate(&op[i]);
 813       }
 814
 815       switch (expr->operation) {
 816       case ir_unop_logic_not:
 817          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 818          inst->conditional_mod = BRW_CONDITIONAL_Z;
 819          break;
 820
 821       case ir_binop_logic_xor:
 822          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_binop_logic_or:
 827          inst = emit(OR(dst_null_d(), op[0], op[1]));
 828          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 829          break;
 830
 831       case ir_binop_logic_and:
 832          inst = emit(AND(dst_null_d(), op[0], op[1]));
 833          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 834          break;
 835
 836       case ir_unop_f2b:
 837          if (brw->gen >= 6) {
 838             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 839          } else {
 840             inst = emit(MOV(dst_null_f(), op[0]));
 841             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 842          }
 843          break;
 844
 845       case ir_unop_i2b:
 846          if (brw->gen >= 6) {
 847             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 848          } else {
 849             inst = emit(MOV(dst_null_d(), op[0]));
 850             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 851          }
 852          break;
 853
 854       case ir_binop_all_equal:
 855          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 856          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 857          break;
 858
 859       case ir_binop_any_nequal:
 860          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 861          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 862          break;
 863
 864       case ir_unop_any:
 865          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 866          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 867          break;
 868
 869       case ir_binop_greater:
 870       case ir_binop_gequal:
 871       case ir_binop_less:
 872       case ir_binop_lequal:
 873       case ir_binop_equal:
 874       case ir_binop_nequal:
 875          emit(CMP(dst_null_d(), op[0], op[1],
 876                   brw_conditional_for_comparison(expr->operation)));
 877          break;
 878
 879       case ir_triop_csel: {
 880          /* Expand the boolean condition into the flag register. */
 881          inst = emit(MOV(dst_null_d(), op[0]));
 882          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 883
 884          /* Select which boolean to return. */
 885          dst_reg temp(this, expr->operands[1]->type);
 886          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 887          inst->predicate = BRW_PREDICATE_NORMAL;
 888
 889          /* Expand the result to a condition code. */
 890          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 891          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 892          break;
 893       }
 894
 895       default:
 896          unreachable("not reached");
 897       }
 898       return;
 899    }
 900
 901    ir->accept(this);
 902
 903    resolve_ud_negate(&this->result);
 904
 905    if (brw->gen >= 6) {
 906       vec4_instruction *inst = emit(AND(dst_null_d(),
 907                                         this->result, src_reg(1)));
 908       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 909    } else {
 910       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 911       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 912    }
 913 }
 914
 915 /**
 916  * Emit a gen6 IF statement with the comparison folded into the IF
 917  * instruction.
 918  */
 919 void
 920 vec4_visitor::emit_if_gen6(ir_if *ir)
 921 {
 922    ir_expression *expr = ir->condition->as_expression();
 923
 924    if (expr && expr->operation != ir_binop_ubo_load) {
 925       src_reg op[3];
 926       dst_reg temp;
 927
 928       assert(expr->get_num_operands() <= 3);
 929       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 930          expr->operands[i]->accept(this);
 931          op[i] = this->result;
 932       }
 933
 934       switch (expr->operation) {
 935       case ir_unop_logic_not:
 936          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 937          return;
 938
 939       case ir_binop_logic_xor:
 940          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 941          return;
 942
 943       case ir_binop_logic_or:
 944          temp = dst_reg(this, glsl_type::bool_type);
 945          emit(OR(temp, op[0], op[1]));
 946          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 947          return;
 948
 949       case ir_binop_logic_and:
 950          temp = dst_reg(this, glsl_type::bool_type);
 951          emit(AND(temp, op[0], op[1]));
 952          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 953          return;
 954
 955       case ir_unop_f2b:
 956          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958
 959       case ir_unop_i2b:
 960          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 961          return;
 962
 963       case ir_binop_greater:
 964       case ir_binop_gequal:
 965       case ir_binop_less:
 966       case ir_binop_lequal:
 967       case ir_binop_equal:
 968       case ir_binop_nequal:
 969          emit(IF(op[0], op[1],
 970                  brw_conditional_for_comparison(expr->operation)));
 971          return;
 972
 973       case ir_binop_all_equal:
 974          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 975          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 976          return;
 977
 978       case ir_binop_any_nequal:
 979          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 980          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 981          return;
 982
 983       case ir_unop_any:
 984          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 985          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 986          return;
 987
 988       case ir_triop_csel: {
 989          /* Expand the boolean condition into the flag register. */
 990          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 991          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 992
 993          /* Select which boolean to return. */
 994          dst_reg temp(this, expr->operands[1]->type);
 995          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 996          inst->predicate = BRW_PREDICATE_NORMAL;
 997
 998          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 999          return;
1000       }
1001
1002       default:
1003          unreachable("not reached");
1004       }
1005       return;
1006    }
1007
1008    ir->condition->accept(this);
1009
1010    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1011 }
1012
1013 void
1014 vec4_visitor::visit(ir_variable *ir)
1015 {
1016    dst_reg *reg = NULL;
1017
1018    if (variable_storage(ir))
1019       return;
1020
1021    switch (ir->data.mode) {
1022    case ir_var_shader_in:
1023       assert(ir->data.location != -1);
1024       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1025       break;
1026
1027    case ir_var_shader_out:
1028       assert(ir->data.location != -1);
1029       reg = new(mem_ctx) dst_reg(this, ir->type);
1030
1031       for (int i = 0; i < type_size(ir->type); i++) {
1032          output_reg[ir->data.location + i] = *reg;
1033          output_reg[ir->data.location + i].reg_offset = i;
1034          output_reg[ir->data.location + i].type =
1035             brw_type_for_base_type(ir->type->get_scalar_type());
1036          output_reg_annotation[ir->data.location + i] = ir->name;
1037       }
1038       break;
1039
1040    case ir_var_auto:
1041    case ir_var_temporary:
1042       reg = new(mem_ctx) dst_reg(this, ir->type);
1043       break;
1044
1045    case ir_var_uniform:
1046       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1047
1048       /* Thanks to the lower_ubo_reference pass, we will see only
1049        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1050        * variables, so no need for them to be in variable_ht.
1051        *
1052        * Some uniforms, such as samplers and atomic counters, have no actual
1053        * storage, so we should ignore them.
1054        */
1055       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1056          return;
1057
1058       /* Track how big the whole uniform variable is, in case we need to put a
1059        * copy of its data into pull constants for array access.
1060        */
1061       assert(this->uniforms < uniform_array_size);
1062       this->uniform_size[this->uniforms] = type_size(ir->type);
1063
1064       if (!strncmp(ir->name, "gl_", 3)) {
1065          setup_builtin_uniform_values(ir);
1066       } else {
1067          setup_uniform_values(ir);
1068       }
1069       break;
1070
1071    case ir_var_system_value:
1072       reg = make_reg_for_system_value(ir);
1073       break;
1074
1075    default:
1076       unreachable("not reached");
1077    }
1078
1079    reg->type = brw_type_for_base_type(ir->type);
1080    hash_table_insert(this->variable_ht, reg, ir);
1081 }
1082
1083 void
1084 vec4_visitor::visit(ir_loop *ir)
1085 {
1086    /* We don't want debugging output to print the whole body of the
1087     * loop as the annotation.
1088     */
1089    this->base_ir = NULL;
1090
1091    emit(BRW_OPCODE_DO);
1092
1093    visit_instructions(&ir->body_instructions);
1094
1095    emit(BRW_OPCODE_WHILE);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop_jump *ir)
1100 {
1101    switch (ir->mode) {
1102    case ir_loop_jump::jump_break:
1103       emit(BRW_OPCODE_BREAK);
1104       break;
1105    case ir_loop_jump::jump_continue:
1106       emit(BRW_OPCODE_CONTINUE);
1107       break;
1108    }
1109 }
1110
1111
1112 void
1113 vec4_visitor::visit(ir_function_signature *)
1114 {
1115    unreachable("not reached");
1116 }
1117
1118 void
1119 vec4_visitor::visit(ir_function *ir)
1120 {
1121    /* Ignore function bodies other than main() -- we shouldn't see calls to
1122     * them since they should all be inlined.
1123     */
1124    if (strcmp(ir->name, "main") == 0) {
1125       const ir_function_signature *sig;
1126       exec_list empty;
1127
1128       sig = ir->matching_signature(NULL, &empty, false);
1129
1130       assert(sig);
1131
1132       visit_instructions(&sig->body);
1133    }
1134 }
1135
1136 bool
1137 vec4_visitor::try_emit_mad(ir_expression *ir)
1138 {
1139    /* 3-src instructions were introduced in gen6. */
1140    if (brw->gen < 6)
1141       return false;
1142
1143    /* MAD can only handle floating-point data. */
1144    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1145       return false;
1146
1147    ir_rvalue *nonmul = ir->operands[1];
1148    ir_expression *mul = ir->operands[0]->as_expression();
1149
1150    if (!mul || mul->operation != ir_binop_mul) {
1151       nonmul = ir->operands[0];
1152       mul = ir->operands[1]->as_expression();
1153
1154       if (!mul || mul->operation != ir_binop_mul)
1155          return false;
1156    }
1157
1158    nonmul->accept(this);
1159    src_reg src0 = fix_3src_operand(this->result);
1160
1161    mul->operands[0]->accept(this);
1162    src_reg src1 = fix_3src_operand(this->result);
1163
1164    mul->operands[1]->accept(this);
1165    src_reg src2 = fix_3src_operand(this->result);
1166
1167    this->result = src_reg(this, ir->type);
1168    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1169
1170    return true;
1171 }
1172
1173 bool
1174 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1175 {
1176    /* This optimization relies on CMP setting the destination to 0 when
1177     * false.  Early hardware only sets the least significant bit, and
1178     * leaves the other bits undefined.  So we can't use it.
1179     */
1180    if (brw->gen < 6)
1181       return false;
1182
1183    ir_expression *const cmp = ir->operands[0]->as_expression();
1184
1185    if (cmp == NULL)
1186       return false;
1187
1188    switch (cmp->operation) {
1189    case ir_binop_less:
1190    case ir_binop_greater:
1191    case ir_binop_lequal:
1192    case ir_binop_gequal:
1193    case ir_binop_equal:
1194    case ir_binop_nequal:
1195       break;
1196
1197    default:
1198       return false;
1199    }
1200
1201    cmp->operands[0]->accept(this);
1202    const src_reg cmp_src0 = this->result;
1203
1204    cmp->operands[1]->accept(this);
1205    const src_reg cmp_src1 = this->result;
1206
1207    this->result = src_reg(this, ir->type);
1208
1209    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1210             brw_conditional_for_comparison(cmp->operation)));
1211
1212    /* If the comparison is false, this->result will just happen to be zero.
1213     */
1214    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1215                                        this->result, src_reg(1.0f));
1216    inst->predicate = BRW_PREDICATE_NORMAL;
1217    inst->predicate_inverse = true;
1218
1219    return true;
1220 }
1221
1222 void
1223 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1224                           src_reg src0, src_reg src1)
1225 {
1226    vec4_instruction *inst;
1227
1228    if (brw->gen >= 6) {
1229       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1230       inst->conditional_mod = conditionalmod;
1231    } else {
1232       emit(CMP(dst, src0, src1, conditionalmod));
1233
1234       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1235       inst->predicate = BRW_PREDICATE_NORMAL;
1236    }
1237 }
1238
1239 void
1240 vec4_visitor::emit_lrp(const dst_reg &dst,
1241                        const src_reg &x, const src_reg &y, const src_reg &a)
1242 {
1243    if (brw->gen >= 6) {
1244       /* Note that the instruction's argument order is reversed from GLSL
1245        * and the IR.
1246        */
1247       emit(LRP(dst,
1248                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1249    } else {
1250       /* Earlier generations don't support three source operations, so we
1251        * need to emit x*(1-a) + y*a.
1252        */
1253       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1254       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1255       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1256       y_times_a.writemask           = dst.writemask;
1257       one_minus_a.writemask         = dst.writemask;
1258       x_times_one_minus_a.writemask = dst.writemask;
1259
1260       emit(MUL(y_times_a, y, a));
1261       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1262       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1263       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1264    }
1265 }
1266
1267 void
1268 vec4_visitor::visit(ir_expression *ir)
1269 {
1270    unsigned int operand;
1271    src_reg op[Elements(ir->operands)];
1272    vec4_instruction *inst;
1273
1274    if (ir->operation == ir_binop_add) {
1275       if (try_emit_mad(ir))
1276          return;
1277    }
1278
1279    if (ir->operation == ir_unop_b2f) {
1280       if (try_emit_b2f_of_compare(ir))
1281          return;
1282    }
1283
1284    /* Storage for our result.  Ideally for an assignment we'd be using
1285     * the actual storage for the result here, instead.
1286     */
1287    dst_reg result_dst(this, ir->type);
1288    src_reg result_src(result_dst);
1289
1290    if (ir->operation == ir_triop_csel) {
1291       ir->operands[1]->accept(this);
1292       op[1] = this->result;
1293       ir->operands[2]->accept(this);
1294       op[2] = this->result;
1295
1296       enum brw_predicate predicate;
1297       emit_bool_to_cond_code(ir->operands[0], &predicate);
1298       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1299       inst->predicate = predicate;
1300       this->result = result_src;
1301       return;
1302    }
1303
1304    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1305       this->result.file = BAD_FILE;
1306       ir->operands[operand]->accept(this);
1307       if (this->result.file == BAD_FILE) {
1308          fprintf(stderr, "Failed to get tree for expression operand:\n");
1309          ir->operands[operand]->fprint(stderr);
1310          exit(1);
1311       }
1312       op[operand] = this->result;
1313
1314       /* Matrix expression operands should have been broken down to vector
1315        * operations already.
1316        */
1317       assert(!ir->operands[operand]->type->is_matrix());
1318    }
1319
1320    /* If nothing special happens, this is the result. */
1321    this->result = result_src;
1322
1323    switch (ir->operation) {
1324    case ir_unop_logic_not:
1325       if (ctx->Const.UniformBooleanTrue != 1) {
1326          emit(NOT(result_dst, op[0]));
1327       } else {
1328          emit(XOR(result_dst, op[0], src_reg(1u)));
1329       }
1330       break;
1331    case ir_unop_neg:
1332       op[0].negate = !op[0].negate;
1333       emit(MOV(result_dst, op[0]));
1334       break;
1335    case ir_unop_abs:
1336       op[0].abs = true;
1337       op[0].negate = false;
1338       emit(MOV(result_dst, op[0]));
1339       break;
1340
1341    case ir_unop_sign:
1342       if (ir->type->is_float()) {
1343          /* AND(val, 0x80000000) gives the sign bit.
1344           *
1345           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1346           * zero.
1347           */
1348          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1349
1350          op[0].type = BRW_REGISTER_TYPE_UD;
1351          result_dst.type = BRW_REGISTER_TYPE_UD;
1352          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1353
1354          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1355          inst->predicate = BRW_PREDICATE_NORMAL;
1356
1357          this->result.type = BRW_REGISTER_TYPE_F;
1358       } else {
1359          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1360           *               -> non-negative val generates 0x00000000.
1361           *  Predicated OR sets 1 if val is positive.
1362           */
1363          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1364
1365          emit(ASR(result_dst, op[0], src_reg(31)));
1366
1367          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1368          inst->predicate = BRW_PREDICATE_NORMAL;
1369       }
1370       break;
1371
1372    case ir_unop_rcp:
1373       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1374       break;
1375
1376    case ir_unop_exp2:
1377       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1378       break;
1379    case ir_unop_log2:
1380       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1381       break;
1382    case ir_unop_exp:
1383    case ir_unop_log:
1384       unreachable("not reached: should be handled by ir_explog_to_explog2");
1385    case ir_unop_sin:
1386    case ir_unop_sin_reduced:
1387       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1388       break;
1389    case ir_unop_cos:
1390    case ir_unop_cos_reduced:
1391       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1392       break;
1393
1394    case ir_unop_dFdx:
1395    case ir_unop_dFdx_coarse:
1396    case ir_unop_dFdx_fine:
1397    case ir_unop_dFdy:
1398    case ir_unop_dFdy_coarse:
1399    case ir_unop_dFdy_fine:
1400       unreachable("derivatives not valid in vertex shader");
1401
1402    case ir_unop_bitfield_reverse:
1403       emit(BFREV(result_dst, op[0]));
1404       break;
1405    case ir_unop_bit_count:
1406       emit(CBIT(result_dst, op[0]));
1407       break;
1408    case ir_unop_find_msb: {
1409       src_reg temp = src_reg(this, glsl_type::uint_type);
1410
1411       inst = emit(FBH(dst_reg(temp), op[0]));
1412       inst->dst.writemask = WRITEMASK_XYZW;
1413
1414       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1415        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1416        * subtract the result from 31 to convert the MSB count into an LSB count.
1417        */
1418
1419       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1420       temp.swizzle = BRW_SWIZZLE_NOOP;
1421       emit(MOV(result_dst, temp));
1422
1423       src_reg src_tmp = src_reg(result_dst);
1424       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1425
1426       src_tmp.negate = true;
1427       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1428       inst->predicate = BRW_PREDICATE_NORMAL;
1429       break;
1430    }
1431    case ir_unop_find_lsb:
1432       emit(FBL(result_dst, op[0]));
1433       break;
1434    case ir_unop_saturate:
1435       inst = emit(MOV(result_dst, op[0]));
1436       inst->saturate = true;
1437       break;
1438
1439    case ir_unop_noise:
1440       unreachable("not reached: should be handled by lower_noise");
1441
1442    case ir_binop_add:
1443       emit(ADD(result_dst, op[0], op[1]));
1444       break;
1445    case ir_binop_sub:
1446       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1447
1448    case ir_binop_mul:
1449       if (brw->gen < 8 && ir->type->is_integer()) {
1450          /* For integer multiplication, the MUL uses the low 16 bits of one of
1451           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1452           * accumulates in the contribution of the upper 16 bits of that
1453           * operand.  If we can determine that one of the args is in the low
1454           * 16 bits, though, we can just emit a single MUL.
1455           */
1456          if (ir->operands[0]->is_uint16_constant()) {
1457             if (brw->gen < 7)
1458                emit(MUL(result_dst, op[0], op[1]));
1459             else
1460                emit(MUL(result_dst, op[1], op[0]));
1461          } else if (ir->operands[1]->is_uint16_constant()) {
1462             if (brw->gen < 7)
1463                emit(MUL(result_dst, op[1], op[0]));
1464             else
1465                emit(MUL(result_dst, op[0], op[1]));
1466          } else {
1467             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1468
1469             emit(MUL(acc, op[0], op[1]));
1470             emit(MACH(dst_null_d(), op[0], op[1]));
1471             emit(MOV(result_dst, src_reg(acc)));
1472          }
1473       } else {
1474          emit(MUL(result_dst, op[0], op[1]));
1475       }
1476       break;
1477    case ir_binop_imul_high: {
1478       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1479
1480       emit(MUL(acc, op[0], op[1]));
1481       emit(MACH(result_dst, op[0], op[1]));
1482       break;
1483    }
1484    case ir_binop_div:
1485       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1486       assert(ir->type->is_integer());
1487       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1488       break;
1489    case ir_binop_carry: {
1490       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1491
1492       emit(ADDC(dst_null_ud(), op[0], op[1]));
1493       emit(MOV(result_dst, src_reg(acc)));
1494       break;
1495    }
1496    case ir_binop_borrow: {
1497       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1498
1499       emit(SUBB(dst_null_ud(), op[0], op[1]));
1500       emit(MOV(result_dst, src_reg(acc)));
1501       break;
1502    }
1503    case ir_binop_mod:
1504       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1505       assert(ir->type->is_integer());
1506       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1507       break;
1508
1509    case ir_binop_less:
1510    case ir_binop_greater:
1511    case ir_binop_lequal:
1512    case ir_binop_gequal:
1513    case ir_binop_equal:
1514    case ir_binop_nequal: {
1515       emit(CMP(result_dst, op[0], op[1],
1516                brw_conditional_for_comparison(ir->operation)));
1517       if (ctx->Const.UniformBooleanTrue == 1) {
1518          emit(AND(result_dst, result_src, src_reg(1u)));
1519       }
1520       break;
1521    }
1522
1523    case ir_binop_all_equal:
1524       /* "==" operator producing a scalar boolean. */
1525       if (ir->operands[0]->type->is_vector() ||
1526           ir->operands[1]->type->is_vector()) {
1527          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1528          emit(MOV(result_dst, src_reg(0)));
1529          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1530          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1531       } else {
1532          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1533          if (ctx->Const.UniformBooleanTrue == 1) {
1534             emit(AND(result_dst, result_src, src_reg(1u)));
1535          }
1536       }
1537       break;
1538    case ir_binop_any_nequal:
1539       /* "!=" operator producing a scalar boolean. */
1540       if (ir->operands[0]->type->is_vector() ||
1541           ir->operands[1]->type->is_vector()) {
1542          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1543
1544          emit(MOV(result_dst, src_reg(0)));
1545          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1546          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1547       } else {
1548          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1549          if (ctx->Const.UniformBooleanTrue == 1) {
1550             emit(AND(result_dst, result_src, src_reg(1u)));
1551          }
1552       }
1553       break;
1554
1555    case ir_unop_any:
1556       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1557       emit(MOV(result_dst, src_reg(0)));
1558
1559       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1560       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1561       break;
1562
1563    case ir_binop_logic_xor:
1564       emit(XOR(result_dst, op[0], op[1]));
1565       break;
1566
1567    case ir_binop_logic_or:
1568       emit(OR(result_dst, op[0], op[1]));
1569       break;
1570
1571    case ir_binop_logic_and:
1572       emit(AND(result_dst, op[0], op[1]));
1573       break;
1574
1575    case ir_binop_dot:
1576       assert(ir->operands[0]->type->is_vector());
1577       assert(ir->operands[0]->type == ir->operands[1]->type);
1578       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1579       break;
1580
1581    case ir_unop_sqrt:
1582       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1583       break;
1584    case ir_unop_rsq:
1585       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1586       break;
1587
1588    case ir_unop_bitcast_i2f:
1589    case ir_unop_bitcast_u2f:
1590       this->result = op[0];
1591       this->result.type = BRW_REGISTER_TYPE_F;
1592       break;
1593
1594    case ir_unop_bitcast_f2i:
1595       this->result = op[0];
1596       this->result.type = BRW_REGISTER_TYPE_D;
1597       break;
1598
1599    case ir_unop_bitcast_f2u:
1600       this->result = op[0];
1601       this->result.type = BRW_REGISTER_TYPE_UD;
1602       break;
1603
1604    case ir_unop_i2f:
1605    case ir_unop_i2u:
1606    case ir_unop_u2i:
1607    case ir_unop_u2f:
1608    case ir_unop_f2i:
1609    case ir_unop_f2u:
1610       emit(MOV(result_dst, op[0]));
1611       break;
1612    case ir_unop_b2i:
1613       if (ctx->Const.UniformBooleanTrue != 1) {
1614          emit(AND(result_dst, op[0], src_reg(1u)));
1615       } else {
1616          emit(MOV(result_dst, op[0]));
1617       }
1618       break;
1619    case ir_unop_b2f:
1620       if (ctx->Const.UniformBooleanTrue != 1) {
1621          op[0].type = BRW_REGISTER_TYPE_UD;
1622          result_dst.type = BRW_REGISTER_TYPE_UD;
1623          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1624          result_dst.type = BRW_REGISTER_TYPE_F;
1625       } else {
1626          emit(MOV(result_dst, op[0]));
1627       }
1628       break;
1629    case ir_unop_f2b:
1630    case ir_unop_i2b:
1631       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1632       if (ctx->Const.UniformBooleanTrue == 1) {
1633          emit(AND(result_dst, result_src, src_reg(1u)));
1634       }
1635       break;
1636
1637    case ir_unop_trunc:
1638       emit(RNDZ(result_dst, op[0]));
1639       break;
1640    case ir_unop_ceil:
1641       op[0].negate = !op[0].negate;
1642       inst = emit(RNDD(result_dst, op[0]));
1643       this->result.negate = true;
1644       break;
1645    case ir_unop_floor:
1646       inst = emit(RNDD(result_dst, op[0]));
1647       break;
1648    case ir_unop_fract:
1649       inst = emit(FRC(result_dst, op[0]));
1650       break;
1651    case ir_unop_round_even:
1652       emit(RNDE(result_dst, op[0]));
1653       break;
1654
1655    case ir_binop_min:
1656       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1657       break;
1658    case ir_binop_max:
1659       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1660       break;
1661
1662    case ir_binop_pow:
1663       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1664       break;
1665
1666    case ir_unop_bit_not:
1667       inst = emit(NOT(result_dst, op[0]));
1668       break;
1669    case ir_binop_bit_and:
1670       inst = emit(AND(result_dst, op[0], op[1]));
1671       break;
1672    case ir_binop_bit_xor:
1673       inst = emit(XOR(result_dst, op[0], op[1]));
1674       break;
1675    case ir_binop_bit_or:
1676       inst = emit(OR(result_dst, op[0], op[1]));
1677       break;
1678
1679    case ir_binop_lshift:
1680       inst = emit(SHL(result_dst, op[0], op[1]));
1681       break;
1682
1683    case ir_binop_rshift:
1684       if (ir->type->base_type == GLSL_TYPE_INT)
1685          inst = emit(ASR(result_dst, op[0], op[1]));
1686       else
1687          inst = emit(SHR(result_dst, op[0], op[1]));
1688       break;
1689
1690    case ir_binop_bfm:
1691       emit(BFI1(result_dst, op[0], op[1]));
1692       break;
1693
1694    case ir_binop_ubo_load: {
1695       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1696       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1697       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1698       src_reg offset;
1699
1700       /* Now, load the vector from that offset. */
1701       assert(ir->type->is_vector() || ir->type->is_scalar());
1702
1703       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1704       packed_consts.type = result.type;
1705       src_reg surf_index;
1706
1707       if (const_uniform_block) {
1708          /* The block index is a constant, so just emit the binding table entry
1709           * as an immediate.
1710           */
1711          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1712                               const_uniform_block->value.u[0]);
1713       } else {
1714          /* The block index is not a constant. Evaluate the index expression
1715           * per-channel and add the base UBO index; the generator will select
1716           * a value from any live channel.
1717           */
1718          surf_index = src_reg(this, glsl_type::uint_type);
1719          emit(ADD(dst_reg(surf_index), op[0],
1720                   src_reg(prog_data->base.binding_table.ubo_start)));
1721
1722          /* Assume this may touch any UBO. It would be nice to provide
1723           * a tighter bound, but the array information is already lowered away.
1724           */
1725          brw_mark_surface_used(&prog_data->base,
1726                                prog_data->base.binding_table.ubo_start +
1727                                shader_prog->NumUniformBlocks - 1);
1728       }
1729
1730       if (const_offset_ir) {
1731          if (brw->gen >= 8) {
1732             /* Store the offset in a GRF so we can send-from-GRF. */
1733             offset = src_reg(this, glsl_type::int_type);
1734             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1735          } else {
1736             /* Immediates are fine on older generations since they'll be moved
1737              * to a (potentially fake) MRF at the generator level.
1738              */
1739             offset = src_reg(const_offset / 16);
1740          }
1741       } else {
1742          offset = src_reg(this, glsl_type::uint_type);
1743          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1744       }
1745
1746       if (brw->gen >= 7) {
1747          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1748          grf_offset.type = offset.type;
1749
1750          emit(MOV(grf_offset, offset));
1751
1752          emit(new(mem_ctx) vec4_instruction(this,
1753                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1754                                             dst_reg(packed_consts),
1755                                             surf_index,
1756                                             src_reg(grf_offset)));
1757       } else {
1758          vec4_instruction *pull =
1759             emit(new(mem_ctx) vec4_instruction(this,
1760                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1761                                                dst_reg(packed_consts),
1762                                                surf_index,
1763                                                offset));
1764          pull->base_mrf = 14;
1765          pull->mlen = 1;
1766       }
1767
1768       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1769       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1770                                             const_offset % 16 / 4,
1771                                             const_offset % 16 / 4,
1772                                             const_offset % 16 / 4);
1773
1774       /* UBO bools are any nonzero int.  We need to convert them to use the
1775        * value of true stored in ctx->Const.UniformBooleanTrue.
1776        */
1777       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1778          emit(CMP(result_dst, packed_consts, src_reg(0u),
1779                   BRW_CONDITIONAL_NZ));
1780          if (ctx->Const.UniformBooleanTrue == 1) {
1781             emit(AND(result_dst, result, src_reg(1u)));
1782          }
1783       } else {
1784          emit(MOV(result_dst, packed_consts));
1785       }
1786       break;
1787    }
1788
1789    case ir_binop_vector_extract:
1790       unreachable("should have been lowered by vec_index_to_cond_assign");
1791
1792    case ir_triop_fma:
1793       op[0] = fix_3src_operand(op[0]);
1794       op[1] = fix_3src_operand(op[1]);
1795       op[2] = fix_3src_operand(op[2]);
1796       /* Note that the instruction's argument order is reversed from GLSL
1797        * and the IR.
1798        */
1799       emit(MAD(result_dst, op[2], op[1], op[0]));
1800       break;
1801
1802    case ir_triop_lrp:
1803       emit_lrp(result_dst, op[0], op[1], op[2]);
1804       break;
1805
1806    case ir_triop_csel:
1807       unreachable("already handled above");
1808       break;
1809
1810    case ir_triop_bfi:
1811       op[0] = fix_3src_operand(op[0]);
1812       op[1] = fix_3src_operand(op[1]);
1813       op[2] = fix_3src_operand(op[2]);
1814       emit(BFI2(result_dst, op[0], op[1], op[2]));
1815       break;
1816
1817    case ir_triop_bitfield_extract:
1818       op[0] = fix_3src_operand(op[0]);
1819       op[1] = fix_3src_operand(op[1]);
1820       op[2] = fix_3src_operand(op[2]);
1821       /* Note that the instruction's argument order is reversed from GLSL
1822        * and the IR.
1823        */
1824       emit(BFE(result_dst, op[2], op[1], op[0]));
1825       break;
1826
1827    case ir_triop_vector_insert:
1828       unreachable("should have been lowered by lower_vector_insert");
1829
1830    case ir_quadop_bitfield_insert:
1831       unreachable("not reached: should be handled by "
1832               "bitfield_insert_to_bfm_bfi\n");
1833
1834    case ir_quadop_vector:
1835       unreachable("not reached: should be handled by lower_quadop_vector");
1836
1837    case ir_unop_pack_half_2x16:
1838       emit_pack_half_2x16(result_dst, op[0]);
1839       break;
1840    case ir_unop_unpack_half_2x16:
1841       emit_unpack_half_2x16(result_dst, op[0]);
1842       break;
1843    case ir_unop_unpack_unorm_4x8:
1844       emit_unpack_unorm_4x8(result_dst, op[0]);
1845       break;
1846    case ir_unop_unpack_snorm_4x8:
1847       emit_unpack_snorm_4x8(result_dst, op[0]);
1848       break;
1849    case ir_unop_pack_unorm_4x8:
1850       emit_pack_unorm_4x8(result_dst, op[0]);
1851       break;
1852    case ir_unop_pack_snorm_4x8:
1853       emit_pack_snorm_4x8(result_dst, op[0]);
1854       break;
1855    case ir_unop_pack_snorm_2x16:
1856    case ir_unop_pack_unorm_2x16:
1857    case ir_unop_unpack_snorm_2x16:
1858    case ir_unop_unpack_unorm_2x16:
1859       unreachable("not reached: should be handled by lower_packing_builtins");
1860    case ir_unop_unpack_half_2x16_split_x:
1861    case ir_unop_unpack_half_2x16_split_y:
1862    case ir_binop_pack_half_2x16_split:
1863    case ir_unop_interpolate_at_centroid:
1864    case ir_binop_interpolate_at_sample:
1865    case ir_binop_interpolate_at_offset:
1866       unreachable("not reached: should not occur in vertex shader");
1867    case ir_binop_ldexp:
1868       unreachable("not reached: should be handled by ldexp_to_arith()");
1869    }
1870 }
1871
1872
1873 void
1874 vec4_visitor::visit(ir_swizzle *ir)
1875 {
1876    src_reg src;
1877    int i = 0;
1878    int swizzle[4];
1879
1880    /* Note that this is only swizzles in expressions, not those on the left
1881     * hand side of an assignment, which do write masking.  See ir_assignment
1882     * for that.
1883     */
1884
1885    ir->val->accept(this);
1886    src = this->result;
1887    assert(src.file != BAD_FILE);
1888
1889    for (i = 0; i < ir->type->vector_elements; i++) {
1890       switch (i) {
1891       case 0:
1892          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1893          break;
1894       case 1:
1895          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1896          break;
1897       case 2:
1898          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1899          break;
1900       case 3:
1901          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1902             break;
1903       }
1904    }
1905    for (; i < 4; i++) {
1906       /* Replicate the last channel out. */
1907       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1908    }
1909
1910    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1911
1912    this->result = src;
1913 }
1914
1915 void
1916 vec4_visitor::visit(ir_dereference_variable *ir)
1917 {
1918    const struct glsl_type *type = ir->type;
1919    dst_reg *reg = variable_storage(ir->var);
1920
1921    if (!reg) {
1922       fail("Failed to find variable storage for %s\n", ir->var->name);
1923       this->result = src_reg(brw_null_reg());
1924       return;
1925    }
1926
1927    this->result = src_reg(*reg);
1928
1929    /* System values get their swizzle from the dst_reg writemask */
1930    if (ir->var->data.mode == ir_var_system_value)
1931       return;
1932
1933    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1934       this->result.swizzle = swizzle_for_size(type->vector_elements);
1935 }
1936
1937
1938 int
1939 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1940 {
1941    /* Under normal circumstances array elements are stored consecutively, so
1942     * the stride is equal to the size of the array element.
1943     */
1944    return type_size(ir->type);
1945 }
1946
1947
1948 void
1949 vec4_visitor::visit(ir_dereference_array *ir)
1950 {
1951    ir_constant *constant_index;
1952    src_reg src;
1953    int array_stride = compute_array_stride(ir);
1954
1955    constant_index = ir->array_index->constant_expression_value();
1956
1957    ir->array->accept(this);
1958    src = this->result;
1959
1960    if (constant_index) {
1961       src.reg_offset += constant_index->value.i[0] * array_stride;
1962    } else {
1963       /* Variable index array dereference.  It eats the "vec4" of the
1964        * base of the array and an index that offsets the Mesa register
1965        * index.
1966        */
1967       ir->array_index->accept(this);
1968
1969       src_reg index_reg;
1970
1971       if (array_stride == 1) {
1972          index_reg = this->result;
1973       } else {
1974          index_reg = src_reg(this, glsl_type::int_type);
1975
1976          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1977       }
1978
1979       if (src.reladdr) {
1980          src_reg temp = src_reg(this, glsl_type::int_type);
1981
1982          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1983
1984          index_reg = temp;
1985       }
1986
1987       src.reladdr = ralloc(mem_ctx, src_reg);
1988       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1989    }
1990
1991    /* If the type is smaller than a vec4, replicate the last channel out. */
1992    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1993       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1994    else
1995       src.swizzle = BRW_SWIZZLE_NOOP;
1996    src.type = brw_type_for_base_type(ir->type);
1997
1998    this->result = src;
1999 }
2000
2001 void
2002 vec4_visitor::visit(ir_dereference_record *ir)
2003 {
2004    unsigned int i;
2005    const glsl_type *struct_type = ir->record->type;
2006    int offset = 0;
2007
2008    ir->record->accept(this);
2009
2010    for (i = 0; i < struct_type->length; i++) {
2011       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2012          break;
2013       offset += type_size(struct_type->fields.structure[i].type);
2014    }
2015
2016    /* If the type is smaller than a vec4, replicate the last channel out. */
2017    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2018       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2019    else
2020       this->result.swizzle = BRW_SWIZZLE_NOOP;
2021    this->result.type = brw_type_for_base_type(ir->type);
2022
2023    this->result.reg_offset += offset;
2024 }
2025
2026 /**
2027  * We want to be careful in assignment setup to hit the actual storage
2028  * instead of potentially using a temporary like we might with the
2029  * ir_dereference handler.
2030  */
2031 static dst_reg
2032 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2033 {
2034    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2035     * access of a vector, it must be separated into a series conditional moves
2036     * before reaching this point (see ir_vec_index_to_cond_assign).
2037     */
2038    assert(ir->as_dereference());
2039    ir_dereference_array *deref_array = ir->as_dereference_array();
2040    if (deref_array) {
2041       assert(!deref_array->array->type->is_vector());
2042    }
2043
2044    /* Use the rvalue deref handler for the most part.  We'll ignore
2045     * swizzles in it and write swizzles using writemask, though.
2046     */
2047    ir->accept(v);
2048    return dst_reg(v->result);
2049 }
2050
2051 void
2052 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2053                               const struct glsl_type *type,
2054                               enum brw_predicate predicate)
2055 {
2056    if (type->base_type == GLSL_TYPE_STRUCT) {
2057       for (unsigned int i = 0; i < type->length; i++) {
2058          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2059       }
2060       return;
2061    }
2062
2063    if (type->is_array()) {
2064       for (unsigned int i = 0; i < type->length; i++) {
2065          emit_block_move(dst, src, type->fields.array, predicate);
2066       }
2067       return;
2068    }
2069
2070    if (type->is_matrix()) {
2071       const struct glsl_type *vec_type;
2072
2073       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2074                                          type->vector_elements, 1);
2075
2076       for (int i = 0; i < type->matrix_columns; i++) {
2077          emit_block_move(dst, src, vec_type, predicate);
2078       }
2079       return;
2080    }
2081
2082    assert(type->is_scalar() || type->is_vector());
2083
2084    dst->type = brw_type_for_base_type(type);
2085    src->type = dst->type;
2086
2087    dst->writemask = (1 << type->vector_elements) - 1;
2088
2089    src->swizzle = swizzle_for_size(type->vector_elements);
2090
2091    vec4_instruction *inst = emit(MOV(*dst, *src));
2092    inst->predicate = predicate;
2093
2094    dst->reg_offset++;
2095    src->reg_offset++;
2096 }
2097
2098
2099 /* If the RHS processing resulted in an instruction generating a
2100  * temporary value, and it would be easy to rewrite the instruction to
2101  * generate its result right into the LHS instead, do so.  This ends
2102  * up reliably removing instructions where it can be tricky to do so
2103  * later without real UD chain information.
2104  */
2105 bool
2106 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2107                                      dst_reg dst,
2108                                      src_reg src,
2109                                      vec4_instruction *pre_rhs_inst,
2110                                      vec4_instruction *last_rhs_inst)
2111 {
2112    /* This could be supported, but it would take more smarts. */
2113    if (ir->condition)
2114       return false;
2115
2116    if (pre_rhs_inst == last_rhs_inst)
2117       return false; /* No instructions generated to work with. */
2118
2119    /* Make sure the last instruction generated our source reg. */
2120    if (src.file != GRF ||
2121        src.file != last_rhs_inst->dst.file ||
2122        src.reg != last_rhs_inst->dst.reg ||
2123        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2124        src.reladdr ||
2125        src.abs ||
2126        src.negate ||
2127        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2128       return false;
2129
2130    /* Check that that last instruction fully initialized the channels
2131     * we want to use, in the order we want to use them.  We could
2132     * potentially reswizzle the operands of many instructions so that
2133     * we could handle out of order channels, but don't yet.
2134     */
2135
2136    for (unsigned i = 0; i < 4; i++) {
2137       if (dst.writemask & (1 << i)) {
2138          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2139             return false;
2140
2141          if (BRW_GET_SWZ(src.swizzle, i) != i)
2142             return false;
2143       }
2144    }
2145
2146    /* Success!  Rewrite the instruction. */
2147    last_rhs_inst->dst.file = dst.file;
2148    last_rhs_inst->dst.reg = dst.reg;
2149    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2150    last_rhs_inst->dst.reladdr = dst.reladdr;
2151    last_rhs_inst->dst.writemask &= dst.writemask;
2152
2153    return true;
2154 }
2155
2156 void
2157 vec4_visitor::visit(ir_assignment *ir)
2158 {
2159    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2160    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2161
2162    if (!ir->lhs->type->is_scalar() &&
2163        !ir->lhs->type->is_vector()) {
2164       ir->rhs->accept(this);
2165       src_reg src = this->result;
2166
2167       if (ir->condition) {
2168          emit_bool_to_cond_code(ir->condition, &predicate);
2169       }
2170
2171       /* emit_block_move doesn't account for swizzles in the source register.
2172        * This should be ok, since the source register is a structure or an
2173        * array, and those can't be swizzled.  But double-check to be sure.
2174        */
2175       assert(src.swizzle ==
2176              (ir->rhs->type->is_matrix()
2177               ? swizzle_for_size(ir->rhs->type->vector_elements)
2178               : BRW_SWIZZLE_NOOP));
2179
2180       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2181       return;
2182    }
2183
2184    /* Now we're down to just a scalar/vector with writemasks. */
2185    int i;
2186
2187    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2188    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2189
2190    ir->rhs->accept(this);
2191
2192    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2193
2194    src_reg src = this->result;
2195
2196    int swizzles[4];
2197    int first_enabled_chan = 0;
2198    int src_chan = 0;
2199
2200    assert(ir->lhs->type->is_vector() ||
2201           ir->lhs->type->is_scalar());
2202    dst.writemask = ir->write_mask;
2203
2204    for (int i = 0; i < 4; i++) {
2205       if (dst.writemask & (1 << i)) {
2206          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2207          break;
2208       }
2209    }
2210
2211    /* Swizzle a small RHS vector into the channels being written.
2212     *
2213     * glsl ir treats write_mask as dictating how many channels are
2214     * present on the RHS while in our instructions we need to make
2215     * those channels appear in the slots of the vec4 they're written to.
2216     */
2217    for (int i = 0; i < 4; i++) {
2218       if (dst.writemask & (1 << i))
2219          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2220       else
2221          swizzles[i] = first_enabled_chan;
2222    }
2223    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2224                               swizzles[2], swizzles[3]);
2225
2226    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2227       return;
2228    }
2229
2230    if (ir->condition) {
2231       emit_bool_to_cond_code(ir->condition, &predicate);
2232    }
2233
2234    for (i = 0; i < type_size(ir->lhs->type); i++) {
2235       vec4_instruction *inst = emit(MOV(dst, src));
2236       inst->predicate = predicate;
2237
2238       dst.reg_offset++;
2239       src.reg_offset++;
2240    }
2241 }
2242
2243 void
2244 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2245 {
2246    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2247       foreach_in_list(ir_constant, field_value, &ir->components) {
2248          emit_constant_values(dst, field_value);
2249       }
2250       return;
2251    }
2252
2253    if (ir->type->is_array()) {
2254       for (unsigned int i = 0; i < ir->type->length; i++) {
2255          emit_constant_values(dst, ir->array_elements[i]);
2256       }
2257       return;
2258    }
2259
2260    if (ir->type->is_matrix()) {
2261       for (int i = 0; i < ir->type->matrix_columns; i++) {
2262          float *vec = &ir->value.f[i * ir->type->vector_elements];
2263
2264          for (int j = 0; j < ir->type->vector_elements; j++) {
2265             dst->writemask = 1 << j;
2266             dst->type = BRW_REGISTER_TYPE_F;
2267
2268             emit(MOV(*dst, src_reg(vec[j])));
2269          }
2270          dst->reg_offset++;
2271       }
2272       return;
2273    }
2274
2275    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2276
2277    for (int i = 0; i < ir->type->vector_elements; i++) {
2278       if (!(remaining_writemask & (1 << i)))
2279          continue;
2280
2281       dst->writemask = 1 << i;
2282       dst->type = brw_type_for_base_type(ir->type);
2283
2284       /* Find other components that match the one we're about to
2285        * write.  Emits fewer instructions for things like vec4(0.5,
2286        * 1.5, 1.5, 1.5).
2287        */
2288       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2289          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2290             if (ir->value.b[i] == ir->value.b[j])
2291                dst->writemask |= (1 << j);
2292          } else {
2293             /* u, i, and f storage all line up, so no need for a
2294              * switch case for comparing each type.
2295              */
2296             if (ir->value.u[i] == ir->value.u[j])
2297                dst->writemask |= (1 << j);
2298          }
2299       }
2300
2301       switch (ir->type->base_type) {
2302       case GLSL_TYPE_FLOAT:
2303          emit(MOV(*dst, src_reg(ir->value.f[i])));
2304          break;
2305       case GLSL_TYPE_INT:
2306          emit(MOV(*dst, src_reg(ir->value.i[i])));
2307          break;
2308       case GLSL_TYPE_UINT:
2309          emit(MOV(*dst, src_reg(ir->value.u[i])));
2310          break;
2311       case GLSL_TYPE_BOOL:
2312          emit(MOV(*dst,
2313                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2314                                               : 0u)));
2315          break;
2316       default:
2317          unreachable("Non-float/uint/int/bool constant");
2318       }
2319
2320       remaining_writemask &= ~dst->writemask;
2321    }
2322    dst->reg_offset++;
2323 }
2324
2325 void
2326 vec4_visitor::visit(ir_constant *ir)
2327 {
2328    dst_reg dst = dst_reg(this, ir->type);
2329    this->result = src_reg(dst);
2330
2331    emit_constant_values(&dst, ir);
2332 }
2333
2334 void
2335 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2336 {
2337    ir_dereference *deref = static_cast<ir_dereference *>(
2338       ir->actual_parameters.get_head());
2339    ir_variable *location = deref->variable_referenced();
2340    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2341                           location->data.binding);
2342
2343    /* Calculate the surface offset */
2344    src_reg offset(this, glsl_type::uint_type);
2345    ir_dereference_array *deref_array = deref->as_dereference_array();
2346    if (deref_array) {
2347       deref_array->array_index->accept(this);
2348
2349       src_reg tmp(this, glsl_type::uint_type);
2350       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2351       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2352    } else {
2353       offset = location->data.atomic.offset;
2354    }
2355
2356    /* Emit the appropriate machine instruction */
2357    const char *callee = ir->callee->function_name();
2358    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2359
2360    if (!strcmp("__intrinsic_atomic_read", callee)) {
2361       emit_untyped_surface_read(surf_index, dst, offset);
2362
2363    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2364       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2365                           src_reg(), src_reg());
2366
2367    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2368       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2369                           src_reg(), src_reg());
2370    }
2371 }
2372
2373 void
2374 vec4_visitor::visit(ir_call *ir)
2375 {
2376    const char *callee = ir->callee->function_name();
2377
2378    if (!strcmp("__intrinsic_atomic_read", callee) ||
2379        !strcmp("__intrinsic_atomic_increment", callee) ||
2380        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2381       visit_atomic_counter_intrinsic(ir);
2382    } else {
2383       unreachable("Unsupported intrinsic.");
2384    }
2385 }
2386
2387 src_reg
2388 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2389 {
2390    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2391    inst->base_mrf = 2;
2392    inst->mlen = 1;
2393    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2394    inst->dst.writemask = WRITEMASK_XYZW;
2395
2396    inst->src[1] = sampler;
2397
2398    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2399    int param_base = inst->base_mrf;
2400    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2401    int zero_mask = 0xf & ~coord_mask;
2402
2403    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2404             coordinate));
2405
2406    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2407             src_reg(0)));
2408
2409    emit(inst);
2410    return src_reg(inst->dst);
2411 }
2412
2413 static bool
2414 is_high_sampler(struct brw_context *brw, src_reg sampler)
2415 {
2416    if (brw->gen < 8 && !brw->is_haswell)
2417       return false;
2418
2419    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2420 }
2421
2422 void
2423 vec4_visitor::visit(ir_texture *ir)
2424 {
2425    uint32_t sampler =
2426       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2427
2428    ir_rvalue *nonconst_sampler_index =
2429       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2430
2431    /* Handle non-constant sampler array indexing */
2432    src_reg sampler_reg;
2433    if (nonconst_sampler_index) {
2434       /* The highest sampler which may be used by this operation is
2435        * the last element of the array. Mark it here, because the generator
2436        * doesn't have enough information to determine the bound.
2437        */
2438       uint32_t array_size = ir->sampler->as_dereference_array()
2439          ->array->type->array_size();
2440
2441       uint32_t max_used = sampler + array_size - 1;
2442       if (ir->op == ir_tg4 && brw->gen < 8) {
2443          max_used += prog_data->base.binding_table.gather_texture_start;
2444       } else {
2445          max_used += prog_data->base.binding_table.texture_start;
2446       }
2447
2448       brw_mark_surface_used(&prog_data->base, max_used);
2449
2450       /* Emit code to evaluate the actual indexing expression */
2451       nonconst_sampler_index->accept(this);
2452       dst_reg temp(this, glsl_type::uint_type);
2453       emit(ADD(temp, this->result, src_reg(sampler)))
2454          ->force_writemask_all = true;
2455       sampler_reg = src_reg(temp);
2456    } else {
2457       /* Single sampler, or constant array index; the indexing expression
2458        * is just an immediate.
2459        */
2460       sampler_reg = src_reg(sampler);
2461    }
2462
2463    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2464     * emitting anything other than setting up the constant result.
2465     */
2466    if (ir->op == ir_tg4) {
2467       ir_constant *chan = ir->lod_info.component->as_constant();
2468       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2469       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2470          dst_reg result(this, ir->type);
2471          this->result = src_reg(result);
2472          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2473          return;
2474       }
2475    }
2476
2477    /* Should be lowered by do_lower_texture_projection */
2478    assert(!ir->projector);
2479
2480    /* Should be lowered */
2481    assert(!ir->offset || !ir->offset->type->is_array());
2482
2483    /* Generate code to compute all the subexpression trees.  This has to be
2484     * done before loading any values into MRFs for the sampler message since
2485     * generating these values may involve SEND messages that need the MRFs.
2486     */
2487    src_reg coordinate;
2488    if (ir->coordinate) {
2489       ir->coordinate->accept(this);
2490       coordinate = this->result;
2491    }
2492
2493    src_reg shadow_comparitor;
2494    if (ir->shadow_comparitor) {
2495       ir->shadow_comparitor->accept(this);
2496       shadow_comparitor = this->result;
2497    }
2498
2499    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2500    src_reg offset_value;
2501    if (has_nonconstant_offset) {
2502       ir->offset->accept(this);
2503       offset_value = src_reg(this->result);
2504    }
2505
2506    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2507    src_reg lod, dPdx, dPdy, sample_index, mcs;
2508    switch (ir->op) {
2509    case ir_tex:
2510       lod = src_reg(0.0f);
2511       lod_type = glsl_type::float_type;
2512       break;
2513    case ir_txf:
2514    case ir_txl:
2515    case ir_txs:
2516       ir->lod_info.lod->accept(this);
2517       lod = this->result;
2518       lod_type = ir->lod_info.lod->type;
2519       break;
2520    case ir_query_levels:
2521       lod = src_reg(0);
2522       lod_type = glsl_type::int_type;
2523       break;
2524    case ir_txf_ms:
2525       ir->lod_info.sample_index->accept(this);
2526       sample_index = this->result;
2527       sample_index_type = ir->lod_info.sample_index->type;
2528
2529       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2530          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2531       else
2532          mcs = src_reg(0u);
2533       break;
2534    case ir_txd:
2535       ir->lod_info.grad.dPdx->accept(this);
2536       dPdx = this->result;
2537
2538       ir->lod_info.grad.dPdy->accept(this);
2539       dPdy = this->result;
2540
2541       lod_type = ir->lod_info.grad.dPdx->type;
2542       break;
2543    case ir_txb:
2544    case ir_lod:
2545    case ir_tg4:
2546       break;
2547    }
2548
2549    enum opcode opcode;
2550    switch (ir->op) {
2551    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2552    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2553    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2554    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2555    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2556    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2557    case ir_tg4: opcode = has_nonconstant_offset
2558                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2559    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2560    case ir_txb:
2561       unreachable("TXB is not valid for vertex shaders.");
2562    case ir_lod:
2563       unreachable("LOD is not valid for vertex shaders.");
2564    default:
2565       unreachable("Unrecognized tex op");
2566    }
2567
2568    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2569
2570    if (ir->offset != NULL && !has_nonconstant_offset) {
2571       inst->offset =
2572          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2573                             ir->offset->type->vector_elements);
2574    }
2575
2576    /* Stuff the channel select bits in the top of the texture offset */
2577    if (ir->op == ir_tg4)
2578       inst->offset |= gather_channel(ir, sampler) << 16;
2579
2580    /* The message header is necessary for:
2581     * - Gen4 (always)
2582     * - Texel offsets
2583     * - Gather channel selection
2584     * - Sampler indices too large to fit in a 4-bit value.
2585     */
2586    inst->header_present =
2587       brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2588       is_high_sampler(brw, sampler_reg);
2589    inst->base_mrf = 2;
2590    inst->mlen = inst->header_present + 1; /* always at least one */
2591    inst->dst = dst_reg(this, ir->type);
2592    inst->dst.writemask = WRITEMASK_XYZW;
2593    inst->shadow_compare = ir->shadow_comparitor != NULL;
2594
2595    inst->src[1] = sampler_reg;
2596
2597    /* MRF for the first parameter */
2598    int param_base = inst->base_mrf + inst->header_present;
2599
2600    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2601       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2602       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2603    } else {
2604       /* Load the coordinate */
2605       /* FINISHME: gl_clamp_mask and saturate */
2606       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2607       int zero_mask = 0xf & ~coord_mask;
2608
2609       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2610                coordinate));
2611
2612       if (zero_mask != 0) {
2613          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2614                   src_reg(0)));
2615       }
2616       /* Load the shadow comparitor */
2617       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2618          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2619                           WRITEMASK_X),
2620                   shadow_comparitor));
2621          inst->mlen++;
2622       }
2623
2624       /* Load the LOD info */
2625       if (ir->op == ir_tex || ir->op == ir_txl) {
2626          int mrf, writemask;
2627          if (brw->gen >= 5) {
2628             mrf = param_base + 1;
2629             if (ir->shadow_comparitor) {
2630                writemask = WRITEMASK_Y;
2631                /* mlen already incremented */
2632             } else {
2633                writemask = WRITEMASK_X;
2634                inst->mlen++;
2635             }
2636          } else /* brw->gen == 4 */ {
2637             mrf = param_base;
2638             writemask = WRITEMASK_W;
2639          }
2640          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2641       } else if (ir->op == ir_txf) {
2642          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2643       } else if (ir->op == ir_txf_ms) {
2644          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2645                   sample_index));
2646          if (brw->gen >= 7) {
2647             /* MCS data is in the first channel of `mcs`, but we need to get it into
2648              * the .y channel of the second vec4 of params, so replicate .x across
2649              * the whole vec4 and then mask off everything except .y
2650              */
2651             mcs.swizzle = BRW_SWIZZLE_XXXX;
2652             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2653                      mcs));
2654          }
2655          inst->mlen++;
2656       } else if (ir->op == ir_txd) {
2657          const glsl_type *type = lod_type;
2658
2659          if (brw->gen >= 5) {
2660             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2661             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2662             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2663             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2664             inst->mlen++;
2665
2666             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2667                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2668                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2669                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2670                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2671                inst->mlen++;
2672
2673                if (ir->shadow_comparitor) {
2674                   emit(MOV(dst_reg(MRF, param_base + 2,
2675                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2676                            shadow_comparitor));
2677                }
2678             }
2679          } else /* brw->gen == 4 */ {
2680             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2681             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2682             inst->mlen += 2;
2683          }
2684       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2685          if (ir->shadow_comparitor) {
2686             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2687                      shadow_comparitor));
2688          }
2689
2690          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2691                   offset_value));
2692          inst->mlen++;
2693       }
2694    }
2695
2696    emit(inst);
2697
2698    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2699     * spec requires layers.
2700     */
2701    if (ir->op == ir_txs) {
2702       glsl_type const *type = ir->sampler->type;
2703       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2704           type->sampler_array) {
2705          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2706                    writemask(inst->dst, WRITEMASK_Z),
2707                    src_reg(inst->dst), src_reg(6));
2708       }
2709    }
2710
2711    if (brw->gen == 6 && ir->op == ir_tg4) {
2712       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2713    }
2714
2715    swizzle_result(ir, src_reg(inst->dst), sampler);
2716 }
2717
2718 /**
2719  * Apply workarounds for Gen6 gather with UINT/SINT
2720  */
2721 void
2722 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2723 {
2724    if (!wa)
2725       return;
2726
2727    int width = (wa & WA_8BIT) ? 8 : 16;
2728    dst_reg dst_f = dst;
2729    dst_f.type = BRW_REGISTER_TYPE_F;
2730
2731    /* Convert from UNORM to UINT */
2732    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2733    emit(MOV(dst, src_reg(dst_f)));
2734
2735    if (wa & WA_SIGN) {
2736       /* Reinterpret the UINT value as a signed INT value by
2737        * shifting the sign bit into place, then shifting back
2738        * preserving sign.
2739        */
2740       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2741       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2742    }
2743 }
2744
2745 /**
2746  * Set up the gather channel based on the swizzle, for gather4.
2747  */
2748 uint32_t
2749 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2750 {
2751    ir_constant *chan = ir->lod_info.component->as_constant();
2752    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2753    switch (swiz) {
2754       case SWIZZLE_X: return 0;
2755       case SWIZZLE_Y:
2756          /* gather4 sampler is broken for green channel on RG32F --
2757           * we must ask for blue instead.
2758           */
2759          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2760             return 2;
2761          return 1;
2762       case SWIZZLE_Z: return 2;
2763       case SWIZZLE_W: return 3;
2764       default:
2765          unreachable("Not reached"); /* zero, one swizzles handled already */
2766    }
2767 }
2768
2769 void
2770 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2771 {
2772    int s = key->tex.swizzles[sampler];
2773
2774    this->result = src_reg(this, ir->type);
2775    dst_reg swizzled_result(this->result);
2776
2777    if (ir->op == ir_query_levels) {
2778       /* # levels is in .w */
2779       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2780       emit(MOV(swizzled_result, orig_val));
2781       return;
2782    }
2783
2784    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2785                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2786       emit(MOV(swizzled_result, orig_val));
2787       return;
2788    }
2789
2790
2791    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2792    int swizzle[4] = {0};
2793
2794    for (int i = 0; i < 4; i++) {
2795       switch (GET_SWZ(s, i)) {
2796       case SWIZZLE_ZERO:
2797          zero_mask |= (1 << i);
2798          break;
2799       case SWIZZLE_ONE:
2800          one_mask |= (1 << i);
2801          break;
2802       default:
2803          copy_mask |= (1 << i);
2804          swizzle[i] = GET_SWZ(s, i);
2805          break;
2806       }
2807    }
2808
2809    if (copy_mask) {
2810       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2811       swizzled_result.writemask = copy_mask;
2812       emit(MOV(swizzled_result, orig_val));
2813    }
2814
2815    if (zero_mask) {
2816       swizzled_result.writemask = zero_mask;
2817       emit(MOV(swizzled_result, src_reg(0.0f)));
2818    }
2819
2820    if (one_mask) {
2821       swizzled_result.writemask = one_mask;
2822       emit(MOV(swizzled_result, src_reg(1.0f)));
2823    }
2824 }
2825
2826 void
2827 vec4_visitor::visit(ir_return *)
2828 {
2829    unreachable("not reached");
2830 }
2831
2832 void
2833 vec4_visitor::visit(ir_discard *)
2834 {
2835    unreachable("not reached");
2836 }
2837
2838 void
2839 vec4_visitor::visit(ir_if *ir)
2840 {
2841    /* Don't point the annotation at the if statement, because then it plus
2842     * the then and else blocks get printed.
2843     */
2844    this->base_ir = ir->condition;
2845
2846    if (brw->gen == 6) {
2847       emit_if_gen6(ir);
2848    } else {
2849       enum brw_predicate predicate;
2850       emit_bool_to_cond_code(ir->condition, &predicate);
2851       emit(IF(predicate));
2852    }
2853
2854    visit_instructions(&ir->then_instructions);
2855
2856    if (!ir->else_instructions.is_empty()) {
2857       this->base_ir = ir->condition;
2858       emit(BRW_OPCODE_ELSE);
2859
2860       visit_instructions(&ir->else_instructions);
2861    }
2862
2863    this->base_ir = ir->condition;
2864    emit(BRW_OPCODE_ENDIF);
2865 }
2866
2867 void
2868 vec4_visitor::visit(ir_emit_vertex *)
2869 {
2870    unreachable("not reached");
2871 }
2872
2873 void
2874 vec4_visitor::visit(ir_end_primitive *)
2875 {
2876    unreachable("not reached");
2877 }
2878
2879 void
2880 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2881                                   dst_reg dst, src_reg offset,
2882                                   src_reg src0, src_reg src1)
2883 {
2884    unsigned mlen = 0;
2885
2886    /* Set the atomic operation offset. */
2887    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2888    mlen++;
2889
2890    /* Set the atomic operation arguments. */
2891    if (src0.file != BAD_FILE) {
2892       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2893       mlen++;
2894    }
2895
2896    if (src1.file != BAD_FILE) {
2897       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2898       mlen++;
2899    }
2900
2901    /* Emit the instruction.  Note that this maps to the normal SIMD8
2902     * untyped atomic message on Ivy Bridge, but that's OK because
2903     * unused channels will be masked out.
2904     */
2905    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2906                                  src_reg(atomic_op), src_reg(surf_index));
2907    inst->base_mrf = 0;
2908    inst->mlen = mlen;
2909 }
2910
2911 void
2912 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2913                                         src_reg offset)
2914 {
2915    /* Set the surface read offset. */
2916    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2917
2918    /* Emit the instruction.  Note that this maps to the normal SIMD8
2919     * untyped surface read message, but that's OK because unused
2920     * channels will be masked out.
2921     */
2922    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2923                                  dst, src_reg(surf_index));
2924    inst->base_mrf = 0;
2925    inst->mlen = 1;
2926 }
2927
2928 void
2929 vec4_visitor::emit_ndc_computation()
2930 {
2931    /* Get the position */
2932    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2933
2934    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2935    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2936    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2937
2938    current_annotation = "NDC";
2939    dst_reg ndc_w = ndc;
2940    ndc_w.writemask = WRITEMASK_W;
2941    src_reg pos_w = pos;
2942    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2943    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2944
2945    dst_reg ndc_xyz = ndc;
2946    ndc_xyz.writemask = WRITEMASK_XYZ;
2947
2948    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2949 }
2950
2951 void
2952 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2953 {
2954    if (brw->gen < 6 &&
2955        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2956         key->userclip_active || brw->has_negative_rhw_bug)) {
2957       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2958       dst_reg header1_w = header1;
2959       header1_w.writemask = WRITEMASK_W;
2960
2961       emit(MOV(header1, 0u));
2962
2963       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2965
2966          current_annotation = "Point size";
2967          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2968          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2969       }
2970
2971       if (key->userclip_active) {
2972          current_annotation = "Clipping flags";
2973          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2974          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2975
2976          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2977          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2978          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2979
2980          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2981          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2982          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2983          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2984       }
2985
2986       /* i965 clipping workaround:
2987        * 1) Test for -ve rhw
2988        * 2) If set,
2989        *      set ndc = (0,0,0,0)
2990        *      set ucp[6] = 1
2991        *
2992        * Later, clipping will detect ucp[6] and ensure the primitive is
2993        * clipped against all fixed planes.
2994        */
2995       if (brw->has_negative_rhw_bug) {
2996          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2997          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2998          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2999          vec4_instruction *inst;
3000          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3001          inst->predicate = BRW_PREDICATE_NORMAL;
3002          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3003          inst->predicate = BRW_PREDICATE_NORMAL;
3004       }
3005
3006       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3007    } else if (brw->gen < 6) {
3008       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3009    } else {
3010       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3011       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3012          dst_reg reg_w = reg;
3013          reg_w.writemask = WRITEMASK_W;
3014          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3015       }
3016       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3017          dst_reg reg_y = reg;
3018          reg_y.writemask = WRITEMASK_Y;
3019          reg_y.type = BRW_REGISTER_TYPE_D;
3020          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3021       }
3022       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3023          dst_reg reg_z = reg;
3024          reg_z.writemask = WRITEMASK_Z;
3025          reg_z.type = BRW_REGISTER_TYPE_D;
3026          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3027       }
3028    }
3029 }
3030
3031 void
3032 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3033 {
3034    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3035     *
3036     *     "If a linked set of shaders forming the vertex stage contains no
3037     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3038     *     application has requested clipping against user clip planes through
3039     *     the API, then the coordinate written to gl_Position is used for
3040     *     comparison against the user clip planes."
3041     *
3042     * This function is only called if the shader didn't write to
3043     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3044     * if the user wrote to it; otherwise we use gl_Position.
3045     */
3046    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3047    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3048       clip_vertex = VARYING_SLOT_POS;
3049    }
3050
3051    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3052         ++i) {
3053       reg.writemask = 1 << i;
3054       emit(DP4(reg,
3055                src_reg(output_reg[clip_vertex]),
3056                src_reg(this->userplane[i + offset])));
3057    }
3058 }
3059
3060 vec4_instruction *
3061 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3062 {
3063    assert (varying < VARYING_SLOT_MAX);
3064    reg.type = output_reg[varying].type;
3065    current_annotation = output_reg_annotation[varying];
3066    /* Copy the register, saturating if necessary */
3067    return emit(MOV(reg, src_reg(output_reg[varying])));
3068 }
3069
3070 void
3071 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3072 {
3073    reg.type = BRW_REGISTER_TYPE_F;
3074
3075    switch (varying) {
3076    case VARYING_SLOT_PSIZ:
3077    {
3078       /* PSIZ is always in slot 0, and is coupled with other flags. */
3079       current_annotation = "indices, point width, clip flags";
3080       emit_psiz_and_flags(reg);
3081       break;
3082    }
3083    case BRW_VARYING_SLOT_NDC:
3084       current_annotation = "NDC";
3085       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3086       break;
3087    case VARYING_SLOT_POS:
3088       current_annotation = "gl_Position";
3089       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3090       break;
3091    case VARYING_SLOT_EDGE:
3092       /* This is present when doing unfilled polygons.  We're supposed to copy
3093        * the edge flag from the user-provided vertex array
3094        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3095        * of that attribute (starts as 1.0f).  This is then used in clipping to
3096        * determine which edges should be drawn as wireframe.
3097        */
3098       current_annotation = "edge flag";
3099       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3100                                     glsl_type::float_type, WRITEMASK_XYZW))));
3101       break;
3102    case BRW_VARYING_SLOT_PAD:
3103       /* No need to write to this slot */
3104       break;
3105    case VARYING_SLOT_COL0:
3106    case VARYING_SLOT_COL1:
3107    case VARYING_SLOT_BFC0:
3108    case VARYING_SLOT_BFC1: {
3109       /* These built-in varyings are only supported in compatibility mode,
3110        * and we only support GS in core profile.  So, this must be a vertex
3111        * shader.
3112        */
3113       assert(stage == MESA_SHADER_VERTEX);
3114       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3115       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3116          inst->saturate = true;
3117       break;
3118    }
3119
3120    default:
3121       emit_generic_urb_slot(reg, varying);
3122       break;
3123    }
3124 }
3125
3126 static int
3127 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3128 {
3129    if (brw->gen >= 6) {
3130       /* URB data written (does not include the message header reg) must
3131        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3132        * section 5.4.3.2.2: URB_INTERLEAVED.
3133        *
3134        * URB entries are allocated on a multiple of 1024 bits, so an
3135        * extra 128 bits written here to make the end align to 256 is
3136        * no problem.
3137        */
3138       if ((mlen % 2) != 1)
3139          mlen++;
3140    }
3141
3142    return mlen;
3143 }
3144
3145
3146 /**
3147  * Generates the VUE payload plus the necessary URB write instructions to
3148  * output it.
3149  *
3150  * The VUE layout is documented in Volume 2a.
3151  */
3152 void
3153 vec4_visitor::emit_vertex()
3154 {
3155    /* MRF 0 is reserved for the debugger, so start with message header
3156     * in MRF 1.
3157     */
3158    int base_mrf = 1;
3159    int mrf = base_mrf;
3160    /* In the process of generating our URB write message contents, we
3161     * may need to unspill a register or load from an array.  Those
3162     * reads would use MRFs 14-15.
3163     */
3164    int max_usable_mrf = 13;
3165
3166    /* The following assertion verifies that max_usable_mrf causes an
3167     * even-numbered amount of URB write data, which will meet gen6's
3168     * requirements for length alignment.
3169     */
3170    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3171
3172    /* First mrf is the g0-based message header containing URB handles and
3173     * such.
3174     */
3175    emit_urb_write_header(mrf++);
3176
3177    if (brw->gen < 6) {
3178       emit_ndc_computation();
3179    }
3180
3181    /* Lower legacy ff and ClipVertex clipping to clip distances */
3182    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3183       current_annotation = "user clip distances";
3184
3185       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3186       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3187
3188       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3189       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3190    }
3191
3192    /* We may need to split this up into several URB writes, so do them in a
3193     * loop.
3194     */
3195    int slot = 0;
3196    bool complete = false;
3197    do {
3198       /* URB offset is in URB row increments, and each of our MRFs is half of
3199        * one of those, since we're doing interleaved writes.
3200        */
3201       int offset = slot / 2;
3202
3203       mrf = base_mrf + 1;
3204       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3205          emit_urb_slot(dst_reg(MRF, mrf++),
3206                        prog_data->vue_map.slot_to_varying[slot]);
3207
3208          /* If this was max_usable_mrf, we can't fit anything more into this
3209           * URB WRITE.
3210           */
3211          if (mrf > max_usable_mrf) {
3212             slot++;
3213             break;
3214          }
3215       }
3216
3217       complete = slot >= prog_data->vue_map.num_slots;
3218       current_annotation = "URB write";
3219       vec4_instruction *inst = emit_urb_write_opcode(complete);
3220       inst->base_mrf = base_mrf;
3221       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3222       inst->offset += offset;
3223    } while(!complete);
3224 }
3225
3226
3227 src_reg
3228 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3229                                  src_reg *reladdr, int reg_offset)
3230 {
3231    /* Because we store the values to scratch interleaved like our
3232     * vertex data, we need to scale the vec4 index by 2.
3233     */
3234    int message_header_scale = 2;
3235
3236    /* Pre-gen6, the message header uses byte offsets instead of vec4
3237     * (16-byte) offset units.
3238     */
3239    if (brw->gen < 6)
3240       message_header_scale *= 16;
3241
3242    if (reladdr) {
3243       src_reg index = src_reg(this, glsl_type::int_type);
3244
3245       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3246                                    src_reg(reg_offset)));
3247       emit_before(block, inst, MUL(dst_reg(index), index,
3248                                    src_reg(message_header_scale)));
3249
3250       return index;
3251    } else {
3252       return src_reg(reg_offset * message_header_scale);
3253    }
3254 }
3255
3256 src_reg
3257 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3258                                        src_reg *reladdr, int reg_offset)
3259 {
3260    if (reladdr) {
3261       src_reg index = src_reg(this, glsl_type::int_type);
3262
3263       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3264                                    src_reg(reg_offset)));
3265
3266       /* Pre-gen6, the message header uses byte offsets instead of vec4
3267        * (16-byte) offset units.
3268        */
3269       if (brw->gen < 6) {
3270          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3271       }
3272
3273       return index;
3274    } else if (brw->gen >= 8) {
3275       /* Store the offset in a GRF so we can send-from-GRF. */
3276       src_reg offset = src_reg(this, glsl_type::int_type);
3277       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3278       return offset;
3279    } else {
3280       int message_header_scale = brw->gen < 6 ? 16 : 1;
3281       return src_reg(reg_offset * message_header_scale);
3282    }
3283 }
3284
3285 /**
3286  * Emits an instruction before @inst to load the value named by @orig_src
3287  * from scratch space at @base_offset to @temp.
3288  *
3289  * @base_offset is measured in 32-byte units (the size of a register).
3290  */
3291 void
3292 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3293                                 dst_reg temp, src_reg orig_src,
3294                                 int base_offset)
3295 {
3296    int reg_offset = base_offset + orig_src.reg_offset;
3297    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3298                                       reg_offset);
3299
3300    emit_before(block, inst, SCRATCH_READ(temp, index));
3301 }
3302
3303 /**
3304  * Emits an instruction after @inst to store the value to be written
3305  * to @orig_dst to scratch space at @base_offset, from @temp.
3306  *
3307  * @base_offset is measured in 32-byte units (the size of a register).
3308  */
3309 void
3310 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3311                                  int base_offset)
3312 {
3313    int reg_offset = base_offset + inst->dst.reg_offset;
3314    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3315                                       reg_offset);
3316
3317    /* Create a temporary register to store *inst's result in.
3318     *
3319     * We have to be careful in MOVing from our temporary result register in
3320     * the scratch write.  If we swizzle from channels of the temporary that
3321     * weren't initialized, it will confuse live interval analysis, which will
3322     * make spilling fail to make progress.
3323     */
3324    src_reg temp = src_reg(this, glsl_type::vec4_type);
3325    temp.type = inst->dst.type;
3326    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3327    int swizzles[4];
3328    for (int i = 0; i < 4; i++)
3329       if (inst->dst.writemask & (1 << i))
3330          swizzles[i] = i;
3331       else
3332          swizzles[i] = first_writemask_chan;
3333    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3334                                swizzles[2], swizzles[3]);
3335
3336    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3337                                        inst->dst.writemask));
3338    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3339    write->predicate = inst->predicate;
3340    write->ir = inst->ir;
3341    write->annotation = inst->annotation;
3342    inst->insert_after(block, write);
3343
3344    inst->dst.file = temp.file;
3345    inst->dst.reg = temp.reg;
3346    inst->dst.reg_offset = temp.reg_offset;
3347    inst->dst.reladdr = NULL;
3348 }
3349
3350 /**
3351  * We can't generally support array access in GRF space, because a
3352  * single instruction's destination can only span 2 contiguous
3353  * registers.  So, we send all GRF arrays that get variable index
3354  * access to scratch space.
3355  */
3356 void
3357 vec4_visitor::move_grf_array_access_to_scratch()
3358 {
3359    int scratch_loc[this->virtual_grf_count];
3360    memset(scratch_loc, -1, sizeof(scratch_loc));
3361
3362    /* First, calculate the set of virtual GRFs that need to be punted
3363     * to scratch due to having any array access on them, and where in
3364     * scratch.
3365     */
3366    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3367       if (inst->dst.file == GRF && inst->dst.reladdr &&
3368           scratch_loc[inst->dst.reg] == -1) {
3369          scratch_loc[inst->dst.reg] = c->last_scratch;
3370          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3371       }
3372
3373       for (int i = 0 ; i < 3; i++) {
3374          src_reg *src = &inst->src[i];
3375
3376          if (src->file == GRF && src->reladdr &&
3377              scratch_loc[src->reg] == -1) {
3378             scratch_loc[src->reg] = c->last_scratch;
3379             c->last_scratch += this->virtual_grf_sizes[src->reg];
3380          }
3381       }
3382    }
3383
3384    /* Now, for anything that will be accessed through scratch, rewrite
3385     * it to load/store.  Note that this is a _safe list walk, because
3386     * we may generate a new scratch_write instruction after the one
3387     * we're processing.
3388     */
3389    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3390       /* Set up the annotation tracking for new generated instructions. */
3391       base_ir = inst->ir;
3392       current_annotation = inst->annotation;
3393
3394       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3395          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3396       }
3397
3398       for (int i = 0 ; i < 3; i++) {
3399          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3400             continue;
3401
3402          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3403
3404          emit_scratch_read(block, inst, temp, inst->src[i],
3405                            scratch_loc[inst->src[i].reg]);
3406
3407          inst->src[i].file = temp.file;
3408          inst->src[i].reg = temp.reg;
3409          inst->src[i].reg_offset = temp.reg_offset;
3410          inst->src[i].reladdr = NULL;
3411       }
3412    }
3413 }
3414
3415 /**
3416  * Emits an instruction before @inst to load the value named by @orig_src
3417  * from the pull constant buffer (surface) at @base_offset to @temp.
3418  */
3419 void
3420 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3421                                       dst_reg temp, src_reg orig_src,
3422                                       int base_offset)
3423 {
3424    int reg_offset = base_offset + orig_src.reg_offset;
3425    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3426    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3427                                              reg_offset);
3428    vec4_instruction *load;
3429
3430    if (brw->gen >= 7) {
3431       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3432       grf_offset.type = offset.type;
3433       emit_before(block, inst, MOV(grf_offset, offset));
3434
3435       load = new(mem_ctx) vec4_instruction(this,
3436                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3437                                            temp, index, src_reg(grf_offset));
3438    } else {
3439       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3440                                            temp, index, offset);
3441       load->base_mrf = 14;
3442       load->mlen = 1;
3443    }
3444    emit_before(block, inst, load);
3445 }
3446
3447 /**
3448  * Implements array access of uniforms by inserting a
3449  * PULL_CONSTANT_LOAD instruction.
3450  *
3451  * Unlike temporary GRF array access (where we don't support it due to
3452  * the difficulty of doing relative addressing on instruction
3453  * destinations), we could potentially do array access of uniforms
3454  * that were loaded in GRF space as push constants.  In real-world
3455  * usage we've seen, though, the arrays being used are always larger
3456  * than we could load as push constants, so just always move all
3457  * uniform array access out to a pull constant buffer.
3458  */
3459 void
3460 vec4_visitor::move_uniform_array_access_to_pull_constants()
3461 {
3462    int pull_constant_loc[this->uniforms];
3463    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3464    bool nested_reladdr;
3465
3466    /* Walk through and find array access of uniforms.  Put a copy of that
3467     * uniform in the pull constant buffer.
3468     *
3469     * Note that we don't move constant-indexed accesses to arrays.  No
3470     * testing has been done of the performance impact of this choice.
3471     */
3472    do {
3473       nested_reladdr = false;
3474
3475       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3476          for (int i = 0 ; i < 3; i++) {
3477             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3478                continue;
3479
3480             int uniform = inst->src[i].reg;
3481
3482             if (inst->src[i].reladdr->reladdr)
3483                nested_reladdr = true;  /* will need another pass */
3484
3485             /* If this array isn't already present in the pull constant buffer,
3486              * add it.
3487              */
3488             if (pull_constant_loc[uniform] == -1) {
3489                const gl_constant_value **values =
3490                   &stage_prog_data->param[uniform * 4];
3491
3492                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3493
3494                assert(uniform < uniform_array_size);
3495                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3496                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3497                      = values[j];
3498                }
3499             }
3500
3501             /* Set up the annotation tracking for new generated instructions. */
3502             base_ir = inst->ir;
3503             current_annotation = inst->annotation;
3504
3505             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3506
3507             emit_pull_constant_load(block, inst, temp, inst->src[i],
3508                                     pull_constant_loc[uniform]);
3509
3510             inst->src[i].file = temp.file;
3511             inst->src[i].reg = temp.reg;
3512             inst->src[i].reg_offset = temp.reg_offset;
3513             inst->src[i].reladdr = NULL;
3514          }
3515       }
3516    } while (nested_reladdr);
3517
3518    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3519     * no need to track them as larger-than-vec4 objects.  This will be
3520     * relied on in cutting out unused uniform vectors from push
3521     * constants.
3522     */
3523    split_uniform_registers();
3524 }
3525
3526 void
3527 vec4_visitor::resolve_ud_negate(src_reg *reg)
3528 {
3529    if (reg->type != BRW_REGISTER_TYPE_UD ||
3530        !reg->negate)
3531       return;
3532
3533    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3534    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3535    *reg = temp;
3536 }
3537
3538 vec4_visitor::vec4_visitor(struct brw_context *brw,
3539                            struct brw_vec4_compile *c,
3540                            struct gl_program *prog,
3541                            const struct brw_vec4_prog_key *key,
3542                            struct brw_vec4_prog_data *prog_data,
3543                            struct gl_shader_program *shader_prog,
3544                            gl_shader_stage stage,
3545                            void *mem_ctx,
3546                            bool debug_flag,
3547                            bool no_spills,
3548                            shader_time_shader_type st_base,
3549                            shader_time_shader_type st_written,
3550                            shader_time_shader_type st_reset)
3551    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3552      c(c),
3553      key(key),
3554      prog_data(prog_data),
3555      sanity_param_count(0),
3556      fail_msg(NULL),
3557      first_non_payload_grf(0),
3558      need_all_constants_in_pull_buffer(false),
3559      debug_flag(debug_flag),
3560      no_spills(no_spills),
3561      st_base(st_base),
3562      st_written(st_written),
3563      st_reset(st_reset)
3564 {
3565    this->mem_ctx = mem_ctx;
3566    this->failed = false;
3567
3568    this->base_ir = NULL;
3569    this->current_annotation = NULL;
3570    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3571
3572    this->variable_ht = hash_table_ctor(0,
3573                                        hash_table_pointer_hash,
3574                                        hash_table_pointer_compare);
3575
3576    this->virtual_grf_start = NULL;
3577    this->virtual_grf_end = NULL;
3578    this->virtual_grf_sizes = NULL;
3579    this->virtual_grf_count = 0;
3580    this->virtual_grf_reg_map = NULL;
3581    this->virtual_grf_reg_count = 0;
3582    this->virtual_grf_array_size = 0;
3583    this->live_intervals = NULL;
3584
3585    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3586
3587    this->uniforms = 0;
3588
3589    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3590     * at least one. See setup_uniforms() in brw_vec4.cpp.
3591     */
3592    this->uniform_array_size = 1;
3593    if (prog_data) {
3594       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3595    }
3596
3597    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3598    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3599 }
3600
3601 vec4_visitor::~vec4_visitor()
3602 {
3603    hash_table_dtor(this->variable_ht);
3604 }
3605
3606
3607 void
3608 vec4_visitor::fail(const char *format, ...)
3609 {
3610    va_list va;
3611    char *msg;
3612
3613    if (failed)
3614       return;
3615
3616    failed = true;
3617
3618    va_start(va, format);
3619    msg = ralloc_vasprintf(mem_ctx, format, va);
3620    va_end(va);
3621    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3622
3623    this->fail_msg = msg;
3624
3625    if (debug_flag) {
3626       fprintf(stderr, "%s",  msg);
3627    }
3628 }
3629
3630 } /* namespace brw */