src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(MOV(f, src_reg(shifted)));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(MOV(f, src_reg(shifted)));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_ERROR:
 616    case GLSL_TYPE_INTERFACE:
 617       unreachable("not reached");
 618    }
 619
 620    return 0;
 621 }
 622
 623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->alloc.allocate(type_size(type));
 629
 630    if (type->is_array() || type->is_record()) {
 631       this->swizzle = BRW_SWIZZLE_NOOP;
 632    } else {
 633       this->swizzle = swizzle_for_size(type->vector_elements);
 634    }
 635
 636    this->type = brw_type_for_base_type(type);
 637 }
 638
 639 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 640 {
 641    assert(size > 0);
 642
 643    init();
 644
 645    this->file = GRF;
 646    this->reg = v->alloc.allocate(type_size(type) * size);
 647
 648    this->swizzle = BRW_SWIZZLE_NOOP;
 649
 650    this->type = brw_type_for_base_type(type);
 651 }
 652
 653 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 654 {
 655    init();
 656
 657    this->file = GRF;
 658    this->reg = v->alloc.allocate(type_size(type));
 659
 660    if (type->is_array() || type->is_record()) {
 661       this->writemask = WRITEMASK_XYZW;
 662    } else {
 663       this->writemask = (1 << type->vector_elements) - 1;
 664    }
 665
 666    this->type = brw_type_for_base_type(type);
 667 }
 668
 669 /* Our support for uniforms is piggy-backed on the struct
 670  * gl_fragment_program, because that's where the values actually
 671  * get stored, rather than in some global gl_shader_program uniform
 672  * store.
 673  */
 674 void
 675 vec4_visitor::setup_uniform_values(ir_variable *ir)
 676 {
 677    int namelen = strlen(ir->name);
 678
 679    /* The data for our (non-builtin) uniforms is stored in a series of
 680     * gl_uniform_driver_storage structs for each subcomponent that
 681     * glGetUniformLocation() could name.  We know it's been set up in the same
 682     * order we'd walk the type, so walk the list of storage and find anything
 683     * with our name, or the prefix of a component that starts with our name.
 684     */
 685    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 686       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 687
 688       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 689           (storage->name[namelen] != 0 &&
 690            storage->name[namelen] != '.' &&
 691            storage->name[namelen] != '[')) {
 692          continue;
 693       }
 694
 695       gl_constant_value *components = storage->storage;
 696       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 697                                storage->type->matrix_columns);
 698
 699       for (unsigned s = 0; s < vector_count; s++) {
 700          assert(uniforms < uniform_array_size);
 701          uniform_vector_size[uniforms] = storage->type->vector_elements;
 702
 703          int i;
 704          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 705             stage_prog_data->param[uniforms * 4 + i] = components;
 706             components++;
 707          }
 708          for (; i < 4; i++) {
 709             static gl_constant_value zero = { 0.0 };
 710             stage_prog_data->param[uniforms * 4 + i] = &zero;
 711          }
 712
 713          uniforms++;
 714       }
 715    }
 716 }
 717
 718 void
 719 vec4_visitor::setup_uniform_clipplane_values()
 720 {
 721    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 722
 723    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 724       assert(this->uniforms < uniform_array_size);
 725       this->uniform_vector_size[this->uniforms] = 4;
 726       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 727       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 728       for (int j = 0; j < 4; ++j) {
 729          stage_prog_data->param[this->uniforms * 4 + j] =
 730             (gl_constant_value *) &clip_planes[i][j];
 731       }
 732       ++this->uniforms;
 733    }
 734 }
 735
 736 /* Our support for builtin uniforms is even scarier than non-builtin.
 737  * It sits on top of the PROG_STATE_VAR parameters that are
 738  * automatically updated from GL context state.
 739  */
 740 void
 741 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 742 {
 743    const ir_state_slot *const slots = ir->get_state_slots();
 744    assert(slots != NULL);
 745
 746    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 747       /* This state reference has already been setup by ir_to_mesa,
 748        * but we'll get the same index back here.  We can reference
 749        * ParameterValues directly, since unlike brw_fs.cpp, we never
 750        * add new state references during compile.
 751        */
 752       int index = _mesa_add_state_reference(this->prog->Parameters,
 753                                             (gl_state_index *)slots[i].tokens);
 754       gl_constant_value *values =
 755          &this->prog->Parameters->ParameterValues[index][0];
 756
 757       assert(this->uniforms < uniform_array_size);
 758       this->uniform_vector_size[this->uniforms] = 0;
 759       /* Add each of the unique swizzled channels of the element.
 760        * This will end up matching the size of the glsl_type of this field.
 761        */
 762       int last_swiz = -1;
 763       for (unsigned int j = 0; j < 4; j++) {
 764          int swiz = GET_SWZ(slots[i].swizzle, j);
 765          last_swiz = swiz;
 766
 767          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 768          assert(this->uniforms < uniform_array_size);
 769          if (swiz <= last_swiz)
 770             this->uniform_vector_size[this->uniforms]++;
 771       }
 772       this->uniforms++;
 773    }
 774 }
 775
 776 dst_reg *
 777 vec4_visitor::variable_storage(ir_variable *var)
 778 {
 779    return (dst_reg *)hash_table_find(this->variable_ht, var);
 780 }
 781
 782 void
 783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 784                                      enum brw_predicate *predicate)
 785 {
 786    ir_expression *expr = ir->as_expression();
 787
 788    *predicate = BRW_PREDICATE_NORMAL;
 789
 790    if (expr && expr->operation != ir_binop_ubo_load) {
 791       src_reg op[3];
 792       vec4_instruction *inst;
 793
 794       assert(expr->get_num_operands() <= 3);
 795       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 796          expr->operands[i]->accept(this);
 797          op[i] = this->result;
 798
 799          resolve_ud_negate(&op[i]);
 800       }
 801
 802       switch (expr->operation) {
 803       case ir_unop_logic_not:
 804          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 805          inst->conditional_mod = BRW_CONDITIONAL_Z;
 806          break;
 807
 808       case ir_binop_logic_xor:
 809          if (brw->gen <= 5) {
 810             src_reg temp = src_reg(this, ir->type);
 811             emit(XOR(dst_reg(temp), op[0], op[1]));
 812             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 813          } else {
 814             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 815          }
 816          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          break;
 818
 819       case ir_binop_logic_or:
 820          if (brw->gen <= 5) {
 821             src_reg temp = src_reg(this, ir->type);
 822             emit(OR(dst_reg(temp), op[0], op[1]));
 823             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 824          } else {
 825             inst = emit(OR(dst_null_d(), op[0], op[1]));
 826          }
 827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 828          break;
 829
 830       case ir_binop_logic_and:
 831          if (brw->gen <= 5) {
 832             src_reg temp = src_reg(this, ir->type);
 833             emit(AND(dst_reg(temp), op[0], op[1]));
 834             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 835          } else {
 836             inst = emit(AND(dst_null_d(), op[0], op[1]));
 837          }
 838          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839          break;
 840
 841       case ir_unop_f2b:
 842          if (brw->gen >= 6) {
 843             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 844          } else {
 845             inst = emit(MOV(dst_null_f(), op[0]));
 846             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 847          }
 848          break;
 849
 850       case ir_unop_i2b:
 851          if (brw->gen >= 6) {
 852             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 853          } else {
 854             inst = emit(MOV(dst_null_d(), op[0]));
 855             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 856          }
 857          break;
 858
 859       case ir_binop_all_equal:
 860          if (brw->gen <= 5) {
 861             resolve_bool_comparison(expr->operands[0], &op[0]);
 862             resolve_bool_comparison(expr->operands[1], &op[1]);
 863          }
 864          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 865          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 866          break;
 867
 868       case ir_binop_any_nequal:
 869          if (brw->gen <= 5) {
 870             resolve_bool_comparison(expr->operands[0], &op[0]);
 871             resolve_bool_comparison(expr->operands[1], &op[1]);
 872          }
 873          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 874          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 875          break;
 876
 877       case ir_unop_any:
 878          if (brw->gen <= 5) {
 879             resolve_bool_comparison(expr->operands[0], &op[0]);
 880          }
 881          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 882          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 883          break;
 884
 885       case ir_binop_greater:
 886       case ir_binop_gequal:
 887       case ir_binop_less:
 888       case ir_binop_lequal:
 889       case ir_binop_equal:
 890       case ir_binop_nequal:
 891          if (brw->gen <= 5) {
 892             resolve_bool_comparison(expr->operands[0], &op[0]);
 893             resolve_bool_comparison(expr->operands[1], &op[1]);
 894          }
 895          emit(CMP(dst_null_d(), op[0], op[1],
 896                   brw_conditional_for_comparison(expr->operation)));
 897          break;
 898
 899       case ir_triop_csel: {
 900          /* Expand the boolean condition into the flag register. */
 901          inst = emit(MOV(dst_null_d(), op[0]));
 902          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 903
 904          /* Select which boolean to return. */
 905          dst_reg temp(this, expr->operands[1]->type);
 906          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 907          inst->predicate = BRW_PREDICATE_NORMAL;
 908
 909          /* Expand the result to a condition code. */
 910          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 911          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 912          break;
 913       }
 914
 915       default:
 916          unreachable("not reached");
 917       }
 918       return;
 919    }
 920
 921    ir->accept(this);
 922
 923    resolve_ud_negate(&this->result);
 924
 925    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 926    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 927 }
 928
 929 /**
 930  * Emit a gen6 IF statement with the comparison folded into the IF
 931  * instruction.
 932  */
 933 void
 934 vec4_visitor::emit_if_gen6(ir_if *ir)
 935 {
 936    ir_expression *expr = ir->condition->as_expression();
 937
 938    if (expr && expr->operation != ir_binop_ubo_load) {
 939       src_reg op[3];
 940       dst_reg temp;
 941
 942       assert(expr->get_num_operands() <= 3);
 943       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 944          expr->operands[i]->accept(this);
 945          op[i] = this->result;
 946       }
 947
 948       switch (expr->operation) {
 949       case ir_unop_logic_not:
 950          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 951          return;
 952
 953       case ir_binop_logic_xor:
 954          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 955          return;
 956
 957       case ir_binop_logic_or:
 958          temp = dst_reg(this, glsl_type::bool_type);
 959          emit(OR(temp, op[0], op[1]));
 960          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 961          return;
 962
 963       case ir_binop_logic_and:
 964          temp = dst_reg(this, glsl_type::bool_type);
 965          emit(AND(temp, op[0], op[1]));
 966          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_f2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_unop_i2b:
 974          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 975          return;
 976
 977       case ir_binop_greater:
 978       case ir_binop_gequal:
 979       case ir_binop_less:
 980       case ir_binop_lequal:
 981       case ir_binop_equal:
 982       case ir_binop_nequal:
 983          emit(IF(op[0], op[1],
 984                  brw_conditional_for_comparison(expr->operation)));
 985          return;
 986
 987       case ir_binop_all_equal:
 988          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 989          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 990          return;
 991
 992       case ir_binop_any_nequal:
 993          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 994          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 995          return;
 996
 997       case ir_unop_any:
 998          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 999          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000          return;
1001
1002       case ir_triop_csel: {
1003          /* Expand the boolean condition into the flag register. */
1004          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007          /* Select which boolean to return. */
1008          dst_reg temp(this, expr->operands[1]->type);
1009          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010          inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013          return;
1014       }
1015
1016       default:
1017          unreachable("not reached");
1018       }
1019       return;
1020    }
1021
1022    ir->condition->accept(this);
1023
1024    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030    dst_reg *reg = NULL;
1031
1032    if (variable_storage(ir))
1033       return;
1034
1035    switch (ir->data.mode) {
1036    case ir_var_shader_in:
1037       assert(ir->data.location != -1);
1038       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039       break;
1040
1041    case ir_var_shader_out:
1042       assert(ir->data.location != -1);
1043       reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045       for (int i = 0; i < type_size(ir->type); i++) {
1046          output_reg[ir->data.location + i] = *reg;
1047          output_reg[ir->data.location + i].reg_offset = i;
1048          output_reg[ir->data.location + i].type =
1049             brw_type_for_base_type(ir->type->get_scalar_type());
1050          output_reg_annotation[ir->data.location + i] = ir->name;
1051       }
1052       break;
1053
1054    case ir_var_auto:
1055    case ir_var_temporary:
1056       reg = new(mem_ctx) dst_reg(this, ir->type);
1057       break;
1058
1059    case ir_var_uniform:
1060       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062       /* Thanks to the lower_ubo_reference pass, we will see only
1063        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064        * variables, so no need for them to be in variable_ht.
1065        *
1066        * Some uniforms, such as samplers and atomic counters, have no actual
1067        * storage, so we should ignore them.
1068        */
1069       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1070          return;
1071
1072       /* Track how big the whole uniform variable is, in case we need to put a
1073        * copy of its data into pull constants for array access.
1074        */
1075       assert(this->uniforms < uniform_array_size);
1076       this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078       if (!strncmp(ir->name, "gl_", 3)) {
1079          setup_builtin_uniform_values(ir);
1080       } else {
1081          setup_uniform_values(ir);
1082       }
1083       break;
1084
1085    case ir_var_system_value:
1086       reg = make_reg_for_system_value(ir);
1087       break;
1088
1089    default:
1090       unreachable("not reached");
1091    }
1092
1093    reg->type = brw_type_for_base_type(ir->type);
1094    hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100    /* We don't want debugging output to print the whole body of the
1101     * loop as the annotation.
1102     */
1103    this->base_ir = NULL;
1104
1105    emit(BRW_OPCODE_DO);
1106
1107    visit_instructions(&ir->body_instructions);
1108
1109    emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115    switch (ir->mode) {
1116    case ir_loop_jump::jump_break:
1117       emit(BRW_OPCODE_BREAK);
1118       break;
1119    case ir_loop_jump::jump_continue:
1120       emit(BRW_OPCODE_CONTINUE);
1121       break;
1122    }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129    unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135    /* Ignore function bodies other than main() -- we shouldn't see calls to
1136     * them since they should all be inlined.
1137     */
1138    if (strcmp(ir->name, "main") == 0) {
1139       const ir_function_signature *sig;
1140       exec_list empty;
1141
1142       sig = ir->matching_signature(NULL, &empty, false);
1143
1144       assert(sig);
1145
1146       visit_instructions(&sig->body);
1147    }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153    /* 3-src instructions were introduced in gen6. */
1154    if (brw->gen < 6)
1155       return false;
1156
1157    /* MAD can only handle floating-point data. */
1158    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159       return false;
1160
1161    ir_rvalue *nonmul = ir->operands[1];
1162    ir_expression *mul = ir->operands[0]->as_expression();
1163
1164    bool mul_negate = false, mul_abs = false;
1165    if (mul && mul->operation == ir_unop_abs) {
1166       mul = mul->operands[0]->as_expression();
1167       mul_abs = true;
1168    } else if (mul && mul->operation == ir_unop_neg) {
1169       mul = mul->operands[0]->as_expression();
1170       mul_negate = true;
1171    }
1172
1173    if (!mul || mul->operation != ir_binop_mul) {
1174       nonmul = ir->operands[0];
1175       mul = ir->operands[1]->as_expression();
1176
1177       if (mul && mul->operation == ir_unop_abs) {
1178          mul = mul->operands[0]->as_expression();
1179          mul_abs = true;
1180       } else if (mul && mul->operation == ir_unop_neg) {
1181          mul = mul->operands[0]->as_expression();
1182          mul_negate = true;
1183       }
1184
1185       if (!mul || mul->operation != ir_binop_mul)
1186          return false;
1187    }
1188
1189    nonmul->accept(this);
1190    src_reg src0 = fix_3src_operand(this->result);
1191
1192    mul->operands[0]->accept(this);
1193    src_reg src1 = fix_3src_operand(this->result);
1194    src1.negate ^= mul_negate;
1195    src1.abs = mul_abs;
1196    if (mul_abs)
1197       src1.negate = false;
1198
1199    mul->operands[1]->accept(this);
1200    src_reg src2 = fix_3src_operand(this->result);
1201    src2.abs = mul_abs;
1202    if (mul_abs)
1203       src2.negate = false;
1204
1205    this->result = src_reg(this, ir->type);
1206    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1207
1208    return true;
1209 }
1210
1211 bool
1212 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1213 {
1214    /* This optimization relies on CMP setting the destination to 0 when
1215     * false.  Early hardware only sets the least significant bit, and
1216     * leaves the other bits undefined.  So we can't use it.
1217     */
1218    if (brw->gen < 6)
1219       return false;
1220
1221    ir_expression *const cmp = ir->operands[0]->as_expression();
1222
1223    if (cmp == NULL)
1224       return false;
1225
1226    switch (cmp->operation) {
1227    case ir_binop_less:
1228    case ir_binop_greater:
1229    case ir_binop_lequal:
1230    case ir_binop_gequal:
1231    case ir_binop_equal:
1232    case ir_binop_nequal:
1233       break;
1234
1235    default:
1236       return false;
1237    }
1238
1239    cmp->operands[0]->accept(this);
1240    const src_reg cmp_src0 = this->result;
1241
1242    cmp->operands[1]->accept(this);
1243    const src_reg cmp_src1 = this->result;
1244
1245    this->result = src_reg(this, ir->type);
1246
1247    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1248             brw_conditional_for_comparison(cmp->operation)));
1249
1250    /* If the comparison is false, this->result will just happen to be zero.
1251     */
1252    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1253                                        this->result, src_reg(1.0f));
1254    inst->predicate = BRW_PREDICATE_NORMAL;
1255    inst->predicate_inverse = true;
1256
1257    return true;
1258 }
1259
1260 void
1261 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1262                           src_reg src0, src_reg src1)
1263 {
1264    vec4_instruction *inst;
1265
1266    if (brw->gen >= 6) {
1267       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268       inst->conditional_mod = conditionalmod;
1269    } else {
1270       emit(CMP(dst, src0, src1, conditionalmod));
1271
1272       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1273       inst->predicate = BRW_PREDICATE_NORMAL;
1274    }
1275 }
1276
1277 void
1278 vec4_visitor::emit_lrp(const dst_reg &dst,
1279                        const src_reg &x, const src_reg &y, const src_reg &a)
1280 {
1281    if (brw->gen >= 6) {
1282       /* Note that the instruction's argument order is reversed from GLSL
1283        * and the IR.
1284        */
1285       emit(LRP(dst,
1286                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1287    } else {
1288       /* Earlier generations don't support three source operations, so we
1289        * need to emit x*(1-a) + y*a.
1290        */
1291       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1292       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1293       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1294       y_times_a.writemask           = dst.writemask;
1295       one_minus_a.writemask         = dst.writemask;
1296       x_times_one_minus_a.writemask = dst.writemask;
1297
1298       emit(MUL(y_times_a, y, a));
1299       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1300       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1301       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1302    }
1303 }
1304
1305 void
1306 vec4_visitor::visit(ir_expression *ir)
1307 {
1308    unsigned int operand;
1309    src_reg op[Elements(ir->operands)];
1310    vec4_instruction *inst;
1311
1312    if (ir->operation == ir_binop_add) {
1313       if (try_emit_mad(ir))
1314          return;
1315    }
1316
1317    if (ir->operation == ir_unop_b2f) {
1318       if (try_emit_b2f_of_compare(ir))
1319          return;
1320    }
1321
1322    /* Storage for our result.  Ideally for an assignment we'd be using
1323     * the actual storage for the result here, instead.
1324     */
1325    dst_reg result_dst(this, ir->type);
1326    src_reg result_src(result_dst);
1327
1328    if (ir->operation == ir_triop_csel) {
1329       ir->operands[1]->accept(this);
1330       op[1] = this->result;
1331       ir->operands[2]->accept(this);
1332       op[2] = this->result;
1333
1334       enum brw_predicate predicate;
1335       emit_bool_to_cond_code(ir->operands[0], &predicate);
1336       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1337       inst->predicate = predicate;
1338       this->result = result_src;
1339       return;
1340    }
1341
1342    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1343       this->result.file = BAD_FILE;
1344       ir->operands[operand]->accept(this);
1345       if (this->result.file == BAD_FILE) {
1346          fprintf(stderr, "Failed to get tree for expression operand:\n");
1347          ir->operands[operand]->fprint(stderr);
1348          exit(1);
1349       }
1350       op[operand] = this->result;
1351
1352       /* Matrix expression operands should have been broken down to vector
1353        * operations already.
1354        */
1355       assert(!ir->operands[operand]->type->is_matrix());
1356    }
1357
1358    /* If nothing special happens, this is the result. */
1359    this->result = result_src;
1360
1361    switch (ir->operation) {
1362    case ir_unop_logic_not:
1363       emit(NOT(result_dst, op[0]));
1364       break;
1365    case ir_unop_neg:
1366       op[0].negate = !op[0].negate;
1367       emit(MOV(result_dst, op[0]));
1368       break;
1369    case ir_unop_abs:
1370       op[0].abs = true;
1371       op[0].negate = false;
1372       emit(MOV(result_dst, op[0]));
1373       break;
1374
1375    case ir_unop_sign:
1376       if (ir->type->is_float()) {
1377          /* AND(val, 0x80000000) gives the sign bit.
1378           *
1379           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1380           * zero.
1381           */
1382          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1383
1384          op[0].type = BRW_REGISTER_TYPE_UD;
1385          result_dst.type = BRW_REGISTER_TYPE_UD;
1386          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1387
1388          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1389          inst->predicate = BRW_PREDICATE_NORMAL;
1390
1391          this->result.type = BRW_REGISTER_TYPE_F;
1392       } else {
1393          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1394           *               -> non-negative val generates 0x00000000.
1395           *  Predicated OR sets 1 if val is positive.
1396           */
1397          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1398
1399          emit(ASR(result_dst, op[0], src_reg(31)));
1400
1401          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1402          inst->predicate = BRW_PREDICATE_NORMAL;
1403       }
1404       break;
1405
1406    case ir_unop_rcp:
1407       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1408       break;
1409
1410    case ir_unop_exp2:
1411       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1412       break;
1413    case ir_unop_log2:
1414       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1415       break;
1416    case ir_unop_exp:
1417    case ir_unop_log:
1418       unreachable("not reached: should be handled by ir_explog_to_explog2");
1419    case ir_unop_sin:
1420    case ir_unop_sin_reduced:
1421       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1422       break;
1423    case ir_unop_cos:
1424    case ir_unop_cos_reduced:
1425       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1426       break;
1427
1428    case ir_unop_dFdx:
1429    case ir_unop_dFdx_coarse:
1430    case ir_unop_dFdx_fine:
1431    case ir_unop_dFdy:
1432    case ir_unop_dFdy_coarse:
1433    case ir_unop_dFdy_fine:
1434       unreachable("derivatives not valid in vertex shader");
1435
1436    case ir_unop_bitfield_reverse:
1437       emit(BFREV(result_dst, op[0]));
1438       break;
1439    case ir_unop_bit_count:
1440       emit(CBIT(result_dst, op[0]));
1441       break;
1442    case ir_unop_find_msb: {
1443       src_reg temp = src_reg(this, glsl_type::uint_type);
1444
1445       inst = emit(FBH(dst_reg(temp), op[0]));
1446       inst->dst.writemask = WRITEMASK_XYZW;
1447
1448       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1449        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1450        * subtract the result from 31 to convert the MSB count into an LSB count.
1451        */
1452
1453       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1454       temp.swizzle = BRW_SWIZZLE_NOOP;
1455       emit(MOV(result_dst, temp));
1456
1457       src_reg src_tmp = src_reg(result_dst);
1458       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1459
1460       src_tmp.negate = true;
1461       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1462       inst->predicate = BRW_PREDICATE_NORMAL;
1463       break;
1464    }
1465    case ir_unop_find_lsb:
1466       emit(FBL(result_dst, op[0]));
1467       break;
1468    case ir_unop_saturate:
1469       inst = emit(MOV(result_dst, op[0]));
1470       inst->saturate = true;
1471       break;
1472
1473    case ir_unop_noise:
1474       unreachable("not reached: should be handled by lower_noise");
1475
1476    case ir_binop_add:
1477       emit(ADD(result_dst, op[0], op[1]));
1478       break;
1479    case ir_binop_sub:
1480       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1481
1482    case ir_binop_mul:
1483       if (brw->gen < 8 && ir->type->is_integer()) {
1484          /* For integer multiplication, the MUL uses the low 16 bits of one of
1485           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1486           * accumulates in the contribution of the upper 16 bits of that
1487           * operand.  If we can determine that one of the args is in the low
1488           * 16 bits, though, we can just emit a single MUL.
1489           */
1490          if (ir->operands[0]->is_uint16_constant()) {
1491             if (brw->gen < 7)
1492                emit(MUL(result_dst, op[0], op[1]));
1493             else
1494                emit(MUL(result_dst, op[1], op[0]));
1495          } else if (ir->operands[1]->is_uint16_constant()) {
1496             if (brw->gen < 7)
1497                emit(MUL(result_dst, op[1], op[0]));
1498             else
1499                emit(MUL(result_dst, op[0], op[1]));
1500          } else {
1501             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1502
1503             emit(MUL(acc, op[0], op[1]));
1504             emit(MACH(dst_null_d(), op[0], op[1]));
1505             emit(MOV(result_dst, src_reg(acc)));
1506          }
1507       } else {
1508          emit(MUL(result_dst, op[0], op[1]));
1509       }
1510       break;
1511    case ir_binop_imul_high: {
1512       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1513
1514       emit(MUL(acc, op[0], op[1]));
1515       emit(MACH(result_dst, op[0], op[1]));
1516       break;
1517    }
1518    case ir_binop_div:
1519       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1520       assert(ir->type->is_integer());
1521       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1522       break;
1523    case ir_binop_carry: {
1524       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1525
1526       emit(ADDC(dst_null_ud(), op[0], op[1]));
1527       emit(MOV(result_dst, src_reg(acc)));
1528       break;
1529    }
1530    case ir_binop_borrow: {
1531       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1532
1533       emit(SUBB(dst_null_ud(), op[0], op[1]));
1534       emit(MOV(result_dst, src_reg(acc)));
1535       break;
1536    }
1537    case ir_binop_mod:
1538       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1539       assert(ir->type->is_integer());
1540       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1541       break;
1542
1543    case ir_binop_less:
1544    case ir_binop_greater:
1545    case ir_binop_lequal:
1546    case ir_binop_gequal:
1547    case ir_binop_equal:
1548    case ir_binop_nequal: {
1549       if (brw->gen <= 5) {
1550          resolve_bool_comparison(ir->operands[0], &op[0]);
1551          resolve_bool_comparison(ir->operands[1], &op[1]);
1552       }
1553       emit(CMP(result_dst, op[0], op[1],
1554                brw_conditional_for_comparison(ir->operation)));
1555       break;
1556    }
1557
1558    case ir_binop_all_equal:
1559       /* "==" operator producing a scalar boolean. */
1560       if (ir->operands[0]->type->is_vector() ||
1561           ir->operands[1]->type->is_vector()) {
1562          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1563          emit(MOV(result_dst, src_reg(0)));
1564          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1565          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1566       } else {
1567          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1568       }
1569       break;
1570    case ir_binop_any_nequal:
1571       /* "!=" operator producing a scalar boolean. */
1572       if (ir->operands[0]->type->is_vector() ||
1573           ir->operands[1]->type->is_vector()) {
1574          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1575
1576          emit(MOV(result_dst, src_reg(0)));
1577          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1578          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1579       } else {
1580          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1581       }
1582       break;
1583
1584    case ir_unop_any:
1585       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1586       emit(MOV(result_dst, src_reg(0)));
1587
1588       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1589       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1590       break;
1591
1592    case ir_binop_logic_xor:
1593       emit(XOR(result_dst, op[0], op[1]));
1594       break;
1595
1596    case ir_binop_logic_or:
1597       emit(OR(result_dst, op[0], op[1]));
1598       break;
1599
1600    case ir_binop_logic_and:
1601       emit(AND(result_dst, op[0], op[1]));
1602       break;
1603
1604    case ir_binop_dot:
1605       assert(ir->operands[0]->type->is_vector());
1606       assert(ir->operands[0]->type == ir->operands[1]->type);
1607       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1608       break;
1609
1610    case ir_unop_sqrt:
1611       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1612       break;
1613    case ir_unop_rsq:
1614       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1615       break;
1616
1617    case ir_unop_bitcast_i2f:
1618    case ir_unop_bitcast_u2f:
1619       this->result = op[0];
1620       this->result.type = BRW_REGISTER_TYPE_F;
1621       break;
1622
1623    case ir_unop_bitcast_f2i:
1624       this->result = op[0];
1625       this->result.type = BRW_REGISTER_TYPE_D;
1626       break;
1627
1628    case ir_unop_bitcast_f2u:
1629       this->result = op[0];
1630       this->result.type = BRW_REGISTER_TYPE_UD;
1631       break;
1632
1633    case ir_unop_i2f:
1634    case ir_unop_i2u:
1635    case ir_unop_u2i:
1636    case ir_unop_u2f:
1637    case ir_unop_f2i:
1638    case ir_unop_f2u:
1639       emit(MOV(result_dst, op[0]));
1640       break;
1641    case ir_unop_b2i:
1642       emit(AND(result_dst, op[0], src_reg(1)));
1643       break;
1644    case ir_unop_b2f:
1645       if (brw->gen <= 5) {
1646          resolve_bool_comparison(ir->operands[0], &op[0]);
1647       }
1648       op[0].type = BRW_REGISTER_TYPE_D;
1649       result_dst.type = BRW_REGISTER_TYPE_D;
1650       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1651       result_dst.type = BRW_REGISTER_TYPE_F;
1652       break;
1653    case ir_unop_f2b:
1654       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1655       break;
1656    case ir_unop_i2b:
1657       emit(AND(result_dst, op[0], src_reg(1)));
1658       break;
1659
1660    case ir_unop_trunc:
1661       emit(RNDZ(result_dst, op[0]));
1662       break;
1663    case ir_unop_ceil: {
1664          src_reg tmp = src_reg(this, ir->type);
1665          op[0].negate = !op[0].negate;
1666          emit(RNDD(dst_reg(tmp), op[0]));
1667          tmp.negate = true;
1668          emit(MOV(result_dst, tmp));
1669       }
1670       break;
1671    case ir_unop_floor:
1672       inst = emit(RNDD(result_dst, op[0]));
1673       break;
1674    case ir_unop_fract:
1675       inst = emit(FRC(result_dst, op[0]));
1676       break;
1677    case ir_unop_round_even:
1678       emit(RNDE(result_dst, op[0]));
1679       break;
1680
1681    case ir_binop_min:
1682       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1683       break;
1684    case ir_binop_max:
1685       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1686       break;
1687
1688    case ir_binop_pow:
1689       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1690       break;
1691
1692    case ir_unop_bit_not:
1693       inst = emit(NOT(result_dst, op[0]));
1694       break;
1695    case ir_binop_bit_and:
1696       inst = emit(AND(result_dst, op[0], op[1]));
1697       break;
1698    case ir_binop_bit_xor:
1699       inst = emit(XOR(result_dst, op[0], op[1]));
1700       break;
1701    case ir_binop_bit_or:
1702       inst = emit(OR(result_dst, op[0], op[1]));
1703       break;
1704
1705    case ir_binop_lshift:
1706       inst = emit(SHL(result_dst, op[0], op[1]));
1707       break;
1708
1709    case ir_binop_rshift:
1710       if (ir->type->base_type == GLSL_TYPE_INT)
1711          inst = emit(ASR(result_dst, op[0], op[1]));
1712       else
1713          inst = emit(SHR(result_dst, op[0], op[1]));
1714       break;
1715
1716    case ir_binop_bfm:
1717       emit(BFI1(result_dst, op[0], op[1]));
1718       break;
1719
1720    case ir_binop_ubo_load: {
1721       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1722       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1723       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1724       src_reg offset;
1725
1726       /* Now, load the vector from that offset. */
1727       assert(ir->type->is_vector() || ir->type->is_scalar());
1728
1729       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1730       packed_consts.type = result.type;
1731       src_reg surf_index;
1732
1733       if (const_uniform_block) {
1734          /* The block index is a constant, so just emit the binding table entry
1735           * as an immediate.
1736           */
1737          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1738                               const_uniform_block->value.u[0]);
1739       } else {
1740          /* The block index is not a constant. Evaluate the index expression
1741           * per-channel and add the base UBO index; the generator will select
1742           * a value from any live channel.
1743           */
1744          surf_index = src_reg(this, glsl_type::uint_type);
1745          emit(ADD(dst_reg(surf_index), op[0],
1746                   src_reg(prog_data->base.binding_table.ubo_start)));
1747
1748          /* Assume this may touch any UBO. It would be nice to provide
1749           * a tighter bound, but the array information is already lowered away.
1750           */
1751          brw_mark_surface_used(&prog_data->base,
1752                                prog_data->base.binding_table.ubo_start +
1753                                shader_prog->NumUniformBlocks - 1);
1754       }
1755
1756       if (const_offset_ir) {
1757          if (brw->gen >= 8) {
1758             /* Store the offset in a GRF so we can send-from-GRF. */
1759             offset = src_reg(this, glsl_type::int_type);
1760             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1761          } else {
1762             /* Immediates are fine on older generations since they'll be moved
1763              * to a (potentially fake) MRF at the generator level.
1764              */
1765             offset = src_reg(const_offset / 16);
1766          }
1767       } else {
1768          offset = src_reg(this, glsl_type::uint_type);
1769          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1770       }
1771
1772       if (brw->gen >= 7) {
1773          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1774          grf_offset.type = offset.type;
1775
1776          emit(MOV(grf_offset, offset));
1777
1778          vec4_instruction *pull =
1779             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1780                                                dst_reg(packed_consts),
1781                                                surf_index,
1782                                                src_reg(grf_offset)));
1783          pull->mlen = 1;
1784       } else {
1785          vec4_instruction *pull =
1786             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1787                                                dst_reg(packed_consts),
1788                                                surf_index,
1789                                                offset));
1790          pull->base_mrf = 14;
1791          pull->mlen = 1;
1792       }
1793
1794       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1795       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1796                                             const_offset % 16 / 4,
1797                                             const_offset % 16 / 4,
1798                                             const_offset % 16 / 4);
1799
1800       /* UBO bools are any nonzero int.  We need to convert them to use the
1801        * value of true stored in ctx->Const.UniformBooleanTrue.
1802        */
1803       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1804          emit(CMP(result_dst, packed_consts, src_reg(0u),
1805                   BRW_CONDITIONAL_NZ));
1806       } else {
1807          emit(MOV(result_dst, packed_consts));
1808       }
1809       break;
1810    }
1811
1812    case ir_binop_vector_extract:
1813       unreachable("should have been lowered by vec_index_to_cond_assign");
1814
1815    case ir_triop_fma:
1816       op[0] = fix_3src_operand(op[0]);
1817       op[1] = fix_3src_operand(op[1]);
1818       op[2] = fix_3src_operand(op[2]);
1819       /* Note that the instruction's argument order is reversed from GLSL
1820        * and the IR.
1821        */
1822       emit(MAD(result_dst, op[2], op[1], op[0]));
1823       break;
1824
1825    case ir_triop_lrp:
1826       emit_lrp(result_dst, op[0], op[1], op[2]);
1827       break;
1828
1829    case ir_triop_csel:
1830       unreachable("already handled above");
1831       break;
1832
1833    case ir_triop_bfi:
1834       op[0] = fix_3src_operand(op[0]);
1835       op[1] = fix_3src_operand(op[1]);
1836       op[2] = fix_3src_operand(op[2]);
1837       emit(BFI2(result_dst, op[0], op[1], op[2]));
1838       break;
1839
1840    case ir_triop_bitfield_extract:
1841       op[0] = fix_3src_operand(op[0]);
1842       op[1] = fix_3src_operand(op[1]);
1843       op[2] = fix_3src_operand(op[2]);
1844       /* Note that the instruction's argument order is reversed from GLSL
1845        * and the IR.
1846        */
1847       emit(BFE(result_dst, op[2], op[1], op[0]));
1848       break;
1849
1850    case ir_triop_vector_insert:
1851       unreachable("should have been lowered by lower_vector_insert");
1852
1853    case ir_quadop_bitfield_insert:
1854       unreachable("not reached: should be handled by "
1855               "bitfield_insert_to_bfm_bfi\n");
1856
1857    case ir_quadop_vector:
1858       unreachable("not reached: should be handled by lower_quadop_vector");
1859
1860    case ir_unop_pack_half_2x16:
1861       emit_pack_half_2x16(result_dst, op[0]);
1862       break;
1863    case ir_unop_unpack_half_2x16:
1864       emit_unpack_half_2x16(result_dst, op[0]);
1865       break;
1866    case ir_unop_unpack_unorm_4x8:
1867       emit_unpack_unorm_4x8(result_dst, op[0]);
1868       break;
1869    case ir_unop_unpack_snorm_4x8:
1870       emit_unpack_snorm_4x8(result_dst, op[0]);
1871       break;
1872    case ir_unop_pack_unorm_4x8:
1873       emit_pack_unorm_4x8(result_dst, op[0]);
1874       break;
1875    case ir_unop_pack_snorm_4x8:
1876       emit_pack_snorm_4x8(result_dst, op[0]);
1877       break;
1878    case ir_unop_pack_snorm_2x16:
1879    case ir_unop_pack_unorm_2x16:
1880    case ir_unop_unpack_snorm_2x16:
1881    case ir_unop_unpack_unorm_2x16:
1882       unreachable("not reached: should be handled by lower_packing_builtins");
1883    case ir_unop_unpack_half_2x16_split_x:
1884    case ir_unop_unpack_half_2x16_split_y:
1885    case ir_binop_pack_half_2x16_split:
1886    case ir_unop_interpolate_at_centroid:
1887    case ir_binop_interpolate_at_sample:
1888    case ir_binop_interpolate_at_offset:
1889       unreachable("not reached: should not occur in vertex shader");
1890    case ir_binop_ldexp:
1891       unreachable("not reached: should be handled by ldexp_to_arith()");
1892    }
1893 }
1894
1895
1896 void
1897 vec4_visitor::visit(ir_swizzle *ir)
1898 {
1899    src_reg src;
1900    int i = 0;
1901    int swizzle[4];
1902
1903    /* Note that this is only swizzles in expressions, not those on the left
1904     * hand side of an assignment, which do write masking.  See ir_assignment
1905     * for that.
1906     */
1907
1908    ir->val->accept(this);
1909    src = this->result;
1910    assert(src.file != BAD_FILE);
1911
1912    for (i = 0; i < ir->type->vector_elements; i++) {
1913       switch (i) {
1914       case 0:
1915          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1916          break;
1917       case 1:
1918          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1919          break;
1920       case 2:
1921          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1922          break;
1923       case 3:
1924          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1925             break;
1926       }
1927    }
1928    for (; i < 4; i++) {
1929       /* Replicate the last channel out. */
1930       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1931    }
1932
1933    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1934
1935    this->result = src;
1936 }
1937
1938 void
1939 vec4_visitor::visit(ir_dereference_variable *ir)
1940 {
1941    const struct glsl_type *type = ir->type;
1942    dst_reg *reg = variable_storage(ir->var);
1943
1944    if (!reg) {
1945       fail("Failed to find variable storage for %s\n", ir->var->name);
1946       this->result = src_reg(brw_null_reg());
1947       return;
1948    }
1949
1950    this->result = src_reg(*reg);
1951
1952    /* System values get their swizzle from the dst_reg writemask */
1953    if (ir->var->data.mode == ir_var_system_value)
1954       return;
1955
1956    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1957       this->result.swizzle = swizzle_for_size(type->vector_elements);
1958 }
1959
1960
1961 int
1962 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1963 {
1964    /* Under normal circumstances array elements are stored consecutively, so
1965     * the stride is equal to the size of the array element.
1966     */
1967    return type_size(ir->type);
1968 }
1969
1970
1971 void
1972 vec4_visitor::visit(ir_dereference_array *ir)
1973 {
1974    ir_constant *constant_index;
1975    src_reg src;
1976    int array_stride = compute_array_stride(ir);
1977
1978    constant_index = ir->array_index->constant_expression_value();
1979
1980    ir->array->accept(this);
1981    src = this->result;
1982
1983    if (constant_index) {
1984       src.reg_offset += constant_index->value.i[0] * array_stride;
1985    } else {
1986       /* Variable index array dereference.  It eats the "vec4" of the
1987        * base of the array and an index that offsets the Mesa register
1988        * index.
1989        */
1990       ir->array_index->accept(this);
1991
1992       src_reg index_reg;
1993
1994       if (array_stride == 1) {
1995          index_reg = this->result;
1996       } else {
1997          index_reg = src_reg(this, glsl_type::int_type);
1998
1999          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2000       }
2001
2002       if (src.reladdr) {
2003          src_reg temp = src_reg(this, glsl_type::int_type);
2004
2005          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2006
2007          index_reg = temp;
2008       }
2009
2010       src.reladdr = ralloc(mem_ctx, src_reg);
2011       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2012    }
2013
2014    /* If the type is smaller than a vec4, replicate the last channel out. */
2015    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2016       src.swizzle = swizzle_for_size(ir->type->vector_elements);
2017    else
2018       src.swizzle = BRW_SWIZZLE_NOOP;
2019    src.type = brw_type_for_base_type(ir->type);
2020
2021    this->result = src;
2022 }
2023
2024 void
2025 vec4_visitor::visit(ir_dereference_record *ir)
2026 {
2027    unsigned int i;
2028    const glsl_type *struct_type = ir->record->type;
2029    int offset = 0;
2030
2031    ir->record->accept(this);
2032
2033    for (i = 0; i < struct_type->length; i++) {
2034       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2035          break;
2036       offset += type_size(struct_type->fields.structure[i].type);
2037    }
2038
2039    /* If the type is smaller than a vec4, replicate the last channel out. */
2040    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2041       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2042    else
2043       this->result.swizzle = BRW_SWIZZLE_NOOP;
2044    this->result.type = brw_type_for_base_type(ir->type);
2045
2046    this->result.reg_offset += offset;
2047 }
2048
2049 /**
2050  * We want to be careful in assignment setup to hit the actual storage
2051  * instead of potentially using a temporary like we might with the
2052  * ir_dereference handler.
2053  */
2054 static dst_reg
2055 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2056 {
2057    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2058     * access of a vector, it must be separated into a series conditional moves
2059     * before reaching this point (see ir_vec_index_to_cond_assign).
2060     */
2061    assert(ir->as_dereference());
2062    ir_dereference_array *deref_array = ir->as_dereference_array();
2063    if (deref_array) {
2064       assert(!deref_array->array->type->is_vector());
2065    }
2066
2067    /* Use the rvalue deref handler for the most part.  We'll ignore
2068     * swizzles in it and write swizzles using writemask, though.
2069     */
2070    ir->accept(v);
2071    return dst_reg(v->result);
2072 }
2073
2074 void
2075 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2076                               const struct glsl_type *type,
2077                               enum brw_predicate predicate)
2078 {
2079    if (type->base_type == GLSL_TYPE_STRUCT) {
2080       for (unsigned int i = 0; i < type->length; i++) {
2081          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2082       }
2083       return;
2084    }
2085
2086    if (type->is_array()) {
2087       for (unsigned int i = 0; i < type->length; i++) {
2088          emit_block_move(dst, src, type->fields.array, predicate);
2089       }
2090       return;
2091    }
2092
2093    if (type->is_matrix()) {
2094       const struct glsl_type *vec_type;
2095
2096       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2097                                          type->vector_elements, 1);
2098
2099       for (int i = 0; i < type->matrix_columns; i++) {
2100          emit_block_move(dst, src, vec_type, predicate);
2101       }
2102       return;
2103    }
2104
2105    assert(type->is_scalar() || type->is_vector());
2106
2107    dst->type = brw_type_for_base_type(type);
2108    src->type = dst->type;
2109
2110    dst->writemask = (1 << type->vector_elements) - 1;
2111
2112    src->swizzle = swizzle_for_size(type->vector_elements);
2113
2114    vec4_instruction *inst = emit(MOV(*dst, *src));
2115    inst->predicate = predicate;
2116
2117    dst->reg_offset++;
2118    src->reg_offset++;
2119 }
2120
2121
2122 /* If the RHS processing resulted in an instruction generating a
2123  * temporary value, and it would be easy to rewrite the instruction to
2124  * generate its result right into the LHS instead, do so.  This ends
2125  * up reliably removing instructions where it can be tricky to do so
2126  * later without real UD chain information.
2127  */
2128 bool
2129 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2130                                      dst_reg dst,
2131                                      src_reg src,
2132                                      vec4_instruction *pre_rhs_inst,
2133                                      vec4_instruction *last_rhs_inst)
2134 {
2135    /* This could be supported, but it would take more smarts. */
2136    if (ir->condition)
2137       return false;
2138
2139    if (pre_rhs_inst == last_rhs_inst)
2140       return false; /* No instructions generated to work with. */
2141
2142    /* Make sure the last instruction generated our source reg. */
2143    if (src.file != GRF ||
2144        src.file != last_rhs_inst->dst.file ||
2145        src.reg != last_rhs_inst->dst.reg ||
2146        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2147        src.reladdr ||
2148        src.abs ||
2149        src.negate ||
2150        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2151       return false;
2152
2153    /* Check that that last instruction fully initialized the channels
2154     * we want to use, in the order we want to use them.  We could
2155     * potentially reswizzle the operands of many instructions so that
2156     * we could handle out of order channels, but don't yet.
2157     */
2158
2159    for (unsigned i = 0; i < 4; i++) {
2160       if (dst.writemask & (1 << i)) {
2161          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2162             return false;
2163
2164          if (BRW_GET_SWZ(src.swizzle, i) != i)
2165             return false;
2166       }
2167    }
2168
2169    /* Success!  Rewrite the instruction. */
2170    last_rhs_inst->dst.file = dst.file;
2171    last_rhs_inst->dst.reg = dst.reg;
2172    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2173    last_rhs_inst->dst.reladdr = dst.reladdr;
2174    last_rhs_inst->dst.writemask &= dst.writemask;
2175
2176    return true;
2177 }
2178
2179 void
2180 vec4_visitor::visit(ir_assignment *ir)
2181 {
2182    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2183    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2184
2185    if (!ir->lhs->type->is_scalar() &&
2186        !ir->lhs->type->is_vector()) {
2187       ir->rhs->accept(this);
2188       src_reg src = this->result;
2189
2190       if (ir->condition) {
2191          emit_bool_to_cond_code(ir->condition, &predicate);
2192       }
2193
2194       /* emit_block_move doesn't account for swizzles in the source register.
2195        * This should be ok, since the source register is a structure or an
2196        * array, and those can't be swizzled.  But double-check to be sure.
2197        */
2198       assert(src.swizzle ==
2199              (ir->rhs->type->is_matrix()
2200               ? swizzle_for_size(ir->rhs->type->vector_elements)
2201               : BRW_SWIZZLE_NOOP));
2202
2203       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2204       return;
2205    }
2206
2207    /* Now we're down to just a scalar/vector with writemasks. */
2208    int i;
2209
2210    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2211    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2212
2213    ir->rhs->accept(this);
2214
2215    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2216
2217    src_reg src = this->result;
2218
2219    int swizzles[4];
2220    int first_enabled_chan = 0;
2221    int src_chan = 0;
2222
2223    assert(ir->lhs->type->is_vector() ||
2224           ir->lhs->type->is_scalar());
2225    dst.writemask = ir->write_mask;
2226
2227    for (int i = 0; i < 4; i++) {
2228       if (dst.writemask & (1 << i)) {
2229          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2230          break;
2231       }
2232    }
2233
2234    /* Swizzle a small RHS vector into the channels being written.
2235     *
2236     * glsl ir treats write_mask as dictating how many channels are
2237     * present on the RHS while in our instructions we need to make
2238     * those channels appear in the slots of the vec4 they're written to.
2239     */
2240    for (int i = 0; i < 4; i++) {
2241       if (dst.writemask & (1 << i))
2242          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2243       else
2244          swizzles[i] = first_enabled_chan;
2245    }
2246    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2247                               swizzles[2], swizzles[3]);
2248
2249    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2250       return;
2251    }
2252
2253    if (ir->condition) {
2254       emit_bool_to_cond_code(ir->condition, &predicate);
2255    }
2256
2257    for (i = 0; i < type_size(ir->lhs->type); i++) {
2258       vec4_instruction *inst = emit(MOV(dst, src));
2259       inst->predicate = predicate;
2260
2261       dst.reg_offset++;
2262       src.reg_offset++;
2263    }
2264 }
2265
2266 void
2267 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2268 {
2269    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2270       foreach_in_list(ir_constant, field_value, &ir->components) {
2271          emit_constant_values(dst, field_value);
2272       }
2273       return;
2274    }
2275
2276    if (ir->type->is_array()) {
2277       for (unsigned int i = 0; i < ir->type->length; i++) {
2278          emit_constant_values(dst, ir->array_elements[i]);
2279       }
2280       return;
2281    }
2282
2283    if (ir->type->is_matrix()) {
2284       for (int i = 0; i < ir->type->matrix_columns; i++) {
2285          float *vec = &ir->value.f[i * ir->type->vector_elements];
2286
2287          for (int j = 0; j < ir->type->vector_elements; j++) {
2288             dst->writemask = 1 << j;
2289             dst->type = BRW_REGISTER_TYPE_F;
2290
2291             emit(MOV(*dst, src_reg(vec[j])));
2292          }
2293          dst->reg_offset++;
2294       }
2295       return;
2296    }
2297
2298    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2299
2300    for (int i = 0; i < ir->type->vector_elements; i++) {
2301       if (!(remaining_writemask & (1 << i)))
2302          continue;
2303
2304       dst->writemask = 1 << i;
2305       dst->type = brw_type_for_base_type(ir->type);
2306
2307       /* Find other components that match the one we're about to
2308        * write.  Emits fewer instructions for things like vec4(0.5,
2309        * 1.5, 1.5, 1.5).
2310        */
2311       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2312          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2313             if (ir->value.b[i] == ir->value.b[j])
2314                dst->writemask |= (1 << j);
2315          } else {
2316             /* u, i, and f storage all line up, so no need for a
2317              * switch case for comparing each type.
2318              */
2319             if (ir->value.u[i] == ir->value.u[j])
2320                dst->writemask |= (1 << j);
2321          }
2322       }
2323
2324       switch (ir->type->base_type) {
2325       case GLSL_TYPE_FLOAT:
2326          emit(MOV(*dst, src_reg(ir->value.f[i])));
2327          break;
2328       case GLSL_TYPE_INT:
2329          emit(MOV(*dst, src_reg(ir->value.i[i])));
2330          break;
2331       case GLSL_TYPE_UINT:
2332          emit(MOV(*dst, src_reg(ir->value.u[i])));
2333          break;
2334       case GLSL_TYPE_BOOL:
2335          emit(MOV(*dst,
2336                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2337                                               : 0)));
2338          break;
2339       default:
2340          unreachable("Non-float/uint/int/bool constant");
2341       }
2342
2343       remaining_writemask &= ~dst->writemask;
2344    }
2345    dst->reg_offset++;
2346 }
2347
2348 void
2349 vec4_visitor::visit(ir_constant *ir)
2350 {
2351    dst_reg dst = dst_reg(this, ir->type);
2352    this->result = src_reg(dst);
2353
2354    emit_constant_values(&dst, ir);
2355 }
2356
2357 void
2358 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2359 {
2360    ir_dereference *deref = static_cast<ir_dereference *>(
2361       ir->actual_parameters.get_head());
2362    ir_variable *location = deref->variable_referenced();
2363    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2364                           location->data.binding);
2365
2366    /* Calculate the surface offset */
2367    src_reg offset(this, glsl_type::uint_type);
2368    ir_dereference_array *deref_array = deref->as_dereference_array();
2369    if (deref_array) {
2370       deref_array->array_index->accept(this);
2371
2372       src_reg tmp(this, glsl_type::uint_type);
2373       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2374       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2375    } else {
2376       offset = location->data.atomic.offset;
2377    }
2378
2379    /* Emit the appropriate machine instruction */
2380    const char *callee = ir->callee->function_name();
2381    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2382
2383    if (!strcmp("__intrinsic_atomic_read", callee)) {
2384       emit_untyped_surface_read(surf_index, dst, offset);
2385
2386    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2387       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2388                           src_reg(), src_reg());
2389
2390    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2391       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2392                           src_reg(), src_reg());
2393    }
2394 }
2395
2396 void
2397 vec4_visitor::visit(ir_call *ir)
2398 {
2399    const char *callee = ir->callee->function_name();
2400
2401    if (!strcmp("__intrinsic_atomic_read", callee) ||
2402        !strcmp("__intrinsic_atomic_increment", callee) ||
2403        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2404       visit_atomic_counter_intrinsic(ir);
2405    } else {
2406       unreachable("Unsupported intrinsic.");
2407    }
2408 }
2409
2410 src_reg
2411 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2412 {
2413    vec4_instruction *inst =
2414       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2415                                     dst_reg(this, glsl_type::uvec4_type));
2416    inst->base_mrf = 2;
2417    inst->mlen = 1;
2418    inst->src[1] = sampler;
2419
2420    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2421    int param_base = inst->base_mrf;
2422    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2423    int zero_mask = 0xf & ~coord_mask;
2424
2425    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2426             coordinate));
2427
2428    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2429             src_reg(0)));
2430
2431    emit(inst);
2432    return src_reg(inst->dst);
2433 }
2434
2435 static bool
2436 is_high_sampler(struct brw_context *brw, src_reg sampler)
2437 {
2438    if (brw->gen < 8 && !brw->is_haswell)
2439       return false;
2440
2441    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2442 }
2443
2444 void
2445 vec4_visitor::visit(ir_texture *ir)
2446 {
2447    uint32_t sampler =
2448       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2449
2450    ir_rvalue *nonconst_sampler_index =
2451       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2452
2453    /* Handle non-constant sampler array indexing */
2454    src_reg sampler_reg;
2455    if (nonconst_sampler_index) {
2456       /* The highest sampler which may be used by this operation is
2457        * the last element of the array. Mark it here, because the generator
2458        * doesn't have enough information to determine the bound.
2459        */
2460       uint32_t array_size = ir->sampler->as_dereference_array()
2461          ->array->type->array_size();
2462
2463       uint32_t max_used = sampler + array_size - 1;
2464       if (ir->op == ir_tg4 && brw->gen < 8) {
2465          max_used += prog_data->base.binding_table.gather_texture_start;
2466       } else {
2467          max_used += prog_data->base.binding_table.texture_start;
2468       }
2469
2470       brw_mark_surface_used(&prog_data->base, max_used);
2471
2472       /* Emit code to evaluate the actual indexing expression */
2473       nonconst_sampler_index->accept(this);
2474       dst_reg temp(this, glsl_type::uint_type);
2475       emit(ADD(temp, this->result, src_reg(sampler)))
2476          ->force_writemask_all = true;
2477       sampler_reg = src_reg(temp);
2478    } else {
2479       /* Single sampler, or constant array index; the indexing expression
2480        * is just an immediate.
2481        */
2482       sampler_reg = src_reg(sampler);
2483    }
2484
2485    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2486     * emitting anything other than setting up the constant result.
2487     */
2488    if (ir->op == ir_tg4) {
2489       ir_constant *chan = ir->lod_info.component->as_constant();
2490       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2491       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2492          dst_reg result(this, ir->type);
2493          this->result = src_reg(result);
2494          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2495          return;
2496       }
2497    }
2498
2499    /* Should be lowered by do_lower_texture_projection */
2500    assert(!ir->projector);
2501
2502    /* Should be lowered */
2503    assert(!ir->offset || !ir->offset->type->is_array());
2504
2505    /* Generate code to compute all the subexpression trees.  This has to be
2506     * done before loading any values into MRFs for the sampler message since
2507     * generating these values may involve SEND messages that need the MRFs.
2508     */
2509    src_reg coordinate;
2510    if (ir->coordinate) {
2511       ir->coordinate->accept(this);
2512       coordinate = this->result;
2513    }
2514
2515    src_reg shadow_comparitor;
2516    if (ir->shadow_comparitor) {
2517       ir->shadow_comparitor->accept(this);
2518       shadow_comparitor = this->result;
2519    }
2520
2521    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2522    src_reg offset_value;
2523    if (has_nonconstant_offset) {
2524       ir->offset->accept(this);
2525       offset_value = src_reg(this->result);
2526    }
2527
2528    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2529    src_reg lod, dPdx, dPdy, sample_index, mcs;
2530    switch (ir->op) {
2531    case ir_tex:
2532       lod = src_reg(0.0f);
2533       lod_type = glsl_type::float_type;
2534       break;
2535    case ir_txf:
2536    case ir_txl:
2537    case ir_txs:
2538       ir->lod_info.lod->accept(this);
2539       lod = this->result;
2540       lod_type = ir->lod_info.lod->type;
2541       break;
2542    case ir_query_levels:
2543       lod = src_reg(0);
2544       lod_type = glsl_type::int_type;
2545       break;
2546    case ir_txf_ms:
2547       ir->lod_info.sample_index->accept(this);
2548       sample_index = this->result;
2549       sample_index_type = ir->lod_info.sample_index->type;
2550
2551       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2552          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2553       else
2554          mcs = src_reg(0u);
2555       break;
2556    case ir_txd:
2557       ir->lod_info.grad.dPdx->accept(this);
2558       dPdx = this->result;
2559
2560       ir->lod_info.grad.dPdy->accept(this);
2561       dPdy = this->result;
2562
2563       lod_type = ir->lod_info.grad.dPdx->type;
2564       break;
2565    case ir_txb:
2566    case ir_lod:
2567    case ir_tg4:
2568       break;
2569    }
2570
2571    enum opcode opcode;
2572    switch (ir->op) {
2573    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2574    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2575    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2576    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2577    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2578    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2579    case ir_tg4: opcode = has_nonconstant_offset
2580                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2581    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2582    case ir_txb:
2583       unreachable("TXB is not valid for vertex shaders.");
2584    case ir_lod:
2585       unreachable("LOD is not valid for vertex shaders.");
2586    default:
2587       unreachable("Unrecognized tex op");
2588    }
2589
2590    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2591       opcode, dst_reg(this, ir->type));
2592
2593    if (ir->offset != NULL && !has_nonconstant_offset) {
2594       inst->offset =
2595          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2596                             ir->offset->type->vector_elements);
2597    }
2598
2599    /* Stuff the channel select bits in the top of the texture offset */
2600    if (ir->op == ir_tg4)
2601       inst->offset |= gather_channel(ir, sampler) << 16;
2602
2603    /* The message header is necessary for:
2604     * - Gen4 (always)
2605     * - Gen9+ for selecting SIMD4x2
2606     * - Texel offsets
2607     * - Gather channel selection
2608     * - Sampler indices too large to fit in a 4-bit value.
2609     */
2610    inst->header_present =
2611       brw->gen < 5 || brw->gen >= 9 ||
2612       inst->offset != 0 || ir->op == ir_tg4 ||
2613       is_high_sampler(brw, sampler_reg);
2614    inst->base_mrf = 2;
2615    inst->mlen = inst->header_present + 1; /* always at least one */
2616    inst->dst.writemask = WRITEMASK_XYZW;
2617    inst->shadow_compare = ir->shadow_comparitor != NULL;
2618
2619    inst->src[1] = sampler_reg;
2620
2621    /* MRF for the first parameter */
2622    int param_base = inst->base_mrf + inst->header_present;
2623
2624    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2625       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2626       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2627    } else {
2628       /* Load the coordinate */
2629       /* FINISHME: gl_clamp_mask and saturate */
2630       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2631       int zero_mask = 0xf & ~coord_mask;
2632
2633       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2634                coordinate));
2635
2636       if (zero_mask != 0) {
2637          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2638                   src_reg(0)));
2639       }
2640       /* Load the shadow comparitor */
2641       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2642          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2643                           WRITEMASK_X),
2644                   shadow_comparitor));
2645          inst->mlen++;
2646       }
2647
2648       /* Load the LOD info */
2649       if (ir->op == ir_tex || ir->op == ir_txl) {
2650          int mrf, writemask;
2651          if (brw->gen >= 5) {
2652             mrf = param_base + 1;
2653             if (ir->shadow_comparitor) {
2654                writemask = WRITEMASK_Y;
2655                /* mlen already incremented */
2656             } else {
2657                writemask = WRITEMASK_X;
2658                inst->mlen++;
2659             }
2660          } else /* brw->gen == 4 */ {
2661             mrf = param_base;
2662             writemask = WRITEMASK_W;
2663          }
2664          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2665       } else if (ir->op == ir_txf) {
2666          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2667       } else if (ir->op == ir_txf_ms) {
2668          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2669                   sample_index));
2670          if (brw->gen >= 7) {
2671             /* MCS data is in the first channel of `mcs`, but we need to get it into
2672              * the .y channel of the second vec4 of params, so replicate .x across
2673              * the whole vec4 and then mask off everything except .y
2674              */
2675             mcs.swizzle = BRW_SWIZZLE_XXXX;
2676             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2677                      mcs));
2678          }
2679          inst->mlen++;
2680       } else if (ir->op == ir_txd) {
2681          const glsl_type *type = lod_type;
2682
2683          if (brw->gen >= 5) {
2684             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2685             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2686             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2687             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2688             inst->mlen++;
2689
2690             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2691                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2692                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2693                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2694                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2695                inst->mlen++;
2696
2697                if (ir->shadow_comparitor) {
2698                   emit(MOV(dst_reg(MRF, param_base + 2,
2699                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2700                            shadow_comparitor));
2701                }
2702             }
2703          } else /* brw->gen == 4 */ {
2704             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2705             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2706             inst->mlen += 2;
2707          }
2708       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2709          if (ir->shadow_comparitor) {
2710             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2711                      shadow_comparitor));
2712          }
2713
2714          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2715                   offset_value));
2716          inst->mlen++;
2717       }
2718    }
2719
2720    emit(inst);
2721
2722    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2723     * spec requires layers.
2724     */
2725    if (ir->op == ir_txs) {
2726       glsl_type const *type = ir->sampler->type;
2727       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2728           type->sampler_array) {
2729          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2730                    writemask(inst->dst, WRITEMASK_Z),
2731                    src_reg(inst->dst), src_reg(6));
2732       }
2733    }
2734
2735    if (brw->gen == 6 && ir->op == ir_tg4) {
2736       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2737    }
2738
2739    swizzle_result(ir, src_reg(inst->dst), sampler);
2740 }
2741
2742 /**
2743  * Apply workarounds for Gen6 gather with UINT/SINT
2744  */
2745 void
2746 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2747 {
2748    if (!wa)
2749       return;
2750
2751    int width = (wa & WA_8BIT) ? 8 : 16;
2752    dst_reg dst_f = dst;
2753    dst_f.type = BRW_REGISTER_TYPE_F;
2754
2755    /* Convert from UNORM to UINT */
2756    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2757    emit(MOV(dst, src_reg(dst_f)));
2758
2759    if (wa & WA_SIGN) {
2760       /* Reinterpret the UINT value as a signed INT value by
2761        * shifting the sign bit into place, then shifting back
2762        * preserving sign.
2763        */
2764       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2765       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2766    }
2767 }
2768
2769 /**
2770  * Set up the gather channel based on the swizzle, for gather4.
2771  */
2772 uint32_t
2773 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2774 {
2775    ir_constant *chan = ir->lod_info.component->as_constant();
2776    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2777    switch (swiz) {
2778       case SWIZZLE_X: return 0;
2779       case SWIZZLE_Y:
2780          /* gather4 sampler is broken for green channel on RG32F --
2781           * we must ask for blue instead.
2782           */
2783          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2784             return 2;
2785          return 1;
2786       case SWIZZLE_Z: return 2;
2787       case SWIZZLE_W: return 3;
2788       default:
2789          unreachable("Not reached"); /* zero, one swizzles handled already */
2790    }
2791 }
2792
2793 void
2794 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2795 {
2796    int s = key->tex.swizzles[sampler];
2797
2798    this->result = src_reg(this, ir->type);
2799    dst_reg swizzled_result(this->result);
2800
2801    if (ir->op == ir_query_levels) {
2802       /* # levels is in .w */
2803       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2804       emit(MOV(swizzled_result, orig_val));
2805       return;
2806    }
2807
2808    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2809                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2810       emit(MOV(swizzled_result, orig_val));
2811       return;
2812    }
2813
2814
2815    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2816    int swizzle[4] = {0};
2817
2818    for (int i = 0; i < 4; i++) {
2819       switch (GET_SWZ(s, i)) {
2820       case SWIZZLE_ZERO:
2821          zero_mask |= (1 << i);
2822          break;
2823       case SWIZZLE_ONE:
2824          one_mask |= (1 << i);
2825          break;
2826       default:
2827          copy_mask |= (1 << i);
2828          swizzle[i] = GET_SWZ(s, i);
2829          break;
2830       }
2831    }
2832
2833    if (copy_mask) {
2834       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2835       swizzled_result.writemask = copy_mask;
2836       emit(MOV(swizzled_result, orig_val));
2837    }
2838
2839    if (zero_mask) {
2840       swizzled_result.writemask = zero_mask;
2841       emit(MOV(swizzled_result, src_reg(0.0f)));
2842    }
2843
2844    if (one_mask) {
2845       swizzled_result.writemask = one_mask;
2846       emit(MOV(swizzled_result, src_reg(1.0f)));
2847    }
2848 }
2849
2850 void
2851 vec4_visitor::visit(ir_return *)
2852 {
2853    unreachable("not reached");
2854 }
2855
2856 void
2857 vec4_visitor::visit(ir_discard *)
2858 {
2859    unreachable("not reached");
2860 }
2861
2862 void
2863 vec4_visitor::visit(ir_if *ir)
2864 {
2865    /* Don't point the annotation at the if statement, because then it plus
2866     * the then and else blocks get printed.
2867     */
2868    this->base_ir = ir->condition;
2869
2870    if (brw->gen == 6) {
2871       emit_if_gen6(ir);
2872    } else {
2873       enum brw_predicate predicate;
2874       emit_bool_to_cond_code(ir->condition, &predicate);
2875       emit(IF(predicate));
2876    }
2877
2878    visit_instructions(&ir->then_instructions);
2879
2880    if (!ir->else_instructions.is_empty()) {
2881       this->base_ir = ir->condition;
2882       emit(BRW_OPCODE_ELSE);
2883
2884       visit_instructions(&ir->else_instructions);
2885    }
2886
2887    this->base_ir = ir->condition;
2888    emit(BRW_OPCODE_ENDIF);
2889 }
2890
2891 void
2892 vec4_visitor::visit(ir_emit_vertex *)
2893 {
2894    unreachable("not reached");
2895 }
2896
2897 void
2898 vec4_visitor::visit(ir_end_primitive *)
2899 {
2900    unreachable("not reached");
2901 }
2902
2903 void
2904 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2905                                   dst_reg dst, src_reg offset,
2906                                   src_reg src0, src_reg src1)
2907 {
2908    unsigned mlen = 0;
2909
2910    /* Set the atomic operation offset. */
2911    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2912    mlen++;
2913
2914    /* Set the atomic operation arguments. */
2915    if (src0.file != BAD_FILE) {
2916       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2917       mlen++;
2918    }
2919
2920    if (src1.file != BAD_FILE) {
2921       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2922       mlen++;
2923    }
2924
2925    /* Emit the instruction.  Note that this maps to the normal SIMD8
2926     * untyped atomic message on Ivy Bridge, but that's OK because
2927     * unused channels will be masked out.
2928     */
2929    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2930                                  src_reg(atomic_op), src_reg(surf_index));
2931    inst->base_mrf = 0;
2932    inst->mlen = mlen;
2933 }
2934
2935 void
2936 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2937                                         src_reg offset)
2938 {
2939    /* Set the surface read offset. */
2940    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2941
2942    /* Emit the instruction.  Note that this maps to the normal SIMD8
2943     * untyped surface read message, but that's OK because unused
2944     * channels will be masked out.
2945     */
2946    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2947                                  dst, src_reg(surf_index));
2948    inst->base_mrf = 0;
2949    inst->mlen = 1;
2950 }
2951
2952 void
2953 vec4_visitor::emit_ndc_computation()
2954 {
2955    /* Get the position */
2956    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2957
2958    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2959    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2960    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2961
2962    current_annotation = "NDC";
2963    dst_reg ndc_w = ndc;
2964    ndc_w.writemask = WRITEMASK_W;
2965    src_reg pos_w = pos;
2966    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2967    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2968
2969    dst_reg ndc_xyz = ndc;
2970    ndc_xyz.writemask = WRITEMASK_XYZ;
2971
2972    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2973 }
2974
2975 void
2976 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2977 {
2978    if (brw->gen < 6 &&
2979        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2980         key->userclip_active || brw->has_negative_rhw_bug)) {
2981       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2982       dst_reg header1_w = header1;
2983       header1_w.writemask = WRITEMASK_W;
2984
2985       emit(MOV(header1, 0u));
2986
2987       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2988          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2989
2990          current_annotation = "Point size";
2991          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2992          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2993       }
2994
2995       if (key->userclip_active) {
2996          current_annotation = "Clipping flags";
2997          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2998          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2999
3000          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3001          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3002          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3003
3004          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3005          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3006          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3007          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3008       }
3009
3010       /* i965 clipping workaround:
3011        * 1) Test for -ve rhw
3012        * 2) If set,
3013        *      set ndc = (0,0,0,0)
3014        *      set ucp[6] = 1
3015        *
3016        * Later, clipping will detect ucp[6] and ensure the primitive is
3017        * clipped against all fixed planes.
3018        */
3019       if (brw->has_negative_rhw_bug) {
3020          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3021          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3022          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3023          vec4_instruction *inst;
3024          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3025          inst->predicate = BRW_PREDICATE_NORMAL;
3026          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3027          inst->predicate = BRW_PREDICATE_NORMAL;
3028       }
3029
3030       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3031    } else if (brw->gen < 6) {
3032       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3033    } else {
3034       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3035       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3036          dst_reg reg_w = reg;
3037          reg_w.writemask = WRITEMASK_W;
3038          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3039       }
3040       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3041          dst_reg reg_y = reg;
3042          reg_y.writemask = WRITEMASK_Y;
3043          reg_y.type = BRW_REGISTER_TYPE_D;
3044          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3045       }
3046       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3047          dst_reg reg_z = reg;
3048          reg_z.writemask = WRITEMASK_Z;
3049          reg_z.type = BRW_REGISTER_TYPE_D;
3050          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3051       }
3052    }
3053 }
3054
3055 void
3056 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3057 {
3058    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3059     *
3060     *     "If a linked set of shaders forming the vertex stage contains no
3061     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3062     *     application has requested clipping against user clip planes through
3063     *     the API, then the coordinate written to gl_Position is used for
3064     *     comparison against the user clip planes."
3065     *
3066     * This function is only called if the shader didn't write to
3067     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3068     * if the user wrote to it; otherwise we use gl_Position.
3069     */
3070    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3071    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3072       clip_vertex = VARYING_SLOT_POS;
3073    }
3074
3075    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3076         ++i) {
3077       reg.writemask = 1 << i;
3078       emit(DP4(reg,
3079                src_reg(output_reg[clip_vertex]),
3080                src_reg(this->userplane[i + offset])));
3081    }
3082 }
3083
3084 vec4_instruction *
3085 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3086 {
3087    assert (varying < VARYING_SLOT_MAX);
3088    reg.type = output_reg[varying].type;
3089    current_annotation = output_reg_annotation[varying];
3090    /* Copy the register, saturating if necessary */
3091    return emit(MOV(reg, src_reg(output_reg[varying])));
3092 }
3093
3094 void
3095 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3096 {
3097    reg.type = BRW_REGISTER_TYPE_F;
3098
3099    switch (varying) {
3100    case VARYING_SLOT_PSIZ:
3101    {
3102       /* PSIZ is always in slot 0, and is coupled with other flags. */
3103       current_annotation = "indices, point width, clip flags";
3104       emit_psiz_and_flags(reg);
3105       break;
3106    }
3107    case BRW_VARYING_SLOT_NDC:
3108       current_annotation = "NDC";
3109       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3110       break;
3111    case VARYING_SLOT_POS:
3112       current_annotation = "gl_Position";
3113       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3114       break;
3115    case VARYING_SLOT_EDGE:
3116       /* This is present when doing unfilled polygons.  We're supposed to copy
3117        * the edge flag from the user-provided vertex array
3118        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3119        * of that attribute (starts as 1.0f).  This is then used in clipping to
3120        * determine which edges should be drawn as wireframe.
3121        */
3122       current_annotation = "edge flag";
3123       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3124                                     glsl_type::float_type, WRITEMASK_XYZW))));
3125       break;
3126    case BRW_VARYING_SLOT_PAD:
3127       /* No need to write to this slot */
3128       break;
3129    case VARYING_SLOT_COL0:
3130    case VARYING_SLOT_COL1:
3131    case VARYING_SLOT_BFC0:
3132    case VARYING_SLOT_BFC1: {
3133       /* These built-in varyings are only supported in compatibility mode,
3134        * and we only support GS in core profile.  So, this must be a vertex
3135        * shader.
3136        */
3137       assert(stage == MESA_SHADER_VERTEX);
3138       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3139       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3140          inst->saturate = true;
3141       break;
3142    }
3143
3144    default:
3145       emit_generic_urb_slot(reg, varying);
3146       break;
3147    }
3148 }
3149
3150 static int
3151 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3152 {
3153    if (brw->gen >= 6) {
3154       /* URB data written (does not include the message header reg) must
3155        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3156        * section 5.4.3.2.2: URB_INTERLEAVED.
3157        *
3158        * URB entries are allocated on a multiple of 1024 bits, so an
3159        * extra 128 bits written here to make the end align to 256 is
3160        * no problem.
3161        */
3162       if ((mlen % 2) != 1)
3163          mlen++;
3164    }
3165
3166    return mlen;
3167 }
3168
3169
3170 /**
3171  * Generates the VUE payload plus the necessary URB write instructions to
3172  * output it.
3173  *
3174  * The VUE layout is documented in Volume 2a.
3175  */
3176 void
3177 vec4_visitor::emit_vertex()
3178 {
3179    /* MRF 0 is reserved for the debugger, so start with message header
3180     * in MRF 1.
3181     */
3182    int base_mrf = 1;
3183    int mrf = base_mrf;
3184    /* In the process of generating our URB write message contents, we
3185     * may need to unspill a register or load from an array.  Those
3186     * reads would use MRFs 14-15.
3187     */
3188    int max_usable_mrf = 13;
3189
3190    /* The following assertion verifies that max_usable_mrf causes an
3191     * even-numbered amount of URB write data, which will meet gen6's
3192     * requirements for length alignment.
3193     */
3194    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3195
3196    /* First mrf is the g0-based message header containing URB handles and
3197     * such.
3198     */
3199    emit_urb_write_header(mrf++);
3200
3201    if (brw->gen < 6) {
3202       emit_ndc_computation();
3203    }
3204
3205    /* Lower legacy ff and ClipVertex clipping to clip distances */
3206    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3207       current_annotation = "user clip distances";
3208
3209       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3210       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3211
3212       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3213       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3214    }
3215
3216    /* We may need to split this up into several URB writes, so do them in a
3217     * loop.
3218     */
3219    int slot = 0;
3220    bool complete = false;
3221    do {
3222       /* URB offset is in URB row increments, and each of our MRFs is half of
3223        * one of those, since we're doing interleaved writes.
3224        */
3225       int offset = slot / 2;
3226
3227       mrf = base_mrf + 1;
3228       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3229          emit_urb_slot(dst_reg(MRF, mrf++),
3230                        prog_data->vue_map.slot_to_varying[slot]);
3231
3232          /* If this was max_usable_mrf, we can't fit anything more into this
3233           * URB WRITE.
3234           */
3235          if (mrf > max_usable_mrf) {
3236             slot++;
3237             break;
3238          }
3239       }
3240
3241       complete = slot >= prog_data->vue_map.num_slots;
3242       current_annotation = "URB write";
3243       vec4_instruction *inst = emit_urb_write_opcode(complete);
3244       inst->base_mrf = base_mrf;
3245       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3246       inst->offset += offset;
3247    } while(!complete);
3248 }
3249
3250
3251 src_reg
3252 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3253                                  src_reg *reladdr, int reg_offset)
3254 {
3255    /* Because we store the values to scratch interleaved like our
3256     * vertex data, we need to scale the vec4 index by 2.
3257     */
3258    int message_header_scale = 2;
3259
3260    /* Pre-gen6, the message header uses byte offsets instead of vec4
3261     * (16-byte) offset units.
3262     */
3263    if (brw->gen < 6)
3264       message_header_scale *= 16;
3265
3266    if (reladdr) {
3267       src_reg index = src_reg(this, glsl_type::int_type);
3268
3269       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3270                                    src_reg(reg_offset)));
3271       emit_before(block, inst, MUL(dst_reg(index), index,
3272                                    src_reg(message_header_scale)));
3273
3274       return index;
3275    } else {
3276       return src_reg(reg_offset * message_header_scale);
3277    }
3278 }
3279
3280 src_reg
3281 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3282                                        src_reg *reladdr, int reg_offset)
3283 {
3284    if (reladdr) {
3285       src_reg index = src_reg(this, glsl_type::int_type);
3286
3287       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3288                                    src_reg(reg_offset)));
3289
3290       /* Pre-gen6, the message header uses byte offsets instead of vec4
3291        * (16-byte) offset units.
3292        */
3293       if (brw->gen < 6) {
3294          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3295       }
3296
3297       return index;
3298    } else if (brw->gen >= 8) {
3299       /* Store the offset in a GRF so we can send-from-GRF. */
3300       src_reg offset = src_reg(this, glsl_type::int_type);
3301       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3302       return offset;
3303    } else {
3304       int message_header_scale = brw->gen < 6 ? 16 : 1;
3305       return src_reg(reg_offset * message_header_scale);
3306    }
3307 }
3308
3309 /**
3310  * Emits an instruction before @inst to load the value named by @orig_src
3311  * from scratch space at @base_offset to @temp.
3312  *
3313  * @base_offset is measured in 32-byte units (the size of a register).
3314  */
3315 void
3316 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3317                                 dst_reg temp, src_reg orig_src,
3318                                 int base_offset)
3319 {
3320    int reg_offset = base_offset + orig_src.reg_offset;
3321    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3322                                       reg_offset);
3323
3324    emit_before(block, inst, SCRATCH_READ(temp, index));
3325 }
3326
3327 /**
3328  * Emits an instruction after @inst to store the value to be written
3329  * to @orig_dst to scratch space at @base_offset, from @temp.
3330  *
3331  * @base_offset is measured in 32-byte units (the size of a register).
3332  */
3333 void
3334 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3335                                  int base_offset)
3336 {
3337    int reg_offset = base_offset + inst->dst.reg_offset;
3338    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3339                                       reg_offset);
3340
3341    /* Create a temporary register to store *inst's result in.
3342     *
3343     * We have to be careful in MOVing from our temporary result register in
3344     * the scratch write.  If we swizzle from channels of the temporary that
3345     * weren't initialized, it will confuse live interval analysis, which will
3346     * make spilling fail to make progress.
3347     */
3348    src_reg temp = src_reg(this, glsl_type::vec4_type);
3349    temp.type = inst->dst.type;
3350    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3351    int swizzles[4];
3352    for (int i = 0; i < 4; i++)
3353       if (inst->dst.writemask & (1 << i))
3354          swizzles[i] = i;
3355       else
3356          swizzles[i] = first_writemask_chan;
3357    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3358                                swizzles[2], swizzles[3]);
3359
3360    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3361                                        inst->dst.writemask));
3362    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3363    write->predicate = inst->predicate;
3364    write->ir = inst->ir;
3365    write->annotation = inst->annotation;
3366    inst->insert_after(block, write);
3367
3368    inst->dst.file = temp.file;
3369    inst->dst.reg = temp.reg;
3370    inst->dst.reg_offset = temp.reg_offset;
3371    inst->dst.reladdr = NULL;
3372 }
3373
3374 /**
3375  * We can't generally support array access in GRF space, because a
3376  * single instruction's destination can only span 2 contiguous
3377  * registers.  So, we send all GRF arrays that get variable index
3378  * access to scratch space.
3379  */
3380 void
3381 vec4_visitor::move_grf_array_access_to_scratch()
3382 {
3383    int scratch_loc[this->alloc.count];
3384    memset(scratch_loc, -1, sizeof(scratch_loc));
3385
3386    /* First, calculate the set of virtual GRFs that need to be punted
3387     * to scratch due to having any array access on them, and where in
3388     * scratch.
3389     */
3390    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3391       if (inst->dst.file == GRF && inst->dst.reladdr &&
3392           scratch_loc[inst->dst.reg] == -1) {
3393          scratch_loc[inst->dst.reg] = c->last_scratch;
3394          c->last_scratch += this->alloc.sizes[inst->dst.reg];
3395       }
3396
3397       for (int i = 0 ; i < 3; i++) {
3398          src_reg *src = &inst->src[i];
3399
3400          if (src->file == GRF && src->reladdr &&
3401              scratch_loc[src->reg] == -1) {
3402             scratch_loc[src->reg] = c->last_scratch;
3403             c->last_scratch += this->alloc.sizes[src->reg];
3404          }
3405       }
3406    }
3407
3408    /* Now, for anything that will be accessed through scratch, rewrite
3409     * it to load/store.  Note that this is a _safe list walk, because
3410     * we may generate a new scratch_write instruction after the one
3411     * we're processing.
3412     */
3413    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3414       /* Set up the annotation tracking for new generated instructions. */
3415       base_ir = inst->ir;
3416       current_annotation = inst->annotation;
3417
3418       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3419          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3420       }
3421
3422       for (int i = 0 ; i < 3; i++) {
3423          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3424             continue;
3425
3426          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3427
3428          emit_scratch_read(block, inst, temp, inst->src[i],
3429                            scratch_loc[inst->src[i].reg]);
3430
3431          inst->src[i].file = temp.file;
3432          inst->src[i].reg = temp.reg;
3433          inst->src[i].reg_offset = temp.reg_offset;
3434          inst->src[i].reladdr = NULL;
3435       }
3436    }
3437 }
3438
3439 /**
3440  * Emits an instruction before @inst to load the value named by @orig_src
3441  * from the pull constant buffer (surface) at @base_offset to @temp.
3442  */
3443 void
3444 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3445                                       dst_reg temp, src_reg orig_src,
3446                                       int base_offset)
3447 {
3448    int reg_offset = base_offset + orig_src.reg_offset;
3449    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3450    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3451                                              reg_offset);
3452    vec4_instruction *load;
3453
3454    if (brw->gen >= 7) {
3455       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3456       grf_offset.type = offset.type;
3457       emit_before(block, inst, MOV(grf_offset, offset));
3458
3459       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3460                                            temp, index, src_reg(grf_offset));
3461       load->mlen = 1;
3462    } else {
3463       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3464                                            temp, index, offset);
3465       load->base_mrf = 14;
3466       load->mlen = 1;
3467    }
3468    emit_before(block, inst, load);
3469 }
3470
3471 /**
3472  * Implements array access of uniforms by inserting a
3473  * PULL_CONSTANT_LOAD instruction.
3474  *
3475  * Unlike temporary GRF array access (where we don't support it due to
3476  * the difficulty of doing relative addressing on instruction
3477  * destinations), we could potentially do array access of uniforms
3478  * that were loaded in GRF space as push constants.  In real-world
3479  * usage we've seen, though, the arrays being used are always larger
3480  * than we could load as push constants, so just always move all
3481  * uniform array access out to a pull constant buffer.
3482  */
3483 void
3484 vec4_visitor::move_uniform_array_access_to_pull_constants()
3485 {
3486    int pull_constant_loc[this->uniforms];
3487    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3488    bool nested_reladdr;
3489
3490    /* Walk through and find array access of uniforms.  Put a copy of that
3491     * uniform in the pull constant buffer.
3492     *
3493     * Note that we don't move constant-indexed accesses to arrays.  No
3494     * testing has been done of the performance impact of this choice.
3495     */
3496    do {
3497       nested_reladdr = false;
3498
3499       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3500          for (int i = 0 ; i < 3; i++) {
3501             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3502                continue;
3503
3504             int uniform = inst->src[i].reg;
3505
3506             if (inst->src[i].reladdr->reladdr)
3507                nested_reladdr = true;  /* will need another pass */
3508
3509             /* If this array isn't already present in the pull constant buffer,
3510              * add it.
3511              */
3512             if (pull_constant_loc[uniform] == -1) {
3513                const gl_constant_value **values =
3514                   &stage_prog_data->param[uniform * 4];
3515
3516                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3517
3518                assert(uniform < uniform_array_size);
3519                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3520                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3521                      = values[j];
3522                }
3523             }
3524
3525             /* Set up the annotation tracking for new generated instructions. */
3526             base_ir = inst->ir;
3527             current_annotation = inst->annotation;
3528
3529             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3530
3531             emit_pull_constant_load(block, inst, temp, inst->src[i],
3532                                     pull_constant_loc[uniform]);
3533
3534             inst->src[i].file = temp.file;
3535             inst->src[i].reg = temp.reg;
3536             inst->src[i].reg_offset = temp.reg_offset;
3537             inst->src[i].reladdr = NULL;
3538          }
3539       }
3540    } while (nested_reladdr);
3541
3542    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3543     * no need to track them as larger-than-vec4 objects.  This will be
3544     * relied on in cutting out unused uniform vectors from push
3545     * constants.
3546     */
3547    split_uniform_registers();
3548 }
3549
3550 void
3551 vec4_visitor::resolve_ud_negate(src_reg *reg)
3552 {
3553    if (reg->type != BRW_REGISTER_TYPE_UD ||
3554        !reg->negate)
3555       return;
3556
3557    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3558    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3559    *reg = temp;
3560 }
3561
3562 /**
3563  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3564  *
3565  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3566  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3567  */
3568 void
3569 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3570 {
3571    assert(brw->gen <= 5);
3572
3573    if (!rvalue->type->is_boolean())
3574       return;
3575
3576    src_reg and_result = src_reg(this, rvalue->type);
3577    src_reg neg_result = src_reg(this, rvalue->type);
3578    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3579    emit(MOV(dst_reg(neg_result), negate(and_result)));
3580    *reg = neg_result;
3581 }
3582
3583 vec4_visitor::vec4_visitor(struct brw_context *brw,
3584                            struct brw_vec4_compile *c,
3585                            struct gl_program *prog,
3586                            const struct brw_vue_prog_key *key,
3587                            struct brw_vue_prog_data *prog_data,
3588                            struct gl_shader_program *shader_prog,
3589                            gl_shader_stage stage,
3590                            void *mem_ctx,
3591                            bool no_spills,
3592                            shader_time_shader_type st_base,
3593                            shader_time_shader_type st_written,
3594                            shader_time_shader_type st_reset)
3595    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3596      c(c),
3597      key(key),
3598      prog_data(prog_data),
3599      sanity_param_count(0),
3600      fail_msg(NULL),
3601      first_non_payload_grf(0),
3602      need_all_constants_in_pull_buffer(false),
3603      no_spills(no_spills),
3604      st_base(st_base),
3605      st_written(st_written),
3606      st_reset(st_reset)
3607 {
3608    this->mem_ctx = mem_ctx;
3609    this->failed = false;
3610
3611    this->base_ir = NULL;
3612    this->current_annotation = NULL;
3613    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3614
3615    this->variable_ht = hash_table_ctor(0,
3616                                        hash_table_pointer_hash,
3617                                        hash_table_pointer_compare);
3618
3619    this->virtual_grf_start = NULL;
3620    this->virtual_grf_end = NULL;
3621    this->live_intervals = NULL;
3622
3623    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3624
3625    this->uniforms = 0;
3626
3627    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3628     * at least one. See setup_uniforms() in brw_vec4.cpp.
3629     */
3630    this->uniform_array_size = 1;
3631    if (prog_data) {
3632       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3633    }
3634
3635    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3636    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3637 }
3638
3639 vec4_visitor::~vec4_visitor()
3640 {
3641    hash_table_dtor(this->variable_ht);
3642 }
3643
3644
3645 void
3646 vec4_visitor::fail(const char *format, ...)
3647 {
3648    va_list va;
3649    char *msg;
3650
3651    if (failed)
3652       return;
3653
3654    failed = true;
3655
3656    va_start(va, format);
3657    msg = ralloc_vasprintf(mem_ctx, format, va);
3658    va_end(va);
3659    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3660
3661    this->fail_msg = msg;
3662
3663    if (debug_enabled) {
3664       fprintf(stderr, "%s",  msg);
3665    }
3666 }
3667
3668 } /* namespace brw */