src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 690           (storage->name[namelen] != 0 &&
 691            storage->name[namelen] != '.' &&
 692            storage->name[namelen] != '[')) {
 693          continue;
 694       }
 695
 696       gl_constant_value *components = storage->storage;
 697       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 698                                storage->type->matrix_columns);
 699
 700       for (unsigned s = 0; s < vector_count; s++) {
 701          assert(uniforms < uniform_array_size);
 702          uniform_vector_size[uniforms] = storage->type->vector_elements;
 703
 704          int i;
 705          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 706             stage_prog_data->param[uniforms * 4 + i] = components;
 707             components++;
 708          }
 709          for (; i < 4; i++) {
 710             static gl_constant_value zero = { 0.0 };
 711             stage_prog_data->param[uniforms * 4 + i] = &zero;
 712          }
 713
 714          uniforms++;
 715       }
 716    }
 717 }
 718
 719 void
 720 vec4_visitor::setup_uniform_clipplane_values()
 721 {
 722    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 723
 724    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 4;
 727       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 728       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 729       for (int j = 0; j < 4; ++j) {
 730          stage_prog_data->param[this->uniforms * 4 + j] =
 731             (gl_constant_value *) &clip_planes[i][j];
 732       }
 733       ++this->uniforms;
 734    }
 735 }
 736
 737 /* Our support for builtin uniforms is even scarier than non-builtin.
 738  * It sits on top of the PROG_STATE_VAR parameters that are
 739  * automatically updated from GL context state.
 740  */
 741 void
 742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 743 {
 744    const ir_state_slot *const slots = ir->get_state_slots();
 745    assert(slots != NULL);
 746
 747    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 748       /* This state reference has already been setup by ir_to_mesa,
 749        * but we'll get the same index back here.  We can reference
 750        * ParameterValues directly, since unlike brw_fs.cpp, we never
 751        * add new state references during compile.
 752        */
 753       int index = _mesa_add_state_reference(this->prog->Parameters,
 754                                             (gl_state_index *)slots[i].tokens);
 755       gl_constant_value *values =
 756          &this->prog->Parameters->ParameterValues[index][0];
 757
 758       assert(this->uniforms < uniform_array_size);
 759
 760       for (unsigned j = 0; j < 4; j++)
 761          stage_prog_data->param[this->uniforms * 4 + j] =
 762             &values[GET_SWZ(slots[i].swizzle, j)];
 763
 764       this->uniform_vector_size[this->uniforms] =
 765          (ir->type->is_scalar() || ir->type->is_vector() ||
 766           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 767
 768       this->uniforms++;
 769    }
 770 }
 771
 772 dst_reg *
 773 vec4_visitor::variable_storage(ir_variable *var)
 774 {
 775    return (dst_reg *)hash_table_find(this->variable_ht, var);
 776 }
 777
 778 void
 779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 780                                      enum brw_predicate *predicate)
 781 {
 782    ir_expression *expr = ir->as_expression();
 783
 784    *predicate = BRW_PREDICATE_NORMAL;
 785
 786    if (expr && expr->operation != ir_binop_ubo_load) {
 787       src_reg op[3];
 788       vec4_instruction *inst;
 789
 790       assert(expr->get_num_operands() <= 3);
 791       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 792          expr->operands[i]->accept(this);
 793          op[i] = this->result;
 794
 795          resolve_ud_negate(&op[i]);
 796       }
 797
 798       switch (expr->operation) {
 799       case ir_unop_logic_not:
 800          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 801          inst->conditional_mod = BRW_CONDITIONAL_Z;
 802          break;
 803
 804       case ir_binop_logic_xor:
 805          if (brw->gen <= 5) {
 806             src_reg temp = src_reg(this, ir->type);
 807             emit(XOR(dst_reg(temp), op[0], op[1]));
 808             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 809          } else {
 810             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 811          }
 812          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 813          break;
 814
 815       case ir_binop_logic_or:
 816          if (brw->gen <= 5) {
 817             src_reg temp = src_reg(this, ir->type);
 818             emit(OR(dst_reg(temp), op[0], op[1]));
 819             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 820          } else {
 821             inst = emit(OR(dst_null_d(), op[0], op[1]));
 822          }
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_binop_logic_and:
 827          if (brw->gen <= 5) {
 828             src_reg temp = src_reg(this, ir->type);
 829             emit(AND(dst_reg(temp), op[0], op[1]));
 830             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 831          } else {
 832             inst = emit(AND(dst_null_d(), op[0], op[1]));
 833          }
 834          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 835          break;
 836
 837       case ir_unop_f2b:
 838          if (brw->gen >= 6) {
 839             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 840          } else {
 841             inst = emit(MOV(dst_null_f(), op[0]));
 842             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          }
 844          break;
 845
 846       case ir_unop_i2b:
 847          if (brw->gen >= 6) {
 848             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 849          } else {
 850             inst = emit(MOV(dst_null_d(), op[0]));
 851             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 852          }
 853          break;
 854
 855       case ir_binop_all_equal:
 856          if (brw->gen <= 5) {
 857             resolve_bool_comparison(expr->operands[0], &op[0]);
 858             resolve_bool_comparison(expr->operands[1], &op[1]);
 859          }
 860          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 861          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 862          break;
 863
 864       case ir_binop_any_nequal:
 865          if (brw->gen <= 5) {
 866             resolve_bool_comparison(expr->operands[0], &op[0]);
 867             resolve_bool_comparison(expr->operands[1], &op[1]);
 868          }
 869          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 870          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 871          break;
 872
 873       case ir_unop_any:
 874          if (brw->gen <= 5) {
 875             resolve_bool_comparison(expr->operands[0], &op[0]);
 876          }
 877          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 878          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 879          break;
 880
 881       case ir_binop_greater:
 882       case ir_binop_gequal:
 883       case ir_binop_less:
 884       case ir_binop_lequal:
 885       case ir_binop_equal:
 886       case ir_binop_nequal:
 887          if (brw->gen <= 5) {
 888             resolve_bool_comparison(expr->operands[0], &op[0]);
 889             resolve_bool_comparison(expr->operands[1], &op[1]);
 890          }
 891          emit(CMP(dst_null_d(), op[0], op[1],
 892                   brw_conditional_for_comparison(expr->operation)));
 893          break;
 894
 895       case ir_triop_csel: {
 896          /* Expand the boolean condition into the flag register. */
 897          inst = emit(MOV(dst_null_d(), op[0]));
 898          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 899
 900          /* Select which boolean to return. */
 901          dst_reg temp(this, expr->operands[1]->type);
 902          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 903          inst->predicate = BRW_PREDICATE_NORMAL;
 904
 905          /* Expand the result to a condition code. */
 906          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 907          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 908          break;
 909       }
 910
 911       default:
 912          unreachable("not reached");
 913       }
 914       return;
 915    }
 916
 917    ir->accept(this);
 918
 919    resolve_ud_negate(&this->result);
 920
 921    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 922    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923 }
 924
 925 /**
 926  * Emit a gen6 IF statement with the comparison folded into the IF
 927  * instruction.
 928  */
 929 void
 930 vec4_visitor::emit_if_gen6(ir_if *ir)
 931 {
 932    ir_expression *expr = ir->condition->as_expression();
 933
 934    if (expr && expr->operation != ir_binop_ubo_load) {
 935       src_reg op[3];
 936       dst_reg temp;
 937
 938       assert(expr->get_num_operands() <= 3);
 939       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 940          expr->operands[i]->accept(this);
 941          op[i] = this->result;
 942       }
 943
 944       switch (expr->operation) {
 945       case ir_unop_logic_not:
 946          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 947          return;
 948
 949       case ir_binop_logic_xor:
 950          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 951          return;
 952
 953       case ir_binop_logic_or:
 954          temp = dst_reg(this, glsl_type::bool_type);
 955          emit(OR(temp, op[0], op[1]));
 956          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958
 959       case ir_binop_logic_and:
 960          temp = dst_reg(this, glsl_type::bool_type);
 961          emit(AND(temp, op[0], op[1]));
 962          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 963          return;
 964
 965       case ir_unop_f2b:
 966          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_i2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_binop_greater:
 974       case ir_binop_gequal:
 975       case ir_binop_less:
 976       case ir_binop_lequal:
 977       case ir_binop_equal:
 978       case ir_binop_nequal:
 979          emit(IF(op[0], op[1],
 980                  brw_conditional_for_comparison(expr->operation)));
 981          return;
 982
 983       case ir_binop_all_equal:
 984          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 985          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 986          return;
 987
 988       case ir_binop_any_nequal:
 989          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 990          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 991          return;
 992
 993       case ir_unop_any:
 994          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 995          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 996          return;
 997
 998       case ir_triop_csel: {
 999          /* Expand the boolean condition into the flag register. */
1000          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003          /* Select which boolean to return. */
1004          dst_reg temp(this, expr->operands[1]->type);
1005          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006          inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009          return;
1010       }
1011
1012       default:
1013          unreachable("not reached");
1014       }
1015       return;
1016    }
1017
1018    ir->condition->accept(this);
1019
1020    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026    dst_reg *reg = NULL;
1027
1028    if (variable_storage(ir))
1029       return;
1030
1031    switch (ir->data.mode) {
1032    case ir_var_shader_in:
1033       assert(ir->data.location != -1);
1034       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035       break;
1036
1037    case ir_var_shader_out:
1038       assert(ir->data.location != -1);
1039       reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041       for (int i = 0; i < type_size(ir->type); i++) {
1042          output_reg[ir->data.location + i] = *reg;
1043          output_reg[ir->data.location + i].reg_offset = i;
1044          output_reg[ir->data.location + i].type =
1045             brw_type_for_base_type(ir->type->get_scalar_type());
1046          output_reg_annotation[ir->data.location + i] = ir->name;
1047       }
1048       break;
1049
1050    case ir_var_auto:
1051    case ir_var_temporary:
1052       reg = new(mem_ctx) dst_reg(this, ir->type);
1053       break;
1054
1055    case ir_var_uniform:
1056       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058       /* Thanks to the lower_ubo_reference pass, we will see only
1059        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060        * variables, so no need for them to be in variable_ht.
1061        *
1062        * Some uniforms, such as samplers and atomic counters, have no actual
1063        * storage, so we should ignore them.
1064        */
1065       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066          return;
1067
1068       /* Track how big the whole uniform variable is, in case we need to put a
1069        * copy of its data into pull constants for array access.
1070        */
1071       assert(this->uniforms < uniform_array_size);
1072       this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074       if (!strncmp(ir->name, "gl_", 3)) {
1075          setup_builtin_uniform_values(ir);
1076       } else {
1077          setup_uniform_values(ir);
1078       }
1079       break;
1080
1081    case ir_var_system_value:
1082       reg = make_reg_for_system_value(ir);
1083       break;
1084
1085    default:
1086       unreachable("not reached");
1087    }
1088
1089    reg->type = brw_type_for_base_type(ir->type);
1090    hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096    /* We don't want debugging output to print the whole body of the
1097     * loop as the annotation.
1098     */
1099    this->base_ir = NULL;
1100
1101    emit(BRW_OPCODE_DO);
1102
1103    visit_instructions(&ir->body_instructions);
1104
1105    emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111    switch (ir->mode) {
1112    case ir_loop_jump::jump_break:
1113       emit(BRW_OPCODE_BREAK);
1114       break;
1115    case ir_loop_jump::jump_continue:
1116       emit(BRW_OPCODE_CONTINUE);
1117       break;
1118    }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125    unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131    /* Ignore function bodies other than main() -- we shouldn't see calls to
1132     * them since they should all be inlined.
1133     */
1134    if (strcmp(ir->name, "main") == 0) {
1135       const ir_function_signature *sig;
1136       exec_list empty;
1137
1138       sig = ir->matching_signature(NULL, &empty, false);
1139
1140       assert(sig);
1141
1142       visit_instructions(&sig->body);
1143    }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149    /* 3-src instructions were introduced in gen6. */
1150    if (brw->gen < 6)
1151       return false;
1152
1153    /* MAD can only handle floating-point data. */
1154    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155       return false;
1156
1157    ir_rvalue *nonmul;
1158    ir_expression *mul;
1159    bool mul_negate, mul_abs;
1160
1161    for (int i = 0; i < 2; i++) {
1162       mul_negate = false;
1163       mul_abs = false;
1164
1165       mul = ir->operands[i]->as_expression();
1166       nonmul = ir->operands[1 - i];
1167
1168       if (mul && mul->operation == ir_unop_abs) {
1169          mul = mul->operands[0]->as_expression();
1170          mul_abs = true;
1171       } else if (mul && mul->operation == ir_unop_neg) {
1172          mul = mul->operands[0]->as_expression();
1173          mul_negate = true;
1174       }
1175
1176       if (mul && mul->operation == ir_binop_mul)
1177          break;
1178    }
1179
1180    if (!mul || mul->operation != ir_binop_mul)
1181       return false;
1182
1183    nonmul->accept(this);
1184    src_reg src0 = fix_3src_operand(this->result);
1185
1186    mul->operands[0]->accept(this);
1187    src_reg src1 = fix_3src_operand(this->result);
1188    src1.negate ^= mul_negate;
1189    src1.abs = mul_abs;
1190    if (mul_abs)
1191       src1.negate = false;
1192
1193    mul->operands[1]->accept(this);
1194    src_reg src2 = fix_3src_operand(this->result);
1195    src2.abs = mul_abs;
1196    if (mul_abs)
1197       src2.negate = false;
1198
1199    this->result = src_reg(this, ir->type);
1200    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202    return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208    /* This optimization relies on CMP setting the destination to 0 when
1209     * false.  Early hardware only sets the least significant bit, and
1210     * leaves the other bits undefined.  So we can't use it.
1211     */
1212    if (brw->gen < 6)
1213       return false;
1214
1215    ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217    if (cmp == NULL)
1218       return false;
1219
1220    switch (cmp->operation) {
1221    case ir_binop_less:
1222    case ir_binop_greater:
1223    case ir_binop_lequal:
1224    case ir_binop_gequal:
1225    case ir_binop_equal:
1226    case ir_binop_nequal:
1227       break;
1228
1229    default:
1230       return false;
1231    }
1232
1233    cmp->operands[0]->accept(this);
1234    const src_reg cmp_src0 = this->result;
1235
1236    cmp->operands[1]->accept(this);
1237    const src_reg cmp_src1 = this->result;
1238
1239    this->result = src_reg(this, ir->type);
1240
1241    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242             brw_conditional_for_comparison(cmp->operation)));
1243
1244    /* If the comparison is false, this->result will just happen to be zero.
1245     */
1246    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247                                        this->result, src_reg(1.0f));
1248    inst->predicate = BRW_PREDICATE_NORMAL;
1249    inst->predicate_inverse = true;
1250
1251    return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256                           src_reg src0, src_reg src1)
1257 {
1258    vec4_instruction *inst;
1259
1260    if (brw->gen >= 6) {
1261       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262       inst->conditional_mod = conditionalmod;
1263    } else {
1264       emit(CMP(dst, src0, src1, conditionalmod));
1265
1266       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267       inst->predicate = BRW_PREDICATE_NORMAL;
1268    }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273                        const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275    if (brw->gen >= 6) {
1276       /* Note that the instruction's argument order is reversed from GLSL
1277        * and the IR.
1278        */
1279       emit(LRP(dst,
1280                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281    } else {
1282       /* Earlier generations don't support three source operations, so we
1283        * need to emit x*(1-a) + y*a.
1284        */
1285       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1286       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1287       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288       y_times_a.writemask           = dst.writemask;
1289       one_minus_a.writemask         = dst.writemask;
1290       x_times_one_minus_a.writemask = dst.writemask;
1291
1292       emit(MUL(y_times_a, y, a));
1293       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296    }
1297 }
1298
1299 void
1300 vec4_visitor::visit(ir_expression *ir)
1301 {
1302    unsigned int operand;
1303    src_reg op[ARRAY_SIZE(ir->operands)];
1304    vec4_instruction *inst;
1305
1306    if (ir->operation == ir_binop_add) {
1307       if (try_emit_mad(ir))
1308          return;
1309    }
1310
1311    if (ir->operation == ir_unop_b2f) {
1312       if (try_emit_b2f_of_compare(ir))
1313          return;
1314    }
1315
1316    /* Storage for our result.  Ideally for an assignment we'd be using
1317     * the actual storage for the result here, instead.
1318     */
1319    dst_reg result_dst(this, ir->type);
1320    src_reg result_src(result_dst);
1321
1322    if (ir->operation == ir_triop_csel) {
1323       ir->operands[1]->accept(this);
1324       op[1] = this->result;
1325       ir->operands[2]->accept(this);
1326       op[2] = this->result;
1327
1328       enum brw_predicate predicate;
1329       emit_bool_to_cond_code(ir->operands[0], &predicate);
1330       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1331       inst->predicate = predicate;
1332       this->result = result_src;
1333       return;
1334    }
1335
1336    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1337       this->result.file = BAD_FILE;
1338       ir->operands[operand]->accept(this);
1339       if (this->result.file == BAD_FILE) {
1340          fprintf(stderr, "Failed to get tree for expression operand:\n");
1341          ir->operands[operand]->fprint(stderr);
1342          exit(1);
1343       }
1344       op[operand] = this->result;
1345
1346       /* Matrix expression operands should have been broken down to vector
1347        * operations already.
1348        */
1349       assert(!ir->operands[operand]->type->is_matrix());
1350    }
1351
1352    /* If nothing special happens, this is the result. */
1353    this->result = result_src;
1354
1355    switch (ir->operation) {
1356    case ir_unop_logic_not:
1357       emit(NOT(result_dst, op[0]));
1358       break;
1359    case ir_unop_neg:
1360       op[0].negate = !op[0].negate;
1361       emit(MOV(result_dst, op[0]));
1362       break;
1363    case ir_unop_abs:
1364       op[0].abs = true;
1365       op[0].negate = false;
1366       emit(MOV(result_dst, op[0]));
1367       break;
1368
1369    case ir_unop_sign:
1370       if (ir->type->is_float()) {
1371          /* AND(val, 0x80000000) gives the sign bit.
1372           *
1373           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1374           * zero.
1375           */
1376          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1377
1378          op[0].type = BRW_REGISTER_TYPE_UD;
1379          result_dst.type = BRW_REGISTER_TYPE_UD;
1380          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1381
1382          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1383          inst->predicate = BRW_PREDICATE_NORMAL;
1384
1385          this->result.type = BRW_REGISTER_TYPE_F;
1386       } else {
1387          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1388           *               -> non-negative val generates 0x00000000.
1389           *  Predicated OR sets 1 if val is positive.
1390           */
1391          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1392
1393          emit(ASR(result_dst, op[0], src_reg(31)));
1394
1395          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1396          inst->predicate = BRW_PREDICATE_NORMAL;
1397       }
1398       break;
1399
1400    case ir_unop_rcp:
1401       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1402       break;
1403
1404    case ir_unop_exp2:
1405       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1406       break;
1407    case ir_unop_log2:
1408       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1409       break;
1410    case ir_unop_exp:
1411    case ir_unop_log:
1412       unreachable("not reached: should be handled by ir_explog_to_explog2");
1413    case ir_unop_sin:
1414    case ir_unop_sin_reduced:
1415       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1416       break;
1417    case ir_unop_cos:
1418    case ir_unop_cos_reduced:
1419       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1420       break;
1421
1422    case ir_unop_dFdx:
1423    case ir_unop_dFdx_coarse:
1424    case ir_unop_dFdx_fine:
1425    case ir_unop_dFdy:
1426    case ir_unop_dFdy_coarse:
1427    case ir_unop_dFdy_fine:
1428       unreachable("derivatives not valid in vertex shader");
1429
1430    case ir_unop_bitfield_reverse:
1431       emit(BFREV(result_dst, op[0]));
1432       break;
1433    case ir_unop_bit_count:
1434       emit(CBIT(result_dst, op[0]));
1435       break;
1436    case ir_unop_find_msb: {
1437       src_reg temp = src_reg(this, glsl_type::uint_type);
1438
1439       inst = emit(FBH(dst_reg(temp), op[0]));
1440       inst->dst.writemask = WRITEMASK_XYZW;
1441
1442       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1443        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1444        * subtract the result from 31 to convert the MSB count into an LSB count.
1445        */
1446
1447       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1448       temp.swizzle = BRW_SWIZZLE_NOOP;
1449       emit(MOV(result_dst, temp));
1450
1451       src_reg src_tmp = src_reg(result_dst);
1452       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1453
1454       src_tmp.negate = true;
1455       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1456       inst->predicate = BRW_PREDICATE_NORMAL;
1457       break;
1458    }
1459    case ir_unop_find_lsb:
1460       emit(FBL(result_dst, op[0]));
1461       break;
1462    case ir_unop_saturate:
1463       inst = emit(MOV(result_dst, op[0]));
1464       inst->saturate = true;
1465       break;
1466
1467    case ir_unop_noise:
1468       unreachable("not reached: should be handled by lower_noise");
1469
1470    case ir_binop_add:
1471       emit(ADD(result_dst, op[0], op[1]));
1472       break;
1473    case ir_binop_sub:
1474       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1475
1476    case ir_binop_mul:
1477       if (brw->gen < 8 && ir->type->is_integer()) {
1478          /* For integer multiplication, the MUL uses the low 16 bits of one of
1479           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1480           * accumulates in the contribution of the upper 16 bits of that
1481           * operand.  If we can determine that one of the args is in the low
1482           * 16 bits, though, we can just emit a single MUL.
1483           */
1484          if (ir->operands[0]->is_uint16_constant()) {
1485             if (brw->gen < 7)
1486                emit(MUL(result_dst, op[0], op[1]));
1487             else
1488                emit(MUL(result_dst, op[1], op[0]));
1489          } else if (ir->operands[1]->is_uint16_constant()) {
1490             if (brw->gen < 7)
1491                emit(MUL(result_dst, op[1], op[0]));
1492             else
1493                emit(MUL(result_dst, op[0], op[1]));
1494          } else {
1495             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1496
1497             emit(MUL(acc, op[0], op[1]));
1498             emit(MACH(dst_null_d(), op[0], op[1]));
1499             emit(MOV(result_dst, src_reg(acc)));
1500          }
1501       } else {
1502          emit(MUL(result_dst, op[0], op[1]));
1503       }
1504       break;
1505    case ir_binop_imul_high: {
1506       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1507
1508       emit(MUL(acc, op[0], op[1]));
1509       emit(MACH(result_dst, op[0], op[1]));
1510       break;
1511    }
1512    case ir_binop_div:
1513       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1514       assert(ir->type->is_integer());
1515       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1516       break;
1517    case ir_binop_carry: {
1518       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1519
1520       emit(ADDC(dst_null_ud(), op[0], op[1]));
1521       emit(MOV(result_dst, src_reg(acc)));
1522       break;
1523    }
1524    case ir_binop_borrow: {
1525       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1526
1527       emit(SUBB(dst_null_ud(), op[0], op[1]));
1528       emit(MOV(result_dst, src_reg(acc)));
1529       break;
1530    }
1531    case ir_binop_mod:
1532       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1533       assert(ir->type->is_integer());
1534       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1535       break;
1536
1537    case ir_binop_less:
1538    case ir_binop_greater:
1539    case ir_binop_lequal:
1540    case ir_binop_gequal:
1541    case ir_binop_equal:
1542    case ir_binop_nequal: {
1543       if (brw->gen <= 5) {
1544          resolve_bool_comparison(ir->operands[0], &op[0]);
1545          resolve_bool_comparison(ir->operands[1], &op[1]);
1546       }
1547       emit(CMP(result_dst, op[0], op[1],
1548                brw_conditional_for_comparison(ir->operation)));
1549       break;
1550    }
1551
1552    case ir_binop_all_equal:
1553       if (brw->gen <= 5) {
1554          resolve_bool_comparison(ir->operands[0], &op[0]);
1555          resolve_bool_comparison(ir->operands[1], &op[1]);
1556       }
1557
1558       /* "==" operator producing a scalar boolean. */
1559       if (ir->operands[0]->type->is_vector() ||
1560           ir->operands[1]->type->is_vector()) {
1561          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1562          emit(MOV(result_dst, src_reg(0)));
1563          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1564          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1565       } else {
1566          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1567       }
1568       break;
1569    case ir_binop_any_nequal:
1570       if (brw->gen <= 5) {
1571          resolve_bool_comparison(ir->operands[0], &op[0]);
1572          resolve_bool_comparison(ir->operands[1], &op[1]);
1573       }
1574
1575       /* "!=" operator producing a scalar boolean. */
1576       if (ir->operands[0]->type->is_vector() ||
1577           ir->operands[1]->type->is_vector()) {
1578          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1579
1580          emit(MOV(result_dst, src_reg(0)));
1581          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1582          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1583       } else {
1584          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1585       }
1586       break;
1587
1588    case ir_unop_any:
1589       if (brw->gen <= 5) {
1590          resolve_bool_comparison(ir->operands[0], &op[0]);
1591       }
1592       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1593       emit(MOV(result_dst, src_reg(0)));
1594
1595       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1596       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1597       break;
1598
1599    case ir_binop_logic_xor:
1600       emit(XOR(result_dst, op[0], op[1]));
1601       break;
1602
1603    case ir_binop_logic_or:
1604       emit(OR(result_dst, op[0], op[1]));
1605       break;
1606
1607    case ir_binop_logic_and:
1608       emit(AND(result_dst, op[0], op[1]));
1609       break;
1610
1611    case ir_binop_dot:
1612       assert(ir->operands[0]->type->is_vector());
1613       assert(ir->operands[0]->type == ir->operands[1]->type);
1614       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1615       break;
1616
1617    case ir_unop_sqrt:
1618       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1619       break;
1620    case ir_unop_rsq:
1621       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1622       break;
1623
1624    case ir_unop_bitcast_i2f:
1625    case ir_unop_bitcast_u2f:
1626       this->result = op[0];
1627       this->result.type = BRW_REGISTER_TYPE_F;
1628       break;
1629
1630    case ir_unop_bitcast_f2i:
1631       this->result = op[0];
1632       this->result.type = BRW_REGISTER_TYPE_D;
1633       break;
1634
1635    case ir_unop_bitcast_f2u:
1636       this->result = op[0];
1637       this->result.type = BRW_REGISTER_TYPE_UD;
1638       break;
1639
1640    case ir_unop_i2f:
1641    case ir_unop_i2u:
1642    case ir_unop_u2i:
1643    case ir_unop_u2f:
1644    case ir_unop_f2i:
1645    case ir_unop_f2u:
1646       emit(MOV(result_dst, op[0]));
1647       break;
1648    case ir_unop_b2i:
1649       emit(AND(result_dst, op[0], src_reg(1)));
1650       break;
1651    case ir_unop_b2f:
1652       if (brw->gen <= 5) {
1653          resolve_bool_comparison(ir->operands[0], &op[0]);
1654       }
1655       op[0].type = BRW_REGISTER_TYPE_D;
1656       result_dst.type = BRW_REGISTER_TYPE_D;
1657       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1658       result_dst.type = BRW_REGISTER_TYPE_F;
1659       break;
1660    case ir_unop_f2b:
1661       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1662       break;
1663    case ir_unop_i2b:
1664       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1665       break;
1666
1667    case ir_unop_trunc:
1668       emit(RNDZ(result_dst, op[0]));
1669       break;
1670    case ir_unop_ceil: {
1671          src_reg tmp = src_reg(this, ir->type);
1672          op[0].negate = !op[0].negate;
1673          emit(RNDD(dst_reg(tmp), op[0]));
1674          tmp.negate = true;
1675          emit(MOV(result_dst, tmp));
1676       }
1677       break;
1678    case ir_unop_floor:
1679       inst = emit(RNDD(result_dst, op[0]));
1680       break;
1681    case ir_unop_fract:
1682       inst = emit(FRC(result_dst, op[0]));
1683       break;
1684    case ir_unop_round_even:
1685       emit(RNDE(result_dst, op[0]));
1686       break;
1687
1688    case ir_binop_min:
1689       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1690       break;
1691    case ir_binop_max:
1692       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1693       break;
1694
1695    case ir_binop_pow:
1696       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1697       break;
1698
1699    case ir_unop_bit_not:
1700       inst = emit(NOT(result_dst, op[0]));
1701       break;
1702    case ir_binop_bit_and:
1703       inst = emit(AND(result_dst, op[0], op[1]));
1704       break;
1705    case ir_binop_bit_xor:
1706       inst = emit(XOR(result_dst, op[0], op[1]));
1707       break;
1708    case ir_binop_bit_or:
1709       inst = emit(OR(result_dst, op[0], op[1]));
1710       break;
1711
1712    case ir_binop_lshift:
1713       inst = emit(SHL(result_dst, op[0], op[1]));
1714       break;
1715
1716    case ir_binop_rshift:
1717       if (ir->type->base_type == GLSL_TYPE_INT)
1718          inst = emit(ASR(result_dst, op[0], op[1]));
1719       else
1720          inst = emit(SHR(result_dst, op[0], op[1]));
1721       break;
1722
1723    case ir_binop_bfm:
1724       emit(BFI1(result_dst, op[0], op[1]));
1725       break;
1726
1727    case ir_binop_ubo_load: {
1728       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1729       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1730       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1731       src_reg offset;
1732
1733       /* Now, load the vector from that offset. */
1734       assert(ir->type->is_vector() || ir->type->is_scalar());
1735
1736       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1737       packed_consts.type = result.type;
1738       src_reg surf_index;
1739
1740       if (const_uniform_block) {
1741          /* The block index is a constant, so just emit the binding table entry
1742           * as an immediate.
1743           */
1744          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1745                               const_uniform_block->value.u[0]);
1746       } else {
1747          /* The block index is not a constant. Evaluate the index expression
1748           * per-channel and add the base UBO index; the generator will select
1749           * a value from any live channel.
1750           */
1751          surf_index = src_reg(this, glsl_type::uint_type);
1752          emit(ADD(dst_reg(surf_index), op[0],
1753                   src_reg(prog_data->base.binding_table.ubo_start)));
1754
1755          /* Assume this may touch any UBO. It would be nice to provide
1756           * a tighter bound, but the array information is already lowered away.
1757           */
1758          brw_mark_surface_used(&prog_data->base,
1759                                prog_data->base.binding_table.ubo_start +
1760                                shader_prog->NumUniformBlocks - 1);
1761       }
1762
1763       if (const_offset_ir) {
1764          if (brw->gen >= 8) {
1765             /* Store the offset in a GRF so we can send-from-GRF. */
1766             offset = src_reg(this, glsl_type::int_type);
1767             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1768          } else {
1769             /* Immediates are fine on older generations since they'll be moved
1770              * to a (potentially fake) MRF at the generator level.
1771              */
1772             offset = src_reg(const_offset / 16);
1773          }
1774       } else {
1775          offset = src_reg(this, glsl_type::uint_type);
1776          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1777       }
1778
1779       if (brw->gen >= 7) {
1780          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1781
1782          /* We have to use a message header on Skylake to get SIMD4x2 mode.
1783           * Reserve space for the register.
1784           */
1785          if (brw->gen >= 9) {
1786             grf_offset.reg_offset++;
1787             alloc.sizes[grf_offset.reg] = 2;
1788          }
1789
1790          grf_offset.type = offset.type;
1791
1792          emit(MOV(grf_offset, offset));
1793
1794          vec4_instruction *pull =
1795             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1796                                                dst_reg(packed_consts),
1797                                                surf_index,
1798                                                src_reg(grf_offset)));
1799          pull->mlen = 1;
1800       } else {
1801          vec4_instruction *pull =
1802             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1803                                                dst_reg(packed_consts),
1804                                                surf_index,
1805                                                offset));
1806          pull->base_mrf = 14;
1807          pull->mlen = 1;
1808       }
1809
1810       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1811       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1812                                             const_offset % 16 / 4,
1813                                             const_offset % 16 / 4,
1814                                             const_offset % 16 / 4);
1815
1816       /* UBO bools are any nonzero int.  We need to convert them to use the
1817        * value of true stored in ctx->Const.UniformBooleanTrue.
1818        */
1819       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1820          emit(CMP(result_dst, packed_consts, src_reg(0u),
1821                   BRW_CONDITIONAL_NZ));
1822       } else {
1823          emit(MOV(result_dst, packed_consts));
1824       }
1825       break;
1826    }
1827
1828    case ir_binop_vector_extract:
1829       unreachable("should have been lowered by vec_index_to_cond_assign");
1830
1831    case ir_triop_fma:
1832       op[0] = fix_3src_operand(op[0]);
1833       op[1] = fix_3src_operand(op[1]);
1834       op[2] = fix_3src_operand(op[2]);
1835       /* Note that the instruction's argument order is reversed from GLSL
1836        * and the IR.
1837        */
1838       emit(MAD(result_dst, op[2], op[1], op[0]));
1839       break;
1840
1841    case ir_triop_lrp:
1842       emit_lrp(result_dst, op[0], op[1], op[2]);
1843       break;
1844
1845    case ir_triop_csel:
1846       unreachable("already handled above");
1847       break;
1848
1849    case ir_triop_bfi:
1850       op[0] = fix_3src_operand(op[0]);
1851       op[1] = fix_3src_operand(op[1]);
1852       op[2] = fix_3src_operand(op[2]);
1853       emit(BFI2(result_dst, op[0], op[1], op[2]));
1854       break;
1855
1856    case ir_triop_bitfield_extract:
1857       op[0] = fix_3src_operand(op[0]);
1858       op[1] = fix_3src_operand(op[1]);
1859       op[2] = fix_3src_operand(op[2]);
1860       /* Note that the instruction's argument order is reversed from GLSL
1861        * and the IR.
1862        */
1863       emit(BFE(result_dst, op[2], op[1], op[0]));
1864       break;
1865
1866    case ir_triop_vector_insert:
1867       unreachable("should have been lowered by lower_vector_insert");
1868
1869    case ir_quadop_bitfield_insert:
1870       unreachable("not reached: should be handled by "
1871               "bitfield_insert_to_bfm_bfi\n");
1872
1873    case ir_quadop_vector:
1874       unreachable("not reached: should be handled by lower_quadop_vector");
1875
1876    case ir_unop_pack_half_2x16:
1877       emit_pack_half_2x16(result_dst, op[0]);
1878       break;
1879    case ir_unop_unpack_half_2x16:
1880       emit_unpack_half_2x16(result_dst, op[0]);
1881       break;
1882    case ir_unop_unpack_unorm_4x8:
1883       emit_unpack_unorm_4x8(result_dst, op[0]);
1884       break;
1885    case ir_unop_unpack_snorm_4x8:
1886       emit_unpack_snorm_4x8(result_dst, op[0]);
1887       break;
1888    case ir_unop_pack_unorm_4x8:
1889       emit_pack_unorm_4x8(result_dst, op[0]);
1890       break;
1891    case ir_unop_pack_snorm_4x8:
1892       emit_pack_snorm_4x8(result_dst, op[0]);
1893       break;
1894    case ir_unop_pack_snorm_2x16:
1895    case ir_unop_pack_unorm_2x16:
1896    case ir_unop_unpack_snorm_2x16:
1897    case ir_unop_unpack_unorm_2x16:
1898       unreachable("not reached: should be handled by lower_packing_builtins");
1899    case ir_unop_unpack_half_2x16_split_x:
1900    case ir_unop_unpack_half_2x16_split_y:
1901    case ir_binop_pack_half_2x16_split:
1902    case ir_unop_interpolate_at_centroid:
1903    case ir_binop_interpolate_at_sample:
1904    case ir_binop_interpolate_at_offset:
1905       unreachable("not reached: should not occur in vertex shader");
1906    case ir_binop_ldexp:
1907       unreachable("not reached: should be handled by ldexp_to_arith()");
1908    case ir_unop_d2f:
1909    case ir_unop_f2d:
1910    case ir_unop_d2i:
1911    case ir_unop_i2d:
1912    case ir_unop_d2u:
1913    case ir_unop_u2d:
1914    case ir_unop_d2b:
1915    case ir_unop_pack_double_2x32:
1916    case ir_unop_unpack_double_2x32:
1917    case ir_unop_frexp_sig:
1918    case ir_unop_frexp_exp:
1919       unreachable("fp64 todo");
1920    }
1921 }
1922
1923
1924 void
1925 vec4_visitor::visit(ir_swizzle *ir)
1926 {
1927    /* Note that this is only swizzles in expressions, not those on the left
1928     * hand side of an assignment, which do write masking.  See ir_assignment
1929     * for that.
1930     */
1931    const unsigned swz = brw_compose_swizzle(
1932       brw_swizzle_for_size(ir->type->vector_elements),
1933       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1934
1935    ir->val->accept(this);
1936    this->result = swizzle(this->result, swz);
1937 }
1938
1939 void
1940 vec4_visitor::visit(ir_dereference_variable *ir)
1941 {
1942    const struct glsl_type *type = ir->type;
1943    dst_reg *reg = variable_storage(ir->var);
1944
1945    if (!reg) {
1946       fail("Failed to find variable storage for %s\n", ir->var->name);
1947       this->result = src_reg(brw_null_reg());
1948       return;
1949    }
1950
1951    this->result = src_reg(*reg);
1952
1953    /* System values get their swizzle from the dst_reg writemask */
1954    if (ir->var->data.mode == ir_var_system_value)
1955       return;
1956
1957    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1958       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
1959 }
1960
1961
1962 int
1963 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1964 {
1965    /* Under normal circumstances array elements are stored consecutively, so
1966     * the stride is equal to the size of the array element.
1967     */
1968    return type_size(ir->type);
1969 }
1970
1971
1972 void
1973 vec4_visitor::visit(ir_dereference_array *ir)
1974 {
1975    ir_constant *constant_index;
1976    src_reg src;
1977    int array_stride = compute_array_stride(ir);
1978
1979    constant_index = ir->array_index->constant_expression_value();
1980
1981    ir->array->accept(this);
1982    src = this->result;
1983
1984    if (constant_index) {
1985       src.reg_offset += constant_index->value.i[0] * array_stride;
1986    } else {
1987       /* Variable index array dereference.  It eats the "vec4" of the
1988        * base of the array and an index that offsets the Mesa register
1989        * index.
1990        */
1991       ir->array_index->accept(this);
1992
1993       src_reg index_reg;
1994
1995       if (array_stride == 1) {
1996          index_reg = this->result;
1997       } else {
1998          index_reg = src_reg(this, glsl_type::int_type);
1999
2000          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2001       }
2002
2003       if (src.reladdr) {
2004          src_reg temp = src_reg(this, glsl_type::int_type);
2005
2006          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2007
2008          index_reg = temp;
2009       }
2010
2011       src.reladdr = ralloc(mem_ctx, src_reg);
2012       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2013    }
2014
2015    /* If the type is smaller than a vec4, replicate the last channel out. */
2016    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2017       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2018    else
2019       src.swizzle = BRW_SWIZZLE_NOOP;
2020    src.type = brw_type_for_base_type(ir->type);
2021
2022    this->result = src;
2023 }
2024
2025 void
2026 vec4_visitor::visit(ir_dereference_record *ir)
2027 {
2028    unsigned int i;
2029    const glsl_type *struct_type = ir->record->type;
2030    int offset = 0;
2031
2032    ir->record->accept(this);
2033
2034    for (i = 0; i < struct_type->length; i++) {
2035       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2036          break;
2037       offset += type_size(struct_type->fields.structure[i].type);
2038    }
2039
2040    /* If the type is smaller than a vec4, replicate the last channel out. */
2041    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2042       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2043    else
2044       this->result.swizzle = BRW_SWIZZLE_NOOP;
2045    this->result.type = brw_type_for_base_type(ir->type);
2046
2047    this->result.reg_offset += offset;
2048 }
2049
2050 /**
2051  * We want to be careful in assignment setup to hit the actual storage
2052  * instead of potentially using a temporary like we might with the
2053  * ir_dereference handler.
2054  */
2055 static dst_reg
2056 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2057 {
2058    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2059     * access of a vector, it must be separated into a series conditional moves
2060     * before reaching this point (see ir_vec_index_to_cond_assign).
2061     */
2062    assert(ir->as_dereference());
2063    ir_dereference_array *deref_array = ir->as_dereference_array();
2064    if (deref_array) {
2065       assert(!deref_array->array->type->is_vector());
2066    }
2067
2068    /* Use the rvalue deref handler for the most part.  We'll ignore
2069     * swizzles in it and write swizzles using writemask, though.
2070     */
2071    ir->accept(v);
2072    return dst_reg(v->result);
2073 }
2074
2075 void
2076 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2077                               const struct glsl_type *type,
2078                               enum brw_predicate predicate)
2079 {
2080    if (type->base_type == GLSL_TYPE_STRUCT) {
2081       for (unsigned int i = 0; i < type->length; i++) {
2082          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2083       }
2084       return;
2085    }
2086
2087    if (type->is_array()) {
2088       for (unsigned int i = 0; i < type->length; i++) {
2089          emit_block_move(dst, src, type->fields.array, predicate);
2090       }
2091       return;
2092    }
2093
2094    if (type->is_matrix()) {
2095       const struct glsl_type *vec_type;
2096
2097       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2098                                          type->vector_elements, 1);
2099
2100       for (int i = 0; i < type->matrix_columns; i++) {
2101          emit_block_move(dst, src, vec_type, predicate);
2102       }
2103       return;
2104    }
2105
2106    assert(type->is_scalar() || type->is_vector());
2107
2108    dst->type = brw_type_for_base_type(type);
2109    src->type = dst->type;
2110
2111    dst->writemask = (1 << type->vector_elements) - 1;
2112
2113    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2114
2115    vec4_instruction *inst = emit(MOV(*dst, *src));
2116    inst->predicate = predicate;
2117
2118    dst->reg_offset++;
2119    src->reg_offset++;
2120 }
2121
2122
2123 /* If the RHS processing resulted in an instruction generating a
2124  * temporary value, and it would be easy to rewrite the instruction to
2125  * generate its result right into the LHS instead, do so.  This ends
2126  * up reliably removing instructions where it can be tricky to do so
2127  * later without real UD chain information.
2128  */
2129 bool
2130 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2131                                      dst_reg dst,
2132                                      src_reg src,
2133                                      vec4_instruction *pre_rhs_inst,
2134                                      vec4_instruction *last_rhs_inst)
2135 {
2136    /* This could be supported, but it would take more smarts. */
2137    if (ir->condition)
2138       return false;
2139
2140    if (pre_rhs_inst == last_rhs_inst)
2141       return false; /* No instructions generated to work with. */
2142
2143    /* Make sure the last instruction generated our source reg. */
2144    if (src.file != GRF ||
2145        src.file != last_rhs_inst->dst.file ||
2146        src.reg != last_rhs_inst->dst.reg ||
2147        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2148        src.reladdr ||
2149        src.abs ||
2150        src.negate ||
2151        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2152       return false;
2153
2154    /* Check that that last instruction fully initialized the channels
2155     * we want to use, in the order we want to use them.  We could
2156     * potentially reswizzle the operands of many instructions so that
2157     * we could handle out of order channels, but don't yet.
2158     */
2159
2160    for (unsigned i = 0; i < 4; i++) {
2161       if (dst.writemask & (1 << i)) {
2162          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2163             return false;
2164
2165          if (BRW_GET_SWZ(src.swizzle, i) != i)
2166             return false;
2167       }
2168    }
2169
2170    /* Success!  Rewrite the instruction. */
2171    last_rhs_inst->dst.file = dst.file;
2172    last_rhs_inst->dst.reg = dst.reg;
2173    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2174    last_rhs_inst->dst.reladdr = dst.reladdr;
2175    last_rhs_inst->dst.writemask &= dst.writemask;
2176
2177    return true;
2178 }
2179
2180 void
2181 vec4_visitor::visit(ir_assignment *ir)
2182 {
2183    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2184    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2185
2186    if (!ir->lhs->type->is_scalar() &&
2187        !ir->lhs->type->is_vector()) {
2188       ir->rhs->accept(this);
2189       src_reg src = this->result;
2190
2191       if (ir->condition) {
2192          emit_bool_to_cond_code(ir->condition, &predicate);
2193       }
2194
2195       /* emit_block_move doesn't account for swizzles in the source register.
2196        * This should be ok, since the source register is a structure or an
2197        * array, and those can't be swizzled.  But double-check to be sure.
2198        */
2199       assert(src.swizzle ==
2200              (ir->rhs->type->is_matrix()
2201               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2202               : BRW_SWIZZLE_NOOP));
2203
2204       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2205       return;
2206    }
2207
2208    /* Now we're down to just a scalar/vector with writemasks. */
2209    int i;
2210
2211    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2212    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2213
2214    ir->rhs->accept(this);
2215
2216    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2217
2218    int swizzles[4];
2219    int src_chan = 0;
2220
2221    assert(ir->lhs->type->is_vector() ||
2222           ir->lhs->type->is_scalar());
2223    dst.writemask = ir->write_mask;
2224
2225    /* Swizzle a small RHS vector into the channels being written.
2226     *
2227     * glsl ir treats write_mask as dictating how many channels are
2228     * present on the RHS while in our instructions we need to make
2229     * those channels appear in the slots of the vec4 they're written to.
2230     */
2231    for (int i = 0; i < 4; i++)
2232       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2233
2234    src_reg src = swizzle(this->result,
2235                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2236                                       swizzles[2], swizzles[3]));
2237
2238    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2239       return;
2240    }
2241
2242    if (ir->condition) {
2243       emit_bool_to_cond_code(ir->condition, &predicate);
2244    }
2245
2246    for (i = 0; i < type_size(ir->lhs->type); i++) {
2247       vec4_instruction *inst = emit(MOV(dst, src));
2248       inst->predicate = predicate;
2249
2250       dst.reg_offset++;
2251       src.reg_offset++;
2252    }
2253 }
2254
2255 void
2256 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2257 {
2258    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2259       foreach_in_list(ir_constant, field_value, &ir->components) {
2260          emit_constant_values(dst, field_value);
2261       }
2262       return;
2263    }
2264
2265    if (ir->type->is_array()) {
2266       for (unsigned int i = 0; i < ir->type->length; i++) {
2267          emit_constant_values(dst, ir->array_elements[i]);
2268       }
2269       return;
2270    }
2271
2272    if (ir->type->is_matrix()) {
2273       for (int i = 0; i < ir->type->matrix_columns; i++) {
2274          float *vec = &ir->value.f[i * ir->type->vector_elements];
2275
2276          for (int j = 0; j < ir->type->vector_elements; j++) {
2277             dst->writemask = 1 << j;
2278             dst->type = BRW_REGISTER_TYPE_F;
2279
2280             emit(MOV(*dst, src_reg(vec[j])));
2281          }
2282          dst->reg_offset++;
2283       }
2284       return;
2285    }
2286
2287    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2288
2289    for (int i = 0; i < ir->type->vector_elements; i++) {
2290       if (!(remaining_writemask & (1 << i)))
2291          continue;
2292
2293       dst->writemask = 1 << i;
2294       dst->type = brw_type_for_base_type(ir->type);
2295
2296       /* Find other components that match the one we're about to
2297        * write.  Emits fewer instructions for things like vec4(0.5,
2298        * 1.5, 1.5, 1.5).
2299        */
2300       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2301          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2302             if (ir->value.b[i] == ir->value.b[j])
2303                dst->writemask |= (1 << j);
2304          } else {
2305             /* u, i, and f storage all line up, so no need for a
2306              * switch case for comparing each type.
2307              */
2308             if (ir->value.u[i] == ir->value.u[j])
2309                dst->writemask |= (1 << j);
2310          }
2311       }
2312
2313       switch (ir->type->base_type) {
2314       case GLSL_TYPE_FLOAT:
2315          emit(MOV(*dst, src_reg(ir->value.f[i])));
2316          break;
2317       case GLSL_TYPE_INT:
2318          emit(MOV(*dst, src_reg(ir->value.i[i])));
2319          break;
2320       case GLSL_TYPE_UINT:
2321          emit(MOV(*dst, src_reg(ir->value.u[i])));
2322          break;
2323       case GLSL_TYPE_BOOL:
2324          emit(MOV(*dst,
2325                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2326                                               : 0)));
2327          break;
2328       default:
2329          unreachable("Non-float/uint/int/bool constant");
2330       }
2331
2332       remaining_writemask &= ~dst->writemask;
2333    }
2334    dst->reg_offset++;
2335 }
2336
2337 void
2338 vec4_visitor::visit(ir_constant *ir)
2339 {
2340    dst_reg dst = dst_reg(this, ir->type);
2341    this->result = src_reg(dst);
2342
2343    emit_constant_values(&dst, ir);
2344 }
2345
2346 void
2347 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2348 {
2349    ir_dereference *deref = static_cast<ir_dereference *>(
2350       ir->actual_parameters.get_head());
2351    ir_variable *location = deref->variable_referenced();
2352    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2353                           location->data.binding);
2354
2355    /* Calculate the surface offset */
2356    src_reg offset(this, glsl_type::uint_type);
2357    ir_dereference_array *deref_array = deref->as_dereference_array();
2358    if (deref_array) {
2359       deref_array->array_index->accept(this);
2360
2361       src_reg tmp(this, glsl_type::uint_type);
2362       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2363       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2364    } else {
2365       offset = location->data.atomic.offset;
2366    }
2367
2368    /* Emit the appropriate machine instruction */
2369    const char *callee = ir->callee->function_name();
2370    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2371
2372    if (!strcmp("__intrinsic_atomic_read", callee)) {
2373       emit_untyped_surface_read(surf_index, dst, offset);
2374
2375    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2376       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2377                           src_reg(), src_reg());
2378
2379    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2380       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2381                           src_reg(), src_reg());
2382    }
2383 }
2384
2385 void
2386 vec4_visitor::visit(ir_call *ir)
2387 {
2388    const char *callee = ir->callee->function_name();
2389
2390    if (!strcmp("__intrinsic_atomic_read", callee) ||
2391        !strcmp("__intrinsic_atomic_increment", callee) ||
2392        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2393       visit_atomic_counter_intrinsic(ir);
2394    } else {
2395       unreachable("Unsupported intrinsic.");
2396    }
2397 }
2398
2399 src_reg
2400 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2401 {
2402    vec4_instruction *inst =
2403       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2404                                     dst_reg(this, glsl_type::uvec4_type));
2405    inst->base_mrf = 2;
2406    inst->mlen = 1;
2407    inst->src[1] = sampler;
2408
2409    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2410    int param_base = inst->base_mrf;
2411    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2412    int zero_mask = 0xf & ~coord_mask;
2413
2414    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2415             coordinate));
2416
2417    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2418             src_reg(0)));
2419
2420    emit(inst);
2421    return src_reg(inst->dst);
2422 }
2423
2424 static bool
2425 is_high_sampler(struct brw_context *brw, src_reg sampler)
2426 {
2427    if (brw->gen < 8 && !brw->is_haswell)
2428       return false;
2429
2430    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2431 }
2432
2433 void
2434 vec4_visitor::visit(ir_texture *ir)
2435 {
2436    uint32_t sampler =
2437       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2438
2439    ir_rvalue *nonconst_sampler_index =
2440       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2441
2442    /* Handle non-constant sampler array indexing */
2443    src_reg sampler_reg;
2444    if (nonconst_sampler_index) {
2445       /* The highest sampler which may be used by this operation is
2446        * the last element of the array. Mark it here, because the generator
2447        * doesn't have enough information to determine the bound.
2448        */
2449       uint32_t array_size = ir->sampler->as_dereference_array()
2450          ->array->type->array_size();
2451
2452       uint32_t max_used = sampler + array_size - 1;
2453       if (ir->op == ir_tg4 && brw->gen < 8) {
2454          max_used += prog_data->base.binding_table.gather_texture_start;
2455       } else {
2456          max_used += prog_data->base.binding_table.texture_start;
2457       }
2458
2459       brw_mark_surface_used(&prog_data->base, max_used);
2460
2461       /* Emit code to evaluate the actual indexing expression */
2462       nonconst_sampler_index->accept(this);
2463       dst_reg temp(this, glsl_type::uint_type);
2464       emit(ADD(temp, this->result, src_reg(sampler)))
2465          ->force_writemask_all = true;
2466       sampler_reg = src_reg(temp);
2467    } else {
2468       /* Single sampler, or constant array index; the indexing expression
2469        * is just an immediate.
2470        */
2471       sampler_reg = src_reg(sampler);
2472    }
2473
2474    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2475     * emitting anything other than setting up the constant result.
2476     */
2477    if (ir->op == ir_tg4) {
2478       ir_constant *chan = ir->lod_info.component->as_constant();
2479       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2480       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2481          dst_reg result(this, ir->type);
2482          this->result = src_reg(result);
2483          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2484          return;
2485       }
2486    }
2487
2488    /* Should be lowered by do_lower_texture_projection */
2489    assert(!ir->projector);
2490
2491    /* Should be lowered */
2492    assert(!ir->offset || !ir->offset->type->is_array());
2493
2494    /* Generate code to compute all the subexpression trees.  This has to be
2495     * done before loading any values into MRFs for the sampler message since
2496     * generating these values may involve SEND messages that need the MRFs.
2497     */
2498    src_reg coordinate;
2499    if (ir->coordinate) {
2500       ir->coordinate->accept(this);
2501       coordinate = this->result;
2502    }
2503
2504    src_reg shadow_comparitor;
2505    if (ir->shadow_comparitor) {
2506       ir->shadow_comparitor->accept(this);
2507       shadow_comparitor = this->result;
2508    }
2509
2510    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2511    src_reg offset_value;
2512    if (has_nonconstant_offset) {
2513       ir->offset->accept(this);
2514       offset_value = src_reg(this->result);
2515    }
2516
2517    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2518    src_reg lod, dPdx, dPdy, sample_index, mcs;
2519    switch (ir->op) {
2520    case ir_tex:
2521       lod = src_reg(0.0f);
2522       lod_type = glsl_type::float_type;
2523       break;
2524    case ir_txf:
2525    case ir_txl:
2526    case ir_txs:
2527       ir->lod_info.lod->accept(this);
2528       lod = this->result;
2529       lod_type = ir->lod_info.lod->type;
2530       break;
2531    case ir_query_levels:
2532       lod = src_reg(0);
2533       lod_type = glsl_type::int_type;
2534       break;
2535    case ir_txf_ms:
2536       ir->lod_info.sample_index->accept(this);
2537       sample_index = this->result;
2538       sample_index_type = ir->lod_info.sample_index->type;
2539
2540       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2541          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2542       else
2543          mcs = src_reg(0u);
2544       break;
2545    case ir_txd:
2546       ir->lod_info.grad.dPdx->accept(this);
2547       dPdx = this->result;
2548
2549       ir->lod_info.grad.dPdy->accept(this);
2550       dPdy = this->result;
2551
2552       lod_type = ir->lod_info.grad.dPdx->type;
2553       break;
2554    case ir_txb:
2555    case ir_lod:
2556    case ir_tg4:
2557       break;
2558    }
2559
2560    enum opcode opcode;
2561    switch (ir->op) {
2562    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2563    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2564    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2565    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2566    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2567    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2568    case ir_tg4: opcode = has_nonconstant_offset
2569                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2570    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2571    case ir_txb:
2572       unreachable("TXB is not valid for vertex shaders.");
2573    case ir_lod:
2574       unreachable("LOD is not valid for vertex shaders.");
2575    default:
2576       unreachable("Unrecognized tex op");
2577    }
2578
2579    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2580       opcode, dst_reg(this, ir->type));
2581
2582    if (ir->offset != NULL && !has_nonconstant_offset) {
2583       inst->offset =
2584          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2585                             ir->offset->type->vector_elements);
2586    }
2587
2588    /* Stuff the channel select bits in the top of the texture offset */
2589    if (ir->op == ir_tg4)
2590       inst->offset |= gather_channel(ir, sampler) << 16;
2591
2592    /* The message header is necessary for:
2593     * - Gen4 (always)
2594     * - Gen9+ for selecting SIMD4x2
2595     * - Texel offsets
2596     * - Gather channel selection
2597     * - Sampler indices too large to fit in a 4-bit value.
2598     */
2599    inst->header_present =
2600       brw->gen < 5 || brw->gen >= 9 ||
2601       inst->offset != 0 || ir->op == ir_tg4 ||
2602       is_high_sampler(brw, sampler_reg);
2603    inst->base_mrf = 2;
2604    inst->mlen = inst->header_present + 1; /* always at least one */
2605    inst->dst.writemask = WRITEMASK_XYZW;
2606    inst->shadow_compare = ir->shadow_comparitor != NULL;
2607
2608    inst->src[1] = sampler_reg;
2609
2610    /* MRF for the first parameter */
2611    int param_base = inst->base_mrf + inst->header_present;
2612
2613    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2614       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2615       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2616    } else {
2617       /* Load the coordinate */
2618       /* FINISHME: gl_clamp_mask and saturate */
2619       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2620       int zero_mask = 0xf & ~coord_mask;
2621
2622       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2623                coordinate));
2624
2625       if (zero_mask != 0) {
2626          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2627                   src_reg(0)));
2628       }
2629       /* Load the shadow comparitor */
2630       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2631          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2632                           WRITEMASK_X),
2633                   shadow_comparitor));
2634          inst->mlen++;
2635       }
2636
2637       /* Load the LOD info */
2638       if (ir->op == ir_tex || ir->op == ir_txl) {
2639          int mrf, writemask;
2640          if (brw->gen >= 5) {
2641             mrf = param_base + 1;
2642             if (ir->shadow_comparitor) {
2643                writemask = WRITEMASK_Y;
2644                /* mlen already incremented */
2645             } else {
2646                writemask = WRITEMASK_X;
2647                inst->mlen++;
2648             }
2649          } else /* brw->gen == 4 */ {
2650             mrf = param_base;
2651             writemask = WRITEMASK_W;
2652          }
2653          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2654       } else if (ir->op == ir_txf) {
2655          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2656       } else if (ir->op == ir_txf_ms) {
2657          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2658                   sample_index));
2659          if (brw->gen >= 7) {
2660             /* MCS data is in the first channel of `mcs`, but we need to get it into
2661              * the .y channel of the second vec4 of params, so replicate .x across
2662              * the whole vec4 and then mask off everything except .y
2663              */
2664             mcs.swizzle = BRW_SWIZZLE_XXXX;
2665             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2666                      mcs));
2667          }
2668          inst->mlen++;
2669       } else if (ir->op == ir_txd) {
2670          const glsl_type *type = lod_type;
2671
2672          if (brw->gen >= 5) {
2673             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2674             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2675             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2676             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2677             inst->mlen++;
2678
2679             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2680                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2681                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2682                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2683                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2684                inst->mlen++;
2685
2686                if (ir->shadow_comparitor) {
2687                   emit(MOV(dst_reg(MRF, param_base + 2,
2688                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2689                            shadow_comparitor));
2690                }
2691             }
2692          } else /* brw->gen == 4 */ {
2693             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2694             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2695             inst->mlen += 2;
2696          }
2697       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2698          if (ir->shadow_comparitor) {
2699             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2700                      shadow_comparitor));
2701          }
2702
2703          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2704                   offset_value));
2705          inst->mlen++;
2706       }
2707    }
2708
2709    emit(inst);
2710
2711    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2712     * spec requires layers.
2713     */
2714    if (ir->op == ir_txs) {
2715       glsl_type const *type = ir->sampler->type;
2716       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2717           type->sampler_array) {
2718          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2719                    writemask(inst->dst, WRITEMASK_Z),
2720                    src_reg(inst->dst), src_reg(6));
2721       }
2722    }
2723
2724    if (brw->gen == 6 && ir->op == ir_tg4) {
2725       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2726    }
2727
2728    swizzle_result(ir, src_reg(inst->dst), sampler);
2729 }
2730
2731 /**
2732  * Apply workarounds for Gen6 gather with UINT/SINT
2733  */
2734 void
2735 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2736 {
2737    if (!wa)
2738       return;
2739
2740    int width = (wa & WA_8BIT) ? 8 : 16;
2741    dst_reg dst_f = dst;
2742    dst_f.type = BRW_REGISTER_TYPE_F;
2743
2744    /* Convert from UNORM to UINT */
2745    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2746    emit(MOV(dst, src_reg(dst_f)));
2747
2748    if (wa & WA_SIGN) {
2749       /* Reinterpret the UINT value as a signed INT value by
2750        * shifting the sign bit into place, then shifting back
2751        * preserving sign.
2752        */
2753       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2754       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2755    }
2756 }
2757
2758 /**
2759  * Set up the gather channel based on the swizzle, for gather4.
2760  */
2761 uint32_t
2762 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2763 {
2764    ir_constant *chan = ir->lod_info.component->as_constant();
2765    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2766    switch (swiz) {
2767       case SWIZZLE_X: return 0;
2768       case SWIZZLE_Y:
2769          /* gather4 sampler is broken for green channel on RG32F --
2770           * we must ask for blue instead.
2771           */
2772          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2773             return 2;
2774          return 1;
2775       case SWIZZLE_Z: return 2;
2776       case SWIZZLE_W: return 3;
2777       default:
2778          unreachable("Not reached"); /* zero, one swizzles handled already */
2779    }
2780 }
2781
2782 void
2783 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2784 {
2785    int s = key->tex.swizzles[sampler];
2786
2787    this->result = src_reg(this, ir->type);
2788    dst_reg swizzled_result(this->result);
2789
2790    if (ir->op == ir_query_levels) {
2791       /* # levels is in .w */
2792       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2793       emit(MOV(swizzled_result, orig_val));
2794       return;
2795    }
2796
2797    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2798                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2799       emit(MOV(swizzled_result, orig_val));
2800       return;
2801    }
2802
2803
2804    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2805    int swizzle[4] = {0};
2806
2807    for (int i = 0; i < 4; i++) {
2808       switch (GET_SWZ(s, i)) {
2809       case SWIZZLE_ZERO:
2810          zero_mask |= (1 << i);
2811          break;
2812       case SWIZZLE_ONE:
2813          one_mask |= (1 << i);
2814          break;
2815       default:
2816          copy_mask |= (1 << i);
2817          swizzle[i] = GET_SWZ(s, i);
2818          break;
2819       }
2820    }
2821
2822    if (copy_mask) {
2823       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2824       swizzled_result.writemask = copy_mask;
2825       emit(MOV(swizzled_result, orig_val));
2826    }
2827
2828    if (zero_mask) {
2829       swizzled_result.writemask = zero_mask;
2830       emit(MOV(swizzled_result, src_reg(0.0f)));
2831    }
2832
2833    if (one_mask) {
2834       swizzled_result.writemask = one_mask;
2835       emit(MOV(swizzled_result, src_reg(1.0f)));
2836    }
2837 }
2838
2839 void
2840 vec4_visitor::visit(ir_return *)
2841 {
2842    unreachable("not reached");
2843 }
2844
2845 void
2846 vec4_visitor::visit(ir_discard *)
2847 {
2848    unreachable("not reached");
2849 }
2850
2851 void
2852 vec4_visitor::visit(ir_if *ir)
2853 {
2854    /* Don't point the annotation at the if statement, because then it plus
2855     * the then and else blocks get printed.
2856     */
2857    this->base_ir = ir->condition;
2858
2859    if (brw->gen == 6) {
2860       emit_if_gen6(ir);
2861    } else {
2862       enum brw_predicate predicate;
2863       emit_bool_to_cond_code(ir->condition, &predicate);
2864       emit(IF(predicate));
2865    }
2866
2867    visit_instructions(&ir->then_instructions);
2868
2869    if (!ir->else_instructions.is_empty()) {
2870       this->base_ir = ir->condition;
2871       emit(BRW_OPCODE_ELSE);
2872
2873       visit_instructions(&ir->else_instructions);
2874    }
2875
2876    this->base_ir = ir->condition;
2877    emit(BRW_OPCODE_ENDIF);
2878 }
2879
2880 void
2881 vec4_visitor::visit(ir_emit_vertex *)
2882 {
2883    unreachable("not reached");
2884 }
2885
2886 void
2887 vec4_visitor::visit(ir_end_primitive *)
2888 {
2889    unreachable("not reached");
2890 }
2891
2892 void
2893 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2894                                   dst_reg dst, src_reg offset,
2895                                   src_reg src0, src_reg src1)
2896 {
2897    unsigned mlen = 0;
2898
2899    /* Set the atomic operation offset. */
2900    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2901    mlen++;
2902
2903    /* Set the atomic operation arguments. */
2904    if (src0.file != BAD_FILE) {
2905       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2906       mlen++;
2907    }
2908
2909    if (src1.file != BAD_FILE) {
2910       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2911       mlen++;
2912    }
2913
2914    /* Emit the instruction.  Note that this maps to the normal SIMD8
2915     * untyped atomic message on Ivy Bridge, but that's OK because
2916     * unused channels will be masked out.
2917     */
2918    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2919                                  src_reg(atomic_op), src_reg(surf_index));
2920    inst->base_mrf = 0;
2921    inst->mlen = mlen;
2922 }
2923
2924 void
2925 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2926                                         src_reg offset)
2927 {
2928    /* Set the surface read offset. */
2929    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2930
2931    /* Emit the instruction.  Note that this maps to the normal SIMD8
2932     * untyped surface read message, but that's OK because unused
2933     * channels will be masked out.
2934     */
2935    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2936                                  dst, src_reg(surf_index));
2937    inst->base_mrf = 0;
2938    inst->mlen = 1;
2939 }
2940
2941 void
2942 vec4_visitor::emit_ndc_computation()
2943 {
2944    /* Get the position */
2945    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2946
2947    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2948    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2949    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2950
2951    current_annotation = "NDC";
2952    dst_reg ndc_w = ndc;
2953    ndc_w.writemask = WRITEMASK_W;
2954    src_reg pos_w = pos;
2955    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2956    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2957
2958    dst_reg ndc_xyz = ndc;
2959    ndc_xyz.writemask = WRITEMASK_XYZ;
2960
2961    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2962 }
2963
2964 void
2965 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2966 {
2967    if (brw->gen < 6 &&
2968        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2969         key->userclip_active || brw->has_negative_rhw_bug)) {
2970       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2971       dst_reg header1_w = header1;
2972       header1_w.writemask = WRITEMASK_W;
2973
2974       emit(MOV(header1, 0u));
2975
2976       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2977          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2978
2979          current_annotation = "Point size";
2980          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2981          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2982       }
2983
2984       if (key->userclip_active) {
2985          current_annotation = "Clipping flags";
2986          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2987          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2988
2989          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2990          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2991          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2992
2993          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2994          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2995          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2996          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2997       }
2998
2999       /* i965 clipping workaround:
3000        * 1) Test for -ve rhw
3001        * 2) If set,
3002        *      set ndc = (0,0,0,0)
3003        *      set ucp[6] = 1
3004        *
3005        * Later, clipping will detect ucp[6] and ensure the primitive is
3006        * clipped against all fixed planes.
3007        */
3008       if (brw->has_negative_rhw_bug) {
3009          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3010          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3011          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3012          vec4_instruction *inst;
3013          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3014          inst->predicate = BRW_PREDICATE_NORMAL;
3015          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3016          inst->predicate = BRW_PREDICATE_NORMAL;
3017       }
3018
3019       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3020    } else if (brw->gen < 6) {
3021       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3022    } else {
3023       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3024       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3025          dst_reg reg_w = reg;
3026          reg_w.writemask = WRITEMASK_W;
3027          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3028       }
3029       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3030          dst_reg reg_y = reg;
3031          reg_y.writemask = WRITEMASK_Y;
3032          reg_y.type = BRW_REGISTER_TYPE_D;
3033          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3034       }
3035       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3036          dst_reg reg_z = reg;
3037          reg_z.writemask = WRITEMASK_Z;
3038          reg_z.type = BRW_REGISTER_TYPE_D;
3039          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3040       }
3041    }
3042 }
3043
3044 void
3045 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3046 {
3047    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3048     *
3049     *     "If a linked set of shaders forming the vertex stage contains no
3050     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3051     *     application has requested clipping against user clip planes through
3052     *     the API, then the coordinate written to gl_Position is used for
3053     *     comparison against the user clip planes."
3054     *
3055     * This function is only called if the shader didn't write to
3056     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3057     * if the user wrote to it; otherwise we use gl_Position.
3058     */
3059    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3060    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3061       clip_vertex = VARYING_SLOT_POS;
3062    }
3063
3064    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3065         ++i) {
3066       reg.writemask = 1 << i;
3067       emit(DP4(reg,
3068                src_reg(output_reg[clip_vertex]),
3069                src_reg(this->userplane[i + offset])));
3070    }
3071 }
3072
3073 vec4_instruction *
3074 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3075 {
3076    assert (varying < VARYING_SLOT_MAX);
3077    reg.type = output_reg[varying].type;
3078    current_annotation = output_reg_annotation[varying];
3079    /* Copy the register, saturating if necessary */
3080    return emit(MOV(reg, src_reg(output_reg[varying])));
3081 }
3082
3083 void
3084 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3085 {
3086    reg.type = BRW_REGISTER_TYPE_F;
3087
3088    switch (varying) {
3089    case VARYING_SLOT_PSIZ:
3090    {
3091       /* PSIZ is always in slot 0, and is coupled with other flags. */
3092       current_annotation = "indices, point width, clip flags";
3093       emit_psiz_and_flags(reg);
3094       break;
3095    }
3096    case BRW_VARYING_SLOT_NDC:
3097       current_annotation = "NDC";
3098       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3099       break;
3100    case VARYING_SLOT_POS:
3101       current_annotation = "gl_Position";
3102       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3103       break;
3104    case VARYING_SLOT_EDGE:
3105       /* This is present when doing unfilled polygons.  We're supposed to copy
3106        * the edge flag from the user-provided vertex array
3107        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3108        * of that attribute (starts as 1.0f).  This is then used in clipping to
3109        * determine which edges should be drawn as wireframe.
3110        */
3111       current_annotation = "edge flag";
3112       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3113                                     glsl_type::float_type, WRITEMASK_XYZW))));
3114       break;
3115    case BRW_VARYING_SLOT_PAD:
3116       /* No need to write to this slot */
3117       break;
3118    case VARYING_SLOT_COL0:
3119    case VARYING_SLOT_COL1:
3120    case VARYING_SLOT_BFC0:
3121    case VARYING_SLOT_BFC1: {
3122       /* These built-in varyings are only supported in compatibility mode,
3123        * and we only support GS in core profile.  So, this must be a vertex
3124        * shader.
3125        */
3126       assert(stage == MESA_SHADER_VERTEX);
3127       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3128       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3129          inst->saturate = true;
3130       break;
3131    }
3132
3133    default:
3134       emit_generic_urb_slot(reg, varying);
3135       break;
3136    }
3137 }
3138
3139 static int
3140 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3141 {
3142    if (brw->gen >= 6) {
3143       /* URB data written (does not include the message header reg) must
3144        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3145        * section 5.4.3.2.2: URB_INTERLEAVED.
3146        *
3147        * URB entries are allocated on a multiple of 1024 bits, so an
3148        * extra 128 bits written here to make the end align to 256 is
3149        * no problem.
3150        */
3151       if ((mlen % 2) != 1)
3152          mlen++;
3153    }
3154
3155    return mlen;
3156 }
3157
3158
3159 /**
3160  * Generates the VUE payload plus the necessary URB write instructions to
3161  * output it.
3162  *
3163  * The VUE layout is documented in Volume 2a.
3164  */
3165 void
3166 vec4_visitor::emit_vertex()
3167 {
3168    /* MRF 0 is reserved for the debugger, so start with message header
3169     * in MRF 1.
3170     */
3171    int base_mrf = 1;
3172    int mrf = base_mrf;
3173    /* In the process of generating our URB write message contents, we
3174     * may need to unspill a register or load from an array.  Those
3175     * reads would use MRFs 14-15.
3176     */
3177    int max_usable_mrf = 13;
3178
3179    /* The following assertion verifies that max_usable_mrf causes an
3180     * even-numbered amount of URB write data, which will meet gen6's
3181     * requirements for length alignment.
3182     */
3183    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3184
3185    /* First mrf is the g0-based message header containing URB handles and
3186     * such.
3187     */
3188    emit_urb_write_header(mrf++);
3189
3190    if (brw->gen < 6) {
3191       emit_ndc_computation();
3192    }
3193
3194    /* Lower legacy ff and ClipVertex clipping to clip distances */
3195    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3196       current_annotation = "user clip distances";
3197
3198       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3199       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3200
3201       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3202       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3203    }
3204
3205    /* We may need to split this up into several URB writes, so do them in a
3206     * loop.
3207     */
3208    int slot = 0;
3209    bool complete = false;
3210    do {
3211       /* URB offset is in URB row increments, and each of our MRFs is half of
3212        * one of those, since we're doing interleaved writes.
3213        */
3214       int offset = slot / 2;
3215
3216       mrf = base_mrf + 1;
3217       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3218          emit_urb_slot(dst_reg(MRF, mrf++),
3219                        prog_data->vue_map.slot_to_varying[slot]);
3220
3221          /* If this was max_usable_mrf, we can't fit anything more into this
3222           * URB WRITE.
3223           */
3224          if (mrf > max_usable_mrf) {
3225             slot++;
3226             break;
3227          }
3228       }
3229
3230       complete = slot >= prog_data->vue_map.num_slots;
3231       current_annotation = "URB write";
3232       vec4_instruction *inst = emit_urb_write_opcode(complete);
3233       inst->base_mrf = base_mrf;
3234       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3235       inst->offset += offset;
3236    } while(!complete);
3237 }
3238
3239
3240 src_reg
3241 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3242                                  src_reg *reladdr, int reg_offset)
3243 {
3244    /* Because we store the values to scratch interleaved like our
3245     * vertex data, we need to scale the vec4 index by 2.
3246     */
3247    int message_header_scale = 2;
3248
3249    /* Pre-gen6, the message header uses byte offsets instead of vec4
3250     * (16-byte) offset units.
3251     */
3252    if (brw->gen < 6)
3253       message_header_scale *= 16;
3254
3255    if (reladdr) {
3256       src_reg index = src_reg(this, glsl_type::int_type);
3257
3258       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3259                                    src_reg(reg_offset)));
3260       emit_before(block, inst, MUL(dst_reg(index), index,
3261                                    src_reg(message_header_scale)));
3262
3263       return index;
3264    } else {
3265       return src_reg(reg_offset * message_header_scale);
3266    }
3267 }
3268
3269 src_reg
3270 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3271                                        src_reg *reladdr, int reg_offset)
3272 {
3273    if (reladdr) {
3274       src_reg index = src_reg(this, glsl_type::int_type);
3275
3276       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3277                                    src_reg(reg_offset)));
3278
3279       /* Pre-gen6, the message header uses byte offsets instead of vec4
3280        * (16-byte) offset units.
3281        */
3282       if (brw->gen < 6) {
3283          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3284       }
3285
3286       return index;
3287    } else if (brw->gen >= 8) {
3288       /* Store the offset in a GRF so we can send-from-GRF. */
3289       src_reg offset = src_reg(this, glsl_type::int_type);
3290       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3291       return offset;
3292    } else {
3293       int message_header_scale = brw->gen < 6 ? 16 : 1;
3294       return src_reg(reg_offset * message_header_scale);
3295    }
3296 }
3297
3298 /**
3299  * Emits an instruction before @inst to load the value named by @orig_src
3300  * from scratch space at @base_offset to @temp.
3301  *
3302  * @base_offset is measured in 32-byte units (the size of a register).
3303  */
3304 void
3305 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3306                                 dst_reg temp, src_reg orig_src,
3307                                 int base_offset)
3308 {
3309    int reg_offset = base_offset + orig_src.reg_offset;
3310    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3311                                       reg_offset);
3312
3313    emit_before(block, inst, SCRATCH_READ(temp, index));
3314 }
3315
3316 /**
3317  * Emits an instruction after @inst to store the value to be written
3318  * to @orig_dst to scratch space at @base_offset, from @temp.
3319  *
3320  * @base_offset is measured in 32-byte units (the size of a register).
3321  */
3322 void
3323 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3324                                  int base_offset)
3325 {
3326    int reg_offset = base_offset + inst->dst.reg_offset;
3327    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3328                                       reg_offset);
3329
3330    /* Create a temporary register to store *inst's result in.
3331     *
3332     * We have to be careful in MOVing from our temporary result register in
3333     * the scratch write.  If we swizzle from channels of the temporary that
3334     * weren't initialized, it will confuse live interval analysis, which will
3335     * make spilling fail to make progress.
3336     */
3337    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3338                                        inst->dst.type),
3339                                 brw_swizzle_for_mask(inst->dst.writemask));
3340    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3341                                        inst->dst.writemask));
3342    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3343    write->predicate = inst->predicate;
3344    write->ir = inst->ir;
3345    write->annotation = inst->annotation;
3346    inst->insert_after(block, write);
3347
3348    inst->dst.file = temp.file;
3349    inst->dst.reg = temp.reg;
3350    inst->dst.reg_offset = temp.reg_offset;
3351    inst->dst.reladdr = NULL;
3352 }
3353
3354 /**
3355  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3356  * adds the scratch read(s) before \p inst. The function also checks for
3357  * recursive reladdr scratch accesses, issuing the corresponding scratch
3358  * loads and rewriting reladdr references accordingly.
3359  *
3360  * \return \p src if it did not require a scratch load, otherwise, the
3361  * register holding the result of the scratch load that the caller should
3362  * use to rewrite src.
3363  */
3364 src_reg
3365 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3366                                    vec4_instruction *inst, src_reg src)
3367 {
3368    /* Resolve recursive reladdr scratch access by calling ourselves
3369     * with src.reladdr
3370     */
3371    if (src.reladdr)
3372       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3373                                           *src.reladdr);
3374
3375    /* Now handle scratch access on src */
3376    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3377       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3378       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3379       src.reg = temp.reg;
3380       src.reg_offset = temp.reg_offset;
3381       src.reladdr = NULL;
3382    }
3383
3384    return src;
3385 }
3386
3387 /**
3388  * We can't generally support array access in GRF space, because a
3389  * single instruction's destination can only span 2 contiguous
3390  * registers.  So, we send all GRF arrays that get variable index
3391  * access to scratch space.
3392  */
3393 void
3394 vec4_visitor::move_grf_array_access_to_scratch()
3395 {
3396    int scratch_loc[this->alloc.count];
3397    memset(scratch_loc, -1, sizeof(scratch_loc));
3398
3399    /* First, calculate the set of virtual GRFs that need to be punted
3400     * to scratch due to having any array access on them, and where in
3401     * scratch.
3402     */
3403    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3404       if (inst->dst.file == GRF && inst->dst.reladdr) {
3405          if (scratch_loc[inst->dst.reg] == -1) {
3406             scratch_loc[inst->dst.reg] = c->last_scratch;
3407             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3408          }
3409
3410          for (src_reg *iter = inst->dst.reladdr;
3411               iter->reladdr;
3412               iter = iter->reladdr) {
3413             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3414                scratch_loc[iter->reg] = c->last_scratch;
3415                c->last_scratch += this->alloc.sizes[iter->reg];
3416             }
3417          }
3418       }
3419
3420       for (int i = 0 ; i < 3; i++) {
3421          for (src_reg *iter = &inst->src[i];
3422               iter->reladdr;
3423               iter = iter->reladdr) {
3424             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3425                scratch_loc[iter->reg] = c->last_scratch;
3426                c->last_scratch += this->alloc.sizes[iter->reg];
3427             }
3428          }
3429       }
3430    }
3431
3432    /* Now, for anything that will be accessed through scratch, rewrite
3433     * it to load/store.  Note that this is a _safe list walk, because
3434     * we may generate a new scratch_write instruction after the one
3435     * we're processing.
3436     */
3437    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3438       /* Set up the annotation tracking for new generated instructions. */
3439       base_ir = inst->ir;
3440       current_annotation = inst->annotation;
3441
3442       /* First handle scratch access on the dst. Notice we have to handle
3443        * the case where the dst's reladdr also points to scratch space.
3444        */
3445       if (inst->dst.reladdr)
3446          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3447                                                    *inst->dst.reladdr);
3448
3449       /* Now that we have handled any (possibly recursive) reladdr scratch
3450        * accesses for dst we can safely do the scratch write for dst itself
3451        */
3452       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3453          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3454
3455       /* Now handle scratch access on any src. In this case, since inst->src[i]
3456        * already is a src_reg, we can just call emit_resolve_reladdr with
3457        * inst->src[i] and it will take care of handling scratch loads for
3458        * both src and src.reladdr (recursively).
3459        */
3460       for (int i = 0 ; i < 3; i++) {
3461          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3462                                              inst->src[i]);
3463       }
3464    }
3465 }
3466
3467 /**
3468  * Emits an instruction before @inst to load the value named by @orig_src
3469  * from the pull constant buffer (surface) at @base_offset to @temp.
3470  */
3471 void
3472 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3473                                       dst_reg temp, src_reg orig_src,
3474                                       int base_offset)
3475 {
3476    int reg_offset = base_offset + orig_src.reg_offset;
3477    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3478    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3479                                              reg_offset);
3480    vec4_instruction *load;
3481
3482    if (brw->gen >= 7) {
3483       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3484
3485       /* We have to use a message header on Skylake to get SIMD4x2 mode.
3486        * Reserve space for the register.
3487        */
3488       if (brw->gen >= 9) {
3489          grf_offset.reg_offset++;
3490          alloc.sizes[grf_offset.reg] = 2;
3491       }
3492
3493       grf_offset.type = offset.type;
3494       emit_before(block, inst, MOV(grf_offset, offset));
3495
3496       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3497                                            temp, index, src_reg(grf_offset));
3498       load->mlen = 1;
3499    } else {
3500       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3501                                            temp, index, offset);
3502       load->base_mrf = 14;
3503       load->mlen = 1;
3504    }
3505    emit_before(block, inst, load);
3506 }
3507
3508 /**
3509  * Implements array access of uniforms by inserting a
3510  * PULL_CONSTANT_LOAD instruction.
3511  *
3512  * Unlike temporary GRF array access (where we don't support it due to
3513  * the difficulty of doing relative addressing on instruction
3514  * destinations), we could potentially do array access of uniforms
3515  * that were loaded in GRF space as push constants.  In real-world
3516  * usage we've seen, though, the arrays being used are always larger
3517  * than we could load as push constants, so just always move all
3518  * uniform array access out to a pull constant buffer.
3519  */
3520 void
3521 vec4_visitor::move_uniform_array_access_to_pull_constants()
3522 {
3523    int pull_constant_loc[this->uniforms];
3524    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3525    bool nested_reladdr;
3526
3527    /* Walk through and find array access of uniforms.  Put a copy of that
3528     * uniform in the pull constant buffer.
3529     *
3530     * Note that we don't move constant-indexed accesses to arrays.  No
3531     * testing has been done of the performance impact of this choice.
3532     */
3533    do {
3534       nested_reladdr = false;
3535
3536       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3537          for (int i = 0 ; i < 3; i++) {
3538             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3539                continue;
3540
3541             int uniform = inst->src[i].reg;
3542
3543             if (inst->src[i].reladdr->reladdr)
3544                nested_reladdr = true;  /* will need another pass */
3545
3546             /* If this array isn't already present in the pull constant buffer,
3547              * add it.
3548              */
3549             if (pull_constant_loc[uniform] == -1) {
3550                const gl_constant_value **values =
3551                   &stage_prog_data->param[uniform * 4];
3552
3553                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3554
3555                assert(uniform < uniform_array_size);
3556                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3557                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3558                      = values[j];
3559                }
3560             }
3561
3562             /* Set up the annotation tracking for new generated instructions. */
3563             base_ir = inst->ir;
3564             current_annotation = inst->annotation;
3565
3566             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3567
3568             emit_pull_constant_load(block, inst, temp, inst->src[i],
3569                                     pull_constant_loc[uniform]);
3570
3571             inst->src[i].file = temp.file;
3572             inst->src[i].reg = temp.reg;
3573             inst->src[i].reg_offset = temp.reg_offset;
3574             inst->src[i].reladdr = NULL;
3575          }
3576       }
3577    } while (nested_reladdr);
3578
3579    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3580     * no need to track them as larger-than-vec4 objects.  This will be
3581     * relied on in cutting out unused uniform vectors from push
3582     * constants.
3583     */
3584    split_uniform_registers();
3585 }
3586
3587 void
3588 vec4_visitor::resolve_ud_negate(src_reg *reg)
3589 {
3590    if (reg->type != BRW_REGISTER_TYPE_UD ||
3591        !reg->negate)
3592       return;
3593
3594    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3595    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3596    *reg = temp;
3597 }
3598
3599 /**
3600  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3601  *
3602  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3603  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3604  */
3605 void
3606 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3607 {
3608    assert(brw->gen <= 5);
3609
3610    if (!rvalue->type->is_boolean())
3611       return;
3612
3613    src_reg and_result = src_reg(this, rvalue->type);
3614    src_reg neg_result = src_reg(this, rvalue->type);
3615    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3616    emit(MOV(dst_reg(neg_result), negate(and_result)));
3617    *reg = neg_result;
3618 }
3619
3620 vec4_visitor::vec4_visitor(struct brw_context *brw,
3621                            struct brw_vec4_compile *c,
3622                            struct gl_program *prog,
3623                            const struct brw_vue_prog_key *key,
3624                            struct brw_vue_prog_data *prog_data,
3625                            struct gl_shader_program *shader_prog,
3626                            gl_shader_stage stage,
3627                            void *mem_ctx,
3628                            bool no_spills,
3629                            shader_time_shader_type st_base,
3630                            shader_time_shader_type st_written,
3631                            shader_time_shader_type st_reset)
3632    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3633      c(c),
3634      key(key),
3635      prog_data(prog_data),
3636      sanity_param_count(0),
3637      fail_msg(NULL),
3638      first_non_payload_grf(0),
3639      need_all_constants_in_pull_buffer(false),
3640      no_spills(no_spills),
3641      st_base(st_base),
3642      st_written(st_written),
3643      st_reset(st_reset)
3644 {
3645    this->mem_ctx = mem_ctx;
3646    this->failed = false;
3647
3648    this->base_ir = NULL;
3649    this->current_annotation = NULL;
3650    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3651
3652    this->variable_ht = hash_table_ctor(0,
3653                                        hash_table_pointer_hash,
3654                                        hash_table_pointer_compare);
3655
3656    this->virtual_grf_start = NULL;
3657    this->virtual_grf_end = NULL;
3658    this->live_intervals = NULL;
3659
3660    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3661
3662    this->uniforms = 0;
3663
3664    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3665     * at least one. See setup_uniforms() in brw_vec4.cpp.
3666     */
3667    this->uniform_array_size = 1;
3668    if (prog_data) {
3669       this->uniform_array_size =
3670          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3671    }
3672
3673    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3674    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3675 }
3676
3677 vec4_visitor::~vec4_visitor()
3678 {
3679    hash_table_dtor(this->variable_ht);
3680 }
3681
3682
3683 void
3684 vec4_visitor::fail(const char *format, ...)
3685 {
3686    va_list va;
3687    char *msg;
3688
3689    if (failed)
3690       return;
3691
3692    failed = true;
3693
3694    va_start(va, format);
3695    msg = ralloc_vasprintf(mem_ctx, format, va);
3696    va_end(va);
3697    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3698
3699    this->fail_msg = msg;
3700
3701    if (debug_enabled) {
3702       fprintf(stderr, "%s",  msg);
3703    }
3704 }
3705
3706 } /* namespace brw */