src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 690           (storage->name[namelen] != 0 &&
 691            storage->name[namelen] != '.' &&
 692            storage->name[namelen] != '[')) {
 693          continue;
 694       }
 695
 696       gl_constant_value *components = storage->storage;
 697       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 698                                storage->type->matrix_columns);
 699
 700       for (unsigned s = 0; s < vector_count; s++) {
 701          assert(uniforms < uniform_array_size);
 702          uniform_vector_size[uniforms] = storage->type->vector_elements;
 703
 704          int i;
 705          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 706             stage_prog_data->param[uniforms * 4 + i] = components;
 707             components++;
 708          }
 709          for (; i < 4; i++) {
 710             static gl_constant_value zero = { 0.0 };
 711             stage_prog_data->param[uniforms * 4 + i] = &zero;
 712          }
 713
 714          uniforms++;
 715       }
 716    }
 717 }
 718
 719 void
 720 vec4_visitor::setup_uniform_clipplane_values()
 721 {
 722    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 723
 724    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 4;
 727       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 728       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 729       for (int j = 0; j < 4; ++j) {
 730          stage_prog_data->param[this->uniforms * 4 + j] =
 731             (gl_constant_value *) &clip_planes[i][j];
 732       }
 733       ++this->uniforms;
 734    }
 735 }
 736
 737 /* Our support for builtin uniforms is even scarier than non-builtin.
 738  * It sits on top of the PROG_STATE_VAR parameters that are
 739  * automatically updated from GL context state.
 740  */
 741 void
 742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 743 {
 744    const ir_state_slot *const slots = ir->get_state_slots();
 745    assert(slots != NULL);
 746
 747    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 748       /* This state reference has already been setup by ir_to_mesa,
 749        * but we'll get the same index back here.  We can reference
 750        * ParameterValues directly, since unlike brw_fs.cpp, we never
 751        * add new state references during compile.
 752        */
 753       int index = _mesa_add_state_reference(this->prog->Parameters,
 754                                             (gl_state_index *)slots[i].tokens);
 755       gl_constant_value *values =
 756          &this->prog->Parameters->ParameterValues[index][0];
 757
 758       assert(this->uniforms < uniform_array_size);
 759
 760       for (unsigned j = 0; j < 4; j++)
 761          stage_prog_data->param[this->uniforms * 4 + j] =
 762             &values[GET_SWZ(slots[i].swizzle, j)];
 763
 764       this->uniform_vector_size[this->uniforms] =
 765          (ir->type->is_scalar() || ir->type->is_vector() ||
 766           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 767
 768       this->uniforms++;
 769    }
 770 }
 771
 772 dst_reg *
 773 vec4_visitor::variable_storage(ir_variable *var)
 774 {
 775    return (dst_reg *)hash_table_find(this->variable_ht, var);
 776 }
 777
 778 void
 779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 780                                      enum brw_predicate *predicate)
 781 {
 782    ir_expression *expr = ir->as_expression();
 783
 784    *predicate = BRW_PREDICATE_NORMAL;
 785
 786    if (expr && expr->operation != ir_binop_ubo_load) {
 787       src_reg op[3];
 788       vec4_instruction *inst;
 789
 790       assert(expr->get_num_operands() <= 3);
 791       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 792          expr->operands[i]->accept(this);
 793          op[i] = this->result;
 794
 795          resolve_ud_negate(&op[i]);
 796       }
 797
 798       switch (expr->operation) {
 799       case ir_unop_logic_not:
 800          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 801          inst->conditional_mod = BRW_CONDITIONAL_Z;
 802          break;
 803
 804       case ir_binop_logic_xor:
 805          if (brw->gen <= 5) {
 806             src_reg temp = src_reg(this, ir->type);
 807             emit(XOR(dst_reg(temp), op[0], op[1]));
 808             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 809          } else {
 810             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 811          }
 812          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 813          break;
 814
 815       case ir_binop_logic_or:
 816          if (brw->gen <= 5) {
 817             src_reg temp = src_reg(this, ir->type);
 818             emit(OR(dst_reg(temp), op[0], op[1]));
 819             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 820          } else {
 821             inst = emit(OR(dst_null_d(), op[0], op[1]));
 822          }
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_binop_logic_and:
 827          if (brw->gen <= 5) {
 828             src_reg temp = src_reg(this, ir->type);
 829             emit(AND(dst_reg(temp), op[0], op[1]));
 830             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 831          } else {
 832             inst = emit(AND(dst_null_d(), op[0], op[1]));
 833          }
 834          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 835          break;
 836
 837       case ir_unop_f2b:
 838          if (brw->gen >= 6) {
 839             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 840          } else {
 841             inst = emit(MOV(dst_null_f(), op[0]));
 842             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          }
 844          break;
 845
 846       case ir_unop_i2b:
 847          if (brw->gen >= 6) {
 848             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 849          } else {
 850             inst = emit(MOV(dst_null_d(), op[0]));
 851             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 852          }
 853          break;
 854
 855       case ir_binop_all_equal:
 856          if (brw->gen <= 5) {
 857             resolve_bool_comparison(expr->operands[0], &op[0]);
 858             resolve_bool_comparison(expr->operands[1], &op[1]);
 859          }
 860          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 861          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 862          break;
 863
 864       case ir_binop_any_nequal:
 865          if (brw->gen <= 5) {
 866             resolve_bool_comparison(expr->operands[0], &op[0]);
 867             resolve_bool_comparison(expr->operands[1], &op[1]);
 868          }
 869          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 870          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 871          break;
 872
 873       case ir_unop_any:
 874          if (brw->gen <= 5) {
 875             resolve_bool_comparison(expr->operands[0], &op[0]);
 876          }
 877          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 878          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 879          break;
 880
 881       case ir_binop_greater:
 882       case ir_binop_gequal:
 883       case ir_binop_less:
 884       case ir_binop_lequal:
 885       case ir_binop_equal:
 886       case ir_binop_nequal:
 887          if (brw->gen <= 5) {
 888             resolve_bool_comparison(expr->operands[0], &op[0]);
 889             resolve_bool_comparison(expr->operands[1], &op[1]);
 890          }
 891          emit(CMP(dst_null_d(), op[0], op[1],
 892                   brw_conditional_for_comparison(expr->operation)));
 893          break;
 894
 895       case ir_triop_csel: {
 896          /* Expand the boolean condition into the flag register. */
 897          inst = emit(MOV(dst_null_d(), op[0]));
 898          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 899
 900          /* Select which boolean to return. */
 901          dst_reg temp(this, expr->operands[1]->type);
 902          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 903          inst->predicate = BRW_PREDICATE_NORMAL;
 904
 905          /* Expand the result to a condition code. */
 906          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 907          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 908          break;
 909       }
 910
 911       default:
 912          unreachable("not reached");
 913       }
 914       return;
 915    }
 916
 917    ir->accept(this);
 918
 919    resolve_ud_negate(&this->result);
 920
 921    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 922    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923 }
 924
 925 /**
 926  * Emit a gen6 IF statement with the comparison folded into the IF
 927  * instruction.
 928  */
 929 void
 930 vec4_visitor::emit_if_gen6(ir_if *ir)
 931 {
 932    ir_expression *expr = ir->condition->as_expression();
 933
 934    if (expr && expr->operation != ir_binop_ubo_load) {
 935       src_reg op[3];
 936       dst_reg temp;
 937
 938       assert(expr->get_num_operands() <= 3);
 939       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 940          expr->operands[i]->accept(this);
 941          op[i] = this->result;
 942       }
 943
 944       switch (expr->operation) {
 945       case ir_unop_logic_not:
 946          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 947          return;
 948
 949       case ir_binop_logic_xor:
 950          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 951          return;
 952
 953       case ir_binop_logic_or:
 954          temp = dst_reg(this, glsl_type::bool_type);
 955          emit(OR(temp, op[0], op[1]));
 956          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958
 959       case ir_binop_logic_and:
 960          temp = dst_reg(this, glsl_type::bool_type);
 961          emit(AND(temp, op[0], op[1]));
 962          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 963          return;
 964
 965       case ir_unop_f2b:
 966          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_i2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_binop_greater:
 974       case ir_binop_gequal:
 975       case ir_binop_less:
 976       case ir_binop_lequal:
 977       case ir_binop_equal:
 978       case ir_binop_nequal:
 979          emit(IF(op[0], op[1],
 980                  brw_conditional_for_comparison(expr->operation)));
 981          return;
 982
 983       case ir_binop_all_equal:
 984          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 985          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 986          return;
 987
 988       case ir_binop_any_nequal:
 989          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 990          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 991          return;
 992
 993       case ir_unop_any:
 994          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 995          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 996          return;
 997
 998       case ir_triop_csel: {
 999          /* Expand the boolean condition into the flag register. */
1000          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003          /* Select which boolean to return. */
1004          dst_reg temp(this, expr->operands[1]->type);
1005          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006          inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009          return;
1010       }
1011
1012       default:
1013          unreachable("not reached");
1014       }
1015       return;
1016    }
1017
1018    ir->condition->accept(this);
1019
1020    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026    dst_reg *reg = NULL;
1027
1028    if (variable_storage(ir))
1029       return;
1030
1031    switch (ir->data.mode) {
1032    case ir_var_shader_in:
1033       assert(ir->data.location != -1);
1034       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035       break;
1036
1037    case ir_var_shader_out:
1038       assert(ir->data.location != -1);
1039       reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041       for (int i = 0; i < type_size(ir->type); i++) {
1042          output_reg[ir->data.location + i] = *reg;
1043          output_reg[ir->data.location + i].reg_offset = i;
1044          output_reg[ir->data.location + i].type =
1045             brw_type_for_base_type(ir->type->get_scalar_type());
1046          output_reg_annotation[ir->data.location + i] = ir->name;
1047       }
1048       break;
1049
1050    case ir_var_auto:
1051    case ir_var_temporary:
1052       reg = new(mem_ctx) dst_reg(this, ir->type);
1053       break;
1054
1055    case ir_var_uniform:
1056       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058       /* Thanks to the lower_ubo_reference pass, we will see only
1059        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060        * variables, so no need for them to be in variable_ht.
1061        *
1062        * Some uniforms, such as samplers and atomic counters, have no actual
1063        * storage, so we should ignore them.
1064        */
1065       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066          return;
1067
1068       /* Track how big the whole uniform variable is, in case we need to put a
1069        * copy of its data into pull constants for array access.
1070        */
1071       assert(this->uniforms < uniform_array_size);
1072       this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074       if (!strncmp(ir->name, "gl_", 3)) {
1075          setup_builtin_uniform_values(ir);
1076       } else {
1077          setup_uniform_values(ir);
1078       }
1079       break;
1080
1081    case ir_var_system_value:
1082       reg = make_reg_for_system_value(ir);
1083       break;
1084
1085    default:
1086       unreachable("not reached");
1087    }
1088
1089    reg->type = brw_type_for_base_type(ir->type);
1090    hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096    /* We don't want debugging output to print the whole body of the
1097     * loop as the annotation.
1098     */
1099    this->base_ir = NULL;
1100
1101    emit(BRW_OPCODE_DO);
1102
1103    visit_instructions(&ir->body_instructions);
1104
1105    emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111    switch (ir->mode) {
1112    case ir_loop_jump::jump_break:
1113       emit(BRW_OPCODE_BREAK);
1114       break;
1115    case ir_loop_jump::jump_continue:
1116       emit(BRW_OPCODE_CONTINUE);
1117       break;
1118    }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125    unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131    /* Ignore function bodies other than main() -- we shouldn't see calls to
1132     * them since they should all be inlined.
1133     */
1134    if (strcmp(ir->name, "main") == 0) {
1135       const ir_function_signature *sig;
1136       exec_list empty;
1137
1138       sig = ir->matching_signature(NULL, &empty, false);
1139
1140       assert(sig);
1141
1142       visit_instructions(&sig->body);
1143    }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149    /* 3-src instructions were introduced in gen6. */
1150    if (brw->gen < 6)
1151       return false;
1152
1153    /* MAD can only handle floating-point data. */
1154    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155       return false;
1156
1157    ir_rvalue *nonmul;
1158    ir_expression *mul;
1159    bool mul_negate, mul_abs;
1160
1161    for (int i = 0; i < 2; i++) {
1162       mul_negate = false;
1163       mul_abs = false;
1164
1165       mul = ir->operands[i]->as_expression();
1166       nonmul = ir->operands[1 - i];
1167
1168       if (mul && mul->operation == ir_unop_abs) {
1169          mul = mul->operands[0]->as_expression();
1170          mul_abs = true;
1171       } else if (mul && mul->operation == ir_unop_neg) {
1172          mul = mul->operands[0]->as_expression();
1173          mul_negate = true;
1174       }
1175
1176       if (mul && mul->operation == ir_binop_mul)
1177          break;
1178    }
1179
1180    if (!mul || mul->operation != ir_binop_mul)
1181       return false;
1182
1183    nonmul->accept(this);
1184    src_reg src0 = fix_3src_operand(this->result);
1185
1186    mul->operands[0]->accept(this);
1187    src_reg src1 = fix_3src_operand(this->result);
1188    src1.negate ^= mul_negate;
1189    src1.abs = mul_abs;
1190    if (mul_abs)
1191       src1.negate = false;
1192
1193    mul->operands[1]->accept(this);
1194    src_reg src2 = fix_3src_operand(this->result);
1195    src2.abs = mul_abs;
1196    if (mul_abs)
1197       src2.negate = false;
1198
1199    this->result = src_reg(this, ir->type);
1200    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202    return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208    /* This optimization relies on CMP setting the destination to 0 when
1209     * false.  Early hardware only sets the least significant bit, and
1210     * leaves the other bits undefined.  So we can't use it.
1211     */
1212    if (brw->gen < 6)
1213       return false;
1214
1215    ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217    if (cmp == NULL)
1218       return false;
1219
1220    switch (cmp->operation) {
1221    case ir_binop_less:
1222    case ir_binop_greater:
1223    case ir_binop_lequal:
1224    case ir_binop_gequal:
1225    case ir_binop_equal:
1226    case ir_binop_nequal:
1227       break;
1228
1229    default:
1230       return false;
1231    }
1232
1233    cmp->operands[0]->accept(this);
1234    const src_reg cmp_src0 = this->result;
1235
1236    cmp->operands[1]->accept(this);
1237    const src_reg cmp_src1 = this->result;
1238
1239    this->result = src_reg(this, ir->type);
1240
1241    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242             brw_conditional_for_comparison(cmp->operation)));
1243
1244    /* If the comparison is false, this->result will just happen to be zero.
1245     */
1246    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247                                        this->result, src_reg(1.0f));
1248    inst->predicate = BRW_PREDICATE_NORMAL;
1249    inst->predicate_inverse = true;
1250
1251    return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256                           src_reg src0, src_reg src1)
1257 {
1258    vec4_instruction *inst;
1259
1260    if (brw->gen >= 6) {
1261       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262       inst->conditional_mod = conditionalmod;
1263    } else {
1264       emit(CMP(dst, src0, src1, conditionalmod));
1265
1266       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267       inst->predicate = BRW_PREDICATE_NORMAL;
1268    }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273                        const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275    if (brw->gen >= 6) {
1276       /* Note that the instruction's argument order is reversed from GLSL
1277        * and the IR.
1278        */
1279       emit(LRP(dst,
1280                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281    } else {
1282       /* Earlier generations don't support three source operations, so we
1283        * need to emit x*(1-a) + y*a.
1284        */
1285       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1286       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1287       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288       y_times_a.writemask           = dst.writemask;
1289       one_minus_a.writemask         = dst.writemask;
1290       x_times_one_minus_a.writemask = dst.writemask;
1291
1292       emit(MUL(y_times_a, y, a));
1293       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296    }
1297 }
1298
1299 /**
1300  * Emits the instructions needed to perform a pull constant load. before_block
1301  * and before_inst can be NULL in which case the instruction will be appended
1302  * to the end of the instruction list.
1303  */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306                                           src_reg surf_index,
1307                                           src_reg offset_reg,
1308                                           bblock_t *before_block,
1309                                           vec4_instruction *before_inst)
1310 {
1311    assert((before_inst == NULL && before_block == NULL) ||
1312           (before_inst && before_block));
1313
1314    vec4_instruction *pull;
1315
1316    if (brw->gen >= 7) {
1317       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1318
1319       /* We have to use a message header on Skylake to get SIMD4x2 mode.
1320        * Reserve space for the register.
1321        */
1322       if (brw->gen >= 9) {
1323          grf_offset.reg_offset++;
1324          alloc.sizes[grf_offset.reg] = 2;
1325       }
1326
1327       grf_offset.type = offset_reg.type;
1328
1329       pull = MOV(grf_offset, offset_reg);
1330
1331       if (before_inst)
1332          emit_before(before_block, before_inst, pull);
1333       else
1334          emit(pull);
1335
1336       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1337                                            dst,
1338                                            surf_index,
1339                                            src_reg(grf_offset));
1340       pull->mlen = 1;
1341    } else {
1342       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1343                                            dst,
1344                                            surf_index,
1345                                            offset_reg);
1346       pull->base_mrf = 14;
1347       pull->mlen = 1;
1348    }
1349
1350    if (before_inst)
1351       emit_before(before_block, before_inst, pull);
1352    else
1353       emit(pull);
1354 }
1355
1356 void
1357 vec4_visitor::visit(ir_expression *ir)
1358 {
1359    unsigned int operand;
1360    src_reg op[ARRAY_SIZE(ir->operands)];
1361    vec4_instruction *inst;
1362
1363    if (ir->operation == ir_binop_add) {
1364       if (try_emit_mad(ir))
1365          return;
1366    }
1367
1368    if (ir->operation == ir_unop_b2f) {
1369       if (try_emit_b2f_of_compare(ir))
1370          return;
1371    }
1372
1373    /* Storage for our result.  Ideally for an assignment we'd be using
1374     * the actual storage for the result here, instead.
1375     */
1376    dst_reg result_dst(this, ir->type);
1377    src_reg result_src(result_dst);
1378
1379    if (ir->operation == ir_triop_csel) {
1380       ir->operands[1]->accept(this);
1381       op[1] = this->result;
1382       ir->operands[2]->accept(this);
1383       op[2] = this->result;
1384
1385       enum brw_predicate predicate;
1386       emit_bool_to_cond_code(ir->operands[0], &predicate);
1387       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1388       inst->predicate = predicate;
1389       this->result = result_src;
1390       return;
1391    }
1392
1393    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1394       this->result.file = BAD_FILE;
1395       ir->operands[operand]->accept(this);
1396       if (this->result.file == BAD_FILE) {
1397          fprintf(stderr, "Failed to get tree for expression operand:\n");
1398          ir->operands[operand]->fprint(stderr);
1399          exit(1);
1400       }
1401       op[operand] = this->result;
1402
1403       /* Matrix expression operands should have been broken down to vector
1404        * operations already.
1405        */
1406       assert(!ir->operands[operand]->type->is_matrix());
1407    }
1408
1409    /* If nothing special happens, this is the result. */
1410    this->result = result_src;
1411
1412    switch (ir->operation) {
1413    case ir_unop_logic_not:
1414       emit(NOT(result_dst, op[0]));
1415       break;
1416    case ir_unop_neg:
1417       op[0].negate = !op[0].negate;
1418       emit(MOV(result_dst, op[0]));
1419       break;
1420    case ir_unop_abs:
1421       op[0].abs = true;
1422       op[0].negate = false;
1423       emit(MOV(result_dst, op[0]));
1424       break;
1425
1426    case ir_unop_sign:
1427       if (ir->type->is_float()) {
1428          /* AND(val, 0x80000000) gives the sign bit.
1429           *
1430           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1431           * zero.
1432           */
1433          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1434
1435          op[0].type = BRW_REGISTER_TYPE_UD;
1436          result_dst.type = BRW_REGISTER_TYPE_UD;
1437          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1438
1439          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1440          inst->predicate = BRW_PREDICATE_NORMAL;
1441
1442          this->result.type = BRW_REGISTER_TYPE_F;
1443       } else {
1444          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1445           *               -> non-negative val generates 0x00000000.
1446           *  Predicated OR sets 1 if val is positive.
1447           */
1448          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1449
1450          emit(ASR(result_dst, op[0], src_reg(31)));
1451
1452          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1453          inst->predicate = BRW_PREDICATE_NORMAL;
1454       }
1455       break;
1456
1457    case ir_unop_rcp:
1458       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1459       break;
1460
1461    case ir_unop_exp2:
1462       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1463       break;
1464    case ir_unop_log2:
1465       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1466       break;
1467    case ir_unop_exp:
1468    case ir_unop_log:
1469       unreachable("not reached: should be handled by ir_explog_to_explog2");
1470    case ir_unop_sin:
1471       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1472       break;
1473    case ir_unop_cos:
1474       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1475       break;
1476
1477    case ir_unop_dFdx:
1478    case ir_unop_dFdx_coarse:
1479    case ir_unop_dFdx_fine:
1480    case ir_unop_dFdy:
1481    case ir_unop_dFdy_coarse:
1482    case ir_unop_dFdy_fine:
1483       unreachable("derivatives not valid in vertex shader");
1484
1485    case ir_unop_bitfield_reverse:
1486       emit(BFREV(result_dst, op[0]));
1487       break;
1488    case ir_unop_bit_count:
1489       emit(CBIT(result_dst, op[0]));
1490       break;
1491    case ir_unop_find_msb: {
1492       src_reg temp = src_reg(this, glsl_type::uint_type);
1493
1494       inst = emit(FBH(dst_reg(temp), op[0]));
1495       inst->dst.writemask = WRITEMASK_XYZW;
1496
1497       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1498        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1499        * subtract the result from 31 to convert the MSB count into an LSB count.
1500        */
1501
1502       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1503       temp.swizzle = BRW_SWIZZLE_NOOP;
1504       emit(MOV(result_dst, temp));
1505
1506       src_reg src_tmp = src_reg(result_dst);
1507       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1508
1509       src_tmp.negate = true;
1510       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1511       inst->predicate = BRW_PREDICATE_NORMAL;
1512       break;
1513    }
1514    case ir_unop_find_lsb:
1515       emit(FBL(result_dst, op[0]));
1516       break;
1517    case ir_unop_saturate:
1518       inst = emit(MOV(result_dst, op[0]));
1519       inst->saturate = true;
1520       break;
1521
1522    case ir_unop_noise:
1523       unreachable("not reached: should be handled by lower_noise");
1524
1525    case ir_binop_add:
1526       emit(ADD(result_dst, op[0], op[1]));
1527       break;
1528    case ir_binop_sub:
1529       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1530
1531    case ir_binop_mul:
1532       if (brw->gen < 8 && ir->type->is_integer()) {
1533          /* For integer multiplication, the MUL uses the low 16 bits of one of
1534           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1535           * accumulates in the contribution of the upper 16 bits of that
1536           * operand.  If we can determine that one of the args is in the low
1537           * 16 bits, though, we can just emit a single MUL.
1538           */
1539          if (ir->operands[0]->is_uint16_constant()) {
1540             if (brw->gen < 7)
1541                emit(MUL(result_dst, op[0], op[1]));
1542             else
1543                emit(MUL(result_dst, op[1], op[0]));
1544          } else if (ir->operands[1]->is_uint16_constant()) {
1545             if (brw->gen < 7)
1546                emit(MUL(result_dst, op[1], op[0]));
1547             else
1548                emit(MUL(result_dst, op[0], op[1]));
1549          } else {
1550             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1551
1552             emit(MUL(acc, op[0], op[1]));
1553             emit(MACH(dst_null_d(), op[0], op[1]));
1554             emit(MOV(result_dst, src_reg(acc)));
1555          }
1556       } else {
1557          emit(MUL(result_dst, op[0], op[1]));
1558       }
1559       break;
1560    case ir_binop_imul_high: {
1561       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1562
1563       emit(MUL(acc, op[0], op[1]));
1564       emit(MACH(result_dst, op[0], op[1]));
1565       break;
1566    }
1567    case ir_binop_div:
1568       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1569       assert(ir->type->is_integer());
1570       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1571       break;
1572    case ir_binop_carry: {
1573       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1574
1575       emit(ADDC(dst_null_ud(), op[0], op[1]));
1576       emit(MOV(result_dst, src_reg(acc)));
1577       break;
1578    }
1579    case ir_binop_borrow: {
1580       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1581
1582       emit(SUBB(dst_null_ud(), op[0], op[1]));
1583       emit(MOV(result_dst, src_reg(acc)));
1584       break;
1585    }
1586    case ir_binop_mod:
1587       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1588       assert(ir->type->is_integer());
1589       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1590       break;
1591
1592    case ir_binop_less:
1593    case ir_binop_greater:
1594    case ir_binop_lequal:
1595    case ir_binop_gequal:
1596    case ir_binop_equal:
1597    case ir_binop_nequal: {
1598       if (brw->gen <= 5) {
1599          resolve_bool_comparison(ir->operands[0], &op[0]);
1600          resolve_bool_comparison(ir->operands[1], &op[1]);
1601       }
1602       emit(CMP(result_dst, op[0], op[1],
1603                brw_conditional_for_comparison(ir->operation)));
1604       break;
1605    }
1606
1607    case ir_binop_all_equal:
1608       if (brw->gen <= 5) {
1609          resolve_bool_comparison(ir->operands[0], &op[0]);
1610          resolve_bool_comparison(ir->operands[1], &op[1]);
1611       }
1612
1613       /* "==" operator producing a scalar boolean. */
1614       if (ir->operands[0]->type->is_vector() ||
1615           ir->operands[1]->type->is_vector()) {
1616          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1617          emit(MOV(result_dst, src_reg(0)));
1618          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1619          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1620       } else {
1621          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1622       }
1623       break;
1624    case ir_binop_any_nequal:
1625       if (brw->gen <= 5) {
1626          resolve_bool_comparison(ir->operands[0], &op[0]);
1627          resolve_bool_comparison(ir->operands[1], &op[1]);
1628       }
1629
1630       /* "!=" operator producing a scalar boolean. */
1631       if (ir->operands[0]->type->is_vector() ||
1632           ir->operands[1]->type->is_vector()) {
1633          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1634
1635          emit(MOV(result_dst, src_reg(0)));
1636          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1637          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1638       } else {
1639          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1640       }
1641       break;
1642
1643    case ir_unop_any:
1644       if (brw->gen <= 5) {
1645          resolve_bool_comparison(ir->operands[0], &op[0]);
1646       }
1647       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1648       emit(MOV(result_dst, src_reg(0)));
1649
1650       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1651       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1652       break;
1653
1654    case ir_binop_logic_xor:
1655       emit(XOR(result_dst, op[0], op[1]));
1656       break;
1657
1658    case ir_binop_logic_or:
1659       emit(OR(result_dst, op[0], op[1]));
1660       break;
1661
1662    case ir_binop_logic_and:
1663       emit(AND(result_dst, op[0], op[1]));
1664       break;
1665
1666    case ir_binop_dot:
1667       assert(ir->operands[0]->type->is_vector());
1668       assert(ir->operands[0]->type == ir->operands[1]->type);
1669       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1670       break;
1671
1672    case ir_unop_sqrt:
1673       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1674       break;
1675    case ir_unop_rsq:
1676       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1677       break;
1678
1679    case ir_unop_bitcast_i2f:
1680    case ir_unop_bitcast_u2f:
1681       this->result = op[0];
1682       this->result.type = BRW_REGISTER_TYPE_F;
1683       break;
1684
1685    case ir_unop_bitcast_f2i:
1686       this->result = op[0];
1687       this->result.type = BRW_REGISTER_TYPE_D;
1688       break;
1689
1690    case ir_unop_bitcast_f2u:
1691       this->result = op[0];
1692       this->result.type = BRW_REGISTER_TYPE_UD;
1693       break;
1694
1695    case ir_unop_i2f:
1696    case ir_unop_i2u:
1697    case ir_unop_u2i:
1698    case ir_unop_u2f:
1699    case ir_unop_f2i:
1700    case ir_unop_f2u:
1701       emit(MOV(result_dst, op[0]));
1702       break;
1703    case ir_unop_b2i:
1704       emit(AND(result_dst, op[0], src_reg(1)));
1705       break;
1706    case ir_unop_b2f:
1707       if (brw->gen <= 5) {
1708          resolve_bool_comparison(ir->operands[0], &op[0]);
1709       }
1710       op[0].type = BRW_REGISTER_TYPE_D;
1711       result_dst.type = BRW_REGISTER_TYPE_D;
1712       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1713       result_dst.type = BRW_REGISTER_TYPE_F;
1714       break;
1715    case ir_unop_f2b:
1716       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1717       break;
1718    case ir_unop_i2b:
1719       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1720       break;
1721
1722    case ir_unop_trunc:
1723       emit(RNDZ(result_dst, op[0]));
1724       break;
1725    case ir_unop_ceil: {
1726          src_reg tmp = src_reg(this, ir->type);
1727          op[0].negate = !op[0].negate;
1728          emit(RNDD(dst_reg(tmp), op[0]));
1729          tmp.negate = true;
1730          emit(MOV(result_dst, tmp));
1731       }
1732       break;
1733    case ir_unop_floor:
1734       inst = emit(RNDD(result_dst, op[0]));
1735       break;
1736    case ir_unop_fract:
1737       inst = emit(FRC(result_dst, op[0]));
1738       break;
1739    case ir_unop_round_even:
1740       emit(RNDE(result_dst, op[0]));
1741       break;
1742
1743    case ir_binop_min:
1744       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1745       break;
1746    case ir_binop_max:
1747       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1748       break;
1749
1750    case ir_binop_pow:
1751       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1752       break;
1753
1754    case ir_unop_bit_not:
1755       inst = emit(NOT(result_dst, op[0]));
1756       break;
1757    case ir_binop_bit_and:
1758       inst = emit(AND(result_dst, op[0], op[1]));
1759       break;
1760    case ir_binop_bit_xor:
1761       inst = emit(XOR(result_dst, op[0], op[1]));
1762       break;
1763    case ir_binop_bit_or:
1764       inst = emit(OR(result_dst, op[0], op[1]));
1765       break;
1766
1767    case ir_binop_lshift:
1768       inst = emit(SHL(result_dst, op[0], op[1]));
1769       break;
1770
1771    case ir_binop_rshift:
1772       if (ir->type->base_type == GLSL_TYPE_INT)
1773          inst = emit(ASR(result_dst, op[0], op[1]));
1774       else
1775          inst = emit(SHR(result_dst, op[0], op[1]));
1776       break;
1777
1778    case ir_binop_bfm:
1779       emit(BFI1(result_dst, op[0], op[1]));
1780       break;
1781
1782    case ir_binop_ubo_load: {
1783       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1784       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1785       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1786       src_reg offset;
1787
1788       /* Now, load the vector from that offset. */
1789       assert(ir->type->is_vector() || ir->type->is_scalar());
1790
1791       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1792       packed_consts.type = result.type;
1793       src_reg surf_index;
1794
1795       if (const_uniform_block) {
1796          /* The block index is a constant, so just emit the binding table entry
1797           * as an immediate.
1798           */
1799          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1800                               const_uniform_block->value.u[0]);
1801       } else {
1802          /* The block index is not a constant. Evaluate the index expression
1803           * per-channel and add the base UBO index; the generator will select
1804           * a value from any live channel.
1805           */
1806          surf_index = src_reg(this, glsl_type::uint_type);
1807          emit(ADD(dst_reg(surf_index), op[0],
1808                   src_reg(prog_data->base.binding_table.ubo_start)));
1809
1810          /* Assume this may touch any UBO. It would be nice to provide
1811           * a tighter bound, but the array information is already lowered away.
1812           */
1813          brw_mark_surface_used(&prog_data->base,
1814                                prog_data->base.binding_table.ubo_start +
1815                                shader_prog->NumUniformBlocks - 1);
1816       }
1817
1818       if (const_offset_ir) {
1819          if (brw->gen >= 8) {
1820             /* Store the offset in a GRF so we can send-from-GRF. */
1821             offset = src_reg(this, glsl_type::int_type);
1822             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1823          } else {
1824             /* Immediates are fine on older generations since they'll be moved
1825              * to a (potentially fake) MRF at the generator level.
1826              */
1827             offset = src_reg(const_offset / 16);
1828          }
1829       } else {
1830          offset = src_reg(this, glsl_type::uint_type);
1831          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1832       }
1833
1834       emit_pull_constant_load_reg(dst_reg(packed_consts),
1835                                   surf_index,
1836                                   offset,
1837                                   NULL, NULL /* before_block/inst */);
1838
1839       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1840       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1841                                             const_offset % 16 / 4,
1842                                             const_offset % 16 / 4,
1843                                             const_offset % 16 / 4);
1844
1845       /* UBO bools are any nonzero int.  We need to convert them to use the
1846        * value of true stored in ctx->Const.UniformBooleanTrue.
1847        */
1848       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1849          emit(CMP(result_dst, packed_consts, src_reg(0u),
1850                   BRW_CONDITIONAL_NZ));
1851       } else {
1852          emit(MOV(result_dst, packed_consts));
1853       }
1854       break;
1855    }
1856
1857    case ir_binop_vector_extract:
1858       unreachable("should have been lowered by vec_index_to_cond_assign");
1859
1860    case ir_triop_fma:
1861       op[0] = fix_3src_operand(op[0]);
1862       op[1] = fix_3src_operand(op[1]);
1863       op[2] = fix_3src_operand(op[2]);
1864       /* Note that the instruction's argument order is reversed from GLSL
1865        * and the IR.
1866        */
1867       emit(MAD(result_dst, op[2], op[1], op[0]));
1868       break;
1869
1870    case ir_triop_lrp:
1871       emit_lrp(result_dst, op[0], op[1], op[2]);
1872       break;
1873
1874    case ir_triop_csel:
1875       unreachable("already handled above");
1876       break;
1877
1878    case ir_triop_bfi:
1879       op[0] = fix_3src_operand(op[0]);
1880       op[1] = fix_3src_operand(op[1]);
1881       op[2] = fix_3src_operand(op[2]);
1882       emit(BFI2(result_dst, op[0], op[1], op[2]));
1883       break;
1884
1885    case ir_triop_bitfield_extract:
1886       op[0] = fix_3src_operand(op[0]);
1887       op[1] = fix_3src_operand(op[1]);
1888       op[2] = fix_3src_operand(op[2]);
1889       /* Note that the instruction's argument order is reversed from GLSL
1890        * and the IR.
1891        */
1892       emit(BFE(result_dst, op[2], op[1], op[0]));
1893       break;
1894
1895    case ir_triop_vector_insert:
1896       unreachable("should have been lowered by lower_vector_insert");
1897
1898    case ir_quadop_bitfield_insert:
1899       unreachable("not reached: should be handled by "
1900               "bitfield_insert_to_bfm_bfi\n");
1901
1902    case ir_quadop_vector:
1903       unreachable("not reached: should be handled by lower_quadop_vector");
1904
1905    case ir_unop_pack_half_2x16:
1906       emit_pack_half_2x16(result_dst, op[0]);
1907       break;
1908    case ir_unop_unpack_half_2x16:
1909       emit_unpack_half_2x16(result_dst, op[0]);
1910       break;
1911    case ir_unop_unpack_unorm_4x8:
1912       emit_unpack_unorm_4x8(result_dst, op[0]);
1913       break;
1914    case ir_unop_unpack_snorm_4x8:
1915       emit_unpack_snorm_4x8(result_dst, op[0]);
1916       break;
1917    case ir_unop_pack_unorm_4x8:
1918       emit_pack_unorm_4x8(result_dst, op[0]);
1919       break;
1920    case ir_unop_pack_snorm_4x8:
1921       emit_pack_snorm_4x8(result_dst, op[0]);
1922       break;
1923    case ir_unop_pack_snorm_2x16:
1924    case ir_unop_pack_unorm_2x16:
1925    case ir_unop_unpack_snorm_2x16:
1926    case ir_unop_unpack_unorm_2x16:
1927       unreachable("not reached: should be handled by lower_packing_builtins");
1928    case ir_unop_unpack_half_2x16_split_x:
1929    case ir_unop_unpack_half_2x16_split_y:
1930    case ir_binop_pack_half_2x16_split:
1931    case ir_unop_interpolate_at_centroid:
1932    case ir_binop_interpolate_at_sample:
1933    case ir_binop_interpolate_at_offset:
1934       unreachable("not reached: should not occur in vertex shader");
1935    case ir_binop_ldexp:
1936       unreachable("not reached: should be handled by ldexp_to_arith()");
1937    case ir_unop_d2f:
1938    case ir_unop_f2d:
1939    case ir_unop_d2i:
1940    case ir_unop_i2d:
1941    case ir_unop_d2u:
1942    case ir_unop_u2d:
1943    case ir_unop_d2b:
1944    case ir_unop_pack_double_2x32:
1945    case ir_unop_unpack_double_2x32:
1946    case ir_unop_frexp_sig:
1947    case ir_unop_frexp_exp:
1948       unreachable("fp64 todo");
1949    }
1950 }
1951
1952
1953 void
1954 vec4_visitor::visit(ir_swizzle *ir)
1955 {
1956    /* Note that this is only swizzles in expressions, not those on the left
1957     * hand side of an assignment, which do write masking.  See ir_assignment
1958     * for that.
1959     */
1960    const unsigned swz = brw_compose_swizzle(
1961       brw_swizzle_for_size(ir->type->vector_elements),
1962       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1963
1964    ir->val->accept(this);
1965    this->result = swizzle(this->result, swz);
1966 }
1967
1968 void
1969 vec4_visitor::visit(ir_dereference_variable *ir)
1970 {
1971    const struct glsl_type *type = ir->type;
1972    dst_reg *reg = variable_storage(ir->var);
1973
1974    if (!reg) {
1975       fail("Failed to find variable storage for %s\n", ir->var->name);
1976       this->result = src_reg(brw_null_reg());
1977       return;
1978    }
1979
1980    this->result = src_reg(*reg);
1981
1982    /* System values get their swizzle from the dst_reg writemask */
1983    if (ir->var->data.mode == ir_var_system_value)
1984       return;
1985
1986    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1987       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
1988 }
1989
1990
1991 int
1992 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1993 {
1994    /* Under normal circumstances array elements are stored consecutively, so
1995     * the stride is equal to the size of the array element.
1996     */
1997    return type_size(ir->type);
1998 }
1999
2000
2001 void
2002 vec4_visitor::visit(ir_dereference_array *ir)
2003 {
2004    ir_constant *constant_index;
2005    src_reg src;
2006    int array_stride = compute_array_stride(ir);
2007
2008    constant_index = ir->array_index->constant_expression_value();
2009
2010    ir->array->accept(this);
2011    src = this->result;
2012
2013    if (constant_index) {
2014       src.reg_offset += constant_index->value.i[0] * array_stride;
2015    } else {
2016       /* Variable index array dereference.  It eats the "vec4" of the
2017        * base of the array and an index that offsets the Mesa register
2018        * index.
2019        */
2020       ir->array_index->accept(this);
2021
2022       src_reg index_reg;
2023
2024       if (array_stride == 1) {
2025          index_reg = this->result;
2026       } else {
2027          index_reg = src_reg(this, glsl_type::int_type);
2028
2029          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2030       }
2031
2032       if (src.reladdr) {
2033          src_reg temp = src_reg(this, glsl_type::int_type);
2034
2035          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2036
2037          index_reg = temp;
2038       }
2039
2040       src.reladdr = ralloc(mem_ctx, src_reg);
2041       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2042    }
2043
2044    /* If the type is smaller than a vec4, replicate the last channel out. */
2045    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2046       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2047    else
2048       src.swizzle = BRW_SWIZZLE_NOOP;
2049    src.type = brw_type_for_base_type(ir->type);
2050
2051    this->result = src;
2052 }
2053
2054 void
2055 vec4_visitor::visit(ir_dereference_record *ir)
2056 {
2057    unsigned int i;
2058    const glsl_type *struct_type = ir->record->type;
2059    int offset = 0;
2060
2061    ir->record->accept(this);
2062
2063    for (i = 0; i < struct_type->length; i++) {
2064       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2065          break;
2066       offset += type_size(struct_type->fields.structure[i].type);
2067    }
2068
2069    /* If the type is smaller than a vec4, replicate the last channel out. */
2070    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2071       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2072    else
2073       this->result.swizzle = BRW_SWIZZLE_NOOP;
2074    this->result.type = brw_type_for_base_type(ir->type);
2075
2076    this->result.reg_offset += offset;
2077 }
2078
2079 /**
2080  * We want to be careful in assignment setup to hit the actual storage
2081  * instead of potentially using a temporary like we might with the
2082  * ir_dereference handler.
2083  */
2084 static dst_reg
2085 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2086 {
2087    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2088     * access of a vector, it must be separated into a series conditional moves
2089     * before reaching this point (see ir_vec_index_to_cond_assign).
2090     */
2091    assert(ir->as_dereference());
2092    ir_dereference_array *deref_array = ir->as_dereference_array();
2093    if (deref_array) {
2094       assert(!deref_array->array->type->is_vector());
2095    }
2096
2097    /* Use the rvalue deref handler for the most part.  We'll ignore
2098     * swizzles in it and write swizzles using writemask, though.
2099     */
2100    ir->accept(v);
2101    return dst_reg(v->result);
2102 }
2103
2104 void
2105 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2106                               const struct glsl_type *type,
2107                               enum brw_predicate predicate)
2108 {
2109    if (type->base_type == GLSL_TYPE_STRUCT) {
2110       for (unsigned int i = 0; i < type->length; i++) {
2111          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2112       }
2113       return;
2114    }
2115
2116    if (type->is_array()) {
2117       for (unsigned int i = 0; i < type->length; i++) {
2118          emit_block_move(dst, src, type->fields.array, predicate);
2119       }
2120       return;
2121    }
2122
2123    if (type->is_matrix()) {
2124       const struct glsl_type *vec_type;
2125
2126       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2127                                          type->vector_elements, 1);
2128
2129       for (int i = 0; i < type->matrix_columns; i++) {
2130          emit_block_move(dst, src, vec_type, predicate);
2131       }
2132       return;
2133    }
2134
2135    assert(type->is_scalar() || type->is_vector());
2136
2137    dst->type = brw_type_for_base_type(type);
2138    src->type = dst->type;
2139
2140    dst->writemask = (1 << type->vector_elements) - 1;
2141
2142    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2143
2144    vec4_instruction *inst = emit(MOV(*dst, *src));
2145    inst->predicate = predicate;
2146
2147    dst->reg_offset++;
2148    src->reg_offset++;
2149 }
2150
2151
2152 /* If the RHS processing resulted in an instruction generating a
2153  * temporary value, and it would be easy to rewrite the instruction to
2154  * generate its result right into the LHS instead, do so.  This ends
2155  * up reliably removing instructions where it can be tricky to do so
2156  * later without real UD chain information.
2157  */
2158 bool
2159 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2160                                      dst_reg dst,
2161                                      src_reg src,
2162                                      vec4_instruction *pre_rhs_inst,
2163                                      vec4_instruction *last_rhs_inst)
2164 {
2165    /* This could be supported, but it would take more smarts. */
2166    if (ir->condition)
2167       return false;
2168
2169    if (pre_rhs_inst == last_rhs_inst)
2170       return false; /* No instructions generated to work with. */
2171
2172    /* Make sure the last instruction generated our source reg. */
2173    if (src.file != GRF ||
2174        src.file != last_rhs_inst->dst.file ||
2175        src.reg != last_rhs_inst->dst.reg ||
2176        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2177        src.reladdr ||
2178        src.abs ||
2179        src.negate ||
2180        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2181       return false;
2182
2183    /* Check that that last instruction fully initialized the channels
2184     * we want to use, in the order we want to use them.  We could
2185     * potentially reswizzle the operands of many instructions so that
2186     * we could handle out of order channels, but don't yet.
2187     */
2188
2189    for (unsigned i = 0; i < 4; i++) {
2190       if (dst.writemask & (1 << i)) {
2191          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2192             return false;
2193
2194          if (BRW_GET_SWZ(src.swizzle, i) != i)
2195             return false;
2196       }
2197    }
2198
2199    /* Success!  Rewrite the instruction. */
2200    last_rhs_inst->dst.file = dst.file;
2201    last_rhs_inst->dst.reg = dst.reg;
2202    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2203    last_rhs_inst->dst.reladdr = dst.reladdr;
2204    last_rhs_inst->dst.writemask &= dst.writemask;
2205
2206    return true;
2207 }
2208
2209 void
2210 vec4_visitor::visit(ir_assignment *ir)
2211 {
2212    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2213    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2214
2215    if (!ir->lhs->type->is_scalar() &&
2216        !ir->lhs->type->is_vector()) {
2217       ir->rhs->accept(this);
2218       src_reg src = this->result;
2219
2220       if (ir->condition) {
2221          emit_bool_to_cond_code(ir->condition, &predicate);
2222       }
2223
2224       /* emit_block_move doesn't account for swizzles in the source register.
2225        * This should be ok, since the source register is a structure or an
2226        * array, and those can't be swizzled.  But double-check to be sure.
2227        */
2228       assert(src.swizzle ==
2229              (ir->rhs->type->is_matrix()
2230               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2231               : BRW_SWIZZLE_NOOP));
2232
2233       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2234       return;
2235    }
2236
2237    /* Now we're down to just a scalar/vector with writemasks. */
2238    int i;
2239
2240    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2241    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2242
2243    ir->rhs->accept(this);
2244
2245    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2246
2247    int swizzles[4];
2248    int src_chan = 0;
2249
2250    assert(ir->lhs->type->is_vector() ||
2251           ir->lhs->type->is_scalar());
2252    dst.writemask = ir->write_mask;
2253
2254    /* Swizzle a small RHS vector into the channels being written.
2255     *
2256     * glsl ir treats write_mask as dictating how many channels are
2257     * present on the RHS while in our instructions we need to make
2258     * those channels appear in the slots of the vec4 they're written to.
2259     */
2260    for (int i = 0; i < 4; i++)
2261       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2262
2263    src_reg src = swizzle(this->result,
2264                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2265                                       swizzles[2], swizzles[3]));
2266
2267    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2268       return;
2269    }
2270
2271    if (ir->condition) {
2272       emit_bool_to_cond_code(ir->condition, &predicate);
2273    }
2274
2275    for (i = 0; i < type_size(ir->lhs->type); i++) {
2276       vec4_instruction *inst = emit(MOV(dst, src));
2277       inst->predicate = predicate;
2278
2279       dst.reg_offset++;
2280       src.reg_offset++;
2281    }
2282 }
2283
2284 void
2285 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2286 {
2287    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2288       foreach_in_list(ir_constant, field_value, &ir->components) {
2289          emit_constant_values(dst, field_value);
2290       }
2291       return;
2292    }
2293
2294    if (ir->type->is_array()) {
2295       for (unsigned int i = 0; i < ir->type->length; i++) {
2296          emit_constant_values(dst, ir->array_elements[i]);
2297       }
2298       return;
2299    }
2300
2301    if (ir->type->is_matrix()) {
2302       for (int i = 0; i < ir->type->matrix_columns; i++) {
2303          float *vec = &ir->value.f[i * ir->type->vector_elements];
2304
2305          for (int j = 0; j < ir->type->vector_elements; j++) {
2306             dst->writemask = 1 << j;
2307             dst->type = BRW_REGISTER_TYPE_F;
2308
2309             emit(MOV(*dst, src_reg(vec[j])));
2310          }
2311          dst->reg_offset++;
2312       }
2313       return;
2314    }
2315
2316    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2317
2318    for (int i = 0; i < ir->type->vector_elements; i++) {
2319       if (!(remaining_writemask & (1 << i)))
2320          continue;
2321
2322       dst->writemask = 1 << i;
2323       dst->type = brw_type_for_base_type(ir->type);
2324
2325       /* Find other components that match the one we're about to
2326        * write.  Emits fewer instructions for things like vec4(0.5,
2327        * 1.5, 1.5, 1.5).
2328        */
2329       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2330          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2331             if (ir->value.b[i] == ir->value.b[j])
2332                dst->writemask |= (1 << j);
2333          } else {
2334             /* u, i, and f storage all line up, so no need for a
2335              * switch case for comparing each type.
2336              */
2337             if (ir->value.u[i] == ir->value.u[j])
2338                dst->writemask |= (1 << j);
2339          }
2340       }
2341
2342       switch (ir->type->base_type) {
2343       case GLSL_TYPE_FLOAT:
2344          emit(MOV(*dst, src_reg(ir->value.f[i])));
2345          break;
2346       case GLSL_TYPE_INT:
2347          emit(MOV(*dst, src_reg(ir->value.i[i])));
2348          break;
2349       case GLSL_TYPE_UINT:
2350          emit(MOV(*dst, src_reg(ir->value.u[i])));
2351          break;
2352       case GLSL_TYPE_BOOL:
2353          emit(MOV(*dst,
2354                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2355                                               : 0)));
2356          break;
2357       default:
2358          unreachable("Non-float/uint/int/bool constant");
2359       }
2360
2361       remaining_writemask &= ~dst->writemask;
2362    }
2363    dst->reg_offset++;
2364 }
2365
2366 void
2367 vec4_visitor::visit(ir_constant *ir)
2368 {
2369    dst_reg dst = dst_reg(this, ir->type);
2370    this->result = src_reg(dst);
2371
2372    emit_constant_values(&dst, ir);
2373 }
2374
2375 void
2376 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2377 {
2378    ir_dereference *deref = static_cast<ir_dereference *>(
2379       ir->actual_parameters.get_head());
2380    ir_variable *location = deref->variable_referenced();
2381    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2382                           location->data.binding);
2383
2384    /* Calculate the surface offset */
2385    src_reg offset(this, glsl_type::uint_type);
2386    ir_dereference_array *deref_array = deref->as_dereference_array();
2387    if (deref_array) {
2388       deref_array->array_index->accept(this);
2389
2390       src_reg tmp(this, glsl_type::uint_type);
2391       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2392       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2393    } else {
2394       offset = location->data.atomic.offset;
2395    }
2396
2397    /* Emit the appropriate machine instruction */
2398    const char *callee = ir->callee->function_name();
2399    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2400
2401    if (!strcmp("__intrinsic_atomic_read", callee)) {
2402       emit_untyped_surface_read(surf_index, dst, offset);
2403
2404    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2405       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2406                           src_reg(), src_reg());
2407
2408    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2409       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2410                           src_reg(), src_reg());
2411    }
2412 }
2413
2414 void
2415 vec4_visitor::visit(ir_call *ir)
2416 {
2417    const char *callee = ir->callee->function_name();
2418
2419    if (!strcmp("__intrinsic_atomic_read", callee) ||
2420        !strcmp("__intrinsic_atomic_increment", callee) ||
2421        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2422       visit_atomic_counter_intrinsic(ir);
2423    } else {
2424       unreachable("Unsupported intrinsic.");
2425    }
2426 }
2427
2428 src_reg
2429 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2430 {
2431    vec4_instruction *inst =
2432       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2433                                     dst_reg(this, glsl_type::uvec4_type));
2434    inst->base_mrf = 2;
2435    inst->mlen = 1;
2436    inst->src[1] = sampler;
2437
2438    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2439    int param_base = inst->base_mrf;
2440    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2441    int zero_mask = 0xf & ~coord_mask;
2442
2443    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2444             coordinate));
2445
2446    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2447             src_reg(0)));
2448
2449    emit(inst);
2450    return src_reg(inst->dst);
2451 }
2452
2453 static bool
2454 is_high_sampler(struct brw_context *brw, src_reg sampler)
2455 {
2456    if (brw->gen < 8 && !brw->is_haswell)
2457       return false;
2458
2459    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2460 }
2461
2462 void
2463 vec4_visitor::visit(ir_texture *ir)
2464 {
2465    uint32_t sampler =
2466       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2467
2468    ir_rvalue *nonconst_sampler_index =
2469       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2470
2471    /* Handle non-constant sampler array indexing */
2472    src_reg sampler_reg;
2473    if (nonconst_sampler_index) {
2474       /* The highest sampler which may be used by this operation is
2475        * the last element of the array. Mark it here, because the generator
2476        * doesn't have enough information to determine the bound.
2477        */
2478       uint32_t array_size = ir->sampler->as_dereference_array()
2479          ->array->type->array_size();
2480
2481       uint32_t max_used = sampler + array_size - 1;
2482       if (ir->op == ir_tg4 && brw->gen < 8) {
2483          max_used += prog_data->base.binding_table.gather_texture_start;
2484       } else {
2485          max_used += prog_data->base.binding_table.texture_start;
2486       }
2487
2488       brw_mark_surface_used(&prog_data->base, max_used);
2489
2490       /* Emit code to evaluate the actual indexing expression */
2491       nonconst_sampler_index->accept(this);
2492       dst_reg temp(this, glsl_type::uint_type);
2493       emit(ADD(temp, this->result, src_reg(sampler)))
2494          ->force_writemask_all = true;
2495       sampler_reg = src_reg(temp);
2496    } else {
2497       /* Single sampler, or constant array index; the indexing expression
2498        * is just an immediate.
2499        */
2500       sampler_reg = src_reg(sampler);
2501    }
2502
2503    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2504     * emitting anything other than setting up the constant result.
2505     */
2506    if (ir->op == ir_tg4) {
2507       ir_constant *chan = ir->lod_info.component->as_constant();
2508       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2509       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2510          dst_reg result(this, ir->type);
2511          this->result = src_reg(result);
2512          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2513          return;
2514       }
2515    }
2516
2517    /* Should be lowered by do_lower_texture_projection */
2518    assert(!ir->projector);
2519
2520    /* Should be lowered */
2521    assert(!ir->offset || !ir->offset->type->is_array());
2522
2523    /* Generate code to compute all the subexpression trees.  This has to be
2524     * done before loading any values into MRFs for the sampler message since
2525     * generating these values may involve SEND messages that need the MRFs.
2526     */
2527    src_reg coordinate;
2528    if (ir->coordinate) {
2529       ir->coordinate->accept(this);
2530       coordinate = this->result;
2531    }
2532
2533    src_reg shadow_comparitor;
2534    if (ir->shadow_comparitor) {
2535       ir->shadow_comparitor->accept(this);
2536       shadow_comparitor = this->result;
2537    }
2538
2539    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2540    src_reg offset_value;
2541    if (has_nonconstant_offset) {
2542       ir->offset->accept(this);
2543       offset_value = src_reg(this->result);
2544    }
2545
2546    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2547    src_reg lod, dPdx, dPdy, sample_index, mcs;
2548    switch (ir->op) {
2549    case ir_tex:
2550       lod = src_reg(0.0f);
2551       lod_type = glsl_type::float_type;
2552       break;
2553    case ir_txf:
2554    case ir_txl:
2555    case ir_txs:
2556       ir->lod_info.lod->accept(this);
2557       lod = this->result;
2558       lod_type = ir->lod_info.lod->type;
2559       break;
2560    case ir_query_levels:
2561       lod = src_reg(0);
2562       lod_type = glsl_type::int_type;
2563       break;
2564    case ir_txf_ms:
2565       ir->lod_info.sample_index->accept(this);
2566       sample_index = this->result;
2567       sample_index_type = ir->lod_info.sample_index->type;
2568
2569       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2570          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2571       else
2572          mcs = src_reg(0u);
2573       break;
2574    case ir_txd:
2575       ir->lod_info.grad.dPdx->accept(this);
2576       dPdx = this->result;
2577
2578       ir->lod_info.grad.dPdy->accept(this);
2579       dPdy = this->result;
2580
2581       lod_type = ir->lod_info.grad.dPdx->type;
2582       break;
2583    case ir_txb:
2584    case ir_lod:
2585    case ir_tg4:
2586       break;
2587    }
2588
2589    enum opcode opcode;
2590    switch (ir->op) {
2591    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2592    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2593    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2594    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2595    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2596    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2597    case ir_tg4: opcode = has_nonconstant_offset
2598                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2599    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2600    case ir_txb:
2601       unreachable("TXB is not valid for vertex shaders.");
2602    case ir_lod:
2603       unreachable("LOD is not valid for vertex shaders.");
2604    default:
2605       unreachable("Unrecognized tex op");
2606    }
2607
2608    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2609       opcode, dst_reg(this, ir->type));
2610
2611    if (ir->offset != NULL && !has_nonconstant_offset) {
2612       inst->offset =
2613          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2614                             ir->offset->type->vector_elements);
2615    }
2616
2617    /* Stuff the channel select bits in the top of the texture offset */
2618    if (ir->op == ir_tg4)
2619       inst->offset |= gather_channel(ir, sampler) << 16;
2620
2621    /* The message header is necessary for:
2622     * - Gen4 (always)
2623     * - Gen9+ for selecting SIMD4x2
2624     * - Texel offsets
2625     * - Gather channel selection
2626     * - Sampler indices too large to fit in a 4-bit value.
2627     */
2628    inst->header_present =
2629       brw->gen < 5 || brw->gen >= 9 ||
2630       inst->offset != 0 || ir->op == ir_tg4 ||
2631       is_high_sampler(brw, sampler_reg);
2632    inst->base_mrf = 2;
2633    inst->mlen = inst->header_present + 1; /* always at least one */
2634    inst->dst.writemask = WRITEMASK_XYZW;
2635    inst->shadow_compare = ir->shadow_comparitor != NULL;
2636
2637    inst->src[1] = sampler_reg;
2638
2639    /* MRF for the first parameter */
2640    int param_base = inst->base_mrf + inst->header_present;
2641
2642    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2643       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2644       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2645    } else {
2646       /* Load the coordinate */
2647       /* FINISHME: gl_clamp_mask and saturate */
2648       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2649       int zero_mask = 0xf & ~coord_mask;
2650
2651       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2652                coordinate));
2653
2654       if (zero_mask != 0) {
2655          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2656                   src_reg(0)));
2657       }
2658       /* Load the shadow comparitor */
2659       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2660          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2661                           WRITEMASK_X),
2662                   shadow_comparitor));
2663          inst->mlen++;
2664       }
2665
2666       /* Load the LOD info */
2667       if (ir->op == ir_tex || ir->op == ir_txl) {
2668          int mrf, writemask;
2669          if (brw->gen >= 5) {
2670             mrf = param_base + 1;
2671             if (ir->shadow_comparitor) {
2672                writemask = WRITEMASK_Y;
2673                /* mlen already incremented */
2674             } else {
2675                writemask = WRITEMASK_X;
2676                inst->mlen++;
2677             }
2678          } else /* brw->gen == 4 */ {
2679             mrf = param_base;
2680             writemask = WRITEMASK_W;
2681          }
2682          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2683       } else if (ir->op == ir_txf) {
2684          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2685       } else if (ir->op == ir_txf_ms) {
2686          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2687                   sample_index));
2688          if (brw->gen >= 7) {
2689             /* MCS data is in the first channel of `mcs`, but we need to get it into
2690              * the .y channel of the second vec4 of params, so replicate .x across
2691              * the whole vec4 and then mask off everything except .y
2692              */
2693             mcs.swizzle = BRW_SWIZZLE_XXXX;
2694             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2695                      mcs));
2696          }
2697          inst->mlen++;
2698       } else if (ir->op == ir_txd) {
2699          const glsl_type *type = lod_type;
2700
2701          if (brw->gen >= 5) {
2702             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2703             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2704             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2705             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2706             inst->mlen++;
2707
2708             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2709                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2710                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2711                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2712                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2713                inst->mlen++;
2714
2715                if (ir->shadow_comparitor) {
2716                   emit(MOV(dst_reg(MRF, param_base + 2,
2717                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2718                            shadow_comparitor));
2719                }
2720             }
2721          } else /* brw->gen == 4 */ {
2722             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2723             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2724             inst->mlen += 2;
2725          }
2726       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2727          if (ir->shadow_comparitor) {
2728             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2729                      shadow_comparitor));
2730          }
2731
2732          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2733                   offset_value));
2734          inst->mlen++;
2735       }
2736    }
2737
2738    emit(inst);
2739
2740    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2741     * spec requires layers.
2742     */
2743    if (ir->op == ir_txs) {
2744       glsl_type const *type = ir->sampler->type;
2745       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2746           type->sampler_array) {
2747          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2748                    writemask(inst->dst, WRITEMASK_Z),
2749                    src_reg(inst->dst), src_reg(6));
2750       }
2751    }
2752
2753    if (brw->gen == 6 && ir->op == ir_tg4) {
2754       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2755    }
2756
2757    swizzle_result(ir, src_reg(inst->dst), sampler);
2758 }
2759
2760 /**
2761  * Apply workarounds for Gen6 gather with UINT/SINT
2762  */
2763 void
2764 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2765 {
2766    if (!wa)
2767       return;
2768
2769    int width = (wa & WA_8BIT) ? 8 : 16;
2770    dst_reg dst_f = dst;
2771    dst_f.type = BRW_REGISTER_TYPE_F;
2772
2773    /* Convert from UNORM to UINT */
2774    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2775    emit(MOV(dst, src_reg(dst_f)));
2776
2777    if (wa & WA_SIGN) {
2778       /* Reinterpret the UINT value as a signed INT value by
2779        * shifting the sign bit into place, then shifting back
2780        * preserving sign.
2781        */
2782       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2783       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2784    }
2785 }
2786
2787 /**
2788  * Set up the gather channel based on the swizzle, for gather4.
2789  */
2790 uint32_t
2791 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2792 {
2793    ir_constant *chan = ir->lod_info.component->as_constant();
2794    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2795    switch (swiz) {
2796       case SWIZZLE_X: return 0;
2797       case SWIZZLE_Y:
2798          /* gather4 sampler is broken for green channel on RG32F --
2799           * we must ask for blue instead.
2800           */
2801          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2802             return 2;
2803          return 1;
2804       case SWIZZLE_Z: return 2;
2805       case SWIZZLE_W: return 3;
2806       default:
2807          unreachable("Not reached"); /* zero, one swizzles handled already */
2808    }
2809 }
2810
2811 void
2812 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2813 {
2814    int s = key->tex.swizzles[sampler];
2815
2816    this->result = src_reg(this, ir->type);
2817    dst_reg swizzled_result(this->result);
2818
2819    if (ir->op == ir_query_levels) {
2820       /* # levels is in .w */
2821       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2822       emit(MOV(swizzled_result, orig_val));
2823       return;
2824    }
2825
2826    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2827                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2828       emit(MOV(swizzled_result, orig_val));
2829       return;
2830    }
2831
2832
2833    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2834    int swizzle[4] = {0};
2835
2836    for (int i = 0; i < 4; i++) {
2837       switch (GET_SWZ(s, i)) {
2838       case SWIZZLE_ZERO:
2839          zero_mask |= (1 << i);
2840          break;
2841       case SWIZZLE_ONE:
2842          one_mask |= (1 << i);
2843          break;
2844       default:
2845          copy_mask |= (1 << i);
2846          swizzle[i] = GET_SWZ(s, i);
2847          break;
2848       }
2849    }
2850
2851    if (copy_mask) {
2852       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2853       swizzled_result.writemask = copy_mask;
2854       emit(MOV(swizzled_result, orig_val));
2855    }
2856
2857    if (zero_mask) {
2858       swizzled_result.writemask = zero_mask;
2859       emit(MOV(swizzled_result, src_reg(0.0f)));
2860    }
2861
2862    if (one_mask) {
2863       swizzled_result.writemask = one_mask;
2864       emit(MOV(swizzled_result, src_reg(1.0f)));
2865    }
2866 }
2867
2868 void
2869 vec4_visitor::visit(ir_return *)
2870 {
2871    unreachable("not reached");
2872 }
2873
2874 void
2875 vec4_visitor::visit(ir_discard *)
2876 {
2877    unreachable("not reached");
2878 }
2879
2880 void
2881 vec4_visitor::visit(ir_if *ir)
2882 {
2883    /* Don't point the annotation at the if statement, because then it plus
2884     * the then and else blocks get printed.
2885     */
2886    this->base_ir = ir->condition;
2887
2888    if (brw->gen == 6) {
2889       emit_if_gen6(ir);
2890    } else {
2891       enum brw_predicate predicate;
2892       emit_bool_to_cond_code(ir->condition, &predicate);
2893       emit(IF(predicate));
2894    }
2895
2896    visit_instructions(&ir->then_instructions);
2897
2898    if (!ir->else_instructions.is_empty()) {
2899       this->base_ir = ir->condition;
2900       emit(BRW_OPCODE_ELSE);
2901
2902       visit_instructions(&ir->else_instructions);
2903    }
2904
2905    this->base_ir = ir->condition;
2906    emit(BRW_OPCODE_ENDIF);
2907 }
2908
2909 void
2910 vec4_visitor::visit(ir_emit_vertex *)
2911 {
2912    unreachable("not reached");
2913 }
2914
2915 void
2916 vec4_visitor::visit(ir_end_primitive *)
2917 {
2918    unreachable("not reached");
2919 }
2920
2921 void
2922 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2923                                   dst_reg dst, src_reg offset,
2924                                   src_reg src0, src_reg src1)
2925 {
2926    unsigned mlen = 0;
2927
2928    /* Set the atomic operation offset. */
2929    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2930    mlen++;
2931
2932    /* Set the atomic operation arguments. */
2933    if (src0.file != BAD_FILE) {
2934       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2935       mlen++;
2936    }
2937
2938    if (src1.file != BAD_FILE) {
2939       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2940       mlen++;
2941    }
2942
2943    /* Emit the instruction.  Note that this maps to the normal SIMD8
2944     * untyped atomic message on Ivy Bridge, but that's OK because
2945     * unused channels will be masked out.
2946     */
2947    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2948                                  src_reg(atomic_op), src_reg(surf_index));
2949    inst->base_mrf = 0;
2950    inst->mlen = mlen;
2951 }
2952
2953 void
2954 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2955                                         src_reg offset)
2956 {
2957    /* Set the surface read offset. */
2958    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2959
2960    /* Emit the instruction.  Note that this maps to the normal SIMD8
2961     * untyped surface read message, but that's OK because unused
2962     * channels will be masked out.
2963     */
2964    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2965                                  dst, src_reg(surf_index));
2966    inst->base_mrf = 0;
2967    inst->mlen = 1;
2968 }
2969
2970 void
2971 vec4_visitor::emit_ndc_computation()
2972 {
2973    /* Get the position */
2974    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2975
2976    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2977    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2978    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2979
2980    current_annotation = "NDC";
2981    dst_reg ndc_w = ndc;
2982    ndc_w.writemask = WRITEMASK_W;
2983    src_reg pos_w = pos;
2984    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2985    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2986
2987    dst_reg ndc_xyz = ndc;
2988    ndc_xyz.writemask = WRITEMASK_XYZ;
2989
2990    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2991 }
2992
2993 void
2994 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2995 {
2996    if (brw->gen < 6 &&
2997        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2998         key->userclip_active || brw->has_negative_rhw_bug)) {
2999       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3000       dst_reg header1_w = header1;
3001       header1_w.writemask = WRITEMASK_W;
3002
3003       emit(MOV(header1, 0u));
3004
3005       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3006          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3007
3008          current_annotation = "Point size";
3009          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3010          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3011       }
3012
3013       if (key->userclip_active) {
3014          current_annotation = "Clipping flags";
3015          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3016          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3017
3018          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3019          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3020          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3021
3022          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3023          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3024          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3025          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3026       }
3027
3028       /* i965 clipping workaround:
3029        * 1) Test for -ve rhw
3030        * 2) If set,
3031        *      set ndc = (0,0,0,0)
3032        *      set ucp[6] = 1
3033        *
3034        * Later, clipping will detect ucp[6] and ensure the primitive is
3035        * clipped against all fixed planes.
3036        */
3037       if (brw->has_negative_rhw_bug) {
3038          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3039          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3040          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3041          vec4_instruction *inst;
3042          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3043          inst->predicate = BRW_PREDICATE_NORMAL;
3044          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3045          inst->predicate = BRW_PREDICATE_NORMAL;
3046       }
3047
3048       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3049    } else if (brw->gen < 6) {
3050       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3051    } else {
3052       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3053       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3054          dst_reg reg_w = reg;
3055          reg_w.writemask = WRITEMASK_W;
3056          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3057       }
3058       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3059          dst_reg reg_y = reg;
3060          reg_y.writemask = WRITEMASK_Y;
3061          reg_y.type = BRW_REGISTER_TYPE_D;
3062          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3063       }
3064       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3065          dst_reg reg_z = reg;
3066          reg_z.writemask = WRITEMASK_Z;
3067          reg_z.type = BRW_REGISTER_TYPE_D;
3068          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3069       }
3070    }
3071 }
3072
3073 void
3074 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3075 {
3076    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3077     *
3078     *     "If a linked set of shaders forming the vertex stage contains no
3079     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3080     *     application has requested clipping against user clip planes through
3081     *     the API, then the coordinate written to gl_Position is used for
3082     *     comparison against the user clip planes."
3083     *
3084     * This function is only called if the shader didn't write to
3085     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3086     * if the user wrote to it; otherwise we use gl_Position.
3087     */
3088    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3089    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3090       clip_vertex = VARYING_SLOT_POS;
3091    }
3092
3093    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3094         ++i) {
3095       reg.writemask = 1 << i;
3096       emit(DP4(reg,
3097                src_reg(output_reg[clip_vertex]),
3098                src_reg(this->userplane[i + offset])));
3099    }
3100 }
3101
3102 vec4_instruction *
3103 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3104 {
3105    assert (varying < VARYING_SLOT_MAX);
3106    reg.type = output_reg[varying].type;
3107    current_annotation = output_reg_annotation[varying];
3108    /* Copy the register, saturating if necessary */
3109    return emit(MOV(reg, src_reg(output_reg[varying])));
3110 }
3111
3112 void
3113 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3114 {
3115    reg.type = BRW_REGISTER_TYPE_F;
3116
3117    switch (varying) {
3118    case VARYING_SLOT_PSIZ:
3119    {
3120       /* PSIZ is always in slot 0, and is coupled with other flags. */
3121       current_annotation = "indices, point width, clip flags";
3122       emit_psiz_and_flags(reg);
3123       break;
3124    }
3125    case BRW_VARYING_SLOT_NDC:
3126       current_annotation = "NDC";
3127       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3128       break;
3129    case VARYING_SLOT_POS:
3130       current_annotation = "gl_Position";
3131       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3132       break;
3133    case VARYING_SLOT_EDGE:
3134       /* This is present when doing unfilled polygons.  We're supposed to copy
3135        * the edge flag from the user-provided vertex array
3136        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3137        * of that attribute (starts as 1.0f).  This is then used in clipping to
3138        * determine which edges should be drawn as wireframe.
3139        */
3140       current_annotation = "edge flag";
3141       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3142                                     glsl_type::float_type, WRITEMASK_XYZW))));
3143       break;
3144    case BRW_VARYING_SLOT_PAD:
3145       /* No need to write to this slot */
3146       break;
3147    case VARYING_SLOT_COL0:
3148    case VARYING_SLOT_COL1:
3149    case VARYING_SLOT_BFC0:
3150    case VARYING_SLOT_BFC1: {
3151       /* These built-in varyings are only supported in compatibility mode,
3152        * and we only support GS in core profile.  So, this must be a vertex
3153        * shader.
3154        */
3155       assert(stage == MESA_SHADER_VERTEX);
3156       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3157       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3158          inst->saturate = true;
3159       break;
3160    }
3161
3162    default:
3163       emit_generic_urb_slot(reg, varying);
3164       break;
3165    }
3166 }
3167
3168 static int
3169 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3170 {
3171    if (brw->gen >= 6) {
3172       /* URB data written (does not include the message header reg) must
3173        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3174        * section 5.4.3.2.2: URB_INTERLEAVED.
3175        *
3176        * URB entries are allocated on a multiple of 1024 bits, so an
3177        * extra 128 bits written here to make the end align to 256 is
3178        * no problem.
3179        */
3180       if ((mlen % 2) != 1)
3181          mlen++;
3182    }
3183
3184    return mlen;
3185 }
3186
3187
3188 /**
3189  * Generates the VUE payload plus the necessary URB write instructions to
3190  * output it.
3191  *
3192  * The VUE layout is documented in Volume 2a.
3193  */
3194 void
3195 vec4_visitor::emit_vertex()
3196 {
3197    /* MRF 0 is reserved for the debugger, so start with message header
3198     * in MRF 1.
3199     */
3200    int base_mrf = 1;
3201    int mrf = base_mrf;
3202    /* In the process of generating our URB write message contents, we
3203     * may need to unspill a register or load from an array.  Those
3204     * reads would use MRFs 14-15.
3205     */
3206    int max_usable_mrf = 13;
3207
3208    /* The following assertion verifies that max_usable_mrf causes an
3209     * even-numbered amount of URB write data, which will meet gen6's
3210     * requirements for length alignment.
3211     */
3212    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3213
3214    /* First mrf is the g0-based message header containing URB handles and
3215     * such.
3216     */
3217    emit_urb_write_header(mrf++);
3218
3219    if (brw->gen < 6) {
3220       emit_ndc_computation();
3221    }
3222
3223    /* Lower legacy ff and ClipVertex clipping to clip distances */
3224    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3225       current_annotation = "user clip distances";
3226
3227       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3228       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3229
3230       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3231       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3232    }
3233
3234    /* We may need to split this up into several URB writes, so do them in a
3235     * loop.
3236     */
3237    int slot = 0;
3238    bool complete = false;
3239    do {
3240       /* URB offset is in URB row increments, and each of our MRFs is half of
3241        * one of those, since we're doing interleaved writes.
3242        */
3243       int offset = slot / 2;
3244
3245       mrf = base_mrf + 1;
3246       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3247          emit_urb_slot(dst_reg(MRF, mrf++),
3248                        prog_data->vue_map.slot_to_varying[slot]);
3249
3250          /* If this was max_usable_mrf, we can't fit anything more into this
3251           * URB WRITE.
3252           */
3253          if (mrf > max_usable_mrf) {
3254             slot++;
3255             break;
3256          }
3257       }
3258
3259       complete = slot >= prog_data->vue_map.num_slots;
3260       current_annotation = "URB write";
3261       vec4_instruction *inst = emit_urb_write_opcode(complete);
3262       inst->base_mrf = base_mrf;
3263       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3264       inst->offset += offset;
3265    } while(!complete);
3266 }
3267
3268
3269 src_reg
3270 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3271                                  src_reg *reladdr, int reg_offset)
3272 {
3273    /* Because we store the values to scratch interleaved like our
3274     * vertex data, we need to scale the vec4 index by 2.
3275     */
3276    int message_header_scale = 2;
3277
3278    /* Pre-gen6, the message header uses byte offsets instead of vec4
3279     * (16-byte) offset units.
3280     */
3281    if (brw->gen < 6)
3282       message_header_scale *= 16;
3283
3284    if (reladdr) {
3285       src_reg index = src_reg(this, glsl_type::int_type);
3286
3287       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3288                                    src_reg(reg_offset)));
3289       emit_before(block, inst, MUL(dst_reg(index), index,
3290                                    src_reg(message_header_scale)));
3291
3292       return index;
3293    } else {
3294       return src_reg(reg_offset * message_header_scale);
3295    }
3296 }
3297
3298 src_reg
3299 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3300                                        src_reg *reladdr, int reg_offset)
3301 {
3302    if (reladdr) {
3303       src_reg index = src_reg(this, glsl_type::int_type);
3304
3305       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3306                                    src_reg(reg_offset)));
3307
3308       /* Pre-gen6, the message header uses byte offsets instead of vec4
3309        * (16-byte) offset units.
3310        */
3311       if (brw->gen < 6) {
3312          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3313       }
3314
3315       return index;
3316    } else if (brw->gen >= 8) {
3317       /* Store the offset in a GRF so we can send-from-GRF. */
3318       src_reg offset = src_reg(this, glsl_type::int_type);
3319       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3320       return offset;
3321    } else {
3322       int message_header_scale = brw->gen < 6 ? 16 : 1;
3323       return src_reg(reg_offset * message_header_scale);
3324    }
3325 }
3326
3327 /**
3328  * Emits an instruction before @inst to load the value named by @orig_src
3329  * from scratch space at @base_offset to @temp.
3330  *
3331  * @base_offset is measured in 32-byte units (the size of a register).
3332  */
3333 void
3334 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3335                                 dst_reg temp, src_reg orig_src,
3336                                 int base_offset)
3337 {
3338    int reg_offset = base_offset + orig_src.reg_offset;
3339    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3340                                       reg_offset);
3341
3342    emit_before(block, inst, SCRATCH_READ(temp, index));
3343 }
3344
3345 /**
3346  * Emits an instruction after @inst to store the value to be written
3347  * to @orig_dst to scratch space at @base_offset, from @temp.
3348  *
3349  * @base_offset is measured in 32-byte units (the size of a register).
3350  */
3351 void
3352 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3353                                  int base_offset)
3354 {
3355    int reg_offset = base_offset + inst->dst.reg_offset;
3356    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3357                                       reg_offset);
3358
3359    /* Create a temporary register to store *inst's result in.
3360     *
3361     * We have to be careful in MOVing from our temporary result register in
3362     * the scratch write.  If we swizzle from channels of the temporary that
3363     * weren't initialized, it will confuse live interval analysis, which will
3364     * make spilling fail to make progress.
3365     */
3366    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3367                                        inst->dst.type),
3368                                 brw_swizzle_for_mask(inst->dst.writemask));
3369    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3370                                        inst->dst.writemask));
3371    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3372    write->predicate = inst->predicate;
3373    write->ir = inst->ir;
3374    write->annotation = inst->annotation;
3375    inst->insert_after(block, write);
3376
3377    inst->dst.file = temp.file;
3378    inst->dst.reg = temp.reg;
3379    inst->dst.reg_offset = temp.reg_offset;
3380    inst->dst.reladdr = NULL;
3381 }
3382
3383 /**
3384  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3385  * adds the scratch read(s) before \p inst. The function also checks for
3386  * recursive reladdr scratch accesses, issuing the corresponding scratch
3387  * loads and rewriting reladdr references accordingly.
3388  *
3389  * \return \p src if it did not require a scratch load, otherwise, the
3390  * register holding the result of the scratch load that the caller should
3391  * use to rewrite src.
3392  */
3393 src_reg
3394 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3395                                    vec4_instruction *inst, src_reg src)
3396 {
3397    /* Resolve recursive reladdr scratch access by calling ourselves
3398     * with src.reladdr
3399     */
3400    if (src.reladdr)
3401       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3402                                           *src.reladdr);
3403
3404    /* Now handle scratch access on src */
3405    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3406       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3407       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3408       src.reg = temp.reg;
3409       src.reg_offset = temp.reg_offset;
3410       src.reladdr = NULL;
3411    }
3412
3413    return src;
3414 }
3415
3416 /**
3417  * We can't generally support array access in GRF space, because a
3418  * single instruction's destination can only span 2 contiguous
3419  * registers.  So, we send all GRF arrays that get variable index
3420  * access to scratch space.
3421  */
3422 void
3423 vec4_visitor::move_grf_array_access_to_scratch()
3424 {
3425    int scratch_loc[this->alloc.count];
3426    memset(scratch_loc, -1, sizeof(scratch_loc));
3427
3428    /* First, calculate the set of virtual GRFs that need to be punted
3429     * to scratch due to having any array access on them, and where in
3430     * scratch.
3431     */
3432    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3433       if (inst->dst.file == GRF && inst->dst.reladdr) {
3434          if (scratch_loc[inst->dst.reg] == -1) {
3435             scratch_loc[inst->dst.reg] = c->last_scratch;
3436             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3437          }
3438
3439          for (src_reg *iter = inst->dst.reladdr;
3440               iter->reladdr;
3441               iter = iter->reladdr) {
3442             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3443                scratch_loc[iter->reg] = c->last_scratch;
3444                c->last_scratch += this->alloc.sizes[iter->reg];
3445             }
3446          }
3447       }
3448
3449       for (int i = 0 ; i < 3; i++) {
3450          for (src_reg *iter = &inst->src[i];
3451               iter->reladdr;
3452               iter = iter->reladdr) {
3453             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3454                scratch_loc[iter->reg] = c->last_scratch;
3455                c->last_scratch += this->alloc.sizes[iter->reg];
3456             }
3457          }
3458       }
3459    }
3460
3461    /* Now, for anything that will be accessed through scratch, rewrite
3462     * it to load/store.  Note that this is a _safe list walk, because
3463     * we may generate a new scratch_write instruction after the one
3464     * we're processing.
3465     */
3466    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3467       /* Set up the annotation tracking for new generated instructions. */
3468       base_ir = inst->ir;
3469       current_annotation = inst->annotation;
3470
3471       /* First handle scratch access on the dst. Notice we have to handle
3472        * the case where the dst's reladdr also points to scratch space.
3473        */
3474       if (inst->dst.reladdr)
3475          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3476                                                    *inst->dst.reladdr);
3477
3478       /* Now that we have handled any (possibly recursive) reladdr scratch
3479        * accesses for dst we can safely do the scratch write for dst itself
3480        */
3481       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3482          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3483
3484       /* Now handle scratch access on any src. In this case, since inst->src[i]
3485        * already is a src_reg, we can just call emit_resolve_reladdr with
3486        * inst->src[i] and it will take care of handling scratch loads for
3487        * both src and src.reladdr (recursively).
3488        */
3489       for (int i = 0 ; i < 3; i++) {
3490          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3491                                              inst->src[i]);
3492       }
3493    }
3494 }
3495
3496 /**
3497  * Emits an instruction before @inst to load the value named by @orig_src
3498  * from the pull constant buffer (surface) at @base_offset to @temp.
3499  */
3500 void
3501 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3502                                       dst_reg temp, src_reg orig_src,
3503                                       int base_offset)
3504 {
3505    int reg_offset = base_offset + orig_src.reg_offset;
3506    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3507    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3508                                              reg_offset);
3509
3510    emit_pull_constant_load_reg(temp,
3511                                index,
3512                                offset,
3513                                block, inst);
3514 }
3515
3516 /**
3517  * Implements array access of uniforms by inserting a
3518  * PULL_CONSTANT_LOAD instruction.
3519  *
3520  * Unlike temporary GRF array access (where we don't support it due to
3521  * the difficulty of doing relative addressing on instruction
3522  * destinations), we could potentially do array access of uniforms
3523  * that were loaded in GRF space as push constants.  In real-world
3524  * usage we've seen, though, the arrays being used are always larger
3525  * than we could load as push constants, so just always move all
3526  * uniform array access out to a pull constant buffer.
3527  */
3528 void
3529 vec4_visitor::move_uniform_array_access_to_pull_constants()
3530 {
3531    int pull_constant_loc[this->uniforms];
3532    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3533    bool nested_reladdr;
3534
3535    /* Walk through and find array access of uniforms.  Put a copy of that
3536     * uniform in the pull constant buffer.
3537     *
3538     * Note that we don't move constant-indexed accesses to arrays.  No
3539     * testing has been done of the performance impact of this choice.
3540     */
3541    do {
3542       nested_reladdr = false;
3543
3544       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3545          for (int i = 0 ; i < 3; i++) {
3546             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3547                continue;
3548
3549             int uniform = inst->src[i].reg;
3550
3551             if (inst->src[i].reladdr->reladdr)
3552                nested_reladdr = true;  /* will need another pass */
3553
3554             /* If this array isn't already present in the pull constant buffer,
3555              * add it.
3556              */
3557             if (pull_constant_loc[uniform] == -1) {
3558                const gl_constant_value **values =
3559                   &stage_prog_data->param[uniform * 4];
3560
3561                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3562
3563                assert(uniform < uniform_array_size);
3564                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3565                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3566                      = values[j];
3567                }
3568             }
3569
3570             /* Set up the annotation tracking for new generated instructions. */
3571             base_ir = inst->ir;
3572             current_annotation = inst->annotation;
3573
3574             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3575
3576             emit_pull_constant_load(block, inst, temp, inst->src[i],
3577                                     pull_constant_loc[uniform]);
3578
3579             inst->src[i].file = temp.file;
3580             inst->src[i].reg = temp.reg;
3581             inst->src[i].reg_offset = temp.reg_offset;
3582             inst->src[i].reladdr = NULL;
3583          }
3584       }
3585    } while (nested_reladdr);
3586
3587    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3588     * no need to track them as larger-than-vec4 objects.  This will be
3589     * relied on in cutting out unused uniform vectors from push
3590     * constants.
3591     */
3592    split_uniform_registers();
3593 }
3594
3595 void
3596 vec4_visitor::resolve_ud_negate(src_reg *reg)
3597 {
3598    if (reg->type != BRW_REGISTER_TYPE_UD ||
3599        !reg->negate)
3600       return;
3601
3602    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3603    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3604    *reg = temp;
3605 }
3606
3607 /**
3608  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3609  *
3610  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3611  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3612  */
3613 void
3614 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3615 {
3616    assert(brw->gen <= 5);
3617
3618    if (!rvalue->type->is_boolean())
3619       return;
3620
3621    src_reg and_result = src_reg(this, rvalue->type);
3622    src_reg neg_result = src_reg(this, rvalue->type);
3623    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3624    emit(MOV(dst_reg(neg_result), negate(and_result)));
3625    *reg = neg_result;
3626 }
3627
3628 vec4_visitor::vec4_visitor(struct brw_context *brw,
3629                            struct brw_vec4_compile *c,
3630                            struct gl_program *prog,
3631                            const struct brw_vue_prog_key *key,
3632                            struct brw_vue_prog_data *prog_data,
3633                            struct gl_shader_program *shader_prog,
3634                            gl_shader_stage stage,
3635                            void *mem_ctx,
3636                            bool no_spills,
3637                            shader_time_shader_type st_base,
3638                            shader_time_shader_type st_written,
3639                            shader_time_shader_type st_reset)
3640    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3641      c(c),
3642      key(key),
3643      prog_data(prog_data),
3644      sanity_param_count(0),
3645      fail_msg(NULL),
3646      first_non_payload_grf(0),
3647      need_all_constants_in_pull_buffer(false),
3648      no_spills(no_spills),
3649      st_base(st_base),
3650      st_written(st_written),
3651      st_reset(st_reset)
3652 {
3653    this->mem_ctx = mem_ctx;
3654    this->failed = false;
3655
3656    this->base_ir = NULL;
3657    this->current_annotation = NULL;
3658    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3659
3660    this->variable_ht = hash_table_ctor(0,
3661                                        hash_table_pointer_hash,
3662                                        hash_table_pointer_compare);
3663
3664    this->virtual_grf_start = NULL;
3665    this->virtual_grf_end = NULL;
3666    this->live_intervals = NULL;
3667
3668    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3669
3670    this->uniforms = 0;
3671
3672    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3673     * at least one. See setup_uniforms() in brw_vec4.cpp.
3674     */
3675    this->uniform_array_size = 1;
3676    if (prog_data) {
3677       this->uniform_array_size =
3678          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3679    }
3680
3681    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3682    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3683 }
3684
3685 vec4_visitor::~vec4_visitor()
3686 {
3687    hash_table_dtor(this->variable_ht);
3688 }
3689
3690
3691 void
3692 vec4_visitor::fail(const char *format, ...)
3693 {
3694    va_list va;
3695    char *msg;
3696
3697    if (failed)
3698       return;
3699
3700    failed = true;
3701
3702    va_start(va, format);
3703    msg = ralloc_vasprintf(mem_ctx, format, va);
3704    va_end(va);
3705    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3706
3707    this->fail_msg = msg;
3708
3709    if (debug_enabled) {
3710       fprintf(stderr, "%s",  msg);
3711    }
3712 }
3713
3714 } /* namespace brw */