src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(MOV(f, src_reg(shifted)));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(MOV(f, src_reg(shifted)));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_ERROR:
 616    case GLSL_TYPE_INTERFACE:
 617       unreachable("not reached");
 618    }
 619
 620    return 0;
 621 }
 622
 623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->alloc.allocate(type_size(type));
 629
 630    if (type->is_array() || type->is_record()) {
 631       this->swizzle = BRW_SWIZZLE_NOOP;
 632    } else {
 633       this->swizzle = swizzle_for_size(type->vector_elements);
 634    }
 635
 636    this->type = brw_type_for_base_type(type);
 637 }
 638
 639 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 640 {
 641    assert(size > 0);
 642
 643    init();
 644
 645    this->file = GRF;
 646    this->reg = v->alloc.allocate(type_size(type) * size);
 647
 648    this->swizzle = BRW_SWIZZLE_NOOP;
 649
 650    this->type = brw_type_for_base_type(type);
 651 }
 652
 653 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 654 {
 655    init();
 656
 657    this->file = GRF;
 658    this->reg = v->alloc.allocate(type_size(type));
 659
 660    if (type->is_array() || type->is_record()) {
 661       this->writemask = WRITEMASK_XYZW;
 662    } else {
 663       this->writemask = (1 << type->vector_elements) - 1;
 664    }
 665
 666    this->type = brw_type_for_base_type(type);
 667 }
 668
 669 /* Our support for uniforms is piggy-backed on the struct
 670  * gl_fragment_program, because that's where the values actually
 671  * get stored, rather than in some global gl_shader_program uniform
 672  * store.
 673  */
 674 void
 675 vec4_visitor::setup_uniform_values(ir_variable *ir)
 676 {
 677    int namelen = strlen(ir->name);
 678
 679    /* The data for our (non-builtin) uniforms is stored in a series of
 680     * gl_uniform_driver_storage structs for each subcomponent that
 681     * glGetUniformLocation() could name.  We know it's been set up in the same
 682     * order we'd walk the type, so walk the list of storage and find anything
 683     * with our name, or the prefix of a component that starts with our name.
 684     */
 685    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 686       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 687
 688       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 689           (storage->name[namelen] != 0 &&
 690            storage->name[namelen] != '.' &&
 691            storage->name[namelen] != '[')) {
 692          continue;
 693       }
 694
 695       gl_constant_value *components = storage->storage;
 696       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 697                                storage->type->matrix_columns);
 698
 699       for (unsigned s = 0; s < vector_count; s++) {
 700          assert(uniforms < uniform_array_size);
 701          uniform_vector_size[uniforms] = storage->type->vector_elements;
 702
 703          int i;
 704          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 705             stage_prog_data->param[uniforms * 4 + i] = components;
 706             components++;
 707          }
 708          for (; i < 4; i++) {
 709             static gl_constant_value zero = { 0.0 };
 710             stage_prog_data->param[uniforms * 4 + i] = &zero;
 711          }
 712
 713          uniforms++;
 714       }
 715    }
 716 }
 717
 718 void
 719 vec4_visitor::setup_uniform_clipplane_values()
 720 {
 721    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 722
 723    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 724       assert(this->uniforms < uniform_array_size);
 725       this->uniform_vector_size[this->uniforms] = 4;
 726       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 727       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 728       for (int j = 0; j < 4; ++j) {
 729          stage_prog_data->param[this->uniforms * 4 + j] =
 730             (gl_constant_value *) &clip_planes[i][j];
 731       }
 732       ++this->uniforms;
 733    }
 734 }
 735
 736 /* Our support for builtin uniforms is even scarier than non-builtin.
 737  * It sits on top of the PROG_STATE_VAR parameters that are
 738  * automatically updated from GL context state.
 739  */
 740 void
 741 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 742 {
 743    const ir_state_slot *const slots = ir->get_state_slots();
 744    assert(slots != NULL);
 745
 746    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 747       /* This state reference has already been setup by ir_to_mesa,
 748        * but we'll get the same index back here.  We can reference
 749        * ParameterValues directly, since unlike brw_fs.cpp, we never
 750        * add new state references during compile.
 751        */
 752       int index = _mesa_add_state_reference(this->prog->Parameters,
 753                                             (gl_state_index *)slots[i].tokens);
 754       gl_constant_value *values =
 755          &this->prog->Parameters->ParameterValues[index][0];
 756
 757       assert(this->uniforms < uniform_array_size);
 758       this->uniform_vector_size[this->uniforms] = 0;
 759       /* Add each of the unique swizzled channels of the element.
 760        * This will end up matching the size of the glsl_type of this field.
 761        */
 762       int last_swiz = -1;
 763       for (unsigned int j = 0; j < 4; j++) {
 764          int swiz = GET_SWZ(slots[i].swizzle, j);
 765          last_swiz = swiz;
 766
 767          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 768          assert(this->uniforms < uniform_array_size);
 769          if (swiz <= last_swiz)
 770             this->uniform_vector_size[this->uniforms]++;
 771       }
 772       this->uniforms++;
 773    }
 774 }
 775
 776 dst_reg *
 777 vec4_visitor::variable_storage(ir_variable *var)
 778 {
 779    return (dst_reg *)hash_table_find(this->variable_ht, var);
 780 }
 781
 782 void
 783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 784                                      enum brw_predicate *predicate)
 785 {
 786    ir_expression *expr = ir->as_expression();
 787
 788    *predicate = BRW_PREDICATE_NORMAL;
 789
 790    if (expr && expr->operation != ir_binop_ubo_load) {
 791       src_reg op[3];
 792       vec4_instruction *inst;
 793
 794       assert(expr->get_num_operands() <= 3);
 795       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 796          expr->operands[i]->accept(this);
 797          op[i] = this->result;
 798
 799          resolve_ud_negate(&op[i]);
 800       }
 801
 802       switch (expr->operation) {
 803       case ir_unop_logic_not:
 804          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 805          inst->conditional_mod = BRW_CONDITIONAL_Z;
 806          break;
 807
 808       case ir_binop_logic_xor:
 809          if (brw->gen <= 5) {
 810             src_reg temp = src_reg(this, ir->type);
 811             emit(XOR(dst_reg(temp), op[0], op[1]));
 812             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 813          } else {
 814             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 815          }
 816          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          break;
 818
 819       case ir_binop_logic_or:
 820          if (brw->gen <= 5) {
 821             src_reg temp = src_reg(this, ir->type);
 822             emit(OR(dst_reg(temp), op[0], op[1]));
 823             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 824          } else {
 825             inst = emit(OR(dst_null_d(), op[0], op[1]));
 826          }
 827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 828          break;
 829
 830       case ir_binop_logic_and:
 831          if (brw->gen <= 5) {
 832             src_reg temp = src_reg(this, ir->type);
 833             emit(AND(dst_reg(temp), op[0], op[1]));
 834             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 835          } else {
 836             inst = emit(AND(dst_null_d(), op[0], op[1]));
 837          }
 838          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839          break;
 840
 841       case ir_unop_f2b:
 842          if (brw->gen >= 6) {
 843             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 844          } else {
 845             inst = emit(MOV(dst_null_f(), op[0]));
 846             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 847          }
 848          break;
 849
 850       case ir_unop_i2b:
 851          if (brw->gen >= 6) {
 852             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 853          } else {
 854             inst = emit(MOV(dst_null_d(), op[0]));
 855             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 856          }
 857          break;
 858
 859       case ir_binop_all_equal:
 860          if (brw->gen <= 5) {
 861             resolve_bool_comparison(expr->operands[0], &op[0]);
 862             resolve_bool_comparison(expr->operands[1], &op[1]);
 863          }
 864          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 865          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 866          break;
 867
 868       case ir_binop_any_nequal:
 869          if (brw->gen <= 5) {
 870             resolve_bool_comparison(expr->operands[0], &op[0]);
 871             resolve_bool_comparison(expr->operands[1], &op[1]);
 872          }
 873          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 874          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 875          break;
 876
 877       case ir_unop_any:
 878          if (brw->gen <= 5) {
 879             resolve_bool_comparison(expr->operands[0], &op[0]);
 880          }
 881          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 882          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 883          break;
 884
 885       case ir_binop_greater:
 886       case ir_binop_gequal:
 887       case ir_binop_less:
 888       case ir_binop_lequal:
 889       case ir_binop_equal:
 890       case ir_binop_nequal:
 891          if (brw->gen <= 5) {
 892             resolve_bool_comparison(expr->operands[0], &op[0]);
 893             resolve_bool_comparison(expr->operands[1], &op[1]);
 894          }
 895          emit(CMP(dst_null_d(), op[0], op[1],
 896                   brw_conditional_for_comparison(expr->operation)));
 897          break;
 898
 899       case ir_triop_csel: {
 900          /* Expand the boolean condition into the flag register. */
 901          inst = emit(MOV(dst_null_d(), op[0]));
 902          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 903
 904          /* Select which boolean to return. */
 905          dst_reg temp(this, expr->operands[1]->type);
 906          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 907          inst->predicate = BRW_PREDICATE_NORMAL;
 908
 909          /* Expand the result to a condition code. */
 910          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 911          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 912          break;
 913       }
 914
 915       default:
 916          unreachable("not reached");
 917       }
 918       return;
 919    }
 920
 921    ir->accept(this);
 922
 923    resolve_ud_negate(&this->result);
 924
 925    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 926    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 927 }
 928
 929 /**
 930  * Emit a gen6 IF statement with the comparison folded into the IF
 931  * instruction.
 932  */
 933 void
 934 vec4_visitor::emit_if_gen6(ir_if *ir)
 935 {
 936    ir_expression *expr = ir->condition->as_expression();
 937
 938    if (expr && expr->operation != ir_binop_ubo_load) {
 939       src_reg op[3];
 940       dst_reg temp;
 941
 942       assert(expr->get_num_operands() <= 3);
 943       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 944          expr->operands[i]->accept(this);
 945          op[i] = this->result;
 946       }
 947
 948       switch (expr->operation) {
 949       case ir_unop_logic_not:
 950          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 951          return;
 952
 953       case ir_binop_logic_xor:
 954          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 955          return;
 956
 957       case ir_binop_logic_or:
 958          temp = dst_reg(this, glsl_type::bool_type);
 959          emit(OR(temp, op[0], op[1]));
 960          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 961          return;
 962
 963       case ir_binop_logic_and:
 964          temp = dst_reg(this, glsl_type::bool_type);
 965          emit(AND(temp, op[0], op[1]));
 966          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_f2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_unop_i2b:
 974          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 975          return;
 976
 977       case ir_binop_greater:
 978       case ir_binop_gequal:
 979       case ir_binop_less:
 980       case ir_binop_lequal:
 981       case ir_binop_equal:
 982       case ir_binop_nequal:
 983          emit(IF(op[0], op[1],
 984                  brw_conditional_for_comparison(expr->operation)));
 985          return;
 986
 987       case ir_binop_all_equal:
 988          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 989          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 990          return;
 991
 992       case ir_binop_any_nequal:
 993          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 994          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 995          return;
 996
 997       case ir_unop_any:
 998          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 999          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000          return;
1001
1002       case ir_triop_csel: {
1003          /* Expand the boolean condition into the flag register. */
1004          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007          /* Select which boolean to return. */
1008          dst_reg temp(this, expr->operands[1]->type);
1009          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010          inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013          return;
1014       }
1015
1016       default:
1017          unreachable("not reached");
1018       }
1019       return;
1020    }
1021
1022    ir->condition->accept(this);
1023
1024    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030    dst_reg *reg = NULL;
1031
1032    if (variable_storage(ir))
1033       return;
1034
1035    switch (ir->data.mode) {
1036    case ir_var_shader_in:
1037       assert(ir->data.location != -1);
1038       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039       break;
1040
1041    case ir_var_shader_out:
1042       assert(ir->data.location != -1);
1043       reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045       for (int i = 0; i < type_size(ir->type); i++) {
1046          output_reg[ir->data.location + i] = *reg;
1047          output_reg[ir->data.location + i].reg_offset = i;
1048          output_reg[ir->data.location + i].type =
1049             brw_type_for_base_type(ir->type->get_scalar_type());
1050          output_reg_annotation[ir->data.location + i] = ir->name;
1051       }
1052       break;
1053
1054    case ir_var_auto:
1055    case ir_var_temporary:
1056       reg = new(mem_ctx) dst_reg(this, ir->type);
1057       break;
1058
1059    case ir_var_uniform:
1060       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062       /* Thanks to the lower_ubo_reference pass, we will see only
1063        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064        * variables, so no need for them to be in variable_ht.
1065        *
1066        * Some uniforms, such as samplers and atomic counters, have no actual
1067        * storage, so we should ignore them.
1068        */
1069       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1070          return;
1071
1072       /* Track how big the whole uniform variable is, in case we need to put a
1073        * copy of its data into pull constants for array access.
1074        */
1075       assert(this->uniforms < uniform_array_size);
1076       this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078       if (!strncmp(ir->name, "gl_", 3)) {
1079          setup_builtin_uniform_values(ir);
1080       } else {
1081          setup_uniform_values(ir);
1082       }
1083       break;
1084
1085    case ir_var_system_value:
1086       reg = make_reg_for_system_value(ir);
1087       break;
1088
1089    default:
1090       unreachable("not reached");
1091    }
1092
1093    reg->type = brw_type_for_base_type(ir->type);
1094    hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100    /* We don't want debugging output to print the whole body of the
1101     * loop as the annotation.
1102     */
1103    this->base_ir = NULL;
1104
1105    emit(BRW_OPCODE_DO);
1106
1107    visit_instructions(&ir->body_instructions);
1108
1109    emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115    switch (ir->mode) {
1116    case ir_loop_jump::jump_break:
1117       emit(BRW_OPCODE_BREAK);
1118       break;
1119    case ir_loop_jump::jump_continue:
1120       emit(BRW_OPCODE_CONTINUE);
1121       break;
1122    }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129    unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135    /* Ignore function bodies other than main() -- we shouldn't see calls to
1136     * them since they should all be inlined.
1137     */
1138    if (strcmp(ir->name, "main") == 0) {
1139       const ir_function_signature *sig;
1140       exec_list empty;
1141
1142       sig = ir->matching_signature(NULL, &empty, false);
1143
1144       assert(sig);
1145
1146       visit_instructions(&sig->body);
1147    }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153    /* 3-src instructions were introduced in gen6. */
1154    if (brw->gen < 6)
1155       return false;
1156
1157    /* MAD can only handle floating-point data. */
1158    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159       return false;
1160
1161    ir_rvalue *nonmul = ir->operands[1];
1162    ir_expression *mul = ir->operands[0]->as_expression();
1163
1164    if (!mul || mul->operation != ir_binop_mul) {
1165       nonmul = ir->operands[0];
1166       mul = ir->operands[1]->as_expression();
1167
1168       if (!mul || mul->operation != ir_binop_mul)
1169          return false;
1170    }
1171
1172    nonmul->accept(this);
1173    src_reg src0 = fix_3src_operand(this->result);
1174
1175    mul->operands[0]->accept(this);
1176    src_reg src1 = fix_3src_operand(this->result);
1177
1178    mul->operands[1]->accept(this);
1179    src_reg src2 = fix_3src_operand(this->result);
1180
1181    this->result = src_reg(this, ir->type);
1182    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1183
1184    return true;
1185 }
1186
1187 bool
1188 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1189 {
1190    /* This optimization relies on CMP setting the destination to 0 when
1191     * false.  Early hardware only sets the least significant bit, and
1192     * leaves the other bits undefined.  So we can't use it.
1193     */
1194    if (brw->gen < 6)
1195       return false;
1196
1197    ir_expression *const cmp = ir->operands[0]->as_expression();
1198
1199    if (cmp == NULL)
1200       return false;
1201
1202    switch (cmp->operation) {
1203    case ir_binop_less:
1204    case ir_binop_greater:
1205    case ir_binop_lequal:
1206    case ir_binop_gequal:
1207    case ir_binop_equal:
1208    case ir_binop_nequal:
1209       break;
1210
1211    default:
1212       return false;
1213    }
1214
1215    cmp->operands[0]->accept(this);
1216    const src_reg cmp_src0 = this->result;
1217
1218    cmp->operands[1]->accept(this);
1219    const src_reg cmp_src1 = this->result;
1220
1221    this->result = src_reg(this, ir->type);
1222
1223    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1224             brw_conditional_for_comparison(cmp->operation)));
1225
1226    /* If the comparison is false, this->result will just happen to be zero.
1227     */
1228    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1229                                        this->result, src_reg(1.0f));
1230    inst->predicate = BRW_PREDICATE_NORMAL;
1231    inst->predicate_inverse = true;
1232
1233    return true;
1234 }
1235
1236 void
1237 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1238                           src_reg src0, src_reg src1)
1239 {
1240    vec4_instruction *inst;
1241
1242    if (brw->gen >= 6) {
1243       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1244       inst->conditional_mod = conditionalmod;
1245    } else {
1246       emit(CMP(dst, src0, src1, conditionalmod));
1247
1248       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1249       inst->predicate = BRW_PREDICATE_NORMAL;
1250    }
1251 }
1252
1253 void
1254 vec4_visitor::emit_lrp(const dst_reg &dst,
1255                        const src_reg &x, const src_reg &y, const src_reg &a)
1256 {
1257    if (brw->gen >= 6) {
1258       /* Note that the instruction's argument order is reversed from GLSL
1259        * and the IR.
1260        */
1261       emit(LRP(dst,
1262                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1263    } else {
1264       /* Earlier generations don't support three source operations, so we
1265        * need to emit x*(1-a) + y*a.
1266        */
1267       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1268       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1269       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1270       y_times_a.writemask           = dst.writemask;
1271       one_minus_a.writemask         = dst.writemask;
1272       x_times_one_minus_a.writemask = dst.writemask;
1273
1274       emit(MUL(y_times_a, y, a));
1275       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1276       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1277       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1278    }
1279 }
1280
1281 void
1282 vec4_visitor::visit(ir_expression *ir)
1283 {
1284    unsigned int operand;
1285    src_reg op[Elements(ir->operands)];
1286    vec4_instruction *inst;
1287
1288    if (ir->operation == ir_binop_add) {
1289       if (try_emit_mad(ir))
1290          return;
1291    }
1292
1293    if (ir->operation == ir_unop_b2f) {
1294       if (try_emit_b2f_of_compare(ir))
1295          return;
1296    }
1297
1298    /* Storage for our result.  Ideally for an assignment we'd be using
1299     * the actual storage for the result here, instead.
1300     */
1301    dst_reg result_dst(this, ir->type);
1302    src_reg result_src(result_dst);
1303
1304    if (ir->operation == ir_triop_csel) {
1305       ir->operands[1]->accept(this);
1306       op[1] = this->result;
1307       ir->operands[2]->accept(this);
1308       op[2] = this->result;
1309
1310       enum brw_predicate predicate;
1311       emit_bool_to_cond_code(ir->operands[0], &predicate);
1312       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1313       inst->predicate = predicate;
1314       this->result = result_src;
1315       return;
1316    }
1317
1318    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1319       this->result.file = BAD_FILE;
1320       ir->operands[operand]->accept(this);
1321       if (this->result.file == BAD_FILE) {
1322          fprintf(stderr, "Failed to get tree for expression operand:\n");
1323          ir->operands[operand]->fprint(stderr);
1324          exit(1);
1325       }
1326       op[operand] = this->result;
1327
1328       /* Matrix expression operands should have been broken down to vector
1329        * operations already.
1330        */
1331       assert(!ir->operands[operand]->type->is_matrix());
1332    }
1333
1334    /* If nothing special happens, this is the result. */
1335    this->result = result_src;
1336
1337    switch (ir->operation) {
1338    case ir_unop_logic_not:
1339       emit(NOT(result_dst, op[0]));
1340       break;
1341    case ir_unop_neg:
1342       op[0].negate = !op[0].negate;
1343       emit(MOV(result_dst, op[0]));
1344       break;
1345    case ir_unop_abs:
1346       op[0].abs = true;
1347       op[0].negate = false;
1348       emit(MOV(result_dst, op[0]));
1349       break;
1350
1351    case ir_unop_sign:
1352       if (ir->type->is_float()) {
1353          /* AND(val, 0x80000000) gives the sign bit.
1354           *
1355           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1356           * zero.
1357           */
1358          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1359
1360          op[0].type = BRW_REGISTER_TYPE_UD;
1361          result_dst.type = BRW_REGISTER_TYPE_UD;
1362          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1363
1364          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1365          inst->predicate = BRW_PREDICATE_NORMAL;
1366
1367          this->result.type = BRW_REGISTER_TYPE_F;
1368       } else {
1369          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1370           *               -> non-negative val generates 0x00000000.
1371           *  Predicated OR sets 1 if val is positive.
1372           */
1373          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1374
1375          emit(ASR(result_dst, op[0], src_reg(31)));
1376
1377          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1378          inst->predicate = BRW_PREDICATE_NORMAL;
1379       }
1380       break;
1381
1382    case ir_unop_rcp:
1383       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1384       break;
1385
1386    case ir_unop_exp2:
1387       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1388       break;
1389    case ir_unop_log2:
1390       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1391       break;
1392    case ir_unop_exp:
1393    case ir_unop_log:
1394       unreachable("not reached: should be handled by ir_explog_to_explog2");
1395    case ir_unop_sin:
1396    case ir_unop_sin_reduced:
1397       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1398       break;
1399    case ir_unop_cos:
1400    case ir_unop_cos_reduced:
1401       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1402       break;
1403
1404    case ir_unop_dFdx:
1405    case ir_unop_dFdx_coarse:
1406    case ir_unop_dFdx_fine:
1407    case ir_unop_dFdy:
1408    case ir_unop_dFdy_coarse:
1409    case ir_unop_dFdy_fine:
1410       unreachable("derivatives not valid in vertex shader");
1411
1412    case ir_unop_bitfield_reverse:
1413       emit(BFREV(result_dst, op[0]));
1414       break;
1415    case ir_unop_bit_count:
1416       emit(CBIT(result_dst, op[0]));
1417       break;
1418    case ir_unop_find_msb: {
1419       src_reg temp = src_reg(this, glsl_type::uint_type);
1420
1421       inst = emit(FBH(dst_reg(temp), op[0]));
1422       inst->dst.writemask = WRITEMASK_XYZW;
1423
1424       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1425        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1426        * subtract the result from 31 to convert the MSB count into an LSB count.
1427        */
1428
1429       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1430       temp.swizzle = BRW_SWIZZLE_NOOP;
1431       emit(MOV(result_dst, temp));
1432
1433       src_reg src_tmp = src_reg(result_dst);
1434       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1435
1436       src_tmp.negate = true;
1437       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1438       inst->predicate = BRW_PREDICATE_NORMAL;
1439       break;
1440    }
1441    case ir_unop_find_lsb:
1442       emit(FBL(result_dst, op[0]));
1443       break;
1444    case ir_unop_saturate:
1445       inst = emit(MOV(result_dst, op[0]));
1446       inst->saturate = true;
1447       break;
1448
1449    case ir_unop_noise:
1450       unreachable("not reached: should be handled by lower_noise");
1451
1452    case ir_binop_add:
1453       emit(ADD(result_dst, op[0], op[1]));
1454       break;
1455    case ir_binop_sub:
1456       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1457
1458    case ir_binop_mul:
1459       if (brw->gen < 8 && ir->type->is_integer()) {
1460          /* For integer multiplication, the MUL uses the low 16 bits of one of
1461           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1462           * accumulates in the contribution of the upper 16 bits of that
1463           * operand.  If we can determine that one of the args is in the low
1464           * 16 bits, though, we can just emit a single MUL.
1465           */
1466          if (ir->operands[0]->is_uint16_constant()) {
1467             if (brw->gen < 7)
1468                emit(MUL(result_dst, op[0], op[1]));
1469             else
1470                emit(MUL(result_dst, op[1], op[0]));
1471          } else if (ir->operands[1]->is_uint16_constant()) {
1472             if (brw->gen < 7)
1473                emit(MUL(result_dst, op[1], op[0]));
1474             else
1475                emit(MUL(result_dst, op[0], op[1]));
1476          } else {
1477             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1478
1479             emit(MUL(acc, op[0], op[1]));
1480             emit(MACH(dst_null_d(), op[0], op[1]));
1481             emit(MOV(result_dst, src_reg(acc)));
1482          }
1483       } else {
1484          emit(MUL(result_dst, op[0], op[1]));
1485       }
1486       break;
1487    case ir_binop_imul_high: {
1488       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1489
1490       emit(MUL(acc, op[0], op[1]));
1491       emit(MACH(result_dst, op[0], op[1]));
1492       break;
1493    }
1494    case ir_binop_div:
1495       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1496       assert(ir->type->is_integer());
1497       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1498       break;
1499    case ir_binop_carry: {
1500       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1501
1502       emit(ADDC(dst_null_ud(), op[0], op[1]));
1503       emit(MOV(result_dst, src_reg(acc)));
1504       break;
1505    }
1506    case ir_binop_borrow: {
1507       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1508
1509       emit(SUBB(dst_null_ud(), op[0], op[1]));
1510       emit(MOV(result_dst, src_reg(acc)));
1511       break;
1512    }
1513    case ir_binop_mod:
1514       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1515       assert(ir->type->is_integer());
1516       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1517       break;
1518
1519    case ir_binop_less:
1520    case ir_binop_greater:
1521    case ir_binop_lequal:
1522    case ir_binop_gequal:
1523    case ir_binop_equal:
1524    case ir_binop_nequal: {
1525       if (brw->gen <= 5) {
1526          resolve_bool_comparison(ir->operands[0], &op[0]);
1527          resolve_bool_comparison(ir->operands[1], &op[1]);
1528       }
1529       emit(CMP(result_dst, op[0], op[1],
1530                brw_conditional_for_comparison(ir->operation)));
1531       break;
1532    }
1533
1534    case ir_binop_all_equal:
1535       /* "==" operator producing a scalar boolean. */
1536       if (ir->operands[0]->type->is_vector() ||
1537           ir->operands[1]->type->is_vector()) {
1538          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1539          emit(MOV(result_dst, src_reg(0)));
1540          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1541          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1542       } else {
1543          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1544       }
1545       break;
1546    case ir_binop_any_nequal:
1547       /* "!=" operator producing a scalar boolean. */
1548       if (ir->operands[0]->type->is_vector() ||
1549           ir->operands[1]->type->is_vector()) {
1550          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1551
1552          emit(MOV(result_dst, src_reg(0)));
1553          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1554          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1555       } else {
1556          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1557       }
1558       break;
1559
1560    case ir_unop_any:
1561       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1562       emit(MOV(result_dst, src_reg(0)));
1563
1564       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1565       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1566       break;
1567
1568    case ir_binop_logic_xor:
1569       emit(XOR(result_dst, op[0], op[1]));
1570       break;
1571
1572    case ir_binop_logic_or:
1573       emit(OR(result_dst, op[0], op[1]));
1574       break;
1575
1576    case ir_binop_logic_and:
1577       emit(AND(result_dst, op[0], op[1]));
1578       break;
1579
1580    case ir_binop_dot:
1581       assert(ir->operands[0]->type->is_vector());
1582       assert(ir->operands[0]->type == ir->operands[1]->type);
1583       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1584       break;
1585
1586    case ir_unop_sqrt:
1587       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1588       break;
1589    case ir_unop_rsq:
1590       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1591       break;
1592
1593    case ir_unop_bitcast_i2f:
1594    case ir_unop_bitcast_u2f:
1595       this->result = op[0];
1596       this->result.type = BRW_REGISTER_TYPE_F;
1597       break;
1598
1599    case ir_unop_bitcast_f2i:
1600       this->result = op[0];
1601       this->result.type = BRW_REGISTER_TYPE_D;
1602       break;
1603
1604    case ir_unop_bitcast_f2u:
1605       this->result = op[0];
1606       this->result.type = BRW_REGISTER_TYPE_UD;
1607       break;
1608
1609    case ir_unop_i2f:
1610    case ir_unop_i2u:
1611    case ir_unop_u2i:
1612    case ir_unop_u2f:
1613    case ir_unop_f2i:
1614    case ir_unop_f2u:
1615       emit(MOV(result_dst, op[0]));
1616       break;
1617    case ir_unop_b2i:
1618       emit(AND(result_dst, op[0], src_reg(1)));
1619       break;
1620    case ir_unop_b2f:
1621       if (brw->gen <= 5) {
1622          resolve_bool_comparison(ir->operands[0], &op[0]);
1623       }
1624       op[0].type = BRW_REGISTER_TYPE_D;
1625       result_dst.type = BRW_REGISTER_TYPE_D;
1626       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1627       result_dst.type = BRW_REGISTER_TYPE_F;
1628       break;
1629    case ir_unop_f2b:
1630       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1631       break;
1632    case ir_unop_i2b:
1633       emit(AND(result_dst, op[0], src_reg(1)));
1634       break;
1635
1636    case ir_unop_trunc:
1637       emit(RNDZ(result_dst, op[0]));
1638       break;
1639    case ir_unop_ceil: {
1640          src_reg tmp = src_reg(this, ir->type);
1641          op[0].negate = !op[0].negate;
1642          emit(RNDD(dst_reg(tmp), op[0]));
1643          tmp.negate = true;
1644          emit(MOV(result_dst, tmp));
1645       }
1646       break;
1647    case ir_unop_floor:
1648       inst = emit(RNDD(result_dst, op[0]));
1649       break;
1650    case ir_unop_fract:
1651       inst = emit(FRC(result_dst, op[0]));
1652       break;
1653    case ir_unop_round_even:
1654       emit(RNDE(result_dst, op[0]));
1655       break;
1656
1657    case ir_binop_min:
1658       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1659       break;
1660    case ir_binop_max:
1661       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1662       break;
1663
1664    case ir_binop_pow:
1665       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1666       break;
1667
1668    case ir_unop_bit_not:
1669       inst = emit(NOT(result_dst, op[0]));
1670       break;
1671    case ir_binop_bit_and:
1672       inst = emit(AND(result_dst, op[0], op[1]));
1673       break;
1674    case ir_binop_bit_xor:
1675       inst = emit(XOR(result_dst, op[0], op[1]));
1676       break;
1677    case ir_binop_bit_or:
1678       inst = emit(OR(result_dst, op[0], op[1]));
1679       break;
1680
1681    case ir_binop_lshift:
1682       inst = emit(SHL(result_dst, op[0], op[1]));
1683       break;
1684
1685    case ir_binop_rshift:
1686       if (ir->type->base_type == GLSL_TYPE_INT)
1687          inst = emit(ASR(result_dst, op[0], op[1]));
1688       else
1689          inst = emit(SHR(result_dst, op[0], op[1]));
1690       break;
1691
1692    case ir_binop_bfm:
1693       emit(BFI1(result_dst, op[0], op[1]));
1694       break;
1695
1696    case ir_binop_ubo_load: {
1697       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1698       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1699       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1700       src_reg offset;
1701
1702       /* Now, load the vector from that offset. */
1703       assert(ir->type->is_vector() || ir->type->is_scalar());
1704
1705       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1706       packed_consts.type = result.type;
1707       src_reg surf_index;
1708
1709       if (const_uniform_block) {
1710          /* The block index is a constant, so just emit the binding table entry
1711           * as an immediate.
1712           */
1713          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1714                               const_uniform_block->value.u[0]);
1715       } else {
1716          /* The block index is not a constant. Evaluate the index expression
1717           * per-channel and add the base UBO index; the generator will select
1718           * a value from any live channel.
1719           */
1720          surf_index = src_reg(this, glsl_type::uint_type);
1721          emit(ADD(dst_reg(surf_index), op[0],
1722                   src_reg(prog_data->base.binding_table.ubo_start)));
1723
1724          /* Assume this may touch any UBO. It would be nice to provide
1725           * a tighter bound, but the array information is already lowered away.
1726           */
1727          brw_mark_surface_used(&prog_data->base,
1728                                prog_data->base.binding_table.ubo_start +
1729                                shader_prog->NumUniformBlocks - 1);
1730       }
1731
1732       if (const_offset_ir) {
1733          if (brw->gen >= 8) {
1734             /* Store the offset in a GRF so we can send-from-GRF. */
1735             offset = src_reg(this, glsl_type::int_type);
1736             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1737          } else {
1738             /* Immediates are fine on older generations since they'll be moved
1739              * to a (potentially fake) MRF at the generator level.
1740              */
1741             offset = src_reg(const_offset / 16);
1742          }
1743       } else {
1744          offset = src_reg(this, glsl_type::uint_type);
1745          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1746       }
1747
1748       if (brw->gen >= 7) {
1749          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1750          grf_offset.type = offset.type;
1751
1752          emit(MOV(grf_offset, offset));
1753
1754          emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1755                                             dst_reg(packed_consts),
1756                                             surf_index,
1757                                             src_reg(grf_offset)));
1758       } else {
1759          vec4_instruction *pull =
1760             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1761                                                dst_reg(packed_consts),
1762                                                surf_index,
1763                                                offset));
1764          pull->base_mrf = 14;
1765          pull->mlen = 1;
1766       }
1767
1768       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1769       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1770                                             const_offset % 16 / 4,
1771                                             const_offset % 16 / 4,
1772                                             const_offset % 16 / 4);
1773
1774       /* UBO bools are any nonzero int.  We need to convert them to use the
1775        * value of true stored in ctx->Const.UniformBooleanTrue.
1776        */
1777       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1778          emit(CMP(result_dst, packed_consts, src_reg(0u),
1779                   BRW_CONDITIONAL_NZ));
1780       } else {
1781          emit(MOV(result_dst, packed_consts));
1782       }
1783       break;
1784    }
1785
1786    case ir_binop_vector_extract:
1787       unreachable("should have been lowered by vec_index_to_cond_assign");
1788
1789    case ir_triop_fma:
1790       op[0] = fix_3src_operand(op[0]);
1791       op[1] = fix_3src_operand(op[1]);
1792       op[2] = fix_3src_operand(op[2]);
1793       /* Note that the instruction's argument order is reversed from GLSL
1794        * and the IR.
1795        */
1796       emit(MAD(result_dst, op[2], op[1], op[0]));
1797       break;
1798
1799    case ir_triop_lrp:
1800       emit_lrp(result_dst, op[0], op[1], op[2]);
1801       break;
1802
1803    case ir_triop_csel:
1804       unreachable("already handled above");
1805       break;
1806
1807    case ir_triop_bfi:
1808       op[0] = fix_3src_operand(op[0]);
1809       op[1] = fix_3src_operand(op[1]);
1810       op[2] = fix_3src_operand(op[2]);
1811       emit(BFI2(result_dst, op[0], op[1], op[2]));
1812       break;
1813
1814    case ir_triop_bitfield_extract:
1815       op[0] = fix_3src_operand(op[0]);
1816       op[1] = fix_3src_operand(op[1]);
1817       op[2] = fix_3src_operand(op[2]);
1818       /* Note that the instruction's argument order is reversed from GLSL
1819        * and the IR.
1820        */
1821       emit(BFE(result_dst, op[2], op[1], op[0]));
1822       break;
1823
1824    case ir_triop_vector_insert:
1825       unreachable("should have been lowered by lower_vector_insert");
1826
1827    case ir_quadop_bitfield_insert:
1828       unreachable("not reached: should be handled by "
1829               "bitfield_insert_to_bfm_bfi\n");
1830
1831    case ir_quadop_vector:
1832       unreachable("not reached: should be handled by lower_quadop_vector");
1833
1834    case ir_unop_pack_half_2x16:
1835       emit_pack_half_2x16(result_dst, op[0]);
1836       break;
1837    case ir_unop_unpack_half_2x16:
1838       emit_unpack_half_2x16(result_dst, op[0]);
1839       break;
1840    case ir_unop_unpack_unorm_4x8:
1841       emit_unpack_unorm_4x8(result_dst, op[0]);
1842       break;
1843    case ir_unop_unpack_snorm_4x8:
1844       emit_unpack_snorm_4x8(result_dst, op[0]);
1845       break;
1846    case ir_unop_pack_unorm_4x8:
1847       emit_pack_unorm_4x8(result_dst, op[0]);
1848       break;
1849    case ir_unop_pack_snorm_4x8:
1850       emit_pack_snorm_4x8(result_dst, op[0]);
1851       break;
1852    case ir_unop_pack_snorm_2x16:
1853    case ir_unop_pack_unorm_2x16:
1854    case ir_unop_unpack_snorm_2x16:
1855    case ir_unop_unpack_unorm_2x16:
1856       unreachable("not reached: should be handled by lower_packing_builtins");
1857    case ir_unop_unpack_half_2x16_split_x:
1858    case ir_unop_unpack_half_2x16_split_y:
1859    case ir_binop_pack_half_2x16_split:
1860    case ir_unop_interpolate_at_centroid:
1861    case ir_binop_interpolate_at_sample:
1862    case ir_binop_interpolate_at_offset:
1863       unreachable("not reached: should not occur in vertex shader");
1864    case ir_binop_ldexp:
1865       unreachable("not reached: should be handled by ldexp_to_arith()");
1866    }
1867 }
1868
1869
1870 void
1871 vec4_visitor::visit(ir_swizzle *ir)
1872 {
1873    src_reg src;
1874    int i = 0;
1875    int swizzle[4];
1876
1877    /* Note that this is only swizzles in expressions, not those on the left
1878     * hand side of an assignment, which do write masking.  See ir_assignment
1879     * for that.
1880     */
1881
1882    ir->val->accept(this);
1883    src = this->result;
1884    assert(src.file != BAD_FILE);
1885
1886    for (i = 0; i < ir->type->vector_elements; i++) {
1887       switch (i) {
1888       case 0:
1889          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1890          break;
1891       case 1:
1892          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1893          break;
1894       case 2:
1895          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1896          break;
1897       case 3:
1898          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1899             break;
1900       }
1901    }
1902    for (; i < 4; i++) {
1903       /* Replicate the last channel out. */
1904       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1905    }
1906
1907    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1908
1909    this->result = src;
1910 }
1911
1912 void
1913 vec4_visitor::visit(ir_dereference_variable *ir)
1914 {
1915    const struct glsl_type *type = ir->type;
1916    dst_reg *reg = variable_storage(ir->var);
1917
1918    if (!reg) {
1919       fail("Failed to find variable storage for %s\n", ir->var->name);
1920       this->result = src_reg(brw_null_reg());
1921       return;
1922    }
1923
1924    this->result = src_reg(*reg);
1925
1926    /* System values get their swizzle from the dst_reg writemask */
1927    if (ir->var->data.mode == ir_var_system_value)
1928       return;
1929
1930    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1931       this->result.swizzle = swizzle_for_size(type->vector_elements);
1932 }
1933
1934
1935 int
1936 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1937 {
1938    /* Under normal circumstances array elements are stored consecutively, so
1939     * the stride is equal to the size of the array element.
1940     */
1941    return type_size(ir->type);
1942 }
1943
1944
1945 void
1946 vec4_visitor::visit(ir_dereference_array *ir)
1947 {
1948    ir_constant *constant_index;
1949    src_reg src;
1950    int array_stride = compute_array_stride(ir);
1951
1952    constant_index = ir->array_index->constant_expression_value();
1953
1954    ir->array->accept(this);
1955    src = this->result;
1956
1957    if (constant_index) {
1958       src.reg_offset += constant_index->value.i[0] * array_stride;
1959    } else {
1960       /* Variable index array dereference.  It eats the "vec4" of the
1961        * base of the array and an index that offsets the Mesa register
1962        * index.
1963        */
1964       ir->array_index->accept(this);
1965
1966       src_reg index_reg;
1967
1968       if (array_stride == 1) {
1969          index_reg = this->result;
1970       } else {
1971          index_reg = src_reg(this, glsl_type::int_type);
1972
1973          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1974       }
1975
1976       if (src.reladdr) {
1977          src_reg temp = src_reg(this, glsl_type::int_type);
1978
1979          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1980
1981          index_reg = temp;
1982       }
1983
1984       src.reladdr = ralloc(mem_ctx, src_reg);
1985       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1986    }
1987
1988    /* If the type is smaller than a vec4, replicate the last channel out. */
1989    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1990       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1991    else
1992       src.swizzle = BRW_SWIZZLE_NOOP;
1993    src.type = brw_type_for_base_type(ir->type);
1994
1995    this->result = src;
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_record *ir)
2000 {
2001    unsigned int i;
2002    const glsl_type *struct_type = ir->record->type;
2003    int offset = 0;
2004
2005    ir->record->accept(this);
2006
2007    for (i = 0; i < struct_type->length; i++) {
2008       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2009          break;
2010       offset += type_size(struct_type->fields.structure[i].type);
2011    }
2012
2013    /* If the type is smaller than a vec4, replicate the last channel out. */
2014    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2015       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2016    else
2017       this->result.swizzle = BRW_SWIZZLE_NOOP;
2018    this->result.type = brw_type_for_base_type(ir->type);
2019
2020    this->result.reg_offset += offset;
2021 }
2022
2023 /**
2024  * We want to be careful in assignment setup to hit the actual storage
2025  * instead of potentially using a temporary like we might with the
2026  * ir_dereference handler.
2027  */
2028 static dst_reg
2029 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2030 {
2031    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2032     * access of a vector, it must be separated into a series conditional moves
2033     * before reaching this point (see ir_vec_index_to_cond_assign).
2034     */
2035    assert(ir->as_dereference());
2036    ir_dereference_array *deref_array = ir->as_dereference_array();
2037    if (deref_array) {
2038       assert(!deref_array->array->type->is_vector());
2039    }
2040
2041    /* Use the rvalue deref handler for the most part.  We'll ignore
2042     * swizzles in it and write swizzles using writemask, though.
2043     */
2044    ir->accept(v);
2045    return dst_reg(v->result);
2046 }
2047
2048 void
2049 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2050                               const struct glsl_type *type,
2051                               enum brw_predicate predicate)
2052 {
2053    if (type->base_type == GLSL_TYPE_STRUCT) {
2054       for (unsigned int i = 0; i < type->length; i++) {
2055          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2056       }
2057       return;
2058    }
2059
2060    if (type->is_array()) {
2061       for (unsigned int i = 0; i < type->length; i++) {
2062          emit_block_move(dst, src, type->fields.array, predicate);
2063       }
2064       return;
2065    }
2066
2067    if (type->is_matrix()) {
2068       const struct glsl_type *vec_type;
2069
2070       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2071                                          type->vector_elements, 1);
2072
2073       for (int i = 0; i < type->matrix_columns; i++) {
2074          emit_block_move(dst, src, vec_type, predicate);
2075       }
2076       return;
2077    }
2078
2079    assert(type->is_scalar() || type->is_vector());
2080
2081    dst->type = brw_type_for_base_type(type);
2082    src->type = dst->type;
2083
2084    dst->writemask = (1 << type->vector_elements) - 1;
2085
2086    src->swizzle = swizzle_for_size(type->vector_elements);
2087
2088    vec4_instruction *inst = emit(MOV(*dst, *src));
2089    inst->predicate = predicate;
2090
2091    dst->reg_offset++;
2092    src->reg_offset++;
2093 }
2094
2095
2096 /* If the RHS processing resulted in an instruction generating a
2097  * temporary value, and it would be easy to rewrite the instruction to
2098  * generate its result right into the LHS instead, do so.  This ends
2099  * up reliably removing instructions where it can be tricky to do so
2100  * later without real UD chain information.
2101  */
2102 bool
2103 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2104                                      dst_reg dst,
2105                                      src_reg src,
2106                                      vec4_instruction *pre_rhs_inst,
2107                                      vec4_instruction *last_rhs_inst)
2108 {
2109    /* This could be supported, but it would take more smarts. */
2110    if (ir->condition)
2111       return false;
2112
2113    if (pre_rhs_inst == last_rhs_inst)
2114       return false; /* No instructions generated to work with. */
2115
2116    /* Make sure the last instruction generated our source reg. */
2117    if (src.file != GRF ||
2118        src.file != last_rhs_inst->dst.file ||
2119        src.reg != last_rhs_inst->dst.reg ||
2120        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2121        src.reladdr ||
2122        src.abs ||
2123        src.negate ||
2124        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2125       return false;
2126
2127    /* Check that that last instruction fully initialized the channels
2128     * we want to use, in the order we want to use them.  We could
2129     * potentially reswizzle the operands of many instructions so that
2130     * we could handle out of order channels, but don't yet.
2131     */
2132
2133    for (unsigned i = 0; i < 4; i++) {
2134       if (dst.writemask & (1 << i)) {
2135          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2136             return false;
2137
2138          if (BRW_GET_SWZ(src.swizzle, i) != i)
2139             return false;
2140       }
2141    }
2142
2143    /* Success!  Rewrite the instruction. */
2144    last_rhs_inst->dst.file = dst.file;
2145    last_rhs_inst->dst.reg = dst.reg;
2146    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2147    last_rhs_inst->dst.reladdr = dst.reladdr;
2148    last_rhs_inst->dst.writemask &= dst.writemask;
2149
2150    return true;
2151 }
2152
2153 void
2154 vec4_visitor::visit(ir_assignment *ir)
2155 {
2156    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2157    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2158
2159    if (!ir->lhs->type->is_scalar() &&
2160        !ir->lhs->type->is_vector()) {
2161       ir->rhs->accept(this);
2162       src_reg src = this->result;
2163
2164       if (ir->condition) {
2165          emit_bool_to_cond_code(ir->condition, &predicate);
2166       }
2167
2168       /* emit_block_move doesn't account for swizzles in the source register.
2169        * This should be ok, since the source register is a structure or an
2170        * array, and those can't be swizzled.  But double-check to be sure.
2171        */
2172       assert(src.swizzle ==
2173              (ir->rhs->type->is_matrix()
2174               ? swizzle_for_size(ir->rhs->type->vector_elements)
2175               : BRW_SWIZZLE_NOOP));
2176
2177       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2178       return;
2179    }
2180
2181    /* Now we're down to just a scalar/vector with writemasks. */
2182    int i;
2183
2184    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2185    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2186
2187    ir->rhs->accept(this);
2188
2189    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2190
2191    src_reg src = this->result;
2192
2193    int swizzles[4];
2194    int first_enabled_chan = 0;
2195    int src_chan = 0;
2196
2197    assert(ir->lhs->type->is_vector() ||
2198           ir->lhs->type->is_scalar());
2199    dst.writemask = ir->write_mask;
2200
2201    for (int i = 0; i < 4; i++) {
2202       if (dst.writemask & (1 << i)) {
2203          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2204          break;
2205       }
2206    }
2207
2208    /* Swizzle a small RHS vector into the channels being written.
2209     *
2210     * glsl ir treats write_mask as dictating how many channels are
2211     * present on the RHS while in our instructions we need to make
2212     * those channels appear in the slots of the vec4 they're written to.
2213     */
2214    for (int i = 0; i < 4; i++) {
2215       if (dst.writemask & (1 << i))
2216          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2217       else
2218          swizzles[i] = first_enabled_chan;
2219    }
2220    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2221                               swizzles[2], swizzles[3]);
2222
2223    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2224       return;
2225    }
2226
2227    if (ir->condition) {
2228       emit_bool_to_cond_code(ir->condition, &predicate);
2229    }
2230
2231    for (i = 0; i < type_size(ir->lhs->type); i++) {
2232       vec4_instruction *inst = emit(MOV(dst, src));
2233       inst->predicate = predicate;
2234
2235       dst.reg_offset++;
2236       src.reg_offset++;
2237    }
2238 }
2239
2240 void
2241 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2242 {
2243    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2244       foreach_in_list(ir_constant, field_value, &ir->components) {
2245          emit_constant_values(dst, field_value);
2246       }
2247       return;
2248    }
2249
2250    if (ir->type->is_array()) {
2251       for (unsigned int i = 0; i < ir->type->length; i++) {
2252          emit_constant_values(dst, ir->array_elements[i]);
2253       }
2254       return;
2255    }
2256
2257    if (ir->type->is_matrix()) {
2258       for (int i = 0; i < ir->type->matrix_columns; i++) {
2259          float *vec = &ir->value.f[i * ir->type->vector_elements];
2260
2261          for (int j = 0; j < ir->type->vector_elements; j++) {
2262             dst->writemask = 1 << j;
2263             dst->type = BRW_REGISTER_TYPE_F;
2264
2265             emit(MOV(*dst, src_reg(vec[j])));
2266          }
2267          dst->reg_offset++;
2268       }
2269       return;
2270    }
2271
2272    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2273
2274    for (int i = 0; i < ir->type->vector_elements; i++) {
2275       if (!(remaining_writemask & (1 << i)))
2276          continue;
2277
2278       dst->writemask = 1 << i;
2279       dst->type = brw_type_for_base_type(ir->type);
2280
2281       /* Find other components that match the one we're about to
2282        * write.  Emits fewer instructions for things like vec4(0.5,
2283        * 1.5, 1.5, 1.5).
2284        */
2285       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2286          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2287             if (ir->value.b[i] == ir->value.b[j])
2288                dst->writemask |= (1 << j);
2289          } else {
2290             /* u, i, and f storage all line up, so no need for a
2291              * switch case for comparing each type.
2292              */
2293             if (ir->value.u[i] == ir->value.u[j])
2294                dst->writemask |= (1 << j);
2295          }
2296       }
2297
2298       switch (ir->type->base_type) {
2299       case GLSL_TYPE_FLOAT:
2300          emit(MOV(*dst, src_reg(ir->value.f[i])));
2301          break;
2302       case GLSL_TYPE_INT:
2303          emit(MOV(*dst, src_reg(ir->value.i[i])));
2304          break;
2305       case GLSL_TYPE_UINT:
2306          emit(MOV(*dst, src_reg(ir->value.u[i])));
2307          break;
2308       case GLSL_TYPE_BOOL:
2309          emit(MOV(*dst,
2310                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2311                                               : 0)));
2312          break;
2313       default:
2314          unreachable("Non-float/uint/int/bool constant");
2315       }
2316
2317       remaining_writemask &= ~dst->writemask;
2318    }
2319    dst->reg_offset++;
2320 }
2321
2322 void
2323 vec4_visitor::visit(ir_constant *ir)
2324 {
2325    dst_reg dst = dst_reg(this, ir->type);
2326    this->result = src_reg(dst);
2327
2328    emit_constant_values(&dst, ir);
2329 }
2330
2331 void
2332 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2333 {
2334    ir_dereference *deref = static_cast<ir_dereference *>(
2335       ir->actual_parameters.get_head());
2336    ir_variable *location = deref->variable_referenced();
2337    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2338                           location->data.binding);
2339
2340    /* Calculate the surface offset */
2341    src_reg offset(this, glsl_type::uint_type);
2342    ir_dereference_array *deref_array = deref->as_dereference_array();
2343    if (deref_array) {
2344       deref_array->array_index->accept(this);
2345
2346       src_reg tmp(this, glsl_type::uint_type);
2347       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2348       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2349    } else {
2350       offset = location->data.atomic.offset;
2351    }
2352
2353    /* Emit the appropriate machine instruction */
2354    const char *callee = ir->callee->function_name();
2355    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2356
2357    if (!strcmp("__intrinsic_atomic_read", callee)) {
2358       emit_untyped_surface_read(surf_index, dst, offset);
2359
2360    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2361       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2362                           src_reg(), src_reg());
2363
2364    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2365       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2366                           src_reg(), src_reg());
2367    }
2368 }
2369
2370 void
2371 vec4_visitor::visit(ir_call *ir)
2372 {
2373    const char *callee = ir->callee->function_name();
2374
2375    if (!strcmp("__intrinsic_atomic_read", callee) ||
2376        !strcmp("__intrinsic_atomic_increment", callee) ||
2377        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2378       visit_atomic_counter_intrinsic(ir);
2379    } else {
2380       unreachable("Unsupported intrinsic.");
2381    }
2382 }
2383
2384 src_reg
2385 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2386 {
2387    vec4_instruction *inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS);
2388    inst->base_mrf = 2;
2389    inst->mlen = 1;
2390    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2391    inst->dst.writemask = WRITEMASK_XYZW;
2392
2393    inst->src[1] = sampler;
2394
2395    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2396    int param_base = inst->base_mrf;
2397    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2398    int zero_mask = 0xf & ~coord_mask;
2399
2400    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2401             coordinate));
2402
2403    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2404             src_reg(0)));
2405
2406    emit(inst);
2407    return src_reg(inst->dst);
2408 }
2409
2410 static bool
2411 is_high_sampler(struct brw_context *brw, src_reg sampler)
2412 {
2413    if (brw->gen < 8 && !brw->is_haswell)
2414       return false;
2415
2416    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2417 }
2418
2419 void
2420 vec4_visitor::visit(ir_texture *ir)
2421 {
2422    uint32_t sampler =
2423       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2424
2425    ir_rvalue *nonconst_sampler_index =
2426       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2427
2428    /* Handle non-constant sampler array indexing */
2429    src_reg sampler_reg;
2430    if (nonconst_sampler_index) {
2431       /* The highest sampler which may be used by this operation is
2432        * the last element of the array. Mark it here, because the generator
2433        * doesn't have enough information to determine the bound.
2434        */
2435       uint32_t array_size = ir->sampler->as_dereference_array()
2436          ->array->type->array_size();
2437
2438       uint32_t max_used = sampler + array_size - 1;
2439       if (ir->op == ir_tg4 && brw->gen < 8) {
2440          max_used += prog_data->base.binding_table.gather_texture_start;
2441       } else {
2442          max_used += prog_data->base.binding_table.texture_start;
2443       }
2444
2445       brw_mark_surface_used(&prog_data->base, max_used);
2446
2447       /* Emit code to evaluate the actual indexing expression */
2448       nonconst_sampler_index->accept(this);
2449       dst_reg temp(this, glsl_type::uint_type);
2450       emit(ADD(temp, this->result, src_reg(sampler)))
2451          ->force_writemask_all = true;
2452       sampler_reg = src_reg(temp);
2453    } else {
2454       /* Single sampler, or constant array index; the indexing expression
2455        * is just an immediate.
2456        */
2457       sampler_reg = src_reg(sampler);
2458    }
2459
2460    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2461     * emitting anything other than setting up the constant result.
2462     */
2463    if (ir->op == ir_tg4) {
2464       ir_constant *chan = ir->lod_info.component->as_constant();
2465       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2466       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2467          dst_reg result(this, ir->type);
2468          this->result = src_reg(result);
2469          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2470          return;
2471       }
2472    }
2473
2474    /* Should be lowered by do_lower_texture_projection */
2475    assert(!ir->projector);
2476
2477    /* Should be lowered */
2478    assert(!ir->offset || !ir->offset->type->is_array());
2479
2480    /* Generate code to compute all the subexpression trees.  This has to be
2481     * done before loading any values into MRFs for the sampler message since
2482     * generating these values may involve SEND messages that need the MRFs.
2483     */
2484    src_reg coordinate;
2485    if (ir->coordinate) {
2486       ir->coordinate->accept(this);
2487       coordinate = this->result;
2488    }
2489
2490    src_reg shadow_comparitor;
2491    if (ir->shadow_comparitor) {
2492       ir->shadow_comparitor->accept(this);
2493       shadow_comparitor = this->result;
2494    }
2495
2496    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2497    src_reg offset_value;
2498    if (has_nonconstant_offset) {
2499       ir->offset->accept(this);
2500       offset_value = src_reg(this->result);
2501    }
2502
2503    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2504    src_reg lod, dPdx, dPdy, sample_index, mcs;
2505    switch (ir->op) {
2506    case ir_tex:
2507       lod = src_reg(0.0f);
2508       lod_type = glsl_type::float_type;
2509       break;
2510    case ir_txf:
2511    case ir_txl:
2512    case ir_txs:
2513       ir->lod_info.lod->accept(this);
2514       lod = this->result;
2515       lod_type = ir->lod_info.lod->type;
2516       break;
2517    case ir_query_levels:
2518       lod = src_reg(0);
2519       lod_type = glsl_type::int_type;
2520       break;
2521    case ir_txf_ms:
2522       ir->lod_info.sample_index->accept(this);
2523       sample_index = this->result;
2524       sample_index_type = ir->lod_info.sample_index->type;
2525
2526       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2527          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2528       else
2529          mcs = src_reg(0u);
2530       break;
2531    case ir_txd:
2532       ir->lod_info.grad.dPdx->accept(this);
2533       dPdx = this->result;
2534
2535       ir->lod_info.grad.dPdy->accept(this);
2536       dPdy = this->result;
2537
2538       lod_type = ir->lod_info.grad.dPdx->type;
2539       break;
2540    case ir_txb:
2541    case ir_lod:
2542    case ir_tg4:
2543       break;
2544    }
2545
2546    enum opcode opcode;
2547    switch (ir->op) {
2548    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2549    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2550    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2551    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2552    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2553    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2554    case ir_tg4: opcode = has_nonconstant_offset
2555                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2556    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2557    case ir_txb:
2558       unreachable("TXB is not valid for vertex shaders.");
2559    case ir_lod:
2560       unreachable("LOD is not valid for vertex shaders.");
2561    default:
2562       unreachable("Unrecognized tex op");
2563    }
2564
2565    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode);
2566
2567    if (ir->offset != NULL && !has_nonconstant_offset) {
2568       inst->offset =
2569          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2570                             ir->offset->type->vector_elements);
2571    }
2572
2573    /* Stuff the channel select bits in the top of the texture offset */
2574    if (ir->op == ir_tg4)
2575       inst->offset |= gather_channel(ir, sampler) << 16;
2576
2577    /* The message header is necessary for:
2578     * - Gen4 (always)
2579     * - Gen9+ for selecting SIMD4x2
2580     * - Texel offsets
2581     * - Gather channel selection
2582     * - Sampler indices too large to fit in a 4-bit value.
2583     */
2584    inst->header_present =
2585       brw->gen < 5 || brw->gen >= 9 ||
2586       inst->offset != 0 || ir->op == ir_tg4 ||
2587       is_high_sampler(brw, sampler_reg);
2588    inst->base_mrf = 2;
2589    inst->mlen = inst->header_present + 1; /* always at least one */
2590    inst->dst = dst_reg(this, ir->type);
2591    inst->dst.writemask = WRITEMASK_XYZW;
2592    inst->shadow_compare = ir->shadow_comparitor != NULL;
2593
2594    inst->src[1] = sampler_reg;
2595
2596    /* MRF for the first parameter */
2597    int param_base = inst->base_mrf + inst->header_present;
2598
2599    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2600       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2601       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2602    } else {
2603       /* Load the coordinate */
2604       /* FINISHME: gl_clamp_mask and saturate */
2605       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2606       int zero_mask = 0xf & ~coord_mask;
2607
2608       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2609                coordinate));
2610
2611       if (zero_mask != 0) {
2612          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2613                   src_reg(0)));
2614       }
2615       /* Load the shadow comparitor */
2616       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2617          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2618                           WRITEMASK_X),
2619                   shadow_comparitor));
2620          inst->mlen++;
2621       }
2622
2623       /* Load the LOD info */
2624       if (ir->op == ir_tex || ir->op == ir_txl) {
2625          int mrf, writemask;
2626          if (brw->gen >= 5) {
2627             mrf = param_base + 1;
2628             if (ir->shadow_comparitor) {
2629                writemask = WRITEMASK_Y;
2630                /* mlen already incremented */
2631             } else {
2632                writemask = WRITEMASK_X;
2633                inst->mlen++;
2634             }
2635          } else /* brw->gen == 4 */ {
2636             mrf = param_base;
2637             writemask = WRITEMASK_W;
2638          }
2639          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2640       } else if (ir->op == ir_txf) {
2641          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2642       } else if (ir->op == ir_txf_ms) {
2643          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2644                   sample_index));
2645          if (brw->gen >= 7) {
2646             /* MCS data is in the first channel of `mcs`, but we need to get it into
2647              * the .y channel of the second vec4 of params, so replicate .x across
2648              * the whole vec4 and then mask off everything except .y
2649              */
2650             mcs.swizzle = BRW_SWIZZLE_XXXX;
2651             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2652                      mcs));
2653          }
2654          inst->mlen++;
2655       } else if (ir->op == ir_txd) {
2656          const glsl_type *type = lod_type;
2657
2658          if (brw->gen >= 5) {
2659             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2661             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2662             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2663             inst->mlen++;
2664
2665             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2666                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2667                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2668                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2669                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2670                inst->mlen++;
2671
2672                if (ir->shadow_comparitor) {
2673                   emit(MOV(dst_reg(MRF, param_base + 2,
2674                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2675                            shadow_comparitor));
2676                }
2677             }
2678          } else /* brw->gen == 4 */ {
2679             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2680             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2681             inst->mlen += 2;
2682          }
2683       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2684          if (ir->shadow_comparitor) {
2685             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2686                      shadow_comparitor));
2687          }
2688
2689          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2690                   offset_value));
2691          inst->mlen++;
2692       }
2693    }
2694
2695    emit(inst);
2696
2697    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2698     * spec requires layers.
2699     */
2700    if (ir->op == ir_txs) {
2701       glsl_type const *type = ir->sampler->type;
2702       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2703           type->sampler_array) {
2704          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2705                    writemask(inst->dst, WRITEMASK_Z),
2706                    src_reg(inst->dst), src_reg(6));
2707       }
2708    }
2709
2710    if (brw->gen == 6 && ir->op == ir_tg4) {
2711       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2712    }
2713
2714    swizzle_result(ir, src_reg(inst->dst), sampler);
2715 }
2716
2717 /**
2718  * Apply workarounds for Gen6 gather with UINT/SINT
2719  */
2720 void
2721 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2722 {
2723    if (!wa)
2724       return;
2725
2726    int width = (wa & WA_8BIT) ? 8 : 16;
2727    dst_reg dst_f = dst;
2728    dst_f.type = BRW_REGISTER_TYPE_F;
2729
2730    /* Convert from UNORM to UINT */
2731    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2732    emit(MOV(dst, src_reg(dst_f)));
2733
2734    if (wa & WA_SIGN) {
2735       /* Reinterpret the UINT value as a signed INT value by
2736        * shifting the sign bit into place, then shifting back
2737        * preserving sign.
2738        */
2739       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2740       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2741    }
2742 }
2743
2744 /**
2745  * Set up the gather channel based on the swizzle, for gather4.
2746  */
2747 uint32_t
2748 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2749 {
2750    ir_constant *chan = ir->lod_info.component->as_constant();
2751    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2752    switch (swiz) {
2753       case SWIZZLE_X: return 0;
2754       case SWIZZLE_Y:
2755          /* gather4 sampler is broken for green channel on RG32F --
2756           * we must ask for blue instead.
2757           */
2758          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2759             return 2;
2760          return 1;
2761       case SWIZZLE_Z: return 2;
2762       case SWIZZLE_W: return 3;
2763       default:
2764          unreachable("Not reached"); /* zero, one swizzles handled already */
2765    }
2766 }
2767
2768 void
2769 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2770 {
2771    int s = key->tex.swizzles[sampler];
2772
2773    this->result = src_reg(this, ir->type);
2774    dst_reg swizzled_result(this->result);
2775
2776    if (ir->op == ir_query_levels) {
2777       /* # levels is in .w */
2778       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2779       emit(MOV(swizzled_result, orig_val));
2780       return;
2781    }
2782
2783    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2784                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2785       emit(MOV(swizzled_result, orig_val));
2786       return;
2787    }
2788
2789
2790    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2791    int swizzle[4] = {0};
2792
2793    for (int i = 0; i < 4; i++) {
2794       switch (GET_SWZ(s, i)) {
2795       case SWIZZLE_ZERO:
2796          zero_mask |= (1 << i);
2797          break;
2798       case SWIZZLE_ONE:
2799          one_mask |= (1 << i);
2800          break;
2801       default:
2802          copy_mask |= (1 << i);
2803          swizzle[i] = GET_SWZ(s, i);
2804          break;
2805       }
2806    }
2807
2808    if (copy_mask) {
2809       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2810       swizzled_result.writemask = copy_mask;
2811       emit(MOV(swizzled_result, orig_val));
2812    }
2813
2814    if (zero_mask) {
2815       swizzled_result.writemask = zero_mask;
2816       emit(MOV(swizzled_result, src_reg(0.0f)));
2817    }
2818
2819    if (one_mask) {
2820       swizzled_result.writemask = one_mask;
2821       emit(MOV(swizzled_result, src_reg(1.0f)));
2822    }
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_return *)
2827 {
2828    unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::visit(ir_discard *)
2833 {
2834    unreachable("not reached");
2835 }
2836
2837 void
2838 vec4_visitor::visit(ir_if *ir)
2839 {
2840    /* Don't point the annotation at the if statement, because then it plus
2841     * the then and else blocks get printed.
2842     */
2843    this->base_ir = ir->condition;
2844
2845    if (brw->gen == 6) {
2846       emit_if_gen6(ir);
2847    } else {
2848       enum brw_predicate predicate;
2849       emit_bool_to_cond_code(ir->condition, &predicate);
2850       emit(IF(predicate));
2851    }
2852
2853    visit_instructions(&ir->then_instructions);
2854
2855    if (!ir->else_instructions.is_empty()) {
2856       this->base_ir = ir->condition;
2857       emit(BRW_OPCODE_ELSE);
2858
2859       visit_instructions(&ir->else_instructions);
2860    }
2861
2862    this->base_ir = ir->condition;
2863    emit(BRW_OPCODE_ENDIF);
2864 }
2865
2866 void
2867 vec4_visitor::visit(ir_emit_vertex *)
2868 {
2869    unreachable("not reached");
2870 }
2871
2872 void
2873 vec4_visitor::visit(ir_end_primitive *)
2874 {
2875    unreachable("not reached");
2876 }
2877
2878 void
2879 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2880                                   dst_reg dst, src_reg offset,
2881                                   src_reg src0, src_reg src1)
2882 {
2883    unsigned mlen = 0;
2884
2885    /* Set the atomic operation offset. */
2886    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2887    mlen++;
2888
2889    /* Set the atomic operation arguments. */
2890    if (src0.file != BAD_FILE) {
2891       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2892       mlen++;
2893    }
2894
2895    if (src1.file != BAD_FILE) {
2896       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2897       mlen++;
2898    }
2899
2900    /* Emit the instruction.  Note that this maps to the normal SIMD8
2901     * untyped atomic message on Ivy Bridge, but that's OK because
2902     * unused channels will be masked out.
2903     */
2904    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2905                                  src_reg(atomic_op), src_reg(surf_index));
2906    inst->base_mrf = 0;
2907    inst->mlen = mlen;
2908 }
2909
2910 void
2911 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2912                                         src_reg offset)
2913 {
2914    /* Set the surface read offset. */
2915    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2916
2917    /* Emit the instruction.  Note that this maps to the normal SIMD8
2918     * untyped surface read message, but that's OK because unused
2919     * channels will be masked out.
2920     */
2921    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2922                                  dst, src_reg(surf_index));
2923    inst->base_mrf = 0;
2924    inst->mlen = 1;
2925 }
2926
2927 void
2928 vec4_visitor::emit_ndc_computation()
2929 {
2930    /* Get the position */
2931    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2932
2933    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2934    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2935    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2936
2937    current_annotation = "NDC";
2938    dst_reg ndc_w = ndc;
2939    ndc_w.writemask = WRITEMASK_W;
2940    src_reg pos_w = pos;
2941    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2942    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2943
2944    dst_reg ndc_xyz = ndc;
2945    ndc_xyz.writemask = WRITEMASK_XYZ;
2946
2947    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2948 }
2949
2950 void
2951 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2952 {
2953    if (brw->gen < 6 &&
2954        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2955         key->userclip_active || brw->has_negative_rhw_bug)) {
2956       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2957       dst_reg header1_w = header1;
2958       header1_w.writemask = WRITEMASK_W;
2959
2960       emit(MOV(header1, 0u));
2961
2962       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2963          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2964
2965          current_annotation = "Point size";
2966          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2967          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2968       }
2969
2970       if (key->userclip_active) {
2971          current_annotation = "Clipping flags";
2972          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2973          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2974
2975          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2976          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2977          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2978
2979          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2980          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2981          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2982          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2983       }
2984
2985       /* i965 clipping workaround:
2986        * 1) Test for -ve rhw
2987        * 2) If set,
2988        *      set ndc = (0,0,0,0)
2989        *      set ucp[6] = 1
2990        *
2991        * Later, clipping will detect ucp[6] and ensure the primitive is
2992        * clipped against all fixed planes.
2993        */
2994       if (brw->has_negative_rhw_bug) {
2995          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2996          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2997          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2998          vec4_instruction *inst;
2999          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3000          inst->predicate = BRW_PREDICATE_NORMAL;
3001          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3002          inst->predicate = BRW_PREDICATE_NORMAL;
3003       }
3004
3005       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3006    } else if (brw->gen < 6) {
3007       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3008    } else {
3009       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3010       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3011          dst_reg reg_w = reg;
3012          reg_w.writemask = WRITEMASK_W;
3013          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3014       }
3015       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3016          dst_reg reg_y = reg;
3017          reg_y.writemask = WRITEMASK_Y;
3018          reg_y.type = BRW_REGISTER_TYPE_D;
3019          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3020       }
3021       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3022          dst_reg reg_z = reg;
3023          reg_z.writemask = WRITEMASK_Z;
3024          reg_z.type = BRW_REGISTER_TYPE_D;
3025          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3026       }
3027    }
3028 }
3029
3030 void
3031 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3032 {
3033    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3034     *
3035     *     "If a linked set of shaders forming the vertex stage contains no
3036     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3037     *     application has requested clipping against user clip planes through
3038     *     the API, then the coordinate written to gl_Position is used for
3039     *     comparison against the user clip planes."
3040     *
3041     * This function is only called if the shader didn't write to
3042     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3043     * if the user wrote to it; otherwise we use gl_Position.
3044     */
3045    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3046    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3047       clip_vertex = VARYING_SLOT_POS;
3048    }
3049
3050    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3051         ++i) {
3052       reg.writemask = 1 << i;
3053       emit(DP4(reg,
3054                src_reg(output_reg[clip_vertex]),
3055                src_reg(this->userplane[i + offset])));
3056    }
3057 }
3058
3059 vec4_instruction *
3060 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3061 {
3062    assert (varying < VARYING_SLOT_MAX);
3063    reg.type = output_reg[varying].type;
3064    current_annotation = output_reg_annotation[varying];
3065    /* Copy the register, saturating if necessary */
3066    return emit(MOV(reg, src_reg(output_reg[varying])));
3067 }
3068
3069 void
3070 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3071 {
3072    reg.type = BRW_REGISTER_TYPE_F;
3073
3074    switch (varying) {
3075    case VARYING_SLOT_PSIZ:
3076    {
3077       /* PSIZ is always in slot 0, and is coupled with other flags. */
3078       current_annotation = "indices, point width, clip flags";
3079       emit_psiz_and_flags(reg);
3080       break;
3081    }
3082    case BRW_VARYING_SLOT_NDC:
3083       current_annotation = "NDC";
3084       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3085       break;
3086    case VARYING_SLOT_POS:
3087       current_annotation = "gl_Position";
3088       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3089       break;
3090    case VARYING_SLOT_EDGE:
3091       /* This is present when doing unfilled polygons.  We're supposed to copy
3092        * the edge flag from the user-provided vertex array
3093        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3094        * of that attribute (starts as 1.0f).  This is then used in clipping to
3095        * determine which edges should be drawn as wireframe.
3096        */
3097       current_annotation = "edge flag";
3098       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3099                                     glsl_type::float_type, WRITEMASK_XYZW))));
3100       break;
3101    case BRW_VARYING_SLOT_PAD:
3102       /* No need to write to this slot */
3103       break;
3104    case VARYING_SLOT_COL0:
3105    case VARYING_SLOT_COL1:
3106    case VARYING_SLOT_BFC0:
3107    case VARYING_SLOT_BFC1: {
3108       /* These built-in varyings are only supported in compatibility mode,
3109        * and we only support GS in core profile.  So, this must be a vertex
3110        * shader.
3111        */
3112       assert(stage == MESA_SHADER_VERTEX);
3113       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3114       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3115          inst->saturate = true;
3116       break;
3117    }
3118
3119    default:
3120       emit_generic_urb_slot(reg, varying);
3121       break;
3122    }
3123 }
3124
3125 static int
3126 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3127 {
3128    if (brw->gen >= 6) {
3129       /* URB data written (does not include the message header reg) must
3130        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3131        * section 5.4.3.2.2: URB_INTERLEAVED.
3132        *
3133        * URB entries are allocated on a multiple of 1024 bits, so an
3134        * extra 128 bits written here to make the end align to 256 is
3135        * no problem.
3136        */
3137       if ((mlen % 2) != 1)
3138          mlen++;
3139    }
3140
3141    return mlen;
3142 }
3143
3144
3145 /**
3146  * Generates the VUE payload plus the necessary URB write instructions to
3147  * output it.
3148  *
3149  * The VUE layout is documented in Volume 2a.
3150  */
3151 void
3152 vec4_visitor::emit_vertex()
3153 {
3154    /* MRF 0 is reserved for the debugger, so start with message header
3155     * in MRF 1.
3156     */
3157    int base_mrf = 1;
3158    int mrf = base_mrf;
3159    /* In the process of generating our URB write message contents, we
3160     * may need to unspill a register or load from an array.  Those
3161     * reads would use MRFs 14-15.
3162     */
3163    int max_usable_mrf = 13;
3164
3165    /* The following assertion verifies that max_usable_mrf causes an
3166     * even-numbered amount of URB write data, which will meet gen6's
3167     * requirements for length alignment.
3168     */
3169    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3170
3171    /* First mrf is the g0-based message header containing URB handles and
3172     * such.
3173     */
3174    emit_urb_write_header(mrf++);
3175
3176    if (brw->gen < 6) {
3177       emit_ndc_computation();
3178    }
3179
3180    /* Lower legacy ff and ClipVertex clipping to clip distances */
3181    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3182       current_annotation = "user clip distances";
3183
3184       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3185       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3186
3187       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3188       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3189    }
3190
3191    /* We may need to split this up into several URB writes, so do them in a
3192     * loop.
3193     */
3194    int slot = 0;
3195    bool complete = false;
3196    do {
3197       /* URB offset is in URB row increments, and each of our MRFs is half of
3198        * one of those, since we're doing interleaved writes.
3199        */
3200       int offset = slot / 2;
3201
3202       mrf = base_mrf + 1;
3203       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3204          emit_urb_slot(dst_reg(MRF, mrf++),
3205                        prog_data->vue_map.slot_to_varying[slot]);
3206
3207          /* If this was max_usable_mrf, we can't fit anything more into this
3208           * URB WRITE.
3209           */
3210          if (mrf > max_usable_mrf) {
3211             slot++;
3212             break;
3213          }
3214       }
3215
3216       complete = slot >= prog_data->vue_map.num_slots;
3217       current_annotation = "URB write";
3218       vec4_instruction *inst = emit_urb_write_opcode(complete);
3219       inst->base_mrf = base_mrf;
3220       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3221       inst->offset += offset;
3222    } while(!complete);
3223 }
3224
3225
3226 src_reg
3227 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3228                                  src_reg *reladdr, int reg_offset)
3229 {
3230    /* Because we store the values to scratch interleaved like our
3231     * vertex data, we need to scale the vec4 index by 2.
3232     */
3233    int message_header_scale = 2;
3234
3235    /* Pre-gen6, the message header uses byte offsets instead of vec4
3236     * (16-byte) offset units.
3237     */
3238    if (brw->gen < 6)
3239       message_header_scale *= 16;
3240
3241    if (reladdr) {
3242       src_reg index = src_reg(this, glsl_type::int_type);
3243
3244       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3245                                    src_reg(reg_offset)));
3246       emit_before(block, inst, MUL(dst_reg(index), index,
3247                                    src_reg(message_header_scale)));
3248
3249       return index;
3250    } else {
3251       return src_reg(reg_offset * message_header_scale);
3252    }
3253 }
3254
3255 src_reg
3256 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3257                                        src_reg *reladdr, int reg_offset)
3258 {
3259    if (reladdr) {
3260       src_reg index = src_reg(this, glsl_type::int_type);
3261
3262       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3263                                    src_reg(reg_offset)));
3264
3265       /* Pre-gen6, the message header uses byte offsets instead of vec4
3266        * (16-byte) offset units.
3267        */
3268       if (brw->gen < 6) {
3269          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3270       }
3271
3272       return index;
3273    } else if (brw->gen >= 8) {
3274       /* Store the offset in a GRF so we can send-from-GRF. */
3275       src_reg offset = src_reg(this, glsl_type::int_type);
3276       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3277       return offset;
3278    } else {
3279       int message_header_scale = brw->gen < 6 ? 16 : 1;
3280       return src_reg(reg_offset * message_header_scale);
3281    }
3282 }
3283
3284 /**
3285  * Emits an instruction before @inst to load the value named by @orig_src
3286  * from scratch space at @base_offset to @temp.
3287  *
3288  * @base_offset is measured in 32-byte units (the size of a register).
3289  */
3290 void
3291 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3292                                 dst_reg temp, src_reg orig_src,
3293                                 int base_offset)
3294 {
3295    int reg_offset = base_offset + orig_src.reg_offset;
3296    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3297                                       reg_offset);
3298
3299    emit_before(block, inst, SCRATCH_READ(temp, index));
3300 }
3301
3302 /**
3303  * Emits an instruction after @inst to store the value to be written
3304  * to @orig_dst to scratch space at @base_offset, from @temp.
3305  *
3306  * @base_offset is measured in 32-byte units (the size of a register).
3307  */
3308 void
3309 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3310                                  int base_offset)
3311 {
3312    int reg_offset = base_offset + inst->dst.reg_offset;
3313    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3314                                       reg_offset);
3315
3316    /* Create a temporary register to store *inst's result in.
3317     *
3318     * We have to be careful in MOVing from our temporary result register in
3319     * the scratch write.  If we swizzle from channels of the temporary that
3320     * weren't initialized, it will confuse live interval analysis, which will
3321     * make spilling fail to make progress.
3322     */
3323    src_reg temp = src_reg(this, glsl_type::vec4_type);
3324    temp.type = inst->dst.type;
3325    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3326    int swizzles[4];
3327    for (int i = 0; i < 4; i++)
3328       if (inst->dst.writemask & (1 << i))
3329          swizzles[i] = i;
3330       else
3331          swizzles[i] = first_writemask_chan;
3332    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3333                                swizzles[2], swizzles[3]);
3334
3335    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3336                                        inst->dst.writemask));
3337    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3338    write->predicate = inst->predicate;
3339    write->ir = inst->ir;
3340    write->annotation = inst->annotation;
3341    inst->insert_after(block, write);
3342
3343    inst->dst.file = temp.file;
3344    inst->dst.reg = temp.reg;
3345    inst->dst.reg_offset = temp.reg_offset;
3346    inst->dst.reladdr = NULL;
3347 }
3348
3349 /**
3350  * We can't generally support array access in GRF space, because a
3351  * single instruction's destination can only span 2 contiguous
3352  * registers.  So, we send all GRF arrays that get variable index
3353  * access to scratch space.
3354  */
3355 void
3356 vec4_visitor::move_grf_array_access_to_scratch()
3357 {
3358    int scratch_loc[this->alloc.count];
3359    memset(scratch_loc, -1, sizeof(scratch_loc));
3360
3361    /* First, calculate the set of virtual GRFs that need to be punted
3362     * to scratch due to having any array access on them, and where in
3363     * scratch.
3364     */
3365    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3366       if (inst->dst.file == GRF && inst->dst.reladdr &&
3367           scratch_loc[inst->dst.reg] == -1) {
3368          scratch_loc[inst->dst.reg] = c->last_scratch;
3369          c->last_scratch += this->alloc.sizes[inst->dst.reg];
3370       }
3371
3372       for (int i = 0 ; i < 3; i++) {
3373          src_reg *src = &inst->src[i];
3374
3375          if (src->file == GRF && src->reladdr &&
3376              scratch_loc[src->reg] == -1) {
3377             scratch_loc[src->reg] = c->last_scratch;
3378             c->last_scratch += this->alloc.sizes[src->reg];
3379          }
3380       }
3381    }
3382
3383    /* Now, for anything that will be accessed through scratch, rewrite
3384     * it to load/store.  Note that this is a _safe list walk, because
3385     * we may generate a new scratch_write instruction after the one
3386     * we're processing.
3387     */
3388    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3389       /* Set up the annotation tracking for new generated instructions. */
3390       base_ir = inst->ir;
3391       current_annotation = inst->annotation;
3392
3393       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3394          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3395       }
3396
3397       for (int i = 0 ; i < 3; i++) {
3398          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3399             continue;
3400
3401          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3402
3403          emit_scratch_read(block, inst, temp, inst->src[i],
3404                            scratch_loc[inst->src[i].reg]);
3405
3406          inst->src[i].file = temp.file;
3407          inst->src[i].reg = temp.reg;
3408          inst->src[i].reg_offset = temp.reg_offset;
3409          inst->src[i].reladdr = NULL;
3410       }
3411    }
3412 }
3413
3414 /**
3415  * Emits an instruction before @inst to load the value named by @orig_src
3416  * from the pull constant buffer (surface) at @base_offset to @temp.
3417  */
3418 void
3419 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3420                                       dst_reg temp, src_reg orig_src,
3421                                       int base_offset)
3422 {
3423    int reg_offset = base_offset + orig_src.reg_offset;
3424    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3425    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3426                                              reg_offset);
3427    vec4_instruction *load;
3428
3429    if (brw->gen >= 7) {
3430       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3431       grf_offset.type = offset.type;
3432       emit_before(block, inst, MOV(grf_offset, offset));
3433
3434       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3435                                            temp, index, src_reg(grf_offset));
3436    } else {
3437       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3438                                            temp, index, offset);
3439       load->base_mrf = 14;
3440       load->mlen = 1;
3441    }
3442    emit_before(block, inst, load);
3443 }
3444
3445 /**
3446  * Implements array access of uniforms by inserting a
3447  * PULL_CONSTANT_LOAD instruction.
3448  *
3449  * Unlike temporary GRF array access (where we don't support it due to
3450  * the difficulty of doing relative addressing on instruction
3451  * destinations), we could potentially do array access of uniforms
3452  * that were loaded in GRF space as push constants.  In real-world
3453  * usage we've seen, though, the arrays being used are always larger
3454  * than we could load as push constants, so just always move all
3455  * uniform array access out to a pull constant buffer.
3456  */
3457 void
3458 vec4_visitor::move_uniform_array_access_to_pull_constants()
3459 {
3460    int pull_constant_loc[this->uniforms];
3461    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3462    bool nested_reladdr;
3463
3464    /* Walk through and find array access of uniforms.  Put a copy of that
3465     * uniform in the pull constant buffer.
3466     *
3467     * Note that we don't move constant-indexed accesses to arrays.  No
3468     * testing has been done of the performance impact of this choice.
3469     */
3470    do {
3471       nested_reladdr = false;
3472
3473       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3474          for (int i = 0 ; i < 3; i++) {
3475             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3476                continue;
3477
3478             int uniform = inst->src[i].reg;
3479
3480             if (inst->src[i].reladdr->reladdr)
3481                nested_reladdr = true;  /* will need another pass */
3482
3483             /* If this array isn't already present in the pull constant buffer,
3484              * add it.
3485              */
3486             if (pull_constant_loc[uniform] == -1) {
3487                const gl_constant_value **values =
3488                   &stage_prog_data->param[uniform * 4];
3489
3490                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3491
3492                assert(uniform < uniform_array_size);
3493                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3494                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3495                      = values[j];
3496                }
3497             }
3498
3499             /* Set up the annotation tracking for new generated instructions. */
3500             base_ir = inst->ir;
3501             current_annotation = inst->annotation;
3502
3503             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3504
3505             emit_pull_constant_load(block, inst, temp, inst->src[i],
3506                                     pull_constant_loc[uniform]);
3507
3508             inst->src[i].file = temp.file;
3509             inst->src[i].reg = temp.reg;
3510             inst->src[i].reg_offset = temp.reg_offset;
3511             inst->src[i].reladdr = NULL;
3512          }
3513       }
3514    } while (nested_reladdr);
3515
3516    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3517     * no need to track them as larger-than-vec4 objects.  This will be
3518     * relied on in cutting out unused uniform vectors from push
3519     * constants.
3520     */
3521    split_uniform_registers();
3522 }
3523
3524 void
3525 vec4_visitor::resolve_ud_negate(src_reg *reg)
3526 {
3527    if (reg->type != BRW_REGISTER_TYPE_UD ||
3528        !reg->negate)
3529       return;
3530
3531    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3532    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3533    *reg = temp;
3534 }
3535
3536 /**
3537  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3538  *
3539  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3540  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3541  */
3542 void
3543 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3544 {
3545    assert(brw->gen <= 5);
3546
3547    if (!rvalue->type->is_boolean())
3548       return;
3549
3550    src_reg and_result = src_reg(this, rvalue->type);
3551    src_reg neg_result = src_reg(this, rvalue->type);
3552    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3553    emit(MOV(dst_reg(neg_result), negate(and_result)));
3554    *reg = neg_result;
3555 }
3556
3557 vec4_visitor::vec4_visitor(struct brw_context *brw,
3558                            struct brw_vec4_compile *c,
3559                            struct gl_program *prog,
3560                            const struct brw_vue_prog_key *key,
3561                            struct brw_vue_prog_data *prog_data,
3562                            struct gl_shader_program *shader_prog,
3563                            gl_shader_stage stage,
3564                            void *mem_ctx,
3565                            bool debug_flag,
3566                            bool no_spills,
3567                            shader_time_shader_type st_base,
3568                            shader_time_shader_type st_written,
3569                            shader_time_shader_type st_reset)
3570    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3571      c(c),
3572      key(key),
3573      prog_data(prog_data),
3574      sanity_param_count(0),
3575      fail_msg(NULL),
3576      first_non_payload_grf(0),
3577      need_all_constants_in_pull_buffer(false),
3578      debug_flag(debug_flag),
3579      no_spills(no_spills),
3580      st_base(st_base),
3581      st_written(st_written),
3582      st_reset(st_reset)
3583 {
3584    this->mem_ctx = mem_ctx;
3585    this->failed = false;
3586
3587    this->base_ir = NULL;
3588    this->current_annotation = NULL;
3589    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3590
3591    this->variable_ht = hash_table_ctor(0,
3592                                        hash_table_pointer_hash,
3593                                        hash_table_pointer_compare);
3594
3595    this->virtual_grf_start = NULL;
3596    this->virtual_grf_end = NULL;
3597    this->live_intervals = NULL;
3598
3599    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3600
3601    this->uniforms = 0;
3602
3603    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3604     * at least one. See setup_uniforms() in brw_vec4.cpp.
3605     */
3606    this->uniform_array_size = 1;
3607    if (prog_data) {
3608       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3609    }
3610
3611    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3612    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3613 }
3614
3615 vec4_visitor::~vec4_visitor()
3616 {
3617    hash_table_dtor(this->variable_ht);
3618 }
3619
3620
3621 void
3622 vec4_visitor::fail(const char *format, ...)
3623 {
3624    va_list va;
3625    char *msg;
3626
3627    if (failed)
3628       return;
3629
3630    failed = true;
3631
3632    va_start(va, format);
3633    msg = ralloc_vasprintf(mem_ctx, format, va);
3634    va_end(va);
3635    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3636
3637    this->fail_msg = msg;
3638
3639    if (debug_flag) {
3640       fprintf(stderr, "%s",  msg);
3641    }
3642 }
3643
3644 } /* namespace brw */