src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 vec4_instruction *
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       math = emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358
 359    return math;
 360 }
 361
 362 void
 363 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 364 {
 365    if (devinfo->gen < 7) {
 366       unreachable("ir_unop_pack_half_2x16 should be lowered");
 367    }
 368
 369    assert(dst.type == BRW_REGISTER_TYPE_UD);
 370    assert(src0.type == BRW_REGISTER_TYPE_F);
 371
 372    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 373     *
 374     *   Because this instruction does not have a 16-bit floating-point type,
 375     *   the destination data type must be Word (W).
 376     *
 377     *   The destination must be DWord-aligned and specify a horizontal stride
 378     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 379     *   each destination channel and the upper word is not modified.
 380     *
 381     * The above restriction implies that the f32to16 instruction must use
 382     * align1 mode, because only in align1 mode is it possible to specify
 383     * horizontal stride.  We choose here to defy the hardware docs and emit
 384     * align16 instructions.
 385     *
 386     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 387     * instructions. I was partially successful in that the code passed all
 388     * tests.  However, the code was dubiously correct and fragile, and the
 389     * tests were not harsh enough to probe that frailty. Not trusting the
 390     * code, I chose instead to remain in align16 mode in defiance of the hw
 391     * docs).
 392     *
 393     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 394     * simulator, emitting a f32to16 in align16 mode with UD as destination
 395     * data type is safe. The behavior differs from that specified in the PRM
 396     * in that the upper word of each destination channel is cleared to 0.
 397     */
 398
 399    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 400    src_reg tmp_src(tmp_dst);
 401
 402 #if 0
 403    /* Verify the undocumented behavior on which the following instructions
 404     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 405     * then the result of the bit-or instruction below will be incorrect.
 406     *
 407     * You should inspect the disasm output in order to verify that the MOV is
 408     * not optimized away.
 409     */
 410    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 411 #endif
 412
 413    /* Give tmp the form below, where "." means untouched.
 414     *
 415     *     w z          y          x w z          y          x
 416     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 417     *
 418     * That the upper word of each write-channel be 0 is required for the
 419     * following bit-shift and bit-or instructions to work. Note that this
 420     * relies on the undocumented hardware behavior mentioned above.
 421     */
 422    tmp_dst.writemask = WRITEMASK_XY;
 423    emit(F32TO16(tmp_dst, src0));
 424
 425    /* Give the write-channels of dst the form:
 426     *   0xhhhh0000
 427     */
 428    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 429    emit(SHL(dst, tmp_src, src_reg(16u)));
 430
 431    /* Finally, give the write-channels of dst the form of packHalf2x16's
 432     * output:
 433     *   0xhhhhllll
 434     */
 435    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 436    emit(OR(dst, src_reg(dst), tmp_src));
 437 }
 438
 439 void
 440 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 441 {
 442    if (devinfo->gen < 7) {
 443       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 444    }
 445
 446    assert(dst.type == BRW_REGISTER_TYPE_F);
 447    assert(src0.type == BRW_REGISTER_TYPE_UD);
 448
 449    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 450     *
 451     *   Because this instruction does not have a 16-bit floating-point type,
 452     *   the source data type must be Word (W). The destination type must be
 453     *   F (Float).
 454     *
 455     * To use W as the source data type, we must adjust horizontal strides,
 456     * which is only possible in align1 mode. All my [chadv] attempts at
 457     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 458     * Piglit tests, so I gave up.
 459     *
 460     * I've verified that, on gen7 hardware and the simulator, it is safe to
 461     * emit f16to32 in align16 mode with UD as source data type.
 462     */
 463
 464    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 465    src_reg tmp_src(tmp_dst);
 466
 467    tmp_dst.writemask = WRITEMASK_X;
 468    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 469
 470    tmp_dst.writemask = WRITEMASK_Y;
 471    emit(SHR(tmp_dst, src0, src_reg(16u)));
 472
 473    dst.writemask = WRITEMASK_XY;
 474    emit(F16TO32(dst, tmp_src));
 475 }
 476
 477 void
 478 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 479 {
 480    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 481     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 482     * is not suitable to generate the shift values, but we can use the packed
 483     * vector float and a type-converting MOV.
 484     */
 485    dst_reg shift(this, glsl_type::uvec4_type);
 486    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 487
 488    dst_reg shifted(this, glsl_type::uvec4_type);
 489    src0.swizzle = BRW_SWIZZLE_XXXX;
 490    emit(SHR(shifted, src0, src_reg(shift)));
 491
 492    shifted.type = BRW_REGISTER_TYPE_UB;
 493    dst_reg f(this, glsl_type::vec4_type);
 494    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 495
 496    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 497 }
 498
 499 void
 500 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 501 {
 502    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 503     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 504     * is not suitable to generate the shift values, but we can use the packed
 505     * vector float and a type-converting MOV.
 506     */
 507    dst_reg shift(this, glsl_type::uvec4_type);
 508    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 509
 510    dst_reg shifted(this, glsl_type::uvec4_type);
 511    src0.swizzle = BRW_SWIZZLE_XXXX;
 512    emit(SHR(shifted, src0, src_reg(shift)));
 513
 514    shifted.type = BRW_REGISTER_TYPE_B;
 515    dst_reg f(this, glsl_type::vec4_type);
 516    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 517
 518    dst_reg scaled(this, glsl_type::vec4_type);
 519    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 520
 521    dst_reg max(this, glsl_type::vec4_type);
 522    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 523    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 524 }
 525
 526 void
 527 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 528 {
 529    dst_reg saturated(this, glsl_type::vec4_type);
 530    vec4_instruction *inst = emit(MOV(saturated, src0));
 531    inst->saturate = true;
 532
 533    dst_reg scaled(this, glsl_type::vec4_type);
 534    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 535
 536    dst_reg rounded(this, glsl_type::vec4_type);
 537    emit(RNDE(rounded, src_reg(scaled)));
 538
 539    dst_reg u(this, glsl_type::uvec4_type);
 540    emit(MOV(u, src_reg(rounded)));
 541
 542    src_reg bytes(u);
 543    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 544 }
 545
 546 void
 547 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 548 {
 549    dst_reg max(this, glsl_type::vec4_type);
 550    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 551
 552    dst_reg min(this, glsl_type::vec4_type);
 553    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 554
 555    dst_reg scaled(this, glsl_type::vec4_type);
 556    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 557
 558    dst_reg rounded(this, glsl_type::vec4_type);
 559    emit(RNDE(rounded, src_reg(scaled)));
 560
 561    dst_reg i(this, glsl_type::ivec4_type);
 562    emit(MOV(i, src_reg(rounded)));
 563
 564    src_reg bytes(i);
 565    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 566 }
 567
 568 void
 569 vec4_visitor::visit_instructions(const exec_list *list)
 570 {
 571    foreach_in_list(ir_instruction, ir, list) {
 572       base_ir = ir;
 573       ir->accept(this);
 574    }
 575 }
 576
 577 /**
 578  * Returns the minimum number of vec4 elements needed to pack a type.
 579  *
 580  * For simple types, it will return 1 (a single vec4); for matrices, the
 581  * number of columns; for array and struct, the sum of the vec4_size of
 582  * each of its elements; and for sampler and atomic, zero.
 583  *
 584  * This method is useful to calculate how much register space is needed to
 585  * store a particular type.
 586  */
 587 int
 588 vec4_visitor::type_size(const struct glsl_type *type)
 589 {
 590    unsigned int i;
 591    int size;
 592
 593    switch (type->base_type) {
 594    case GLSL_TYPE_UINT:
 595    case GLSL_TYPE_INT:
 596    case GLSL_TYPE_FLOAT:
 597    case GLSL_TYPE_BOOL:
 598       if (type->is_matrix()) {
 599          return type->matrix_columns;
 600       } else {
 601          /* Regardless of size of vector, it gets a vec4. This is bad
 602           * packing for things like floats, but otherwise arrays become a
 603           * mess.  Hopefully a later pass over the code can pack scalars
 604           * down if appropriate.
 605           */
 606          return 1;
 607       }
 608    case GLSL_TYPE_ARRAY:
 609       assert(type->length > 0);
 610       return type_size(type->fields.array) * type->length;
 611    case GLSL_TYPE_STRUCT:
 612       size = 0;
 613       for (i = 0; i < type->length; i++) {
 614          size += type_size(type->fields.structure[i].type);
 615       }
 616       return size;
 617    case GLSL_TYPE_SUBROUTINE:
 618       return 1;
 619
 620    case GLSL_TYPE_SAMPLER:
 621       /* Samplers take up no register space, since they're baked in at
 622        * link time.
 623        */
 624       return 0;
 625    case GLSL_TYPE_ATOMIC_UINT:
 626       return 0;
 627    case GLSL_TYPE_IMAGE:
 628    case GLSL_TYPE_VOID:
 629    case GLSL_TYPE_DOUBLE:
 630    case GLSL_TYPE_ERROR:
 631    case GLSL_TYPE_INTERFACE:
 632       unreachable("not reached");
 633    }
 634
 635    return 0;
 636 }
 637
 638 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 639 {
 640    init();
 641
 642    this->file = GRF;
 643    this->reg = v->alloc.allocate(v->type_size(type));
 644
 645    if (type->is_array() || type->is_record()) {
 646       this->swizzle = BRW_SWIZZLE_NOOP;
 647    } else {
 648       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 649    }
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 655 {
 656    assert(size > 0);
 657
 658    init();
 659
 660    this->file = GRF;
 661    this->reg = v->alloc.allocate(v->type_size(type) * size);
 662
 663    this->swizzle = BRW_SWIZZLE_NOOP;
 664
 665    this->type = brw_type_for_base_type(type);
 666 }
 667
 668 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 669 {
 670    init();
 671
 672    this->file = GRF;
 673    this->reg = v->alloc.allocate(v->type_size(type));
 674
 675    if (type->is_array() || type->is_record()) {
 676       this->writemask = WRITEMASK_XYZW;
 677    } else {
 678       this->writemask = (1 << type->vector_elements) - 1;
 679    }
 680
 681    this->type = brw_type_for_base_type(type);
 682 }
 683
 684 void
 685 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
 686                                           unsigned n)
 687 {
 688    static const gl_constant_value zero = { 0 };
 689
 690    for (unsigned i = 0; i < n; ++i)
 691       stage_prog_data->param[4 * uniforms + i] = &values[i];
 692
 693    for (unsigned i = n; i < 4; ++i)
 694       stage_prog_data->param[4 * uniforms + i] = &zero;
 695
 696    uniform_vector_size[uniforms++] = n;
 697 }
 698
 699 /* Our support for uniforms is piggy-backed on the struct
 700  * gl_fragment_program, because that's where the values actually
 701  * get stored, rather than in some global gl_shader_program uniform
 702  * store.
 703  */
 704 void
 705 vec4_visitor::setup_uniform_values(ir_variable *ir)
 706 {
 707    int namelen = strlen(ir->name);
 708
 709    /* The data for our (non-builtin) uniforms is stored in a series of
 710     * gl_uniform_driver_storage structs for each subcomponent that
 711     * glGetUniformLocation() could name.  We know it's been set up in the same
 712     * order we'd walk the type, so walk the list of storage and find anything
 713     * with our name, or the prefix of a component that starts with our name.
 714     */
 715    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 716       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 717
 718       if (storage->builtin)
 719          continue;
 720
 721       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 722           (storage->name[namelen] != 0 &&
 723            storage->name[namelen] != '.' &&
 724            storage->name[namelen] != '[')) {
 725          continue;
 726       }
 727
 728       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 729                                      storage->type->matrix_columns);
 730       const unsigned vector_size = storage->type->vector_elements;
 731
 732       for (unsigned s = 0; s < vector_count; s++)
 733          setup_vector_uniform_values(&storage->storage[s * vector_size],
 734                                      vector_size);
 735    }
 736 }
 737
 738 void
 739 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 740 {
 741    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 742       assert(this->uniforms < uniform_array_size);
 743       this->uniform_vector_size[this->uniforms] = 4;
 744       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 745       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 746       for (int j = 0; j < 4; ++j) {
 747          stage_prog_data->param[this->uniforms * 4 + j] =
 748             (gl_constant_value *) &clip_planes[i][j];
 749       }
 750       ++this->uniforms;
 751    }
 752 }
 753
 754 /* Our support for builtin uniforms is even scarier than non-builtin.
 755  * It sits on top of the PROG_STATE_VAR parameters that are
 756  * automatically updated from GL context state.
 757  */
 758 void
 759 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 760 {
 761    const ir_state_slot *const slots = ir->get_state_slots();
 762    assert(slots != NULL);
 763
 764    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 765       /* This state reference has already been setup by ir_to_mesa,
 766        * but we'll get the same index back here.  We can reference
 767        * ParameterValues directly, since unlike brw_fs.cpp, we never
 768        * add new state references during compile.
 769        */
 770       int index = _mesa_add_state_reference(this->prog->Parameters,
 771                                             (gl_state_index *)slots[i].tokens);
 772       gl_constant_value *values =
 773          &this->prog->Parameters->ParameterValues[index][0];
 774
 775       assert(this->uniforms < uniform_array_size);
 776
 777       for (unsigned j = 0; j < 4; j++)
 778          stage_prog_data->param[this->uniforms * 4 + j] =
 779             &values[GET_SWZ(slots[i].swizzle, j)];
 780
 781       this->uniform_vector_size[this->uniforms] =
 782          (ir->type->is_scalar() || ir->type->is_vector() ||
 783           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 784
 785       this->uniforms++;
 786    }
 787 }
 788
 789 dst_reg *
 790 vec4_visitor::variable_storage(ir_variable *var)
 791 {
 792    return (dst_reg *)hash_table_find(this->variable_ht, var);
 793 }
 794
 795 void
 796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 797                                      enum brw_predicate *predicate)
 798 {
 799    ir_expression *expr = ir->as_expression();
 800
 801    *predicate = BRW_PREDICATE_NORMAL;
 802
 803    if (expr && expr->operation != ir_binop_ubo_load) {
 804       src_reg op[3];
 805       vec4_instruction *inst;
 806
 807       assert(expr->get_num_operands() <= 3);
 808       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 809          expr->operands[i]->accept(this);
 810          op[i] = this->result;
 811
 812          resolve_ud_negate(&op[i]);
 813       }
 814
 815       switch (expr->operation) {
 816       case ir_unop_logic_not:
 817          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 818          inst->conditional_mod = BRW_CONDITIONAL_Z;
 819          break;
 820
 821       case ir_binop_logic_xor:
 822          if (devinfo->gen <= 5) {
 823             src_reg temp = src_reg(this, ir->type);
 824             emit(XOR(dst_reg(temp), op[0], op[1]));
 825             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 826          } else {
 827             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 828          }
 829          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 830          break;
 831
 832       case ir_binop_logic_or:
 833          if (devinfo->gen <= 5) {
 834             src_reg temp = src_reg(this, ir->type);
 835             emit(OR(dst_reg(temp), op[0], op[1]));
 836             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 837          } else {
 838             inst = emit(OR(dst_null_d(), op[0], op[1]));
 839          }
 840          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841          break;
 842
 843       case ir_binop_logic_and:
 844          if (devinfo->gen <= 5) {
 845             src_reg temp = src_reg(this, ir->type);
 846             emit(AND(dst_reg(temp), op[0], op[1]));
 847             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 848          } else {
 849             inst = emit(AND(dst_null_d(), op[0], op[1]));
 850          }
 851          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 852          break;
 853
 854       case ir_unop_f2b:
 855          if (devinfo->gen >= 6) {
 856             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 857          } else {
 858             inst = emit(MOV(dst_null_f(), op[0]));
 859             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 860          }
 861          break;
 862
 863       case ir_unop_i2b:
 864          if (devinfo->gen >= 6) {
 865             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 866          } else {
 867             inst = emit(MOV(dst_null_d(), op[0]));
 868             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869          }
 870          break;
 871
 872       case ir_binop_all_equal:
 873          if (devinfo->gen <= 5) {
 874             resolve_bool_comparison(expr->operands[0], &op[0]);
 875             resolve_bool_comparison(expr->operands[1], &op[1]);
 876          }
 877          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 878          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 879          break;
 880
 881       case ir_binop_any_nequal:
 882          if (devinfo->gen <= 5) {
 883             resolve_bool_comparison(expr->operands[0], &op[0]);
 884             resolve_bool_comparison(expr->operands[1], &op[1]);
 885          }
 886          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 887          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 888          break;
 889
 890       case ir_unop_any:
 891          if (devinfo->gen <= 5) {
 892             resolve_bool_comparison(expr->operands[0], &op[0]);
 893          }
 894          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 895          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 896          break;
 897
 898       case ir_binop_greater:
 899       case ir_binop_gequal:
 900       case ir_binop_less:
 901       case ir_binop_lequal:
 902       case ir_binop_equal:
 903       case ir_binop_nequal:
 904          if (devinfo->gen <= 5) {
 905             resolve_bool_comparison(expr->operands[0], &op[0]);
 906             resolve_bool_comparison(expr->operands[1], &op[1]);
 907          }
 908          emit(CMP(dst_null_d(), op[0], op[1],
 909                   brw_conditional_for_comparison(expr->operation)));
 910          break;
 911
 912       case ir_triop_csel: {
 913          /* Expand the boolean condition into the flag register. */
 914          inst = emit(MOV(dst_null_d(), op[0]));
 915          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 916
 917          /* Select which boolean to return. */
 918          dst_reg temp(this, expr->operands[1]->type);
 919          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 920          inst->predicate = BRW_PREDICATE_NORMAL;
 921
 922          /* Expand the result to a condition code. */
 923          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 924          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 925          break;
 926       }
 927
 928       default:
 929          unreachable("not reached");
 930       }
 931       return;
 932    }
 933
 934    ir->accept(this);
 935
 936    resolve_ud_negate(&this->result);
 937
 938    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 939    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 940 }
 941
 942 /**
 943  * Emit a gen6 IF statement with the comparison folded into the IF
 944  * instruction.
 945  */
 946 void
 947 vec4_visitor::emit_if_gen6(ir_if *ir)
 948 {
 949    ir_expression *expr = ir->condition->as_expression();
 950
 951    if (expr && expr->operation != ir_binop_ubo_load) {
 952       src_reg op[3];
 953       dst_reg temp;
 954
 955       assert(expr->get_num_operands() <= 3);
 956       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 957          expr->operands[i]->accept(this);
 958          op[i] = this->result;
 959       }
 960
 961       switch (expr->operation) {
 962       case ir_unop_logic_not:
 963          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 964          return;
 965
 966       case ir_binop_logic_xor:
 967          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_binop_logic_or:
 971          temp = dst_reg(this, glsl_type::bool_type);
 972          emit(OR(temp, op[0], op[1]));
 973          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 974          return;
 975
 976       case ir_binop_logic_and:
 977          temp = dst_reg(this, glsl_type::bool_type);
 978          emit(AND(temp, op[0], op[1]));
 979          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 980          return;
 981
 982       case ir_unop_f2b:
 983          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 984          return;
 985
 986       case ir_unop_i2b:
 987          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 988          return;
 989
 990       case ir_binop_greater:
 991       case ir_binop_gequal:
 992       case ir_binop_less:
 993       case ir_binop_lequal:
 994       case ir_binop_equal:
 995       case ir_binop_nequal:
 996          emit(IF(op[0], op[1],
 997                  brw_conditional_for_comparison(expr->operation)));
 998          return;
 999
1000       case ir_binop_all_equal:
1001          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1002          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1003          return;
1004
1005       case ir_binop_any_nequal:
1006          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1007          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1008          return;
1009
1010       case ir_unop_any:
1011          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1012          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1013          return;
1014
1015       case ir_triop_csel: {
1016          /* Expand the boolean condition into the flag register. */
1017          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1018          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1019
1020          /* Select which boolean to return. */
1021          dst_reg temp(this, expr->operands[1]->type);
1022          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1023          inst->predicate = BRW_PREDICATE_NORMAL;
1024
1025          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1026          return;
1027       }
1028
1029       default:
1030          unreachable("not reached");
1031       }
1032       return;
1033    }
1034
1035    ir->condition->accept(this);
1036
1037    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_variable *ir)
1042 {
1043    dst_reg *reg = NULL;
1044
1045    if (variable_storage(ir))
1046       return;
1047
1048    switch (ir->data.mode) {
1049    case ir_var_shader_in:
1050       assert(ir->data.location != -1);
1051       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1052       break;
1053
1054    case ir_var_shader_out:
1055       assert(ir->data.location != -1);
1056       reg = new(mem_ctx) dst_reg(this, ir->type);
1057
1058       for (int i = 0; i < type_size(ir->type); i++) {
1059          output_reg[ir->data.location + i] = *reg;
1060          output_reg[ir->data.location + i].reg_offset = i;
1061          output_reg_annotation[ir->data.location + i] = ir->name;
1062       }
1063       break;
1064
1065    case ir_var_auto:
1066    case ir_var_temporary:
1067       reg = new(mem_ctx) dst_reg(this, ir->type);
1068       break;
1069
1070    case ir_var_uniform:
1071       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1072
1073       /* Thanks to the lower_ubo_reference pass, we will see only
1074        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1075        * variables, so no need for them to be in variable_ht.
1076        *
1077        * Some uniforms, such as samplers and atomic counters, have no actual
1078        * storage, so we should ignore them.
1079        */
1080       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1081          return;
1082
1083       /* Track how big the whole uniform variable is, in case we need to put a
1084        * copy of its data into pull constants for array access.
1085        */
1086       assert(this->uniforms < uniform_array_size);
1087       this->uniform_size[this->uniforms] = type_size(ir->type);
1088
1089       if (!strncmp(ir->name, "gl_", 3)) {
1090          setup_builtin_uniform_values(ir);
1091       } else {
1092          setup_uniform_values(ir);
1093       }
1094       break;
1095
1096    case ir_var_system_value:
1097       reg = make_reg_for_system_value(ir->data.location, ir->type);
1098       break;
1099
1100    default:
1101       unreachable("not reached");
1102    }
1103
1104    reg->type = brw_type_for_base_type(ir->type);
1105    hash_table_insert(this->variable_ht, reg, ir);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop *ir)
1110 {
1111    /* We don't want debugging output to print the whole body of the
1112     * loop as the annotation.
1113     */
1114    this->base_ir = NULL;
1115
1116    emit(BRW_OPCODE_DO);
1117
1118    visit_instructions(&ir->body_instructions);
1119
1120    emit(BRW_OPCODE_WHILE);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop_jump *ir)
1125 {
1126    switch (ir->mode) {
1127    case ir_loop_jump::jump_break:
1128       emit(BRW_OPCODE_BREAK);
1129       break;
1130    case ir_loop_jump::jump_continue:
1131       emit(BRW_OPCODE_CONTINUE);
1132       break;
1133    }
1134 }
1135
1136
1137 void
1138 vec4_visitor::visit(ir_function_signature *)
1139 {
1140    unreachable("not reached");
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_function *ir)
1145 {
1146    /* Ignore function bodies other than main() -- we shouldn't see calls to
1147     * them since they should all be inlined.
1148     */
1149    if (strcmp(ir->name, "main") == 0) {
1150       const ir_function_signature *sig;
1151       exec_list empty;
1152
1153       sig = ir->matching_signature(NULL, &empty, false);
1154
1155       assert(sig);
1156
1157       visit_instructions(&sig->body);
1158    }
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_mad(ir_expression *ir)
1163 {
1164    /* 3-src instructions were introduced in gen6. */
1165    if (devinfo->gen < 6)
1166       return false;
1167
1168    /* MAD can only handle floating-point data. */
1169    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1170       return false;
1171
1172    ir_rvalue *nonmul;
1173    ir_expression *mul;
1174    bool mul_negate, mul_abs;
1175
1176    for (int i = 0; i < 2; i++) {
1177       mul_negate = false;
1178       mul_abs = false;
1179
1180       mul = ir->operands[i]->as_expression();
1181       nonmul = ir->operands[1 - i];
1182
1183       if (mul && mul->operation == ir_unop_abs) {
1184          mul = mul->operands[0]->as_expression();
1185          mul_abs = true;
1186       } else if (mul && mul->operation == ir_unop_neg) {
1187          mul = mul->operands[0]->as_expression();
1188          mul_negate = true;
1189       }
1190
1191       if (mul && mul->operation == ir_binop_mul)
1192          break;
1193    }
1194
1195    if (!mul || mul->operation != ir_binop_mul)
1196       return false;
1197
1198    nonmul->accept(this);
1199    src_reg src0 = fix_3src_operand(this->result);
1200
1201    mul->operands[0]->accept(this);
1202    src_reg src1 = fix_3src_operand(this->result);
1203    src1.negate ^= mul_negate;
1204    src1.abs = mul_abs;
1205    if (mul_abs)
1206       src1.negate = false;
1207
1208    mul->operands[1]->accept(this);
1209    src_reg src2 = fix_3src_operand(this->result);
1210    src2.abs = mul_abs;
1211    if (mul_abs)
1212       src2.negate = false;
1213
1214    this->result = src_reg(this, ir->type);
1215    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1216
1217    return true;
1218 }
1219
1220 bool
1221 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1222 {
1223    /* This optimization relies on CMP setting the destination to 0 when
1224     * false.  Early hardware only sets the least significant bit, and
1225     * leaves the other bits undefined.  So we can't use it.
1226     */
1227    if (devinfo->gen < 6)
1228       return false;
1229
1230    ir_expression *const cmp = ir->operands[0]->as_expression();
1231
1232    if (cmp == NULL)
1233       return false;
1234
1235    switch (cmp->operation) {
1236    case ir_binop_less:
1237    case ir_binop_greater:
1238    case ir_binop_lequal:
1239    case ir_binop_gequal:
1240    case ir_binop_equal:
1241    case ir_binop_nequal:
1242       break;
1243
1244    default:
1245       return false;
1246    }
1247
1248    cmp->operands[0]->accept(this);
1249    const src_reg cmp_src0 = this->result;
1250
1251    cmp->operands[1]->accept(this);
1252    const src_reg cmp_src1 = this->result;
1253
1254    this->result = src_reg(this, ir->type);
1255
1256    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1257             brw_conditional_for_comparison(cmp->operation)));
1258
1259    /* If the comparison is false, this->result will just happen to be zero.
1260     */
1261    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1262                                        this->result, src_reg(1.0f));
1263    inst->predicate = BRW_PREDICATE_NORMAL;
1264    inst->predicate_inverse = true;
1265
1266    return true;
1267 }
1268
1269 vec4_instruction *
1270 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1271                           src_reg src0, src_reg src1)
1272 {
1273    vec4_instruction *inst;
1274
1275    if (devinfo->gen >= 6) {
1276       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1277       inst->conditional_mod = conditionalmod;
1278    } else {
1279       emit(CMP(dst, src0, src1, conditionalmod));
1280
1281       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282       inst->predicate = BRW_PREDICATE_NORMAL;
1283    }
1284
1285    return inst;
1286 }
1287
1288 vec4_instruction *
1289 vec4_visitor::emit_lrp(const dst_reg &dst,
1290                        const src_reg &x, const src_reg &y, const src_reg &a)
1291 {
1292    if (devinfo->gen >= 6) {
1293       /* Note that the instruction's argument order is reversed from GLSL
1294        * and the IR.
1295        */
1296      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1297                      fix_3src_operand(x)));
1298    } else {
1299       /* Earlier generations don't support three source operations, so we
1300        * need to emit x*(1-a) + y*a.
1301        */
1302       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1303       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1304       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1305       y_times_a.writemask           = dst.writemask;
1306       one_minus_a.writemask         = dst.writemask;
1307       x_times_one_minus_a.writemask = dst.writemask;
1308
1309       emit(MUL(y_times_a, y, a));
1310       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1311       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1312       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1313    }
1314 }
1315
1316 /**
1317  * Emits the instructions needed to perform a pull constant load. before_block
1318  * and before_inst can be NULL in which case the instruction will be appended
1319  * to the end of the instruction list.
1320  */
1321 void
1322 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1323                                           src_reg surf_index,
1324                                           src_reg offset_reg,
1325                                           bblock_t *before_block,
1326                                           vec4_instruction *before_inst)
1327 {
1328    assert((before_inst == NULL && before_block == NULL) ||
1329           (before_inst && before_block));
1330
1331    vec4_instruction *pull;
1332
1333    if (devinfo->gen >= 9) {
1334       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1335       src_reg header(this, glsl_type::uvec4_type, 2);
1336
1337       pull = new(mem_ctx)
1338          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1339                           dst_reg(header));
1340
1341       if (before_inst)
1342          emit_before(before_block, before_inst, pull);
1343       else
1344          emit(pull);
1345
1346       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1347                                  offset_reg.type);
1348       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1349
1350       if (before_inst)
1351          emit_before(before_block, before_inst, pull);
1352       else
1353          emit(pull);
1354
1355       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1356                                            dst,
1357                                            surf_index,
1358                                            header);
1359       pull->mlen = 2;
1360       pull->header_size = 1;
1361    } else if (devinfo->gen >= 7) {
1362       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1363
1364       grf_offset.type = offset_reg.type;
1365
1366       pull = MOV(grf_offset, offset_reg);
1367
1368       if (before_inst)
1369          emit_before(before_block, before_inst, pull);
1370       else
1371          emit(pull);
1372
1373       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1374                                            dst,
1375                                            surf_index,
1376                                            src_reg(grf_offset));
1377       pull->mlen = 1;
1378    } else {
1379       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1380                                            dst,
1381                                            surf_index,
1382                                            offset_reg);
1383       pull->base_mrf = 14;
1384       pull->mlen = 1;
1385    }
1386
1387    if (before_inst)
1388       emit_before(before_block, before_inst, pull);
1389    else
1390       emit(pull);
1391 }
1392
1393 src_reg
1394 vec4_visitor::emit_uniformize(const src_reg &src)
1395 {
1396    const src_reg chan_index(this, glsl_type::uint_type);
1397    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1398                               src.type);
1399
1400    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1401       ->force_writemask_all = true;
1402    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1403       ->force_writemask_all = true;
1404
1405    return src_reg(dst);
1406 }
1407
1408 void
1409 vec4_visitor::visit(ir_expression *ir)
1410 {
1411    unsigned int operand;
1412    src_reg op[ARRAY_SIZE(ir->operands)];
1413    vec4_instruction *inst;
1414
1415    if (ir->operation == ir_binop_add) {
1416       if (try_emit_mad(ir))
1417          return;
1418    }
1419
1420    if (ir->operation == ir_unop_b2f) {
1421       if (try_emit_b2f_of_compare(ir))
1422          return;
1423    }
1424
1425    /* Storage for our result.  Ideally for an assignment we'd be using
1426     * the actual storage for the result here, instead.
1427     */
1428    dst_reg result_dst(this, ir->type);
1429    src_reg result_src(result_dst);
1430
1431    if (ir->operation == ir_triop_csel) {
1432       ir->operands[1]->accept(this);
1433       op[1] = this->result;
1434       ir->operands[2]->accept(this);
1435       op[2] = this->result;
1436
1437       enum brw_predicate predicate;
1438       emit_bool_to_cond_code(ir->operands[0], &predicate);
1439       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1440       inst->predicate = predicate;
1441       this->result = result_src;
1442       return;
1443    }
1444
1445    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1446       this->result.file = BAD_FILE;
1447       ir->operands[operand]->accept(this);
1448       if (this->result.file == BAD_FILE) {
1449          fprintf(stderr, "Failed to get tree for expression operand:\n");
1450          ir->operands[operand]->fprint(stderr);
1451          exit(1);
1452       }
1453       op[operand] = this->result;
1454
1455       /* Matrix expression operands should have been broken down to vector
1456        * operations already.
1457        */
1458       assert(!ir->operands[operand]->type->is_matrix());
1459    }
1460
1461    /* If nothing special happens, this is the result. */
1462    this->result = result_src;
1463
1464    switch (ir->operation) {
1465    case ir_unop_logic_not:
1466       emit(NOT(result_dst, op[0]));
1467       break;
1468    case ir_unop_neg:
1469       op[0].negate = !op[0].negate;
1470       emit(MOV(result_dst, op[0]));
1471       break;
1472    case ir_unop_abs:
1473       op[0].abs = true;
1474       op[0].negate = false;
1475       emit(MOV(result_dst, op[0]));
1476       break;
1477
1478    case ir_unop_sign:
1479       if (ir->type->is_float()) {
1480          /* AND(val, 0x80000000) gives the sign bit.
1481           *
1482           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1483           * zero.
1484           */
1485          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486
1487          op[0].type = BRW_REGISTER_TYPE_UD;
1488          result_dst.type = BRW_REGISTER_TYPE_UD;
1489          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1490
1491          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1492          inst->predicate = BRW_PREDICATE_NORMAL;
1493
1494          this->result.type = BRW_REGISTER_TYPE_F;
1495       } else {
1496          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1497           *               -> non-negative val generates 0x00000000.
1498           *  Predicated OR sets 1 if val is positive.
1499           */
1500          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1501
1502          emit(ASR(result_dst, op[0], src_reg(31)));
1503
1504          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1505          inst->predicate = BRW_PREDICATE_NORMAL;
1506       }
1507       break;
1508
1509    case ir_unop_rcp:
1510       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1511       break;
1512
1513    case ir_unop_exp2:
1514       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1515       break;
1516    case ir_unop_log2:
1517       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1518       break;
1519    case ir_unop_exp:
1520    case ir_unop_log:
1521       unreachable("not reached: should be handled by ir_explog_to_explog2");
1522    case ir_unop_sin:
1523       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1524       break;
1525    case ir_unop_cos:
1526       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1527       break;
1528
1529    case ir_unop_dFdx:
1530    case ir_unop_dFdx_coarse:
1531    case ir_unop_dFdx_fine:
1532    case ir_unop_dFdy:
1533    case ir_unop_dFdy_coarse:
1534    case ir_unop_dFdy_fine:
1535       unreachable("derivatives not valid in vertex shader");
1536
1537    case ir_unop_bitfield_reverse:
1538       emit(BFREV(result_dst, op[0]));
1539       break;
1540    case ir_unop_bit_count:
1541       emit(CBIT(result_dst, op[0]));
1542       break;
1543    case ir_unop_find_msb: {
1544       src_reg temp = src_reg(this, glsl_type::uint_type);
1545
1546       inst = emit(FBH(dst_reg(temp), op[0]));
1547       inst->dst.writemask = WRITEMASK_XYZW;
1548
1549       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1550        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1551        * subtract the result from 31 to convert the MSB count into an LSB count.
1552        */
1553
1554       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1555       temp.swizzle = BRW_SWIZZLE_NOOP;
1556       emit(MOV(result_dst, temp));
1557
1558       src_reg src_tmp = src_reg(result_dst);
1559       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1560
1561       src_tmp.negate = true;
1562       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1563       inst->predicate = BRW_PREDICATE_NORMAL;
1564       break;
1565    }
1566    case ir_unop_find_lsb:
1567       emit(FBL(result_dst, op[0]));
1568       break;
1569    case ir_unop_saturate:
1570       inst = emit(MOV(result_dst, op[0]));
1571       inst->saturate = true;
1572       break;
1573
1574    case ir_unop_noise:
1575       unreachable("not reached: should be handled by lower_noise");
1576
1577    case ir_unop_subroutine_to_int:
1578       emit(MOV(result_dst, op[0]));
1579       break;
1580
1581    case ir_binop_add:
1582       emit(ADD(result_dst, op[0], op[1]));
1583       break;
1584    case ir_binop_sub:
1585       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1586
1587    case ir_binop_mul:
1588       if (devinfo->gen < 8 && ir->type->is_integer()) {
1589          /* For integer multiplication, the MUL uses the low 16 bits of one of
1590           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1591           * accumulates in the contribution of the upper 16 bits of that
1592           * operand.  If we can determine that one of the args is in the low
1593           * 16 bits, though, we can just emit a single MUL.
1594           */
1595          if (ir->operands[0]->is_uint16_constant()) {
1596             if (devinfo->gen < 7)
1597                emit(MUL(result_dst, op[0], op[1]));
1598             else
1599                emit(MUL(result_dst, op[1], op[0]));
1600          } else if (ir->operands[1]->is_uint16_constant()) {
1601             if (devinfo->gen < 7)
1602                emit(MUL(result_dst, op[1], op[0]));
1603             else
1604                emit(MUL(result_dst, op[0], op[1]));
1605          } else {
1606             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1607
1608             emit(MUL(acc, op[0], op[1]));
1609             emit(MACH(dst_null_d(), op[0], op[1]));
1610             emit(MOV(result_dst, src_reg(acc)));
1611          }
1612       } else {
1613          emit(MUL(result_dst, op[0], op[1]));
1614       }
1615       break;
1616    case ir_binop_imul_high: {
1617       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1618
1619       emit(MUL(acc, op[0], op[1]));
1620       emit(MACH(result_dst, op[0], op[1]));
1621       break;
1622    }
1623    case ir_binop_div:
1624       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1625       assert(ir->type->is_integer());
1626       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1627       break;
1628
1629    case ir_binop_carry:
1630       unreachable("Should have been lowered by carry_to_arith().");
1631
1632    case ir_binop_borrow:
1633       unreachable("Should have been lowered by borrow_to_arith().");
1634
1635    case ir_binop_mod:
1636       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1637       assert(ir->type->is_integer());
1638       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1639       break;
1640
1641    case ir_binop_less:
1642    case ir_binop_greater:
1643    case ir_binop_lequal:
1644    case ir_binop_gequal:
1645    case ir_binop_equal:
1646    case ir_binop_nequal: {
1647       if (devinfo->gen <= 5) {
1648          resolve_bool_comparison(ir->operands[0], &op[0]);
1649          resolve_bool_comparison(ir->operands[1], &op[1]);
1650       }
1651       emit(CMP(result_dst, op[0], op[1],
1652                brw_conditional_for_comparison(ir->operation)));
1653       break;
1654    }
1655
1656    case ir_binop_all_equal:
1657       if (devinfo->gen <= 5) {
1658          resolve_bool_comparison(ir->operands[0], &op[0]);
1659          resolve_bool_comparison(ir->operands[1], &op[1]);
1660       }
1661
1662       /* "==" operator producing a scalar boolean. */
1663       if (ir->operands[0]->type->is_vector() ||
1664           ir->operands[1]->type->is_vector()) {
1665          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1666          emit(MOV(result_dst, src_reg(0)));
1667          inst = emit(MOV(result_dst, src_reg(~0)));
1668          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1669       } else {
1670          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1671       }
1672       break;
1673    case ir_binop_any_nequal:
1674       if (devinfo->gen <= 5) {
1675          resolve_bool_comparison(ir->operands[0], &op[0]);
1676          resolve_bool_comparison(ir->operands[1], &op[1]);
1677       }
1678
1679       /* "!=" operator producing a scalar boolean. */
1680       if (ir->operands[0]->type->is_vector() ||
1681           ir->operands[1]->type->is_vector()) {
1682          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1683
1684          emit(MOV(result_dst, src_reg(0)));
1685          inst = emit(MOV(result_dst, src_reg(~0)));
1686          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1687       } else {
1688          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1689       }
1690       break;
1691
1692    case ir_unop_any:
1693       if (devinfo->gen <= 5) {
1694          resolve_bool_comparison(ir->operands[0], &op[0]);
1695       }
1696       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1697       emit(MOV(result_dst, src_reg(0)));
1698
1699       inst = emit(MOV(result_dst, src_reg(~0)));
1700       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1701       break;
1702
1703    case ir_binop_logic_xor:
1704       emit(XOR(result_dst, op[0], op[1]));
1705       break;
1706
1707    case ir_binop_logic_or:
1708       emit(OR(result_dst, op[0], op[1]));
1709       break;
1710
1711    case ir_binop_logic_and:
1712       emit(AND(result_dst, op[0], op[1]));
1713       break;
1714
1715    case ir_binop_dot:
1716       assert(ir->operands[0]->type->is_vector());
1717       assert(ir->operands[0]->type == ir->operands[1]->type);
1718       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1719       break;
1720
1721    case ir_unop_sqrt:
1722       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1723       break;
1724    case ir_unop_rsq:
1725       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1726       break;
1727
1728    case ir_unop_bitcast_i2f:
1729    case ir_unop_bitcast_u2f:
1730       this->result = op[0];
1731       this->result.type = BRW_REGISTER_TYPE_F;
1732       break;
1733
1734    case ir_unop_bitcast_f2i:
1735       this->result = op[0];
1736       this->result.type = BRW_REGISTER_TYPE_D;
1737       break;
1738
1739    case ir_unop_bitcast_f2u:
1740       this->result = op[0];
1741       this->result.type = BRW_REGISTER_TYPE_UD;
1742       break;
1743
1744    case ir_unop_i2f:
1745    case ir_unop_i2u:
1746    case ir_unop_u2i:
1747    case ir_unop_u2f:
1748    case ir_unop_f2i:
1749    case ir_unop_f2u:
1750       emit(MOV(result_dst, op[0]));
1751       break;
1752    case ir_unop_b2i:
1753    case ir_unop_b2f:
1754       if (devinfo->gen <= 5) {
1755          resolve_bool_comparison(ir->operands[0], &op[0]);
1756       }
1757       emit(MOV(result_dst, negate(op[0])));
1758       break;
1759    case ir_unop_f2b:
1760       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1761       break;
1762    case ir_unop_i2b:
1763       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1764       break;
1765
1766    case ir_unop_trunc:
1767       emit(RNDZ(result_dst, op[0]));
1768       break;
1769    case ir_unop_ceil: {
1770          src_reg tmp = src_reg(this, ir->type);
1771          op[0].negate = !op[0].negate;
1772          emit(RNDD(dst_reg(tmp), op[0]));
1773          tmp.negate = true;
1774          emit(MOV(result_dst, tmp));
1775       }
1776       break;
1777    case ir_unop_floor:
1778       inst = emit(RNDD(result_dst, op[0]));
1779       break;
1780    case ir_unop_fract:
1781       inst = emit(FRC(result_dst, op[0]));
1782       break;
1783    case ir_unop_round_even:
1784       emit(RNDE(result_dst, op[0]));
1785       break;
1786
1787    case ir_binop_min:
1788       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1789       break;
1790    case ir_binop_max:
1791       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1792       break;
1793
1794    case ir_binop_pow:
1795       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1796       break;
1797
1798    case ir_unop_bit_not:
1799       inst = emit(NOT(result_dst, op[0]));
1800       break;
1801    case ir_binop_bit_and:
1802       inst = emit(AND(result_dst, op[0], op[1]));
1803       break;
1804    case ir_binop_bit_xor:
1805       inst = emit(XOR(result_dst, op[0], op[1]));
1806       break;
1807    case ir_binop_bit_or:
1808       inst = emit(OR(result_dst, op[0], op[1]));
1809       break;
1810
1811    case ir_binop_lshift:
1812       inst = emit(SHL(result_dst, op[0], op[1]));
1813       break;
1814
1815    case ir_binop_rshift:
1816       if (ir->type->base_type == GLSL_TYPE_INT)
1817          inst = emit(ASR(result_dst, op[0], op[1]));
1818       else
1819          inst = emit(SHR(result_dst, op[0], op[1]));
1820       break;
1821
1822    case ir_binop_bfm:
1823       emit(BFI1(result_dst, op[0], op[1]));
1824       break;
1825
1826    case ir_binop_ubo_load: {
1827       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1828       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1829       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1830       src_reg offset;
1831
1832       /* Now, load the vector from that offset. */
1833       assert(ir->type->is_vector() || ir->type->is_scalar());
1834
1835       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1836       packed_consts.type = result.type;
1837       src_reg surf_index;
1838
1839       if (const_uniform_block) {
1840          /* The block index is a constant, so just emit the binding table entry
1841           * as an immediate.
1842           */
1843          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1844                               const_uniform_block->value.u[0]);
1845       } else {
1846          /* The block index is not a constant. Evaluate the index expression
1847           * per-channel and add the base UBO index; we have to select a value
1848           * from any live channel.
1849           */
1850          surf_index = src_reg(this, glsl_type::uint_type);
1851          emit(ADD(dst_reg(surf_index), op[0],
1852                   src_reg(prog_data->base.binding_table.ubo_start)));
1853          surf_index = emit_uniformize(surf_index);
1854
1855          /* Assume this may touch any UBO. It would be nice to provide
1856           * a tighter bound, but the array information is already lowered away.
1857           */
1858          brw_mark_surface_used(&prog_data->base,
1859                                prog_data->base.binding_table.ubo_start +
1860                                shader_prog->NumUniformBlocks - 1);
1861       }
1862
1863       if (const_offset_ir) {
1864          if (devinfo->gen >= 8) {
1865             /* Store the offset in a GRF so we can send-from-GRF. */
1866             offset = src_reg(this, glsl_type::int_type);
1867             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1868          } else {
1869             /* Immediates are fine on older generations since they'll be moved
1870              * to a (potentially fake) MRF at the generator level.
1871              */
1872             offset = src_reg(const_offset / 16);
1873          }
1874       } else {
1875          offset = src_reg(this, glsl_type::uint_type);
1876          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1877       }
1878
1879       emit_pull_constant_load_reg(dst_reg(packed_consts),
1880                                   surf_index,
1881                                   offset,
1882                                   NULL, NULL /* before_block/inst */);
1883
1884       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1885       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1886                                             const_offset % 16 / 4,
1887                                             const_offset % 16 / 4,
1888                                             const_offset % 16 / 4);
1889
1890       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1891       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1892          emit(CMP(result_dst, packed_consts, src_reg(0u),
1893                   BRW_CONDITIONAL_NZ));
1894       } else {
1895          emit(MOV(result_dst, packed_consts));
1896       }
1897       break;
1898    }
1899
1900    case ir_binop_vector_extract:
1901       unreachable("should have been lowered by vec_index_to_cond_assign");
1902
1903    case ir_triop_fma:
1904       op[0] = fix_3src_operand(op[0]);
1905       op[1] = fix_3src_operand(op[1]);
1906       op[2] = fix_3src_operand(op[2]);
1907       /* Note that the instruction's argument order is reversed from GLSL
1908        * and the IR.
1909        */
1910       emit(MAD(result_dst, op[2], op[1], op[0]));
1911       break;
1912
1913    case ir_triop_lrp:
1914       emit_lrp(result_dst, op[0], op[1], op[2]);
1915       break;
1916
1917    case ir_triop_csel:
1918       unreachable("already handled above");
1919       break;
1920
1921    case ir_triop_bfi:
1922       op[0] = fix_3src_operand(op[0]);
1923       op[1] = fix_3src_operand(op[1]);
1924       op[2] = fix_3src_operand(op[2]);
1925       emit(BFI2(result_dst, op[0], op[1], op[2]));
1926       break;
1927
1928    case ir_triop_bitfield_extract:
1929       op[0] = fix_3src_operand(op[0]);
1930       op[1] = fix_3src_operand(op[1]);
1931       op[2] = fix_3src_operand(op[2]);
1932       /* Note that the instruction's argument order is reversed from GLSL
1933        * and the IR.
1934        */
1935       emit(BFE(result_dst, op[2], op[1], op[0]));
1936       break;
1937
1938    case ir_triop_vector_insert:
1939       unreachable("should have been lowered by lower_vector_insert");
1940
1941    case ir_quadop_bitfield_insert:
1942       unreachable("not reached: should be handled by "
1943               "bitfield_insert_to_bfm_bfi\n");
1944
1945    case ir_quadop_vector:
1946       unreachable("not reached: should be handled by lower_quadop_vector");
1947
1948    case ir_unop_pack_half_2x16:
1949       emit_pack_half_2x16(result_dst, op[0]);
1950       break;
1951    case ir_unop_unpack_half_2x16:
1952       emit_unpack_half_2x16(result_dst, op[0]);
1953       break;
1954    case ir_unop_unpack_unorm_4x8:
1955       emit_unpack_unorm_4x8(result_dst, op[0]);
1956       break;
1957    case ir_unop_unpack_snorm_4x8:
1958       emit_unpack_snorm_4x8(result_dst, op[0]);
1959       break;
1960    case ir_unop_pack_unorm_4x8:
1961       emit_pack_unorm_4x8(result_dst, op[0]);
1962       break;
1963    case ir_unop_pack_snorm_4x8:
1964       emit_pack_snorm_4x8(result_dst, op[0]);
1965       break;
1966    case ir_unop_pack_snorm_2x16:
1967    case ir_unop_pack_unorm_2x16:
1968    case ir_unop_unpack_snorm_2x16:
1969    case ir_unop_unpack_unorm_2x16:
1970       unreachable("not reached: should be handled by lower_packing_builtins");
1971    case ir_unop_unpack_half_2x16_split_x:
1972    case ir_unop_unpack_half_2x16_split_y:
1973    case ir_binop_pack_half_2x16_split:
1974    case ir_unop_interpolate_at_centroid:
1975    case ir_binop_interpolate_at_sample:
1976    case ir_binop_interpolate_at_offset:
1977       unreachable("not reached: should not occur in vertex shader");
1978    case ir_binop_ldexp:
1979       unreachable("not reached: should be handled by ldexp_to_arith()");
1980    case ir_unop_d2f:
1981    case ir_unop_f2d:
1982    case ir_unop_d2i:
1983    case ir_unop_i2d:
1984    case ir_unop_d2u:
1985    case ir_unop_u2d:
1986    case ir_unop_d2b:
1987    case ir_unop_pack_double_2x32:
1988    case ir_unop_unpack_double_2x32:
1989    case ir_unop_frexp_sig:
1990    case ir_unop_frexp_exp:
1991       unreachable("fp64 todo");
1992    }
1993 }
1994
1995
1996 void
1997 vec4_visitor::visit(ir_swizzle *ir)
1998 {
1999    /* Note that this is only swizzles in expressions, not those on the left
2000     * hand side of an assignment, which do write masking.  See ir_assignment
2001     * for that.
2002     */
2003    const unsigned swz = brw_compose_swizzle(
2004       brw_swizzle_for_size(ir->type->vector_elements),
2005       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2006
2007    ir->val->accept(this);
2008    this->result = swizzle(this->result, swz);
2009 }
2010
2011 void
2012 vec4_visitor::visit(ir_dereference_variable *ir)
2013 {
2014    const struct glsl_type *type = ir->type;
2015    dst_reg *reg = variable_storage(ir->var);
2016
2017    if (!reg) {
2018       fail("Failed to find variable storage for %s\n", ir->var->name);
2019       this->result = src_reg(brw_null_reg());
2020       return;
2021    }
2022
2023    this->result = src_reg(*reg);
2024
2025    /* System values get their swizzle from the dst_reg writemask */
2026    if (ir->var->data.mode == ir_var_system_value)
2027       return;
2028
2029    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2030       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2031 }
2032
2033
2034 int
2035 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2036 {
2037    /* Under normal circumstances array elements are stored consecutively, so
2038     * the stride is equal to the size of the array element.
2039     */
2040    return type_size(ir->type);
2041 }
2042
2043
2044 void
2045 vec4_visitor::visit(ir_dereference_array *ir)
2046 {
2047    ir_constant *constant_index;
2048    src_reg src;
2049    int array_stride = compute_array_stride(ir);
2050
2051    constant_index = ir->array_index->constant_expression_value();
2052
2053    ir->array->accept(this);
2054    src = this->result;
2055
2056    if (constant_index) {
2057       src.reg_offset += constant_index->value.i[0] * array_stride;
2058    } else {
2059       /* Variable index array dereference.  It eats the "vec4" of the
2060        * base of the array and an index that offsets the Mesa register
2061        * index.
2062        */
2063       ir->array_index->accept(this);
2064
2065       src_reg index_reg;
2066
2067       if (array_stride == 1) {
2068          index_reg = this->result;
2069       } else {
2070          index_reg = src_reg(this, glsl_type::int_type);
2071
2072          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2073       }
2074
2075       if (src.reladdr) {
2076          src_reg temp = src_reg(this, glsl_type::int_type);
2077
2078          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2079
2080          index_reg = temp;
2081       }
2082
2083       src.reladdr = ralloc(mem_ctx, src_reg);
2084       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2085    }
2086
2087    /* If the type is smaller than a vec4, replicate the last channel out. */
2088    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2089       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2090    else
2091       src.swizzle = BRW_SWIZZLE_NOOP;
2092    src.type = brw_type_for_base_type(ir->type);
2093
2094    this->result = src;
2095 }
2096
2097 void
2098 vec4_visitor::visit(ir_dereference_record *ir)
2099 {
2100    unsigned int i;
2101    const glsl_type *struct_type = ir->record->type;
2102    int offset = 0;
2103
2104    ir->record->accept(this);
2105
2106    for (i = 0; i < struct_type->length; i++) {
2107       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2108          break;
2109       offset += type_size(struct_type->fields.structure[i].type);
2110    }
2111
2112    /* If the type is smaller than a vec4, replicate the last channel out. */
2113    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2114       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2115    else
2116       this->result.swizzle = BRW_SWIZZLE_NOOP;
2117    this->result.type = brw_type_for_base_type(ir->type);
2118
2119    this->result.reg_offset += offset;
2120 }
2121
2122 /**
2123  * We want to be careful in assignment setup to hit the actual storage
2124  * instead of potentially using a temporary like we might with the
2125  * ir_dereference handler.
2126  */
2127 static dst_reg
2128 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2129 {
2130    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2131     * access of a vector, it must be separated into a series conditional moves
2132     * before reaching this point (see ir_vec_index_to_cond_assign).
2133     */
2134    assert(ir->as_dereference());
2135    ir_dereference_array *deref_array = ir->as_dereference_array();
2136    if (deref_array) {
2137       assert(!deref_array->array->type->is_vector());
2138    }
2139
2140    /* Use the rvalue deref handler for the most part.  We'll ignore
2141     * swizzles in it and write swizzles using writemask, though.
2142     */
2143    ir->accept(v);
2144    return dst_reg(v->result);
2145 }
2146
2147 void
2148 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2149                               const struct glsl_type *type,
2150                               enum brw_predicate predicate)
2151 {
2152    if (type->base_type == GLSL_TYPE_STRUCT) {
2153       for (unsigned int i = 0; i < type->length; i++) {
2154          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2155       }
2156       return;
2157    }
2158
2159    if (type->is_array()) {
2160       for (unsigned int i = 0; i < type->length; i++) {
2161          emit_block_move(dst, src, type->fields.array, predicate);
2162       }
2163       return;
2164    }
2165
2166    if (type->is_matrix()) {
2167       const struct glsl_type *vec_type;
2168
2169       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2170                                          type->vector_elements, 1);
2171
2172       for (int i = 0; i < type->matrix_columns; i++) {
2173          emit_block_move(dst, src, vec_type, predicate);
2174       }
2175       return;
2176    }
2177
2178    assert(type->is_scalar() || type->is_vector());
2179
2180    dst->type = brw_type_for_base_type(type);
2181    src->type = dst->type;
2182
2183    dst->writemask = (1 << type->vector_elements) - 1;
2184
2185    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2186
2187    vec4_instruction *inst = emit(MOV(*dst, *src));
2188    inst->predicate = predicate;
2189
2190    dst->reg_offset++;
2191    src->reg_offset++;
2192 }
2193
2194
2195 /* If the RHS processing resulted in an instruction generating a
2196  * temporary value, and it would be easy to rewrite the instruction to
2197  * generate its result right into the LHS instead, do so.  This ends
2198  * up reliably removing instructions where it can be tricky to do so
2199  * later without real UD chain information.
2200  */
2201 bool
2202 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2203                                      dst_reg dst,
2204                                      src_reg src,
2205                                      vec4_instruction *pre_rhs_inst,
2206                                      vec4_instruction *last_rhs_inst)
2207 {
2208    /* This could be supported, but it would take more smarts. */
2209    if (ir->condition)
2210       return false;
2211
2212    if (pre_rhs_inst == last_rhs_inst)
2213       return false; /* No instructions generated to work with. */
2214
2215    /* Make sure the last instruction generated our source reg. */
2216    if (src.file != GRF ||
2217        src.file != last_rhs_inst->dst.file ||
2218        src.reg != last_rhs_inst->dst.reg ||
2219        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2220        src.reladdr ||
2221        src.abs ||
2222        src.negate ||
2223        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2224       return false;
2225
2226    /* Check that that last instruction fully initialized the channels
2227     * we want to use, in the order we want to use them.  We could
2228     * potentially reswizzle the operands of many instructions so that
2229     * we could handle out of order channels, but don't yet.
2230     */
2231
2232    for (unsigned i = 0; i < 4; i++) {
2233       if (dst.writemask & (1 << i)) {
2234          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2235             return false;
2236
2237          if (BRW_GET_SWZ(src.swizzle, i) != i)
2238             return false;
2239       }
2240    }
2241
2242    /* Success!  Rewrite the instruction. */
2243    last_rhs_inst->dst.file = dst.file;
2244    last_rhs_inst->dst.reg = dst.reg;
2245    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2246    last_rhs_inst->dst.reladdr = dst.reladdr;
2247    last_rhs_inst->dst.writemask &= dst.writemask;
2248
2249    return true;
2250 }
2251
2252 void
2253 vec4_visitor::visit(ir_assignment *ir)
2254 {
2255    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2256    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2257
2258    if (!ir->lhs->type->is_scalar() &&
2259        !ir->lhs->type->is_vector()) {
2260       ir->rhs->accept(this);
2261       src_reg src = this->result;
2262
2263       if (ir->condition) {
2264          emit_bool_to_cond_code(ir->condition, &predicate);
2265       }
2266
2267       /* emit_block_move doesn't account for swizzles in the source register.
2268        * This should be ok, since the source register is a structure or an
2269        * array, and those can't be swizzled.  But double-check to be sure.
2270        */
2271       assert(src.swizzle ==
2272              (ir->rhs->type->is_matrix()
2273               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2274               : BRW_SWIZZLE_NOOP));
2275
2276       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2277       return;
2278    }
2279
2280    /* Now we're down to just a scalar/vector with writemasks. */
2281    int i;
2282
2283    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2284    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2285
2286    ir->rhs->accept(this);
2287
2288    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2289
2290    int swizzles[4];
2291    int src_chan = 0;
2292
2293    assert(ir->lhs->type->is_vector() ||
2294           ir->lhs->type->is_scalar());
2295    dst.writemask = ir->write_mask;
2296
2297    /* Swizzle a small RHS vector into the channels being written.
2298     *
2299     * glsl ir treats write_mask as dictating how many channels are
2300     * present on the RHS while in our instructions we need to make
2301     * those channels appear in the slots of the vec4 they're written to.
2302     */
2303    for (int i = 0; i < 4; i++)
2304       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2305
2306    src_reg src = swizzle(this->result,
2307                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2308                                       swizzles[2], swizzles[3]));
2309
2310    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2311       return;
2312    }
2313
2314    if (ir->condition) {
2315       emit_bool_to_cond_code(ir->condition, &predicate);
2316    }
2317
2318    for (i = 0; i < type_size(ir->lhs->type); i++) {
2319       vec4_instruction *inst = emit(MOV(dst, src));
2320       inst->predicate = predicate;
2321
2322       dst.reg_offset++;
2323       src.reg_offset++;
2324    }
2325 }
2326
2327 void
2328 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2329 {
2330    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2331       foreach_in_list(ir_constant, field_value, &ir->components) {
2332          emit_constant_values(dst, field_value);
2333       }
2334       return;
2335    }
2336
2337    if (ir->type->is_array()) {
2338       for (unsigned int i = 0; i < ir->type->length; i++) {
2339          emit_constant_values(dst, ir->array_elements[i]);
2340       }
2341       return;
2342    }
2343
2344    if (ir->type->is_matrix()) {
2345       for (int i = 0; i < ir->type->matrix_columns; i++) {
2346          float *vec = &ir->value.f[i * ir->type->vector_elements];
2347
2348          for (int j = 0; j < ir->type->vector_elements; j++) {
2349             dst->writemask = 1 << j;
2350             dst->type = BRW_REGISTER_TYPE_F;
2351
2352             emit(MOV(*dst, src_reg(vec[j])));
2353          }
2354          dst->reg_offset++;
2355       }
2356       return;
2357    }
2358
2359    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2360
2361    for (int i = 0; i < ir->type->vector_elements; i++) {
2362       if (!(remaining_writemask & (1 << i)))
2363          continue;
2364
2365       dst->writemask = 1 << i;
2366       dst->type = brw_type_for_base_type(ir->type);
2367
2368       /* Find other components that match the one we're about to
2369        * write.  Emits fewer instructions for things like vec4(0.5,
2370        * 1.5, 1.5, 1.5).
2371        */
2372       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2373          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2374             if (ir->value.b[i] == ir->value.b[j])
2375                dst->writemask |= (1 << j);
2376          } else {
2377             /* u, i, and f storage all line up, so no need for a
2378              * switch case for comparing each type.
2379              */
2380             if (ir->value.u[i] == ir->value.u[j])
2381                dst->writemask |= (1 << j);
2382          }
2383       }
2384
2385       switch (ir->type->base_type) {
2386       case GLSL_TYPE_FLOAT:
2387          emit(MOV(*dst, src_reg(ir->value.f[i])));
2388          break;
2389       case GLSL_TYPE_INT:
2390          emit(MOV(*dst, src_reg(ir->value.i[i])));
2391          break;
2392       case GLSL_TYPE_UINT:
2393          emit(MOV(*dst, src_reg(ir->value.u[i])));
2394          break;
2395       case GLSL_TYPE_BOOL:
2396          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2397          break;
2398       default:
2399          unreachable("Non-float/uint/int/bool constant");
2400       }
2401
2402       remaining_writemask &= ~dst->writemask;
2403    }
2404    dst->reg_offset++;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_constant *ir)
2409 {
2410    dst_reg dst = dst_reg(this, ir->type);
2411    this->result = src_reg(dst);
2412
2413    emit_constant_values(&dst, ir);
2414 }
2415
2416 void
2417 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2418 {
2419    ir_dereference *deref = static_cast<ir_dereference *>(
2420       ir->actual_parameters.get_head());
2421    ir_variable *location = deref->variable_referenced();
2422    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2423                           location->data.binding);
2424
2425    /* Calculate the surface offset */
2426    src_reg offset(this, glsl_type::uint_type);
2427    ir_dereference_array *deref_array = deref->as_dereference_array();
2428    if (deref_array) {
2429       deref_array->array_index->accept(this);
2430
2431       src_reg tmp(this, glsl_type::uint_type);
2432       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2433       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2434    } else {
2435       offset = location->data.atomic.offset;
2436    }
2437
2438    /* Emit the appropriate machine instruction */
2439    const char *callee = ir->callee->function_name();
2440    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2441
2442    if (!strcmp("__intrinsic_atomic_read", callee)) {
2443       emit_untyped_surface_read(surf_index, dst, offset);
2444
2445    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2446       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2447                           src_reg(), src_reg());
2448
2449    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2450       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2451                           src_reg(), src_reg());
2452    }
2453
2454    brw_mark_surface_used(stage_prog_data, surf_index);
2455 }
2456
2457 void
2458 vec4_visitor::visit(ir_call *ir)
2459 {
2460    const char *callee = ir->callee->function_name();
2461
2462    if (!strcmp("__intrinsic_atomic_read", callee) ||
2463        !strcmp("__intrinsic_atomic_increment", callee) ||
2464        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2465       visit_atomic_counter_intrinsic(ir);
2466    } else {
2467       unreachable("Unsupported intrinsic.");
2468    }
2469 }
2470
2471 src_reg
2472 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2473                              src_reg coordinate, src_reg sampler)
2474 {
2475    vec4_instruction *inst =
2476       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2477                                     dst_reg(this, glsl_type::uvec4_type));
2478    inst->base_mrf = 2;
2479    inst->src[1] = sampler;
2480
2481    int param_base;
2482
2483    if (devinfo->gen >= 9) {
2484       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2485       vec4_instruction *header_inst = new(mem_ctx)
2486          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2487                           dst_reg(MRF, inst->base_mrf));
2488
2489       emit(header_inst);
2490
2491       inst->mlen = 2;
2492       inst->header_size = 1;
2493       param_base = inst->base_mrf + 1;
2494    } else {
2495       inst->mlen = 1;
2496       param_base = inst->base_mrf;
2497    }
2498
2499    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2500    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2501    int zero_mask = 0xf & ~coord_mask;
2502
2503    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2504             coordinate));
2505
2506    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2507             src_reg(0)));
2508
2509    emit(inst);
2510    return src_reg(inst->dst);
2511 }
2512
2513 bool
2514 vec4_visitor::is_high_sampler(src_reg sampler)
2515 {
2516    if (devinfo->gen < 8 && !devinfo->is_haswell)
2517       return false;
2518
2519    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2520 }
2521
2522 void
2523 vec4_visitor::emit_texture(ir_texture_opcode op,
2524                            dst_reg dest,
2525                            const glsl_type *dest_type,
2526                            src_reg coordinate,
2527                            int coord_components,
2528                            src_reg shadow_comparitor,
2529                            src_reg lod, src_reg lod2,
2530                            src_reg sample_index,
2531                            uint32_t constant_offset,
2532                            src_reg offset_value,
2533                            src_reg mcs,
2534                            bool is_cube_array,
2535                            uint32_t sampler,
2536                            src_reg sampler_reg)
2537 {
2538    enum opcode opcode;
2539    switch (op) {
2540    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2541    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2542    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2543    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2544    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2545    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2546    case ir_tg4: opcode = offset_value.file != BAD_FILE
2547                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2548    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2549    case ir_txb:
2550       unreachable("TXB is not valid for vertex shaders.");
2551    case ir_lod:
2552       unreachable("LOD is not valid for vertex shaders.");
2553    default:
2554       unreachable("Unrecognized tex op");
2555    }
2556
2557    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2558       opcode, dst_reg(this, dest_type));
2559
2560    inst->offset = constant_offset;
2561
2562    /* The message header is necessary for:
2563     * - Gen4 (always)
2564     * - Gen9+ for selecting SIMD4x2
2565     * - Texel offsets
2566     * - Gather channel selection
2567     * - Sampler indices too large to fit in a 4-bit value.
2568     */
2569    inst->header_size =
2570       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2571        inst->offset != 0 || op == ir_tg4 ||
2572        is_high_sampler(sampler_reg)) ? 1 : 0;
2573    inst->base_mrf = 2;
2574    inst->mlen = inst->header_size + 1; /* always at least one */
2575    inst->dst.writemask = WRITEMASK_XYZW;
2576    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2577
2578    inst->src[1] = sampler_reg;
2579
2580    /* MRF for the first parameter */
2581    int param_base = inst->base_mrf + inst->header_size;
2582
2583    if (op == ir_txs || op == ir_query_levels) {
2584       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2585       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2586    } else {
2587       /* Load the coordinate */
2588       /* FINISHME: gl_clamp_mask and saturate */
2589       int coord_mask = (1 << coord_components) - 1;
2590       int zero_mask = 0xf & ~coord_mask;
2591
2592       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2593                coordinate));
2594
2595       if (zero_mask != 0) {
2596          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2597                   src_reg(0)));
2598       }
2599       /* Load the shadow comparitor */
2600       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2601          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2602                           WRITEMASK_X),
2603                   shadow_comparitor));
2604          inst->mlen++;
2605       }
2606
2607       /* Load the LOD info */
2608       if (op == ir_tex || op == ir_txl) {
2609          int mrf, writemask;
2610          if (devinfo->gen >= 5) {
2611             mrf = param_base + 1;
2612             if (shadow_comparitor.file != BAD_FILE) {
2613                writemask = WRITEMASK_Y;
2614                /* mlen already incremented */
2615             } else {
2616                writemask = WRITEMASK_X;
2617                inst->mlen++;
2618             }
2619          } else /* devinfo->gen == 4 */ {
2620             mrf = param_base;
2621             writemask = WRITEMASK_W;
2622          }
2623          lod.swizzle = BRW_SWIZZLE_XXXX;
2624          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2625       } else if (op == ir_txf) {
2626          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2627       } else if (op == ir_txf_ms) {
2628          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2629                   sample_index));
2630          if (devinfo->gen >= 7) {
2631             /* MCS data is in the first channel of `mcs`, but we need to get it into
2632              * the .y channel of the second vec4 of params, so replicate .x across
2633              * the whole vec4 and then mask off everything except .y
2634              */
2635             mcs.swizzle = BRW_SWIZZLE_XXXX;
2636             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2637                      mcs));
2638          }
2639          inst->mlen++;
2640       } else if (op == ir_txd) {
2641          const brw_reg_type type = lod.type;
2642
2643          if (devinfo->gen >= 5) {
2644             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2645             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2646             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2647             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2648             inst->mlen++;
2649
2650             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2651                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2652                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2653                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2654                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2655                inst->mlen++;
2656
2657                if (shadow_comparitor.file != BAD_FILE) {
2658                   emit(MOV(dst_reg(MRF, param_base + 2,
2659                                    shadow_comparitor.type, WRITEMASK_Z),
2660                            shadow_comparitor));
2661                }
2662             }
2663          } else /* devinfo->gen == 4 */ {
2664             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2665             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2666             inst->mlen += 2;
2667          }
2668       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2669          if (shadow_comparitor.file != BAD_FILE) {
2670             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2671                      shadow_comparitor));
2672          }
2673
2674          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2675                   offset_value));
2676          inst->mlen++;
2677       }
2678    }
2679
2680    emit(inst);
2681
2682    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2683     * spec requires layers.
2684     */
2685    if (op == ir_txs && is_cube_array) {
2686       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2687                 writemask(inst->dst, WRITEMASK_Z),
2688                 src_reg(inst->dst), src_reg(6));
2689    }
2690
2691    if (devinfo->gen == 6 && op == ir_tg4) {
2692       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2693    }
2694
2695    swizzle_result(op, dest,
2696                   src_reg(inst->dst), sampler, dest_type);
2697 }
2698
2699 void
2700 vec4_visitor::visit(ir_texture *ir)
2701 {
2702    uint32_t sampler =
2703       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2704
2705    ir_rvalue *nonconst_sampler_index =
2706       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2707
2708    /* Handle non-constant sampler array indexing */
2709    src_reg sampler_reg;
2710    if (nonconst_sampler_index) {
2711       /* The highest sampler which may be used by this operation is
2712        * the last element of the array. Mark it here, because the generator
2713        * doesn't have enough information to determine the bound.
2714        */
2715       uint32_t array_size = ir->sampler->as_dereference_array()
2716          ->array->type->array_size();
2717
2718       uint32_t max_used = sampler + array_size - 1;
2719       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2720          max_used += prog_data->base.binding_table.gather_texture_start;
2721       } else {
2722          max_used += prog_data->base.binding_table.texture_start;
2723       }
2724
2725       brw_mark_surface_used(&prog_data->base, max_used);
2726
2727       /* Emit code to evaluate the actual indexing expression */
2728       nonconst_sampler_index->accept(this);
2729       src_reg temp(this, glsl_type::uint_type);
2730       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2731       sampler_reg = emit_uniformize(temp);
2732    } else {
2733       /* Single sampler, or constant array index; the indexing expression
2734        * is just an immediate.
2735        */
2736       sampler_reg = src_reg(sampler);
2737    }
2738
2739    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2740     * emitting anything other than setting up the constant result.
2741     */
2742    if (ir->op == ir_tg4) {
2743       ir_constant *chan = ir->lod_info.component->as_constant();
2744       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2745       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2746          dst_reg result(this, ir->type);
2747          this->result = src_reg(result);
2748          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2749          return;
2750       }
2751    }
2752
2753    /* Should be lowered by do_lower_texture_projection */
2754    assert(!ir->projector);
2755
2756    /* Should be lowered */
2757    assert(!ir->offset || !ir->offset->type->is_array());
2758
2759    /* Generate code to compute all the subexpression trees.  This has to be
2760     * done before loading any values into MRFs for the sampler message since
2761     * generating these values may involve SEND messages that need the MRFs.
2762     */
2763    src_reg coordinate;
2764    int coord_components = 0;
2765    if (ir->coordinate) {
2766       coord_components = ir->coordinate->type->vector_elements;
2767       ir->coordinate->accept(this);
2768       coordinate = this->result;
2769    }
2770
2771    src_reg shadow_comparitor;
2772    if (ir->shadow_comparitor) {
2773       ir->shadow_comparitor->accept(this);
2774       shadow_comparitor = this->result;
2775    }
2776
2777    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2778    src_reg offset_value;
2779    if (has_nonconstant_offset) {
2780       ir->offset->accept(this);
2781       offset_value = src_reg(this->result);
2782    }
2783
2784    src_reg lod, lod2, sample_index, mcs;
2785    switch (ir->op) {
2786    case ir_tex:
2787       lod = src_reg(0.0f);
2788       break;
2789    case ir_txf:
2790    case ir_txl:
2791    case ir_txs:
2792       ir->lod_info.lod->accept(this);
2793       lod = this->result;
2794       break;
2795    case ir_query_levels:
2796       lod = src_reg(0);
2797       break;
2798    case ir_txf_ms:
2799       ir->lod_info.sample_index->accept(this);
2800       sample_index = this->result;
2801
2802       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2803          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2804       else
2805          mcs = src_reg(0u);
2806       break;
2807    case ir_txd:
2808       ir->lod_info.grad.dPdx->accept(this);
2809       lod = this->result;
2810
2811       ir->lod_info.grad.dPdy->accept(this);
2812       lod2 = this->result;
2813       break;
2814    case ir_txb:
2815    case ir_lod:
2816    case ir_tg4:
2817       break;
2818    }
2819
2820    uint32_t constant_offset = 0;
2821    if (ir->offset != NULL && !has_nonconstant_offset) {
2822       constant_offset  =
2823          brw_texture_offset(ir->offset->as_constant()->value.i,
2824                             ir->offset->type->vector_elements);
2825    }
2826
2827    /* Stuff the channel select bits in the top of the texture offset */
2828    if (ir->op == ir_tg4)
2829       constant_offset |=
2830          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2831                          sampler) << 16;
2832
2833    glsl_type const *type = ir->sampler->type;
2834    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2835       type->sampler_array;
2836
2837    this->result = src_reg(this, ir->type);
2838    dst_reg dest = dst_reg(this->result);
2839
2840    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2841                 shadow_comparitor,
2842                 lod, lod2, sample_index,
2843                 constant_offset, offset_value,
2844                 mcs, is_cube_array, sampler, sampler_reg);
2845 }
2846
2847 /**
2848  * Apply workarounds for Gen6 gather with UINT/SINT
2849  */
2850 void
2851 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2852 {
2853    if (!wa)
2854       return;
2855
2856    int width = (wa & WA_8BIT) ? 8 : 16;
2857    dst_reg dst_f = dst;
2858    dst_f.type = BRW_REGISTER_TYPE_F;
2859
2860    /* Convert from UNORM to UINT */
2861    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2862    emit(MOV(dst, src_reg(dst_f)));
2863
2864    if (wa & WA_SIGN) {
2865       /* Reinterpret the UINT value as a signed INT value by
2866        * shifting the sign bit into place, then shifting back
2867        * preserving sign.
2868        */
2869       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2870       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2871    }
2872 }
2873
2874 /**
2875  * Set up the gather channel based on the swizzle, for gather4.
2876  */
2877 uint32_t
2878 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2879 {
2880    int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2881    switch (swiz) {
2882       case SWIZZLE_X: return 0;
2883       case SWIZZLE_Y:
2884          /* gather4 sampler is broken for green channel on RG32F --
2885           * we must ask for blue instead.
2886           */
2887          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2888             return 2;
2889          return 1;
2890       case SWIZZLE_Z: return 2;
2891       case SWIZZLE_W: return 3;
2892       default:
2893          unreachable("Not reached"); /* zero, one swizzles handled already */
2894    }
2895 }
2896
2897 void
2898 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2899                              src_reg orig_val, uint32_t sampler,
2900                              const glsl_type *dest_type)
2901 {
2902    int s = key->tex.swizzles[sampler];
2903
2904    dst_reg swizzled_result = dest;
2905
2906    if (op == ir_query_levels) {
2907       /* # levels is in .w */
2908       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2909       emit(MOV(swizzled_result, orig_val));
2910       return;
2911    }
2912
2913    if (op == ir_txs || dest_type == glsl_type::float_type
2914                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2915       emit(MOV(swizzled_result, orig_val));
2916       return;
2917    }
2918
2919
2920    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2921    int swizzle[4] = {0};
2922
2923    for (int i = 0; i < 4; i++) {
2924       switch (GET_SWZ(s, i)) {
2925       case SWIZZLE_ZERO:
2926          zero_mask |= (1 << i);
2927          break;
2928       case SWIZZLE_ONE:
2929          one_mask |= (1 << i);
2930          break;
2931       default:
2932          copy_mask |= (1 << i);
2933          swizzle[i] = GET_SWZ(s, i);
2934          break;
2935       }
2936    }
2937
2938    if (copy_mask) {
2939       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2940       swizzled_result.writemask = copy_mask;
2941       emit(MOV(swizzled_result, orig_val));
2942    }
2943
2944    if (zero_mask) {
2945       swizzled_result.writemask = zero_mask;
2946       emit(MOV(swizzled_result, src_reg(0.0f)));
2947    }
2948
2949    if (one_mask) {
2950       swizzled_result.writemask = one_mask;
2951       emit(MOV(swizzled_result, src_reg(1.0f)));
2952    }
2953 }
2954
2955 void
2956 vec4_visitor::visit(ir_return *)
2957 {
2958    unreachable("not reached");
2959 }
2960
2961 void
2962 vec4_visitor::visit(ir_discard *)
2963 {
2964    unreachable("not reached");
2965 }
2966
2967 void
2968 vec4_visitor::visit(ir_if *ir)
2969 {
2970    /* Don't point the annotation at the if statement, because then it plus
2971     * the then and else blocks get printed.
2972     */
2973    this->base_ir = ir->condition;
2974
2975    if (devinfo->gen == 6) {
2976       emit_if_gen6(ir);
2977    } else {
2978       enum brw_predicate predicate;
2979       emit_bool_to_cond_code(ir->condition, &predicate);
2980       emit(IF(predicate));
2981    }
2982
2983    visit_instructions(&ir->then_instructions);
2984
2985    if (!ir->else_instructions.is_empty()) {
2986       this->base_ir = ir->condition;
2987       emit(BRW_OPCODE_ELSE);
2988
2989       visit_instructions(&ir->else_instructions);
2990    }
2991
2992    this->base_ir = ir->condition;
2993    emit(BRW_OPCODE_ENDIF);
2994 }
2995
2996 void
2997 vec4_visitor::gs_emit_vertex(int stream_id)
2998 {
2999    unreachable("not reached");
3000 }
3001
3002 void
3003 vec4_visitor::visit(ir_emit_vertex *)
3004 {
3005    unreachable("not reached");
3006 }
3007
3008 void
3009 vec4_visitor::gs_end_primitive()
3010 {
3011    unreachable("not reached");
3012 }
3013
3014
3015 void
3016 vec4_visitor::visit(ir_end_primitive *)
3017 {
3018    unreachable("not reached");
3019 }
3020
3021 void
3022 vec4_visitor::visit(ir_barrier *)
3023 {
3024    unreachable("not reached");
3025 }
3026
3027 void
3028 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3029                                   dst_reg dst, src_reg offset,
3030                                   src_reg src0, src_reg src1)
3031 {
3032    unsigned mlen = 0;
3033
3034    /* Set the atomic operation offset. */
3035    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3036    mlen++;
3037
3038    /* Set the atomic operation arguments. */
3039    if (src0.file != BAD_FILE) {
3040       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3041       mlen++;
3042    }
3043
3044    if (src1.file != BAD_FILE) {
3045       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3046       mlen++;
3047    }
3048
3049    /* Emit the instruction.  Note that this maps to the normal SIMD8
3050     * untyped atomic message on Ivy Bridge, but that's OK because
3051     * unused channels will be masked out.
3052     */
3053    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3054                                  brw_message_reg(0),
3055                                  src_reg(surf_index), src_reg(atomic_op));
3056    inst->mlen = mlen;
3057 }
3058
3059 void
3060 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3061                                         src_reg offset)
3062 {
3063    /* Set the surface read offset. */
3064    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3065
3066    /* Emit the instruction.  Note that this maps to the normal SIMD8
3067     * untyped surface read message, but that's OK because unused
3068     * channels will be masked out.
3069     */
3070    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3071                                  brw_message_reg(0),
3072                                  src_reg(surf_index), src_reg(1));
3073    inst->mlen = 1;
3074 }
3075
3076 void
3077 vec4_visitor::emit_ndc_computation()
3078 {
3079    /* Get the position */
3080    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3081
3082    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3083    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3084    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3085
3086    current_annotation = "NDC";
3087    dst_reg ndc_w = ndc;
3088    ndc_w.writemask = WRITEMASK_W;
3089    src_reg pos_w = pos;
3090    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3091    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3092
3093    dst_reg ndc_xyz = ndc;
3094    ndc_xyz.writemask = WRITEMASK_XYZ;
3095
3096    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3097 }
3098
3099 void
3100 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3101 {
3102    if (devinfo->gen < 6 &&
3103        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3104         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3105       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3106       dst_reg header1_w = header1;
3107       header1_w.writemask = WRITEMASK_W;
3108
3109       emit(MOV(header1, 0u));
3110
3111       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3112          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3113
3114          current_annotation = "Point size";
3115          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3116          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3117       }
3118
3119       if (key->userclip_active) {
3120          current_annotation = "Clipping flags";
3121          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3122          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3123
3124          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3125          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3126          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3127
3128          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3129          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3130          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3131          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3132       }
3133
3134       /* i965 clipping workaround:
3135        * 1) Test for -ve rhw
3136        * 2) If set,
3137        *      set ndc = (0,0,0,0)
3138        *      set ucp[6] = 1
3139        *
3140        * Later, clipping will detect ucp[6] and ensure the primitive is
3141        * clipped against all fixed planes.
3142        */
3143       if (devinfo->has_negative_rhw_bug) {
3144          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3145          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3146          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3147          vec4_instruction *inst;
3148          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3149          inst->predicate = BRW_PREDICATE_NORMAL;
3150          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3151          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3152          inst->predicate = BRW_PREDICATE_NORMAL;
3153       }
3154
3155       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3156    } else if (devinfo->gen < 6) {
3157       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3158    } else {
3159       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3160       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3161          dst_reg reg_w = reg;
3162          reg_w.writemask = WRITEMASK_W;
3163          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3164          reg_as_src.type = reg_w.type;
3165          reg_as_src.swizzle = brw_swizzle_for_size(1);
3166          emit(MOV(reg_w, reg_as_src));
3167       }
3168       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3169          dst_reg reg_y = reg;
3170          reg_y.writemask = WRITEMASK_Y;
3171          reg_y.type = BRW_REGISTER_TYPE_D;
3172          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3173          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3174       }
3175       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3176          dst_reg reg_z = reg;
3177          reg_z.writemask = WRITEMASK_Z;
3178          reg_z.type = BRW_REGISTER_TYPE_D;
3179          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3180          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3181       }
3182    }
3183 }
3184
3185 void
3186 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3187 {
3188    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3189     *
3190     *     "If a linked set of shaders forming the vertex stage contains no
3191     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3192     *     application has requested clipping against user clip planes through
3193     *     the API, then the coordinate written to gl_Position is used for
3194     *     comparison against the user clip planes."
3195     *
3196     * This function is only called if the shader didn't write to
3197     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3198     * if the user wrote to it; otherwise we use gl_Position.
3199     */
3200    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3201    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3202       clip_vertex = VARYING_SLOT_POS;
3203    }
3204
3205    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3206         ++i) {
3207       reg.writemask = 1 << i;
3208       emit(DP4(reg,
3209                src_reg(output_reg[clip_vertex]),
3210                src_reg(this->userplane[i + offset])));
3211    }
3212 }
3213
3214 vec4_instruction *
3215 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3216 {
3217    assert(varying < VARYING_SLOT_MAX);
3218    assert(output_reg[varying].type == reg.type);
3219    current_annotation = output_reg_annotation[varying];
3220    /* Copy the register, saturating if necessary */
3221    return emit(MOV(reg, src_reg(output_reg[varying])));
3222 }
3223
3224 void
3225 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3226 {
3227    reg.type = BRW_REGISTER_TYPE_F;
3228    output_reg[varying].type = reg.type;
3229
3230    switch (varying) {
3231    case VARYING_SLOT_PSIZ:
3232    {
3233       /* PSIZ is always in slot 0, and is coupled with other flags. */
3234       current_annotation = "indices, point width, clip flags";
3235       emit_psiz_and_flags(reg);
3236       break;
3237    }
3238    case BRW_VARYING_SLOT_NDC:
3239       current_annotation = "NDC";
3240       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3241       break;
3242    case VARYING_SLOT_POS:
3243       current_annotation = "gl_Position";
3244       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3245       break;
3246    case VARYING_SLOT_EDGE:
3247       /* This is present when doing unfilled polygons.  We're supposed to copy
3248        * the edge flag from the user-provided vertex array
3249        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3250        * of that attribute (starts as 1.0f).  This is then used in clipping to
3251        * determine which edges should be drawn as wireframe.
3252        */
3253       current_annotation = "edge flag";
3254       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3255                                     glsl_type::float_type, WRITEMASK_XYZW))));
3256       break;
3257    case BRW_VARYING_SLOT_PAD:
3258       /* No need to write to this slot */
3259       break;
3260    case VARYING_SLOT_COL0:
3261    case VARYING_SLOT_COL1:
3262    case VARYING_SLOT_BFC0:
3263    case VARYING_SLOT_BFC1: {
3264       /* These built-in varyings are only supported in compatibility mode,
3265        * and we only support GS in core profile.  So, this must be a vertex
3266        * shader.
3267        */
3268       assert(stage == MESA_SHADER_VERTEX);
3269       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3270       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3271          inst->saturate = true;
3272       break;
3273    }
3274
3275    default:
3276       emit_generic_urb_slot(reg, varying);
3277       break;
3278    }
3279 }
3280
3281 static int
3282 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3283 {
3284    if (devinfo->gen >= 6) {
3285       /* URB data written (does not include the message header reg) must
3286        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3287        * section 5.4.3.2.2: URB_INTERLEAVED.
3288        *
3289        * URB entries are allocated on a multiple of 1024 bits, so an
3290        * extra 128 bits written here to make the end align to 256 is
3291        * no problem.
3292        */
3293       if ((mlen % 2) != 1)
3294          mlen++;
3295    }
3296
3297    return mlen;
3298 }
3299
3300
3301 /**
3302  * Generates the VUE payload plus the necessary URB write instructions to
3303  * output it.
3304  *
3305  * The VUE layout is documented in Volume 2a.
3306  */
3307 void
3308 vec4_visitor::emit_vertex()
3309 {
3310    /* MRF 0 is reserved for the debugger, so start with message header
3311     * in MRF 1.
3312     */
3313    int base_mrf = 1;
3314    int mrf = base_mrf;
3315    /* In the process of generating our URB write message contents, we
3316     * may need to unspill a register or load from an array.  Those
3317     * reads would use MRFs 14-15.
3318     */
3319    int max_usable_mrf = 13;
3320
3321    /* The following assertion verifies that max_usable_mrf causes an
3322     * even-numbered amount of URB write data, which will meet gen6's
3323     * requirements for length alignment.
3324     */
3325    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3326
3327    /* First mrf is the g0-based message header containing URB handles and
3328     * such.
3329     */
3330    emit_urb_write_header(mrf++);
3331
3332    if (devinfo->gen < 6) {
3333       emit_ndc_computation();
3334    }
3335
3336    /* Lower legacy ff and ClipVertex clipping to clip distances */
3337    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3338       current_annotation = "user clip distances";
3339
3340       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3341       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3342
3343       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3344       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3345    }
3346
3347    /* We may need to split this up into several URB writes, so do them in a
3348     * loop.
3349     */
3350    int slot = 0;
3351    bool complete = false;
3352    do {
3353       /* URB offset is in URB row increments, and each of our MRFs is half of
3354        * one of those, since we're doing interleaved writes.
3355        */
3356       int offset = slot / 2;
3357
3358       mrf = base_mrf + 1;
3359       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3360          emit_urb_slot(dst_reg(MRF, mrf++),
3361                        prog_data->vue_map.slot_to_varying[slot]);
3362
3363          /* If this was max_usable_mrf, we can't fit anything more into this
3364           * URB WRITE.
3365           */
3366          if (mrf > max_usable_mrf) {
3367             slot++;
3368             break;
3369          }
3370       }
3371
3372       complete = slot >= prog_data->vue_map.num_slots;
3373       current_annotation = "URB write";
3374       vec4_instruction *inst = emit_urb_write_opcode(complete);
3375       inst->base_mrf = base_mrf;
3376       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3377       inst->offset += offset;
3378    } while(!complete);
3379 }
3380
3381
3382 src_reg
3383 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3384                                  src_reg *reladdr, int reg_offset)
3385 {
3386    /* Because we store the values to scratch interleaved like our
3387     * vertex data, we need to scale the vec4 index by 2.
3388     */
3389    int message_header_scale = 2;
3390
3391    /* Pre-gen6, the message header uses byte offsets instead of vec4
3392     * (16-byte) offset units.
3393     */
3394    if (devinfo->gen < 6)
3395       message_header_scale *= 16;
3396
3397    if (reladdr) {
3398       src_reg index = src_reg(this, glsl_type::int_type);
3399
3400       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3401                                    src_reg(reg_offset)));
3402       emit_before(block, inst, MUL(dst_reg(index), index,
3403                                    src_reg(message_header_scale)));
3404
3405       return index;
3406    } else {
3407       return src_reg(reg_offset * message_header_scale);
3408    }
3409 }
3410
3411 src_reg
3412 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3413                                        src_reg *reladdr, int reg_offset)
3414 {
3415    if (reladdr) {
3416       src_reg index = src_reg(this, glsl_type::int_type);
3417
3418       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3419                                    src_reg(reg_offset)));
3420
3421       /* Pre-gen6, the message header uses byte offsets instead of vec4
3422        * (16-byte) offset units.
3423        */
3424       if (devinfo->gen < 6) {
3425          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3426       }
3427
3428       return index;
3429    } else if (devinfo->gen >= 8) {
3430       /* Store the offset in a GRF so we can send-from-GRF. */
3431       src_reg offset = src_reg(this, glsl_type::int_type);
3432       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3433       return offset;
3434    } else {
3435       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3436       return src_reg(reg_offset * message_header_scale);
3437    }
3438 }
3439
3440 /**
3441  * Emits an instruction before @inst to load the value named by @orig_src
3442  * from scratch space at @base_offset to @temp.
3443  *
3444  * @base_offset is measured in 32-byte units (the size of a register).
3445  */
3446 void
3447 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3448                                 dst_reg temp, src_reg orig_src,
3449                                 int base_offset)
3450 {
3451    int reg_offset = base_offset + orig_src.reg_offset;
3452    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3453                                       reg_offset);
3454
3455    emit_before(block, inst, SCRATCH_READ(temp, index));
3456 }
3457
3458 /**
3459  * Emits an instruction after @inst to store the value to be written
3460  * to @orig_dst to scratch space at @base_offset, from @temp.
3461  *
3462  * @base_offset is measured in 32-byte units (the size of a register).
3463  */
3464 void
3465 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3466                                  int base_offset)
3467 {
3468    int reg_offset = base_offset + inst->dst.reg_offset;
3469    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3470                                       reg_offset);
3471
3472    /* Create a temporary register to store *inst's result in.
3473     *
3474     * We have to be careful in MOVing from our temporary result register in
3475     * the scratch write.  If we swizzle from channels of the temporary that
3476     * weren't initialized, it will confuse live interval analysis, which will
3477     * make spilling fail to make progress.
3478     */
3479    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3480                                        inst->dst.type),
3481                                 brw_swizzle_for_mask(inst->dst.writemask));
3482    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3483                                        inst->dst.writemask));
3484    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3485    if (inst->opcode != BRW_OPCODE_SEL)
3486       write->predicate = inst->predicate;
3487    write->ir = inst->ir;
3488    write->annotation = inst->annotation;
3489    inst->insert_after(block, write);
3490
3491    inst->dst.file = temp.file;
3492    inst->dst.reg = temp.reg;
3493    inst->dst.reg_offset = temp.reg_offset;
3494    inst->dst.reladdr = NULL;
3495 }
3496
3497 /**
3498  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3499  * adds the scratch read(s) before \p inst. The function also checks for
3500  * recursive reladdr scratch accesses, issuing the corresponding scratch
3501  * loads and rewriting reladdr references accordingly.
3502  *
3503  * \return \p src if it did not require a scratch load, otherwise, the
3504  * register holding the result of the scratch load that the caller should
3505  * use to rewrite src.
3506  */
3507 src_reg
3508 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3509                                    vec4_instruction *inst, src_reg src)
3510 {
3511    /* Resolve recursive reladdr scratch access by calling ourselves
3512     * with src.reladdr
3513     */
3514    if (src.reladdr)
3515       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3516                                           *src.reladdr);
3517
3518    /* Now handle scratch access on src */
3519    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3520       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3521       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3522       src.reg = temp.reg;
3523       src.reg_offset = temp.reg_offset;
3524       src.reladdr = NULL;
3525    }
3526
3527    return src;
3528 }
3529
3530 /**
3531  * We can't generally support array access in GRF space, because a
3532  * single instruction's destination can only span 2 contiguous
3533  * registers.  So, we send all GRF arrays that get variable index
3534  * access to scratch space.
3535  */
3536 void
3537 vec4_visitor::move_grf_array_access_to_scratch()
3538 {
3539    int scratch_loc[this->alloc.count];
3540    memset(scratch_loc, -1, sizeof(scratch_loc));
3541
3542    /* First, calculate the set of virtual GRFs that need to be punted
3543     * to scratch due to having any array access on them, and where in
3544     * scratch.
3545     */
3546    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3547       if (inst->dst.file == GRF && inst->dst.reladdr) {
3548          if (scratch_loc[inst->dst.reg] == -1) {
3549             scratch_loc[inst->dst.reg] = last_scratch;
3550             last_scratch += this->alloc.sizes[inst->dst.reg];
3551          }
3552
3553          for (src_reg *iter = inst->dst.reladdr;
3554               iter->reladdr;
3555               iter = iter->reladdr) {
3556             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3557                scratch_loc[iter->reg] = last_scratch;
3558                last_scratch += this->alloc.sizes[iter->reg];
3559             }
3560          }
3561       }
3562
3563       for (int i = 0 ; i < 3; i++) {
3564          for (src_reg *iter = &inst->src[i];
3565               iter->reladdr;
3566               iter = iter->reladdr) {
3567             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3568                scratch_loc[iter->reg] = last_scratch;
3569                last_scratch += this->alloc.sizes[iter->reg];
3570             }
3571          }
3572       }
3573    }
3574
3575    /* Now, for anything that will be accessed through scratch, rewrite
3576     * it to load/store.  Note that this is a _safe list walk, because
3577     * we may generate a new scratch_write instruction after the one
3578     * we're processing.
3579     */
3580    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3581       /* Set up the annotation tracking for new generated instructions. */
3582       base_ir = inst->ir;
3583       current_annotation = inst->annotation;
3584
3585       /* First handle scratch access on the dst. Notice we have to handle
3586        * the case where the dst's reladdr also points to scratch space.
3587        */
3588       if (inst->dst.reladdr)
3589          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3590                                                    *inst->dst.reladdr);
3591
3592       /* Now that we have handled any (possibly recursive) reladdr scratch
3593        * accesses for dst we can safely do the scratch write for dst itself
3594        */
3595       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3596          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3597
3598       /* Now handle scratch access on any src. In this case, since inst->src[i]
3599        * already is a src_reg, we can just call emit_resolve_reladdr with
3600        * inst->src[i] and it will take care of handling scratch loads for
3601        * both src and src.reladdr (recursively).
3602        */
3603       for (int i = 0 ; i < 3; i++) {
3604          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3605                                              inst->src[i]);
3606       }
3607    }
3608 }
3609
3610 /**
3611  * Emits an instruction before @inst to load the value named by @orig_src
3612  * from the pull constant buffer (surface) at @base_offset to @temp.
3613  */
3614 void
3615 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3616                                       dst_reg temp, src_reg orig_src,
3617                                       int base_offset)
3618 {
3619    int reg_offset = base_offset + orig_src.reg_offset;
3620    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3621    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3622                                              reg_offset);
3623
3624    emit_pull_constant_load_reg(temp,
3625                                index,
3626                                offset,
3627                                block, inst);
3628 }
3629
3630 /**
3631  * Implements array access of uniforms by inserting a
3632  * PULL_CONSTANT_LOAD instruction.
3633  *
3634  * Unlike temporary GRF array access (where we don't support it due to
3635  * the difficulty of doing relative addressing on instruction
3636  * destinations), we could potentially do array access of uniforms
3637  * that were loaded in GRF space as push constants.  In real-world
3638  * usage we've seen, though, the arrays being used are always larger
3639  * than we could load as push constants, so just always move all
3640  * uniform array access out to a pull constant buffer.
3641  */
3642 void
3643 vec4_visitor::move_uniform_array_access_to_pull_constants()
3644 {
3645    int pull_constant_loc[this->uniforms];
3646    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3647    bool nested_reladdr;
3648
3649    /* Walk through and find array access of uniforms.  Put a copy of that
3650     * uniform in the pull constant buffer.
3651     *
3652     * Note that we don't move constant-indexed accesses to arrays.  No
3653     * testing has been done of the performance impact of this choice.
3654     */
3655    do {
3656       nested_reladdr = false;
3657
3658       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3659          for (int i = 0 ; i < 3; i++) {
3660             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3661                continue;
3662
3663             int uniform = inst->src[i].reg;
3664
3665             if (inst->src[i].reladdr->reladdr)
3666                nested_reladdr = true;  /* will need another pass */
3667
3668             /* If this array isn't already present in the pull constant buffer,
3669              * add it.
3670              */
3671             if (pull_constant_loc[uniform] == -1) {
3672                const gl_constant_value **values =
3673                   &stage_prog_data->param[uniform * 4];
3674
3675                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3676
3677                assert(uniform < uniform_array_size);
3678                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3679                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3680                      = values[j];
3681                }
3682             }
3683
3684             /* Set up the annotation tracking for new generated instructions. */
3685             base_ir = inst->ir;
3686             current_annotation = inst->annotation;
3687
3688             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3689
3690             emit_pull_constant_load(block, inst, temp, inst->src[i],
3691                                     pull_constant_loc[uniform]);
3692
3693             inst->src[i].file = temp.file;
3694             inst->src[i].reg = temp.reg;
3695             inst->src[i].reg_offset = temp.reg_offset;
3696             inst->src[i].reladdr = NULL;
3697          }
3698       }
3699    } while (nested_reladdr);
3700
3701    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3702     * no need to track them as larger-than-vec4 objects.  This will be
3703     * relied on in cutting out unused uniform vectors from push
3704     * constants.
3705     */
3706    split_uniform_registers();
3707 }
3708
3709 void
3710 vec4_visitor::resolve_ud_negate(src_reg *reg)
3711 {
3712    if (reg->type != BRW_REGISTER_TYPE_UD ||
3713        !reg->negate)
3714       return;
3715
3716    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3717    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3718    *reg = temp;
3719 }
3720
3721 /**
3722  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3723  *
3724  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3725  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3726  */
3727 void
3728 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3729 {
3730    assert(devinfo->gen <= 5);
3731
3732    if (!rvalue->type->is_boolean())
3733       return;
3734
3735    src_reg and_result = src_reg(this, rvalue->type);
3736    src_reg neg_result = src_reg(this, rvalue->type);
3737    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3738    emit(MOV(dst_reg(neg_result), negate(and_result)));
3739    *reg = neg_result;
3740 }
3741
3742 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3743                            void *log_data,
3744                            struct gl_program *prog,
3745                            const struct brw_vue_prog_key *key,
3746                            struct brw_vue_prog_data *prog_data,
3747                            struct gl_shader_program *shader_prog,
3748                            gl_shader_stage stage,
3749                            void *mem_ctx,
3750                            bool no_spills,
3751                            int shader_time_index)
3752    : backend_shader(compiler, log_data, mem_ctx,
3753                     shader_prog, prog, &prog_data->base, stage),
3754      key(key),
3755      prog_data(prog_data),
3756      sanity_param_count(0),
3757      fail_msg(NULL),
3758      first_non_payload_grf(0),
3759      need_all_constants_in_pull_buffer(false),
3760      no_spills(no_spills),
3761      shader_time_index(shader_time_index),
3762      last_scratch(0)
3763 {
3764    this->failed = false;
3765
3766    this->base_ir = NULL;
3767    this->current_annotation = NULL;
3768    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3769
3770    this->variable_ht = hash_table_ctor(0,
3771                                        hash_table_pointer_hash,
3772                                        hash_table_pointer_compare);
3773
3774    this->virtual_grf_start = NULL;
3775    this->virtual_grf_end = NULL;
3776    this->live_intervals = NULL;
3777
3778    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3779
3780    this->uniforms = 0;
3781
3782    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3783     * at least one. See setup_uniforms() in brw_vec4.cpp.
3784     */
3785    this->uniform_array_size = 1;
3786    if (prog_data) {
3787       this->uniform_array_size =
3788          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3789    }
3790
3791    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3792    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3793 }
3794
3795 vec4_visitor::~vec4_visitor()
3796 {
3797    hash_table_dtor(this->variable_ht);
3798 }
3799
3800
3801 void
3802 vec4_visitor::fail(const char *format, ...)
3803 {
3804    va_list va;
3805    char *msg;
3806
3807    if (failed)
3808       return;
3809
3810    failed = true;
3811
3812    va_start(va, format);
3813    msg = ralloc_vasprintf(mem_ctx, format, va);
3814    va_end(va);
3815    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3816
3817    this->fail_msg = msg;
3818
3819    if (debug_enabled) {
3820       fprintf(stderr, "%s",  msg);
3821    }
3822 }
3823
3824 } /* namespace brw */