src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(const src_reg &src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 317 {
 318    if (!src.abs && !src.negate)
 319       return src;
 320
 321    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 322    resolved.type = src.type;
 323    emit(MOV(resolved, src));
 324
 325    return src_reg(resolved);
 326 }
 327
 328 src_reg
 329 vec4_visitor::fix_math_operand(const src_reg &src)
 330 {
 331    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 332       return src;
 333
 334    /* The gen6 math instruction ignores the source modifiers --
 335     * swizzle, abs, negate, and at least some parts of the register
 336     * region description.
 337     *
 338     * Rather than trying to enumerate all these cases, *always* expand the
 339     * operand to a temp GRF for gen6.
 340     *
 341     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 342     * can't use.
 343     */
 344
 345    if (devinfo->gen == 7 && src.file != IMM)
 346       return src;
 347
 348    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 349    expanded.type = src.type;
 350    emit(MOV(expanded, src));
 351    return src_reg(expanded);
 352 }
 353
 354 vec4_instruction *
 355 vec4_visitor::emit_math(enum opcode opcode,
 356                         const dst_reg &dst,
 357                         const src_reg &src0, const src_reg &src1)
 358 {
 359    vec4_instruction *math =
 360       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 361
 362    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 363       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 364       math->dst = dst_reg(this, glsl_type::vec4_type);
 365       math->dst.type = dst.type;
 366       math = emit(MOV(dst, src_reg(math->dst)));
 367    } else if (devinfo->gen < 6) {
 368       math->base_mrf = 1;
 369       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 370    }
 371
 372    return math;
 373 }
 374
 375 void
 376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 377 {
 378    if (devinfo->gen < 7) {
 379       unreachable("ir_unop_pack_half_2x16 should be lowered");
 380    }
 381
 382    assert(dst.type == BRW_REGISTER_TYPE_UD);
 383    assert(src0.type == BRW_REGISTER_TYPE_F);
 384
 385    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 386     *
 387     *   Because this instruction does not have a 16-bit floating-point type,
 388     *   the destination data type must be Word (W).
 389     *
 390     *   The destination must be DWord-aligned and specify a horizontal stride
 391     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 392     *   each destination channel and the upper word is not modified.
 393     *
 394     * The above restriction implies that the f32to16 instruction must use
 395     * align1 mode, because only in align1 mode is it possible to specify
 396     * horizontal stride.  We choose here to defy the hardware docs and emit
 397     * align16 instructions.
 398     *
 399     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 400     * instructions. I was partially successful in that the code passed all
 401     * tests.  However, the code was dubiously correct and fragile, and the
 402     * tests were not harsh enough to probe that frailty. Not trusting the
 403     * code, I chose instead to remain in align16 mode in defiance of the hw
 404     * docs).
 405     *
 406     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 407     * simulator, emitting a f32to16 in align16 mode with UD as destination
 408     * data type is safe. The behavior differs from that specified in the PRM
 409     * in that the upper word of each destination channel is cleared to 0.
 410     */
 411
 412    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 413    src_reg tmp_src(tmp_dst);
 414
 415 #if 0
 416    /* Verify the undocumented behavior on which the following instructions
 417     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 418     * then the result of the bit-or instruction below will be incorrect.
 419     *
 420     * You should inspect the disasm output in order to verify that the MOV is
 421     * not optimized away.
 422     */
 423    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 424 #endif
 425
 426    /* Give tmp the form below, where "." means untouched.
 427     *
 428     *     w z          y          x w z          y          x
 429     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 430     *
 431     * That the upper word of each write-channel be 0 is required for the
 432     * following bit-shift and bit-or instructions to work. Note that this
 433     * relies on the undocumented hardware behavior mentioned above.
 434     */
 435    tmp_dst.writemask = WRITEMASK_XY;
 436    emit(F32TO16(tmp_dst, src0));
 437
 438    /* Give the write-channels of dst the form:
 439     *   0xhhhh0000
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 442    emit(SHL(dst, tmp_src, src_reg(16u)));
 443
 444    /* Finally, give the write-channels of dst the form of packHalf2x16's
 445     * output:
 446     *   0xhhhhllll
 447     */
 448    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 449    emit(OR(dst, src_reg(dst), tmp_src));
 450 }
 451
 452 void
 453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 454 {
 455    if (devinfo->gen < 7) {
 456       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 457    }
 458
 459    assert(dst.type == BRW_REGISTER_TYPE_F);
 460    assert(src0.type == BRW_REGISTER_TYPE_UD);
 461
 462    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 463     *
 464     *   Because this instruction does not have a 16-bit floating-point type,
 465     *   the source data type must be Word (W). The destination type must be
 466     *   F (Float).
 467     *
 468     * To use W as the source data type, we must adjust horizontal strides,
 469     * which is only possible in align1 mode. All my [chadv] attempts at
 470     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 471     * Piglit tests, so I gave up.
 472     *
 473     * I've verified that, on gen7 hardware and the simulator, it is safe to
 474     * emit f16to32 in align16 mode with UD as source data type.
 475     */
 476
 477    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 478    src_reg tmp_src(tmp_dst);
 479
 480    tmp_dst.writemask = WRITEMASK_X;
 481    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 482
 483    tmp_dst.writemask = WRITEMASK_Y;
 484    emit(SHR(tmp_dst, src0, src_reg(16u)));
 485
 486    dst.writemask = WRITEMASK_XY;
 487    emit(F16TO32(dst, tmp_src));
 488 }
 489
 490 void
 491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 492 {
 493    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 494     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 495     * is not suitable to generate the shift values, but we can use the packed
 496     * vector float and a type-converting MOV.
 497     */
 498    dst_reg shift(this, glsl_type::uvec4_type);
 499    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 500
 501    dst_reg shifted(this, glsl_type::uvec4_type);
 502    src0.swizzle = BRW_SWIZZLE_XXXX;
 503    emit(SHR(shifted, src0, src_reg(shift)));
 504
 505    shifted.type = BRW_REGISTER_TYPE_UB;
 506    dst_reg f(this, glsl_type::vec4_type);
 507    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 508
 509    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 514 {
 515    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 516     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 517     * is not suitable to generate the shift values, but we can use the packed
 518     * vector float and a type-converting MOV.
 519     */
 520    dst_reg shift(this, glsl_type::uvec4_type);
 521    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 522
 523    dst_reg shifted(this, glsl_type::uvec4_type);
 524    src0.swizzle = BRW_SWIZZLE_XXXX;
 525    emit(SHR(shifted, src0, src_reg(shift)));
 526
 527    shifted.type = BRW_REGISTER_TYPE_B;
 528    dst_reg f(this, glsl_type::vec4_type);
 529    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 533
 534    dst_reg max(this, glsl_type::vec4_type);
 535    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 536    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 537 }
 538
 539 void
 540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 541 {
 542    dst_reg saturated(this, glsl_type::vec4_type);
 543    vec4_instruction *inst = emit(MOV(saturated, src0));
 544    inst->saturate = true;
 545
 546    dst_reg scaled(this, glsl_type::vec4_type);
 547    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 548
 549    dst_reg rounded(this, glsl_type::vec4_type);
 550    emit(RNDE(rounded, src_reg(scaled)));
 551
 552    dst_reg u(this, glsl_type::uvec4_type);
 553    emit(MOV(u, src_reg(rounded)));
 554
 555    src_reg bytes(u);
 556    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 557 }
 558
 559 void
 560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 561 {
 562    dst_reg max(this, glsl_type::vec4_type);
 563    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 564
 565    dst_reg min(this, glsl_type::vec4_type);
 566    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 567
 568    dst_reg scaled(this, glsl_type::vec4_type);
 569    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 570
 571    dst_reg rounded(this, glsl_type::vec4_type);
 572    emit(RNDE(rounded, src_reg(scaled)));
 573
 574    dst_reg i(this, glsl_type::ivec4_type);
 575    emit(MOV(i, src_reg(rounded)));
 576
 577    src_reg bytes(i);
 578    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 579 }
 580
 581 void
 582 vec4_visitor::visit_instructions(const exec_list *list)
 583 {
 584    foreach_in_list(ir_instruction, ir, list) {
 585       base_ir = ir;
 586       ir->accept(this);
 587    }
 588 }
 589
 590 /**
 591  * Returns the minimum number of vec4 elements needed to pack a type.
 592  *
 593  * For simple types, it will return 1 (a single vec4); for matrices, the
 594  * number of columns; for array and struct, the sum of the vec4_size of
 595  * each of its elements; and for sampler and atomic, zero.
 596  *
 597  * This method is useful to calculate how much register space is needed to
 598  * store a particular type.
 599  */
 600 int
 601 vec4_visitor::type_size(const struct glsl_type *type)
 602 {
 603    unsigned int i;
 604    int size;
 605
 606    switch (type->base_type) {
 607    case GLSL_TYPE_UINT:
 608    case GLSL_TYPE_INT:
 609    case GLSL_TYPE_FLOAT:
 610    case GLSL_TYPE_BOOL:
 611       if (type->is_matrix()) {
 612          return type->matrix_columns;
 613       } else {
 614          /* Regardless of size of vector, it gets a vec4. This is bad
 615           * packing for things like floats, but otherwise arrays become a
 616           * mess.  Hopefully a later pass over the code can pack scalars
 617           * down if appropriate.
 618           */
 619          return 1;
 620       }
 621    case GLSL_TYPE_ARRAY:
 622       assert(type->length > 0);
 623       return type_size(type->fields.array) * type->length;
 624    case GLSL_TYPE_STRUCT:
 625       size = 0;
 626       for (i = 0; i < type->length; i++) {
 627          size += type_size(type->fields.structure[i].type);
 628       }
 629       return size;
 630    case GLSL_TYPE_SUBROUTINE:
 631       return 1;
 632
 633    case GLSL_TYPE_SAMPLER:
 634       /* Samplers take up no register space, since they're baked in at
 635        * link time.
 636        */
 637       return 0;
 638    case GLSL_TYPE_ATOMIC_UINT:
 639       return 0;
 640    case GLSL_TYPE_IMAGE:
 641       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 642    case GLSL_TYPE_VOID:
 643    case GLSL_TYPE_DOUBLE:
 644    case GLSL_TYPE_ERROR:
 645    case GLSL_TYPE_INTERFACE:
 646       unreachable("not reached");
 647    }
 648
 649    return 0;
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 653 {
 654    init();
 655
 656    this->file = GRF;
 657    this->reg = v->alloc.allocate(v->type_size(type));
 658
 659    if (type->is_array() || type->is_record()) {
 660       this->swizzle = BRW_SWIZZLE_NOOP;
 661    } else {
 662       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 663    }
 664
 665    this->type = brw_type_for_base_type(type);
 666 }
 667
 668 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 669 {
 670    assert(size > 0);
 671
 672    init();
 673
 674    this->file = GRF;
 675    this->reg = v->alloc.allocate(v->type_size(type) * size);
 676
 677    this->swizzle = BRW_SWIZZLE_NOOP;
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 683 {
 684    init();
 685
 686    this->file = GRF;
 687    this->reg = v->alloc.allocate(v->type_size(type));
 688
 689    if (type->is_array() || type->is_record()) {
 690       this->writemask = WRITEMASK_XYZW;
 691    } else {
 692       this->writemask = (1 << type->vector_elements) - 1;
 693    }
 694
 695    this->type = brw_type_for_base_type(type);
 696 }
 697
 698 void
 699 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
 700                                           unsigned n)
 701 {
 702    static const gl_constant_value zero = { 0 };
 703
 704    for (unsigned i = 0; i < n; ++i)
 705       stage_prog_data->param[4 * uniforms + i] = &values[i];
 706
 707    for (unsigned i = n; i < 4; ++i)
 708       stage_prog_data->param[4 * uniforms + i] = &zero;
 709
 710    uniform_vector_size[uniforms++] = n;
 711 }
 712
 713 /* Our support for uniforms is piggy-backed on the struct
 714  * gl_fragment_program, because that's where the values actually
 715  * get stored, rather than in some global gl_shader_program uniform
 716  * store.
 717  */
 718 void
 719 vec4_visitor::setup_uniform_values(ir_variable *ir)
 720 {
 721    int namelen = strlen(ir->name);
 722
 723    /* The data for our (non-builtin) uniforms is stored in a series of
 724     * gl_uniform_driver_storage structs for each subcomponent that
 725     * glGetUniformLocation() could name.  We know it's been set up in the same
 726     * order we'd walk the type, so walk the list of storage and find anything
 727     * with our name, or the prefix of a component that starts with our name.
 728     */
 729    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 730       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 731
 732       if (storage->builtin)
 733          continue;
 734
 735       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 736           (storage->name[namelen] != 0 &&
 737            storage->name[namelen] != '.' &&
 738            storage->name[namelen] != '[')) {
 739          continue;
 740       }
 741
 742       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 743                                      storage->type->matrix_columns);
 744       const unsigned vector_size = storage->type->vector_elements;
 745
 746       for (unsigned s = 0; s < vector_count; s++)
 747          setup_vector_uniform_values(&storage->storage[s * vector_size],
 748                                      vector_size);
 749    }
 750 }
 751
 752 void
 753 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 754 {
 755    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 756       assert(this->uniforms < uniform_array_size);
 757       this->uniform_vector_size[this->uniforms] = 4;
 758       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 759       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 760       for (int j = 0; j < 4; ++j) {
 761          stage_prog_data->param[this->uniforms * 4 + j] =
 762             (gl_constant_value *) &clip_planes[i][j];
 763       }
 764       ++this->uniforms;
 765    }
 766 }
 767
 768 /* Our support for builtin uniforms is even scarier than non-builtin.
 769  * It sits on top of the PROG_STATE_VAR parameters that are
 770  * automatically updated from GL context state.
 771  */
 772 void
 773 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 774 {
 775    const ir_state_slot *const slots = ir->get_state_slots();
 776    assert(slots != NULL);
 777
 778    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 779       /* This state reference has already been setup by ir_to_mesa,
 780        * but we'll get the same index back here.  We can reference
 781        * ParameterValues directly, since unlike brw_fs.cpp, we never
 782        * add new state references during compile.
 783        */
 784       int index = _mesa_add_state_reference(this->prog->Parameters,
 785                                             (gl_state_index *)slots[i].tokens);
 786       gl_constant_value *values =
 787          &this->prog->Parameters->ParameterValues[index][0];
 788
 789       assert(this->uniforms < uniform_array_size);
 790
 791       for (unsigned j = 0; j < 4; j++)
 792          stage_prog_data->param[this->uniforms * 4 + j] =
 793             &values[GET_SWZ(slots[i].swizzle, j)];
 794
 795       this->uniform_vector_size[this->uniforms] =
 796          (ir->type->is_scalar() || ir->type->is_vector() ||
 797           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 798
 799       this->uniforms++;
 800    }
 801 }
 802
 803 dst_reg *
 804 vec4_visitor::variable_storage(ir_variable *var)
 805 {
 806    return (dst_reg *)hash_table_find(this->variable_ht, var);
 807 }
 808
 809 void
 810 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 811                                      enum brw_predicate *predicate)
 812 {
 813    ir_expression *expr = ir->as_expression();
 814
 815    *predicate = BRW_PREDICATE_NORMAL;
 816
 817    if (expr && expr->operation != ir_binop_ubo_load) {
 818       src_reg op[3];
 819       vec4_instruction *inst;
 820
 821       assert(expr->get_num_operands() <= 3);
 822       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 823          expr->operands[i]->accept(this);
 824          op[i] = this->result;
 825
 826          resolve_ud_negate(&op[i]);
 827       }
 828
 829       switch (expr->operation) {
 830       case ir_unop_logic_not:
 831          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 832          inst->conditional_mod = BRW_CONDITIONAL_Z;
 833          break;
 834
 835       case ir_binop_logic_xor:
 836          if (devinfo->gen <= 5) {
 837             src_reg temp = src_reg(this, ir->type);
 838             emit(XOR(dst_reg(temp), op[0], op[1]));
 839             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 840          } else {
 841             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 842          }
 843          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 844          break;
 845
 846       case ir_binop_logic_or:
 847          if (devinfo->gen <= 5) {
 848             src_reg temp = src_reg(this, ir->type);
 849             emit(OR(dst_reg(temp), op[0], op[1]));
 850             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 851          } else {
 852             inst = emit(OR(dst_null_d(), op[0], op[1]));
 853          }
 854          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 855          break;
 856
 857       case ir_binop_logic_and:
 858          if (devinfo->gen <= 5) {
 859             src_reg temp = src_reg(this, ir->type);
 860             emit(AND(dst_reg(temp), op[0], op[1]));
 861             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 862          } else {
 863             inst = emit(AND(dst_null_d(), op[0], op[1]));
 864          }
 865          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 866          break;
 867
 868       case ir_unop_f2b:
 869          if (devinfo->gen >= 6) {
 870             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 871          } else {
 872             inst = emit(MOV(dst_null_f(), op[0]));
 873             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 874          }
 875          break;
 876
 877       case ir_unop_i2b:
 878          if (devinfo->gen >= 6) {
 879             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 880          } else {
 881             inst = emit(MOV(dst_null_d(), op[0]));
 882             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 883          }
 884          break;
 885
 886       case ir_binop_all_equal:
 887          if (devinfo->gen <= 5) {
 888             resolve_bool_comparison(expr->operands[0], &op[0]);
 889             resolve_bool_comparison(expr->operands[1], &op[1]);
 890          }
 891          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 892          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 893          break;
 894
 895       case ir_binop_any_nequal:
 896          if (devinfo->gen <= 5) {
 897             resolve_bool_comparison(expr->operands[0], &op[0]);
 898             resolve_bool_comparison(expr->operands[1], &op[1]);
 899          }
 900          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 901          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 902          break;
 903
 904       case ir_unop_any:
 905          if (devinfo->gen <= 5) {
 906             resolve_bool_comparison(expr->operands[0], &op[0]);
 907          }
 908          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 909          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 910          break;
 911
 912       case ir_binop_greater:
 913       case ir_binop_gequal:
 914       case ir_binop_less:
 915       case ir_binop_lequal:
 916       case ir_binop_equal:
 917       case ir_binop_nequal:
 918          if (devinfo->gen <= 5) {
 919             resolve_bool_comparison(expr->operands[0], &op[0]);
 920             resolve_bool_comparison(expr->operands[1], &op[1]);
 921          }
 922          emit(CMP(dst_null_d(), op[0], op[1],
 923                   brw_conditional_for_comparison(expr->operation)));
 924          break;
 925
 926       case ir_triop_csel: {
 927          /* Expand the boolean condition into the flag register. */
 928          inst = emit(MOV(dst_null_d(), op[0]));
 929          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 930
 931          /* Select which boolean to return. */
 932          dst_reg temp(this, expr->operands[1]->type);
 933          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 934          inst->predicate = BRW_PREDICATE_NORMAL;
 935
 936          /* Expand the result to a condition code. */
 937          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 938          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 939          break;
 940       }
 941
 942       default:
 943          unreachable("not reached");
 944       }
 945       return;
 946    }
 947
 948    ir->accept(this);
 949
 950    resolve_ud_negate(&this->result);
 951
 952    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 953    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 954 }
 955
 956 /**
 957  * Emit a gen6 IF statement with the comparison folded into the IF
 958  * instruction.
 959  */
 960 void
 961 vec4_visitor::emit_if_gen6(ir_if *ir)
 962 {
 963    ir_expression *expr = ir->condition->as_expression();
 964
 965    if (expr && expr->operation != ir_binop_ubo_load) {
 966       src_reg op[3];
 967       dst_reg temp;
 968
 969       assert(expr->get_num_operands() <= 3);
 970       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 971          expr->operands[i]->accept(this);
 972          op[i] = this->result;
 973       }
 974
 975       switch (expr->operation) {
 976       case ir_unop_logic_not:
 977          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 978          return;
 979
 980       case ir_binop_logic_xor:
 981          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 982          return;
 983
 984       case ir_binop_logic_or:
 985          temp = dst_reg(this, glsl_type::bool_type);
 986          emit(OR(temp, op[0], op[1]));
 987          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 988          return;
 989
 990       case ir_binop_logic_and:
 991          temp = dst_reg(this, glsl_type::bool_type);
 992          emit(AND(temp, op[0], op[1]));
 993          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 994          return;
 995
 996       case ir_unop_f2b:
 997          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 998          return;
 999
1000       case ir_unop_i2b:
1001          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1002          return;
1003
1004       case ir_binop_greater:
1005       case ir_binop_gequal:
1006       case ir_binop_less:
1007       case ir_binop_lequal:
1008       case ir_binop_equal:
1009       case ir_binop_nequal:
1010          emit(IF(op[0], op[1],
1011                  brw_conditional_for_comparison(expr->operation)));
1012          return;
1013
1014       case ir_binop_all_equal:
1015          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1016          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1017          return;
1018
1019       case ir_binop_any_nequal:
1020          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1021          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1022          return;
1023
1024       case ir_unop_any:
1025          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1026          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1027          return;
1028
1029       case ir_triop_csel: {
1030          /* Expand the boolean condition into the flag register. */
1031          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1032          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1033
1034          /* Select which boolean to return. */
1035          dst_reg temp(this, expr->operands[1]->type);
1036          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1037          inst->predicate = BRW_PREDICATE_NORMAL;
1038
1039          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1040          return;
1041       }
1042
1043       default:
1044          unreachable("not reached");
1045       }
1046       return;
1047    }
1048
1049    ir->condition->accept(this);
1050
1051    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1052 }
1053
1054 void
1055 vec4_visitor::visit(ir_variable *ir)
1056 {
1057    dst_reg *reg = NULL;
1058
1059    if (variable_storage(ir))
1060       return;
1061
1062    switch (ir->data.mode) {
1063    case ir_var_shader_in:
1064       assert(ir->data.location != -1);
1065       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1066       break;
1067
1068    case ir_var_shader_out:
1069       assert(ir->data.location != -1);
1070       reg = new(mem_ctx) dst_reg(this, ir->type);
1071
1072       for (int i = 0; i < type_size(ir->type); i++) {
1073          output_reg[ir->data.location + i] = *reg;
1074          output_reg[ir->data.location + i].reg_offset = i;
1075          output_reg_annotation[ir->data.location + i] = ir->name;
1076       }
1077       break;
1078
1079    case ir_var_auto:
1080    case ir_var_temporary:
1081       reg = new(mem_ctx) dst_reg(this, ir->type);
1082       break;
1083
1084    case ir_var_uniform:
1085       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1086
1087       /* Thanks to the lower_ubo_reference pass, we will see only
1088        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1089        * variables, so no need for them to be in variable_ht.
1090        *
1091        * Some uniforms, such as samplers and atomic counters, have no actual
1092        * storage, so we should ignore them.
1093        */
1094       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1095          return;
1096
1097       /* Track how big the whole uniform variable is, in case we need to put a
1098        * copy of its data into pull constants for array access.
1099        */
1100       assert(this->uniforms < uniform_array_size);
1101       this->uniform_size[this->uniforms] = type_size(ir->type);
1102
1103       if (!strncmp(ir->name, "gl_", 3)) {
1104          setup_builtin_uniform_values(ir);
1105       } else {
1106          setup_uniform_values(ir);
1107       }
1108       break;
1109
1110    case ir_var_system_value:
1111       reg = make_reg_for_system_value(ir->data.location, ir->type);
1112       break;
1113
1114    default:
1115       unreachable("not reached");
1116    }
1117
1118    reg->type = brw_type_for_base_type(ir->type);
1119    hash_table_insert(this->variable_ht, reg, ir);
1120 }
1121
1122 void
1123 vec4_visitor::visit(ir_loop *ir)
1124 {
1125    /* We don't want debugging output to print the whole body of the
1126     * loop as the annotation.
1127     */
1128    this->base_ir = NULL;
1129
1130    emit(BRW_OPCODE_DO);
1131
1132    visit_instructions(&ir->body_instructions);
1133
1134    emit(BRW_OPCODE_WHILE);
1135 }
1136
1137 void
1138 vec4_visitor::visit(ir_loop_jump *ir)
1139 {
1140    switch (ir->mode) {
1141    case ir_loop_jump::jump_break:
1142       emit(BRW_OPCODE_BREAK);
1143       break;
1144    case ir_loop_jump::jump_continue:
1145       emit(BRW_OPCODE_CONTINUE);
1146       break;
1147    }
1148 }
1149
1150
1151 void
1152 vec4_visitor::visit(ir_function_signature *)
1153 {
1154    unreachable("not reached");
1155 }
1156
1157 void
1158 vec4_visitor::visit(ir_function *ir)
1159 {
1160    /* Ignore function bodies other than main() -- we shouldn't see calls to
1161     * them since they should all be inlined.
1162     */
1163    if (strcmp(ir->name, "main") == 0) {
1164       const ir_function_signature *sig;
1165       exec_list empty;
1166
1167       sig = ir->matching_signature(NULL, &empty, false);
1168
1169       assert(sig);
1170
1171       visit_instructions(&sig->body);
1172    }
1173 }
1174
1175 bool
1176 vec4_visitor::try_emit_mad(ir_expression *ir)
1177 {
1178    /* 3-src instructions were introduced in gen6. */
1179    if (devinfo->gen < 6)
1180       return false;
1181
1182    /* MAD can only handle floating-point data. */
1183    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1184       return false;
1185
1186    ir_rvalue *nonmul;
1187    ir_expression *mul;
1188    bool mul_negate, mul_abs;
1189
1190    for (int i = 0; i < 2; i++) {
1191       mul_negate = false;
1192       mul_abs = false;
1193
1194       mul = ir->operands[i]->as_expression();
1195       nonmul = ir->operands[1 - i];
1196
1197       if (mul && mul->operation == ir_unop_abs) {
1198          mul = mul->operands[0]->as_expression();
1199          mul_abs = true;
1200       } else if (mul && mul->operation == ir_unop_neg) {
1201          mul = mul->operands[0]->as_expression();
1202          mul_negate = true;
1203       }
1204
1205       if (mul && mul->operation == ir_binop_mul)
1206          break;
1207    }
1208
1209    if (!mul || mul->operation != ir_binop_mul)
1210       return false;
1211
1212    nonmul->accept(this);
1213    src_reg src0 = fix_3src_operand(this->result);
1214
1215    mul->operands[0]->accept(this);
1216    src_reg src1 = fix_3src_operand(this->result);
1217    src1.negate ^= mul_negate;
1218    src1.abs = mul_abs;
1219    if (mul_abs)
1220       src1.negate = false;
1221
1222    mul->operands[1]->accept(this);
1223    src_reg src2 = fix_3src_operand(this->result);
1224    src2.abs = mul_abs;
1225    if (mul_abs)
1226       src2.negate = false;
1227
1228    this->result = src_reg(this, ir->type);
1229    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1230
1231    return true;
1232 }
1233
1234 bool
1235 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1236 {
1237    /* This optimization relies on CMP setting the destination to 0 when
1238     * false.  Early hardware only sets the least significant bit, and
1239     * leaves the other bits undefined.  So we can't use it.
1240     */
1241    if (devinfo->gen < 6)
1242       return false;
1243
1244    ir_expression *const cmp = ir->operands[0]->as_expression();
1245
1246    if (cmp == NULL)
1247       return false;
1248
1249    switch (cmp->operation) {
1250    case ir_binop_less:
1251    case ir_binop_greater:
1252    case ir_binop_lequal:
1253    case ir_binop_gequal:
1254    case ir_binop_equal:
1255    case ir_binop_nequal:
1256       break;
1257
1258    default:
1259       return false;
1260    }
1261
1262    cmp->operands[0]->accept(this);
1263    const src_reg cmp_src0 = this->result;
1264
1265    cmp->operands[1]->accept(this);
1266    const src_reg cmp_src1 = this->result;
1267
1268    this->result = src_reg(this, ir->type);
1269
1270    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1271             brw_conditional_for_comparison(cmp->operation)));
1272
1273    /* If the comparison is false, this->result will just happen to be zero.
1274     */
1275    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1276                                        this->result, src_reg(1.0f));
1277    inst->predicate = BRW_PREDICATE_NORMAL;
1278    inst->predicate_inverse = true;
1279
1280    return true;
1281 }
1282
1283 vec4_instruction *
1284 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1285                           src_reg src0, src_reg src1)
1286 {
1287    vec4_instruction *inst;
1288
1289    if (devinfo->gen >= 6) {
1290       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1291       inst->conditional_mod = conditionalmod;
1292    } else {
1293       emit(CMP(dst, src0, src1, conditionalmod));
1294
1295       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1296       inst->predicate = BRW_PREDICATE_NORMAL;
1297    }
1298
1299    return inst;
1300 }
1301
1302 vec4_instruction *
1303 vec4_visitor::emit_lrp(const dst_reg &dst,
1304                        const src_reg &x, const src_reg &y, const src_reg &a)
1305 {
1306    if (devinfo->gen >= 6) {
1307       /* Note that the instruction's argument order is reversed from GLSL
1308        * and the IR.
1309        */
1310      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1311                      fix_3src_operand(x)));
1312    } else {
1313       /* Earlier generations don't support three source operations, so we
1314        * need to emit x*(1-a) + y*a.
1315        */
1316       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1317       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1318       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1319       y_times_a.writemask           = dst.writemask;
1320       one_minus_a.writemask         = dst.writemask;
1321       x_times_one_minus_a.writemask = dst.writemask;
1322
1323       emit(MUL(y_times_a, y, a));
1324       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1325       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1326       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1327    }
1328 }
1329
1330 /**
1331  * Emits the instructions needed to perform a pull constant load. before_block
1332  * and before_inst can be NULL in which case the instruction will be appended
1333  * to the end of the instruction list.
1334  */
1335 void
1336 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1337                                           src_reg surf_index,
1338                                           src_reg offset_reg,
1339                                           bblock_t *before_block,
1340                                           vec4_instruction *before_inst)
1341 {
1342    assert((before_inst == NULL && before_block == NULL) ||
1343           (before_inst && before_block));
1344
1345    vec4_instruction *pull;
1346
1347    if (devinfo->gen >= 9) {
1348       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1349       src_reg header(this, glsl_type::uvec4_type, 2);
1350
1351       pull = new(mem_ctx)
1352          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1353                           dst_reg(header));
1354
1355       if (before_inst)
1356          emit_before(before_block, before_inst, pull);
1357       else
1358          emit(pull);
1359
1360       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1361                                  offset_reg.type);
1362       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1363
1364       if (before_inst)
1365          emit_before(before_block, before_inst, pull);
1366       else
1367          emit(pull);
1368
1369       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1370                                            dst,
1371                                            surf_index,
1372                                            header);
1373       pull->mlen = 2;
1374       pull->header_size = 1;
1375    } else if (devinfo->gen >= 7) {
1376       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1377
1378       grf_offset.type = offset_reg.type;
1379
1380       pull = MOV(grf_offset, offset_reg);
1381
1382       if (before_inst)
1383          emit_before(before_block, before_inst, pull);
1384       else
1385          emit(pull);
1386
1387       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1388                                            dst,
1389                                            surf_index,
1390                                            src_reg(grf_offset));
1391       pull->mlen = 1;
1392    } else {
1393       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1394                                            dst,
1395                                            surf_index,
1396                                            offset_reg);
1397       pull->base_mrf = 14;
1398       pull->mlen = 1;
1399    }
1400
1401    if (before_inst)
1402       emit_before(before_block, before_inst, pull);
1403    else
1404       emit(pull);
1405 }
1406
1407 src_reg
1408 vec4_visitor::emit_uniformize(const src_reg &src)
1409 {
1410    const src_reg chan_index(this, glsl_type::uint_type);
1411    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1412                               src.type);
1413
1414    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1415       ->force_writemask_all = true;
1416    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1417       ->force_writemask_all = true;
1418
1419    return src_reg(dst);
1420 }
1421
1422 void
1423 vec4_visitor::visit(ir_expression *ir)
1424 {
1425    unsigned int operand;
1426    src_reg op[ARRAY_SIZE(ir->operands)];
1427    vec4_instruction *inst;
1428
1429    if (ir->operation == ir_binop_add) {
1430       if (try_emit_mad(ir))
1431          return;
1432    }
1433
1434    if (ir->operation == ir_unop_b2f) {
1435       if (try_emit_b2f_of_compare(ir))
1436          return;
1437    }
1438
1439    /* Storage for our result.  Ideally for an assignment we'd be using
1440     * the actual storage for the result here, instead.
1441     */
1442    dst_reg result_dst(this, ir->type);
1443    src_reg result_src(result_dst);
1444
1445    if (ir->operation == ir_triop_csel) {
1446       ir->operands[1]->accept(this);
1447       op[1] = this->result;
1448       ir->operands[2]->accept(this);
1449       op[2] = this->result;
1450
1451       enum brw_predicate predicate;
1452       emit_bool_to_cond_code(ir->operands[0], &predicate);
1453       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1454       inst->predicate = predicate;
1455       this->result = result_src;
1456       return;
1457    }
1458
1459    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1460       this->result.file = BAD_FILE;
1461       ir->operands[operand]->accept(this);
1462       if (this->result.file == BAD_FILE) {
1463          fprintf(stderr, "Failed to get tree for expression operand:\n");
1464          ir->operands[operand]->fprint(stderr);
1465          exit(1);
1466       }
1467       op[operand] = this->result;
1468
1469       /* Matrix expression operands should have been broken down to vector
1470        * operations already.
1471        */
1472       assert(!ir->operands[operand]->type->is_matrix());
1473    }
1474
1475    /* If nothing special happens, this is the result. */
1476    this->result = result_src;
1477
1478    switch (ir->operation) {
1479    case ir_unop_logic_not:
1480       emit(NOT(result_dst, op[0]));
1481       break;
1482    case ir_unop_neg:
1483       op[0].negate = !op[0].negate;
1484       emit(MOV(result_dst, op[0]));
1485       break;
1486    case ir_unop_abs:
1487       op[0].abs = true;
1488       op[0].negate = false;
1489       emit(MOV(result_dst, op[0]));
1490       break;
1491
1492    case ir_unop_sign:
1493       if (ir->type->is_float()) {
1494          /* AND(val, 0x80000000) gives the sign bit.
1495           *
1496           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1497           * zero.
1498           */
1499          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1500
1501          op[0].type = BRW_REGISTER_TYPE_UD;
1502          result_dst.type = BRW_REGISTER_TYPE_UD;
1503          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1504
1505          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1506          inst->predicate = BRW_PREDICATE_NORMAL;
1507
1508          this->result.type = BRW_REGISTER_TYPE_F;
1509       } else {
1510          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1511           *               -> non-negative val generates 0x00000000.
1512           *  Predicated OR sets 1 if val is positive.
1513           */
1514          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1515
1516          emit(ASR(result_dst, op[0], src_reg(31)));
1517
1518          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1519          inst->predicate = BRW_PREDICATE_NORMAL;
1520       }
1521       break;
1522
1523    case ir_unop_rcp:
1524       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1525       break;
1526
1527    case ir_unop_exp2:
1528       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1529       break;
1530    case ir_unop_log2:
1531       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1532       break;
1533    case ir_unop_exp:
1534    case ir_unop_log:
1535       unreachable("not reached: should be handled by ir_explog_to_explog2");
1536    case ir_unop_sin:
1537       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1538       break;
1539    case ir_unop_cos:
1540       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1541       break;
1542
1543    case ir_unop_dFdx:
1544    case ir_unop_dFdx_coarse:
1545    case ir_unop_dFdx_fine:
1546    case ir_unop_dFdy:
1547    case ir_unop_dFdy_coarse:
1548    case ir_unop_dFdy_fine:
1549       unreachable("derivatives not valid in vertex shader");
1550
1551    case ir_unop_bitfield_reverse:
1552       emit(BFREV(result_dst, op[0]));
1553       break;
1554    case ir_unop_bit_count:
1555       emit(CBIT(result_dst, op[0]));
1556       break;
1557    case ir_unop_find_msb: {
1558       src_reg temp = src_reg(this, glsl_type::uint_type);
1559
1560       inst = emit(FBH(dst_reg(temp), op[0]));
1561       inst->dst.writemask = WRITEMASK_XYZW;
1562
1563       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1564        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1565        * subtract the result from 31 to convert the MSB count into an LSB count.
1566        */
1567
1568       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1569       temp.swizzle = BRW_SWIZZLE_NOOP;
1570       emit(MOV(result_dst, temp));
1571
1572       src_reg src_tmp = src_reg(result_dst);
1573       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1574
1575       src_tmp.negate = true;
1576       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1577       inst->predicate = BRW_PREDICATE_NORMAL;
1578       break;
1579    }
1580    case ir_unop_find_lsb:
1581       emit(FBL(result_dst, op[0]));
1582       break;
1583    case ir_unop_saturate:
1584       inst = emit(MOV(result_dst, op[0]));
1585       inst->saturate = true;
1586       break;
1587
1588    case ir_unop_noise:
1589       unreachable("not reached: should be handled by lower_noise");
1590
1591    case ir_unop_subroutine_to_int:
1592       emit(MOV(result_dst, op[0]));
1593       break;
1594
1595    case ir_binop_add:
1596       emit(ADD(result_dst, op[0], op[1]));
1597       break;
1598    case ir_binop_sub:
1599       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1600
1601    case ir_binop_mul:
1602       if (devinfo->gen < 8 && ir->type->is_integer()) {
1603          /* For integer multiplication, the MUL uses the low 16 bits of one of
1604           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1605           * accumulates in the contribution of the upper 16 bits of that
1606           * operand.  If we can determine that one of the args is in the low
1607           * 16 bits, though, we can just emit a single MUL.
1608           */
1609          if (ir->operands[0]->is_uint16_constant()) {
1610             if (devinfo->gen < 7)
1611                emit(MUL(result_dst, op[0], op[1]));
1612             else
1613                emit(MUL(result_dst, op[1], op[0]));
1614          } else if (ir->operands[1]->is_uint16_constant()) {
1615             if (devinfo->gen < 7)
1616                emit(MUL(result_dst, op[1], op[0]));
1617             else
1618                emit(MUL(result_dst, op[0], op[1]));
1619          } else {
1620             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1621
1622             emit(MUL(acc, op[0], op[1]));
1623             emit(MACH(dst_null_d(), op[0], op[1]));
1624             emit(MOV(result_dst, src_reg(acc)));
1625          }
1626       } else {
1627          emit(MUL(result_dst, op[0], op[1]));
1628       }
1629       break;
1630    case ir_binop_imul_high: {
1631       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1632
1633       emit(MUL(acc, op[0], op[1]));
1634       emit(MACH(result_dst, op[0], op[1]));
1635       break;
1636    }
1637    case ir_binop_div:
1638       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1639       assert(ir->type->is_integer());
1640       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1641       break;
1642
1643    case ir_binop_carry:
1644       unreachable("Should have been lowered by carry_to_arith().");
1645
1646    case ir_binop_borrow:
1647       unreachable("Should have been lowered by borrow_to_arith().");
1648
1649    case ir_binop_mod:
1650       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1651       assert(ir->type->is_integer());
1652       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1653       break;
1654
1655    case ir_binop_less:
1656    case ir_binop_greater:
1657    case ir_binop_lequal:
1658    case ir_binop_gequal:
1659    case ir_binop_equal:
1660    case ir_binop_nequal: {
1661       if (devinfo->gen <= 5) {
1662          resolve_bool_comparison(ir->operands[0], &op[0]);
1663          resolve_bool_comparison(ir->operands[1], &op[1]);
1664       }
1665       emit(CMP(result_dst, op[0], op[1],
1666                brw_conditional_for_comparison(ir->operation)));
1667       break;
1668    }
1669
1670    case ir_binop_all_equal:
1671       if (devinfo->gen <= 5) {
1672          resolve_bool_comparison(ir->operands[0], &op[0]);
1673          resolve_bool_comparison(ir->operands[1], &op[1]);
1674       }
1675
1676       /* "==" operator producing a scalar boolean. */
1677       if (ir->operands[0]->type->is_vector() ||
1678           ir->operands[1]->type->is_vector()) {
1679          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1680          emit(MOV(result_dst, src_reg(0)));
1681          inst = emit(MOV(result_dst, src_reg(~0)));
1682          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1683       } else {
1684          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1685       }
1686       break;
1687    case ir_binop_any_nequal:
1688       if (devinfo->gen <= 5) {
1689          resolve_bool_comparison(ir->operands[0], &op[0]);
1690          resolve_bool_comparison(ir->operands[1], &op[1]);
1691       }
1692
1693       /* "!=" operator producing a scalar boolean. */
1694       if (ir->operands[0]->type->is_vector() ||
1695           ir->operands[1]->type->is_vector()) {
1696          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1697
1698          emit(MOV(result_dst, src_reg(0)));
1699          inst = emit(MOV(result_dst, src_reg(~0)));
1700          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1701       } else {
1702          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1703       }
1704       break;
1705
1706    case ir_unop_any:
1707       if (devinfo->gen <= 5) {
1708          resolve_bool_comparison(ir->operands[0], &op[0]);
1709       }
1710       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1711       emit(MOV(result_dst, src_reg(0)));
1712
1713       inst = emit(MOV(result_dst, src_reg(~0)));
1714       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1715       break;
1716
1717    case ir_binop_logic_xor:
1718       emit(XOR(result_dst, op[0], op[1]));
1719       break;
1720
1721    case ir_binop_logic_or:
1722       emit(OR(result_dst, op[0], op[1]));
1723       break;
1724
1725    case ir_binop_logic_and:
1726       emit(AND(result_dst, op[0], op[1]));
1727       break;
1728
1729    case ir_binop_dot:
1730       assert(ir->operands[0]->type->is_vector());
1731       assert(ir->operands[0]->type == ir->operands[1]->type);
1732       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1733       break;
1734
1735    case ir_unop_sqrt:
1736       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1737       break;
1738    case ir_unop_rsq:
1739       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1740       break;
1741
1742    case ir_unop_bitcast_i2f:
1743    case ir_unop_bitcast_u2f:
1744       this->result = op[0];
1745       this->result.type = BRW_REGISTER_TYPE_F;
1746       break;
1747
1748    case ir_unop_bitcast_f2i:
1749       this->result = op[0];
1750       this->result.type = BRW_REGISTER_TYPE_D;
1751       break;
1752
1753    case ir_unop_bitcast_f2u:
1754       this->result = op[0];
1755       this->result.type = BRW_REGISTER_TYPE_UD;
1756       break;
1757
1758    case ir_unop_i2f:
1759    case ir_unop_i2u:
1760    case ir_unop_u2i:
1761    case ir_unop_u2f:
1762    case ir_unop_f2i:
1763    case ir_unop_f2u:
1764       emit(MOV(result_dst, op[0]));
1765       break;
1766    case ir_unop_b2i:
1767    case ir_unop_b2f:
1768       if (devinfo->gen <= 5) {
1769          resolve_bool_comparison(ir->operands[0], &op[0]);
1770       }
1771       emit(MOV(result_dst, negate(op[0])));
1772       break;
1773    case ir_unop_f2b:
1774       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1775       break;
1776    case ir_unop_i2b:
1777       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1778       break;
1779
1780    case ir_unop_trunc:
1781       emit(RNDZ(result_dst, op[0]));
1782       break;
1783    case ir_unop_ceil: {
1784          src_reg tmp = src_reg(this, ir->type);
1785          op[0].negate = !op[0].negate;
1786          emit(RNDD(dst_reg(tmp), op[0]));
1787          tmp.negate = true;
1788          emit(MOV(result_dst, tmp));
1789       }
1790       break;
1791    case ir_unop_floor:
1792       inst = emit(RNDD(result_dst, op[0]));
1793       break;
1794    case ir_unop_fract:
1795       inst = emit(FRC(result_dst, op[0]));
1796       break;
1797    case ir_unop_round_even:
1798       emit(RNDE(result_dst, op[0]));
1799       break;
1800
1801    case ir_binop_min:
1802       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1803       break;
1804    case ir_binop_max:
1805       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1806       break;
1807
1808    case ir_binop_pow:
1809       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1810       break;
1811
1812    case ir_unop_bit_not:
1813       inst = emit(NOT(result_dst, op[0]));
1814       break;
1815    case ir_binop_bit_and:
1816       inst = emit(AND(result_dst, op[0], op[1]));
1817       break;
1818    case ir_binop_bit_xor:
1819       inst = emit(XOR(result_dst, op[0], op[1]));
1820       break;
1821    case ir_binop_bit_or:
1822       inst = emit(OR(result_dst, op[0], op[1]));
1823       break;
1824
1825    case ir_binop_lshift:
1826       inst = emit(SHL(result_dst, op[0], op[1]));
1827       break;
1828
1829    case ir_binop_rshift:
1830       if (ir->type->base_type == GLSL_TYPE_INT)
1831          inst = emit(ASR(result_dst, op[0], op[1]));
1832       else
1833          inst = emit(SHR(result_dst, op[0], op[1]));
1834       break;
1835
1836    case ir_binop_bfm:
1837       emit(BFI1(result_dst, op[0], op[1]));
1838       break;
1839
1840    case ir_binop_ubo_load: {
1841       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1842       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1843       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1844       src_reg offset;
1845
1846       /* Now, load the vector from that offset. */
1847       assert(ir->type->is_vector() || ir->type->is_scalar());
1848
1849       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1850       packed_consts.type = result.type;
1851       src_reg surf_index;
1852
1853       if (const_uniform_block) {
1854          /* The block index is a constant, so just emit the binding table entry
1855           * as an immediate.
1856           */
1857          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1858                               const_uniform_block->value.u[0]);
1859       } else {
1860          /* The block index is not a constant. Evaluate the index expression
1861           * per-channel and add the base UBO index; we have to select a value
1862           * from any live channel.
1863           */
1864          surf_index = src_reg(this, glsl_type::uint_type);
1865          emit(ADD(dst_reg(surf_index), op[0],
1866                   src_reg(prog_data->base.binding_table.ubo_start)));
1867          surf_index = emit_uniformize(surf_index);
1868
1869          /* Assume this may touch any UBO. It would be nice to provide
1870           * a tighter bound, but the array information is already lowered away.
1871           */
1872          brw_mark_surface_used(&prog_data->base,
1873                                prog_data->base.binding_table.ubo_start +
1874                                shader_prog->NumUniformBlocks - 1);
1875       }
1876
1877       if (const_offset_ir) {
1878          if (devinfo->gen >= 8) {
1879             /* Store the offset in a GRF so we can send-from-GRF. */
1880             offset = src_reg(this, glsl_type::int_type);
1881             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1882          } else {
1883             /* Immediates are fine on older generations since they'll be moved
1884              * to a (potentially fake) MRF at the generator level.
1885              */
1886             offset = src_reg(const_offset / 16);
1887          }
1888       } else {
1889          offset = src_reg(this, glsl_type::uint_type);
1890          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1891       }
1892
1893       emit_pull_constant_load_reg(dst_reg(packed_consts),
1894                                   surf_index,
1895                                   offset,
1896                                   NULL, NULL /* before_block/inst */);
1897
1898       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1899       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1900                                             const_offset % 16 / 4,
1901                                             const_offset % 16 / 4,
1902                                             const_offset % 16 / 4);
1903
1904       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1905       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1906          emit(CMP(result_dst, packed_consts, src_reg(0u),
1907                   BRW_CONDITIONAL_NZ));
1908       } else {
1909          emit(MOV(result_dst, packed_consts));
1910       }
1911       break;
1912    }
1913
1914    case ir_binop_vector_extract:
1915       unreachable("should have been lowered by vec_index_to_cond_assign");
1916
1917    case ir_triop_fma:
1918       op[0] = fix_3src_operand(op[0]);
1919       op[1] = fix_3src_operand(op[1]);
1920       op[2] = fix_3src_operand(op[2]);
1921       /* Note that the instruction's argument order is reversed from GLSL
1922        * and the IR.
1923        */
1924       emit(MAD(result_dst, op[2], op[1], op[0]));
1925       break;
1926
1927    case ir_triop_lrp:
1928       emit_lrp(result_dst, op[0], op[1], op[2]);
1929       break;
1930
1931    case ir_triop_csel:
1932       unreachable("already handled above");
1933       break;
1934
1935    case ir_triop_bfi:
1936       op[0] = fix_3src_operand(op[0]);
1937       op[1] = fix_3src_operand(op[1]);
1938       op[2] = fix_3src_operand(op[2]);
1939       emit(BFI2(result_dst, op[0], op[1], op[2]));
1940       break;
1941
1942    case ir_triop_bitfield_extract:
1943       op[0] = fix_3src_operand(op[0]);
1944       op[1] = fix_3src_operand(op[1]);
1945       op[2] = fix_3src_operand(op[2]);
1946       /* Note that the instruction's argument order is reversed from GLSL
1947        * and the IR.
1948        */
1949       emit(BFE(result_dst, op[2], op[1], op[0]));
1950       break;
1951
1952    case ir_triop_vector_insert:
1953       unreachable("should have been lowered by lower_vector_insert");
1954
1955    case ir_quadop_bitfield_insert:
1956       unreachable("not reached: should be handled by "
1957               "bitfield_insert_to_bfm_bfi\n");
1958
1959    case ir_quadop_vector:
1960       unreachable("not reached: should be handled by lower_quadop_vector");
1961
1962    case ir_unop_pack_half_2x16:
1963       emit_pack_half_2x16(result_dst, op[0]);
1964       break;
1965    case ir_unop_unpack_half_2x16:
1966       emit_unpack_half_2x16(result_dst, op[0]);
1967       break;
1968    case ir_unop_unpack_unorm_4x8:
1969       emit_unpack_unorm_4x8(result_dst, op[0]);
1970       break;
1971    case ir_unop_unpack_snorm_4x8:
1972       emit_unpack_snorm_4x8(result_dst, op[0]);
1973       break;
1974    case ir_unop_pack_unorm_4x8:
1975       emit_pack_unorm_4x8(result_dst, op[0]);
1976       break;
1977    case ir_unop_pack_snorm_4x8:
1978       emit_pack_snorm_4x8(result_dst, op[0]);
1979       break;
1980    case ir_unop_pack_snorm_2x16:
1981    case ir_unop_pack_unorm_2x16:
1982    case ir_unop_unpack_snorm_2x16:
1983    case ir_unop_unpack_unorm_2x16:
1984       unreachable("not reached: should be handled by lower_packing_builtins");
1985    case ir_unop_unpack_half_2x16_split_x:
1986    case ir_unop_unpack_half_2x16_split_y:
1987    case ir_binop_pack_half_2x16_split:
1988    case ir_unop_interpolate_at_centroid:
1989    case ir_binop_interpolate_at_sample:
1990    case ir_binop_interpolate_at_offset:
1991       unreachable("not reached: should not occur in vertex shader");
1992    case ir_binop_ldexp:
1993       unreachable("not reached: should be handled by ldexp_to_arith()");
1994    case ir_unop_d2f:
1995    case ir_unop_f2d:
1996    case ir_unop_d2i:
1997    case ir_unop_i2d:
1998    case ir_unop_d2u:
1999    case ir_unop_u2d:
2000    case ir_unop_d2b:
2001    case ir_unop_pack_double_2x32:
2002    case ir_unop_unpack_double_2x32:
2003    case ir_unop_frexp_sig:
2004    case ir_unop_frexp_exp:
2005       unreachable("fp64 todo");
2006    }
2007 }
2008
2009
2010 void
2011 vec4_visitor::visit(ir_swizzle *ir)
2012 {
2013    /* Note that this is only swizzles in expressions, not those on the left
2014     * hand side of an assignment, which do write masking.  See ir_assignment
2015     * for that.
2016     */
2017    const unsigned swz = brw_compose_swizzle(
2018       brw_swizzle_for_size(ir->type->vector_elements),
2019       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2020
2021    ir->val->accept(this);
2022    this->result = swizzle(this->result, swz);
2023 }
2024
2025 void
2026 vec4_visitor::visit(ir_dereference_variable *ir)
2027 {
2028    const struct glsl_type *type = ir->type;
2029    dst_reg *reg = variable_storage(ir->var);
2030
2031    if (!reg) {
2032       fail("Failed to find variable storage for %s\n", ir->var->name);
2033       this->result = src_reg(brw_null_reg());
2034       return;
2035    }
2036
2037    this->result = src_reg(*reg);
2038
2039    /* System values get their swizzle from the dst_reg writemask */
2040    if (ir->var->data.mode == ir_var_system_value)
2041       return;
2042
2043    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2044       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2045 }
2046
2047
2048 int
2049 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2050 {
2051    /* Under normal circumstances array elements are stored consecutively, so
2052     * the stride is equal to the size of the array element.
2053     */
2054    return type_size(ir->type);
2055 }
2056
2057
2058 void
2059 vec4_visitor::visit(ir_dereference_array *ir)
2060 {
2061    ir_constant *constant_index;
2062    src_reg src;
2063    int array_stride = compute_array_stride(ir);
2064
2065    constant_index = ir->array_index->constant_expression_value();
2066
2067    ir->array->accept(this);
2068    src = this->result;
2069
2070    if (constant_index) {
2071       src.reg_offset += constant_index->value.i[0] * array_stride;
2072    } else {
2073       /* Variable index array dereference.  It eats the "vec4" of the
2074        * base of the array and an index that offsets the Mesa register
2075        * index.
2076        */
2077       ir->array_index->accept(this);
2078
2079       src_reg index_reg;
2080
2081       if (array_stride == 1) {
2082          index_reg = this->result;
2083       } else {
2084          index_reg = src_reg(this, glsl_type::int_type);
2085
2086          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2087       }
2088
2089       if (src.reladdr) {
2090          src_reg temp = src_reg(this, glsl_type::int_type);
2091
2092          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2093
2094          index_reg = temp;
2095       }
2096
2097       src.reladdr = ralloc(mem_ctx, src_reg);
2098       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2099    }
2100
2101    /* If the type is smaller than a vec4, replicate the last channel out. */
2102    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2103       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2104    else
2105       src.swizzle = BRW_SWIZZLE_NOOP;
2106    src.type = brw_type_for_base_type(ir->type);
2107
2108    this->result = src;
2109 }
2110
2111 void
2112 vec4_visitor::visit(ir_dereference_record *ir)
2113 {
2114    unsigned int i;
2115    const glsl_type *struct_type = ir->record->type;
2116    int offset = 0;
2117
2118    ir->record->accept(this);
2119
2120    for (i = 0; i < struct_type->length; i++) {
2121       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2122          break;
2123       offset += type_size(struct_type->fields.structure[i].type);
2124    }
2125
2126    /* If the type is smaller than a vec4, replicate the last channel out. */
2127    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2128       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2129    else
2130       this->result.swizzle = BRW_SWIZZLE_NOOP;
2131    this->result.type = brw_type_for_base_type(ir->type);
2132
2133    this->result.reg_offset += offset;
2134 }
2135
2136 /**
2137  * We want to be careful in assignment setup to hit the actual storage
2138  * instead of potentially using a temporary like we might with the
2139  * ir_dereference handler.
2140  */
2141 static dst_reg
2142 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2143 {
2144    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2145     * access of a vector, it must be separated into a series conditional moves
2146     * before reaching this point (see ir_vec_index_to_cond_assign).
2147     */
2148    assert(ir->as_dereference());
2149    ir_dereference_array *deref_array = ir->as_dereference_array();
2150    if (deref_array) {
2151       assert(!deref_array->array->type->is_vector());
2152    }
2153
2154    /* Use the rvalue deref handler for the most part.  We'll ignore
2155     * swizzles in it and write swizzles using writemask, though.
2156     */
2157    ir->accept(v);
2158    return dst_reg(v->result);
2159 }
2160
2161 void
2162 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2163                               const struct glsl_type *type,
2164                               enum brw_predicate predicate)
2165 {
2166    if (type->base_type == GLSL_TYPE_STRUCT) {
2167       for (unsigned int i = 0; i < type->length; i++) {
2168          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2169       }
2170       return;
2171    }
2172
2173    if (type->is_array()) {
2174       for (unsigned int i = 0; i < type->length; i++) {
2175          emit_block_move(dst, src, type->fields.array, predicate);
2176       }
2177       return;
2178    }
2179
2180    if (type->is_matrix()) {
2181       const struct glsl_type *vec_type;
2182
2183       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2184                                          type->vector_elements, 1);
2185
2186       for (int i = 0; i < type->matrix_columns; i++) {
2187          emit_block_move(dst, src, vec_type, predicate);
2188       }
2189       return;
2190    }
2191
2192    assert(type->is_scalar() || type->is_vector());
2193
2194    dst->type = brw_type_for_base_type(type);
2195    src->type = dst->type;
2196
2197    dst->writemask = (1 << type->vector_elements) - 1;
2198
2199    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2200
2201    vec4_instruction *inst = emit(MOV(*dst, *src));
2202    inst->predicate = predicate;
2203
2204    dst->reg_offset++;
2205    src->reg_offset++;
2206 }
2207
2208
2209 /* If the RHS processing resulted in an instruction generating a
2210  * temporary value, and it would be easy to rewrite the instruction to
2211  * generate its result right into the LHS instead, do so.  This ends
2212  * up reliably removing instructions where it can be tricky to do so
2213  * later without real UD chain information.
2214  */
2215 bool
2216 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2217                                      dst_reg dst,
2218                                      src_reg src,
2219                                      vec4_instruction *pre_rhs_inst,
2220                                      vec4_instruction *last_rhs_inst)
2221 {
2222    /* This could be supported, but it would take more smarts. */
2223    if (ir->condition)
2224       return false;
2225
2226    if (pre_rhs_inst == last_rhs_inst)
2227       return false; /* No instructions generated to work with. */
2228
2229    /* Make sure the last instruction generated our source reg. */
2230    if (src.file != GRF ||
2231        src.file != last_rhs_inst->dst.file ||
2232        src.reg != last_rhs_inst->dst.reg ||
2233        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2234        src.reladdr ||
2235        src.abs ||
2236        src.negate ||
2237        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2238       return false;
2239
2240    /* Check that that last instruction fully initialized the channels
2241     * we want to use, in the order we want to use them.  We could
2242     * potentially reswizzle the operands of many instructions so that
2243     * we could handle out of order channels, but don't yet.
2244     */
2245
2246    for (unsigned i = 0; i < 4; i++) {
2247       if (dst.writemask & (1 << i)) {
2248          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2249             return false;
2250
2251          if (BRW_GET_SWZ(src.swizzle, i) != i)
2252             return false;
2253       }
2254    }
2255
2256    /* Success!  Rewrite the instruction. */
2257    last_rhs_inst->dst.file = dst.file;
2258    last_rhs_inst->dst.reg = dst.reg;
2259    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2260    last_rhs_inst->dst.reladdr = dst.reladdr;
2261    last_rhs_inst->dst.writemask &= dst.writemask;
2262
2263    return true;
2264 }
2265
2266 void
2267 vec4_visitor::visit(ir_assignment *ir)
2268 {
2269    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2270    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2271
2272    if (!ir->lhs->type->is_scalar() &&
2273        !ir->lhs->type->is_vector()) {
2274       ir->rhs->accept(this);
2275       src_reg src = this->result;
2276
2277       if (ir->condition) {
2278          emit_bool_to_cond_code(ir->condition, &predicate);
2279       }
2280
2281       /* emit_block_move doesn't account for swizzles in the source register.
2282        * This should be ok, since the source register is a structure or an
2283        * array, and those can't be swizzled.  But double-check to be sure.
2284        */
2285       assert(src.swizzle ==
2286              (ir->rhs->type->is_matrix()
2287               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2288               : BRW_SWIZZLE_NOOP));
2289
2290       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2291       return;
2292    }
2293
2294    /* Now we're down to just a scalar/vector with writemasks. */
2295    int i;
2296
2297    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2298    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2299
2300    ir->rhs->accept(this);
2301
2302    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2303
2304    int swizzles[4];
2305    int src_chan = 0;
2306
2307    assert(ir->lhs->type->is_vector() ||
2308           ir->lhs->type->is_scalar());
2309    dst.writemask = ir->write_mask;
2310
2311    /* Swizzle a small RHS vector into the channels being written.
2312     *
2313     * glsl ir treats write_mask as dictating how many channels are
2314     * present on the RHS while in our instructions we need to make
2315     * those channels appear in the slots of the vec4 they're written to.
2316     */
2317    for (int i = 0; i < 4; i++)
2318       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2319
2320    src_reg src = swizzle(this->result,
2321                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2322                                       swizzles[2], swizzles[3]));
2323
2324    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2325       return;
2326    }
2327
2328    if (ir->condition) {
2329       emit_bool_to_cond_code(ir->condition, &predicate);
2330    }
2331
2332    for (i = 0; i < type_size(ir->lhs->type); i++) {
2333       vec4_instruction *inst = emit(MOV(dst, src));
2334       inst->predicate = predicate;
2335
2336       dst.reg_offset++;
2337       src.reg_offset++;
2338    }
2339 }
2340
2341 void
2342 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2343 {
2344    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2345       foreach_in_list(ir_constant, field_value, &ir->components) {
2346          emit_constant_values(dst, field_value);
2347       }
2348       return;
2349    }
2350
2351    if (ir->type->is_array()) {
2352       for (unsigned int i = 0; i < ir->type->length; i++) {
2353          emit_constant_values(dst, ir->array_elements[i]);
2354       }
2355       return;
2356    }
2357
2358    if (ir->type->is_matrix()) {
2359       for (int i = 0; i < ir->type->matrix_columns; i++) {
2360          float *vec = &ir->value.f[i * ir->type->vector_elements];
2361
2362          for (int j = 0; j < ir->type->vector_elements; j++) {
2363             dst->writemask = 1 << j;
2364             dst->type = BRW_REGISTER_TYPE_F;
2365
2366             emit(MOV(*dst, src_reg(vec[j])));
2367          }
2368          dst->reg_offset++;
2369       }
2370       return;
2371    }
2372
2373    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2374
2375    for (int i = 0; i < ir->type->vector_elements; i++) {
2376       if (!(remaining_writemask & (1 << i)))
2377          continue;
2378
2379       dst->writemask = 1 << i;
2380       dst->type = brw_type_for_base_type(ir->type);
2381
2382       /* Find other components that match the one we're about to
2383        * write.  Emits fewer instructions for things like vec4(0.5,
2384        * 1.5, 1.5, 1.5).
2385        */
2386       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2387          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2388             if (ir->value.b[i] == ir->value.b[j])
2389                dst->writemask |= (1 << j);
2390          } else {
2391             /* u, i, and f storage all line up, so no need for a
2392              * switch case for comparing each type.
2393              */
2394             if (ir->value.u[i] == ir->value.u[j])
2395                dst->writemask |= (1 << j);
2396          }
2397       }
2398
2399       switch (ir->type->base_type) {
2400       case GLSL_TYPE_FLOAT:
2401          emit(MOV(*dst, src_reg(ir->value.f[i])));
2402          break;
2403       case GLSL_TYPE_INT:
2404          emit(MOV(*dst, src_reg(ir->value.i[i])));
2405          break;
2406       case GLSL_TYPE_UINT:
2407          emit(MOV(*dst, src_reg(ir->value.u[i])));
2408          break;
2409       case GLSL_TYPE_BOOL:
2410          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2411          break;
2412       default:
2413          unreachable("Non-float/uint/int/bool constant");
2414       }
2415
2416       remaining_writemask &= ~dst->writemask;
2417    }
2418    dst->reg_offset++;
2419 }
2420
2421 void
2422 vec4_visitor::visit(ir_constant *ir)
2423 {
2424    dst_reg dst = dst_reg(this, ir->type);
2425    this->result = src_reg(dst);
2426
2427    emit_constant_values(&dst, ir);
2428 }
2429
2430 void
2431 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2432 {
2433    ir_dereference *deref = static_cast<ir_dereference *>(
2434       ir->actual_parameters.get_head());
2435    ir_variable *location = deref->variable_referenced();
2436    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2437                           location->data.binding);
2438
2439    /* Calculate the surface offset */
2440    src_reg offset(this, glsl_type::uint_type);
2441    ir_dereference_array *deref_array = deref->as_dereference_array();
2442    if (deref_array) {
2443       deref_array->array_index->accept(this);
2444
2445       src_reg tmp(this, glsl_type::uint_type);
2446       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2447       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2448    } else {
2449       offset = location->data.atomic.offset;
2450    }
2451
2452    /* Emit the appropriate machine instruction */
2453    const char *callee = ir->callee->function_name();
2454    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2455
2456    if (!strcmp("__intrinsic_atomic_read", callee)) {
2457       emit_untyped_surface_read(surf_index, dst, offset);
2458
2459    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2460       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2461                           src_reg(), src_reg());
2462
2463    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2464       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2465                           src_reg(), src_reg());
2466    }
2467
2468    brw_mark_surface_used(stage_prog_data, surf_index);
2469 }
2470
2471 void
2472 vec4_visitor::visit(ir_call *ir)
2473 {
2474    const char *callee = ir->callee->function_name();
2475
2476    if (!strcmp("__intrinsic_atomic_read", callee) ||
2477        !strcmp("__intrinsic_atomic_increment", callee) ||
2478        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2479       visit_atomic_counter_intrinsic(ir);
2480    } else {
2481       unreachable("Unsupported intrinsic.");
2482    }
2483 }
2484
2485 src_reg
2486 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2487                              src_reg coordinate, src_reg sampler)
2488 {
2489    vec4_instruction *inst =
2490       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2491                                     dst_reg(this, glsl_type::uvec4_type));
2492    inst->base_mrf = 2;
2493    inst->src[1] = sampler;
2494
2495    int param_base;
2496
2497    if (devinfo->gen >= 9) {
2498       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2499       vec4_instruction *header_inst = new(mem_ctx)
2500          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2501                           dst_reg(MRF, inst->base_mrf));
2502
2503       emit(header_inst);
2504
2505       inst->mlen = 2;
2506       inst->header_size = 1;
2507       param_base = inst->base_mrf + 1;
2508    } else {
2509       inst->mlen = 1;
2510       param_base = inst->base_mrf;
2511    }
2512
2513    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2514    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2515    int zero_mask = 0xf & ~coord_mask;
2516
2517    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2518             coordinate));
2519
2520    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2521             src_reg(0)));
2522
2523    emit(inst);
2524    return src_reg(inst->dst);
2525 }
2526
2527 bool
2528 vec4_visitor::is_high_sampler(src_reg sampler)
2529 {
2530    if (devinfo->gen < 8 && !devinfo->is_haswell)
2531       return false;
2532
2533    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2534 }
2535
2536 void
2537 vec4_visitor::emit_texture(ir_texture_opcode op,
2538                            dst_reg dest,
2539                            const glsl_type *dest_type,
2540                            src_reg coordinate,
2541                            int coord_components,
2542                            src_reg shadow_comparitor,
2543                            src_reg lod, src_reg lod2,
2544                            src_reg sample_index,
2545                            uint32_t constant_offset,
2546                            src_reg offset_value,
2547                            src_reg mcs,
2548                            bool is_cube_array,
2549                            uint32_t sampler,
2550                            src_reg sampler_reg)
2551 {
2552    enum opcode opcode;
2553    switch (op) {
2554    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2555    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2556    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2557    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2558    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2559    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2560    case ir_tg4: opcode = offset_value.file != BAD_FILE
2561                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2562    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2563    case ir_txb:
2564       unreachable("TXB is not valid for vertex shaders.");
2565    case ir_lod:
2566       unreachable("LOD is not valid for vertex shaders.");
2567    default:
2568       unreachable("Unrecognized tex op");
2569    }
2570
2571    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2572       opcode, dst_reg(this, dest_type));
2573
2574    inst->offset = constant_offset;
2575
2576    /* The message header is necessary for:
2577     * - Gen4 (always)
2578     * - Gen9+ for selecting SIMD4x2
2579     * - Texel offsets
2580     * - Gather channel selection
2581     * - Sampler indices too large to fit in a 4-bit value.
2582     */
2583    inst->header_size =
2584       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2585        inst->offset != 0 || op == ir_tg4 ||
2586        is_high_sampler(sampler_reg)) ? 1 : 0;
2587    inst->base_mrf = 2;
2588    inst->mlen = inst->header_size + 1; /* always at least one */
2589    inst->dst.writemask = WRITEMASK_XYZW;
2590    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2591
2592    inst->src[1] = sampler_reg;
2593
2594    /* MRF for the first parameter */
2595    int param_base = inst->base_mrf + inst->header_size;
2596
2597    if (op == ir_txs || op == ir_query_levels) {
2598       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2599       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2600    } else {
2601       /* Load the coordinate */
2602       /* FINISHME: gl_clamp_mask and saturate */
2603       int coord_mask = (1 << coord_components) - 1;
2604       int zero_mask = 0xf & ~coord_mask;
2605
2606       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2607                coordinate));
2608
2609       if (zero_mask != 0) {
2610          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2611                   src_reg(0)));
2612       }
2613       /* Load the shadow comparitor */
2614       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2615          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2616                           WRITEMASK_X),
2617                   shadow_comparitor));
2618          inst->mlen++;
2619       }
2620
2621       /* Load the LOD info */
2622       if (op == ir_tex || op == ir_txl) {
2623          int mrf, writemask;
2624          if (devinfo->gen >= 5) {
2625             mrf = param_base + 1;
2626             if (shadow_comparitor.file != BAD_FILE) {
2627                writemask = WRITEMASK_Y;
2628                /* mlen already incremented */
2629             } else {
2630                writemask = WRITEMASK_X;
2631                inst->mlen++;
2632             }
2633          } else /* devinfo->gen == 4 */ {
2634             mrf = param_base;
2635             writemask = WRITEMASK_W;
2636          }
2637          lod.swizzle = BRW_SWIZZLE_XXXX;
2638          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2639       } else if (op == ir_txf) {
2640          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2641       } else if (op == ir_txf_ms) {
2642          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2643                   sample_index));
2644          if (devinfo->gen >= 7) {
2645             /* MCS data is in the first channel of `mcs`, but we need to get it into
2646              * the .y channel of the second vec4 of params, so replicate .x across
2647              * the whole vec4 and then mask off everything except .y
2648              */
2649             mcs.swizzle = BRW_SWIZZLE_XXXX;
2650             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2651                      mcs));
2652          }
2653          inst->mlen++;
2654       } else if (op == ir_txd) {
2655          const brw_reg_type type = lod.type;
2656
2657          if (devinfo->gen >= 5) {
2658             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2659             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2661             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2662             inst->mlen++;
2663
2664             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2665                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2666                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2667                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2668                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2669                inst->mlen++;
2670
2671                if (shadow_comparitor.file != BAD_FILE) {
2672                   emit(MOV(dst_reg(MRF, param_base + 2,
2673                                    shadow_comparitor.type, WRITEMASK_Z),
2674                            shadow_comparitor));
2675                }
2676             }
2677          } else /* devinfo->gen == 4 */ {
2678             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2679             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2680             inst->mlen += 2;
2681          }
2682       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2683          if (shadow_comparitor.file != BAD_FILE) {
2684             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2685                      shadow_comparitor));
2686          }
2687
2688          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2689                   offset_value));
2690          inst->mlen++;
2691       }
2692    }
2693
2694    emit(inst);
2695
2696    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2697     * spec requires layers.
2698     */
2699    if (op == ir_txs && is_cube_array) {
2700       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2701                 writemask(inst->dst, WRITEMASK_Z),
2702                 src_reg(inst->dst), src_reg(6));
2703    }
2704
2705    if (devinfo->gen == 6 && op == ir_tg4) {
2706       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2707    }
2708
2709    swizzle_result(op, dest,
2710                   src_reg(inst->dst), sampler, dest_type);
2711 }
2712
2713 void
2714 vec4_visitor::visit(ir_texture *ir)
2715 {
2716    uint32_t sampler =
2717       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2718
2719    ir_rvalue *nonconst_sampler_index =
2720       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2721
2722    /* Handle non-constant sampler array indexing */
2723    src_reg sampler_reg;
2724    if (nonconst_sampler_index) {
2725       /* The highest sampler which may be used by this operation is
2726        * the last element of the array. Mark it here, because the generator
2727        * doesn't have enough information to determine the bound.
2728        */
2729       uint32_t array_size = ir->sampler->as_dereference_array()
2730          ->array->type->array_size();
2731
2732       uint32_t max_used = sampler + array_size - 1;
2733       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2734          max_used += prog_data->base.binding_table.gather_texture_start;
2735       } else {
2736          max_used += prog_data->base.binding_table.texture_start;
2737       }
2738
2739       brw_mark_surface_used(&prog_data->base, max_used);
2740
2741       /* Emit code to evaluate the actual indexing expression */
2742       nonconst_sampler_index->accept(this);
2743       src_reg temp(this, glsl_type::uint_type);
2744       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2745       sampler_reg = emit_uniformize(temp);
2746    } else {
2747       /* Single sampler, or constant array index; the indexing expression
2748        * is just an immediate.
2749        */
2750       sampler_reg = src_reg(sampler);
2751    }
2752
2753    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2754     * emitting anything other than setting up the constant result.
2755     */
2756    if (ir->op == ir_tg4) {
2757       ir_constant *chan = ir->lod_info.component->as_constant();
2758       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2759       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2760          dst_reg result(this, ir->type);
2761          this->result = src_reg(result);
2762          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2763          return;
2764       }
2765    }
2766
2767    /* Should be lowered by do_lower_texture_projection */
2768    assert(!ir->projector);
2769
2770    /* Should be lowered */
2771    assert(!ir->offset || !ir->offset->type->is_array());
2772
2773    /* Generate code to compute all the subexpression trees.  This has to be
2774     * done before loading any values into MRFs for the sampler message since
2775     * generating these values may involve SEND messages that need the MRFs.
2776     */
2777    src_reg coordinate;
2778    int coord_components = 0;
2779    if (ir->coordinate) {
2780       coord_components = ir->coordinate->type->vector_elements;
2781       ir->coordinate->accept(this);
2782       coordinate = this->result;
2783    }
2784
2785    src_reg shadow_comparitor;
2786    if (ir->shadow_comparitor) {
2787       ir->shadow_comparitor->accept(this);
2788       shadow_comparitor = this->result;
2789    }
2790
2791    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2792    src_reg offset_value;
2793    if (has_nonconstant_offset) {
2794       ir->offset->accept(this);
2795       offset_value = src_reg(this->result);
2796    }
2797
2798    src_reg lod, lod2, sample_index, mcs;
2799    switch (ir->op) {
2800    case ir_tex:
2801       lod = src_reg(0.0f);
2802       break;
2803    case ir_txf:
2804    case ir_txl:
2805    case ir_txs:
2806       ir->lod_info.lod->accept(this);
2807       lod = this->result;
2808       break;
2809    case ir_query_levels:
2810       lod = src_reg(0);
2811       break;
2812    case ir_txf_ms:
2813       ir->lod_info.sample_index->accept(this);
2814       sample_index = this->result;
2815
2816       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2817          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2818       else
2819          mcs = src_reg(0u);
2820       break;
2821    case ir_txd:
2822       ir->lod_info.grad.dPdx->accept(this);
2823       lod = this->result;
2824
2825       ir->lod_info.grad.dPdy->accept(this);
2826       lod2 = this->result;
2827       break;
2828    case ir_txb:
2829    case ir_lod:
2830    case ir_tg4:
2831       break;
2832    }
2833
2834    uint32_t constant_offset = 0;
2835    if (ir->offset != NULL && !has_nonconstant_offset) {
2836       constant_offset  =
2837          brw_texture_offset(ir->offset->as_constant()->value.i,
2838                             ir->offset->type->vector_elements);
2839    }
2840
2841    /* Stuff the channel select bits in the top of the texture offset */
2842    if (ir->op == ir_tg4)
2843       constant_offset |=
2844          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2845                          sampler) << 16;
2846
2847    glsl_type const *type = ir->sampler->type;
2848    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2849       type->sampler_array;
2850
2851    this->result = src_reg(this, ir->type);
2852    dst_reg dest = dst_reg(this->result);
2853
2854    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2855                 shadow_comparitor,
2856                 lod, lod2, sample_index,
2857                 constant_offset, offset_value,
2858                 mcs, is_cube_array, sampler, sampler_reg);
2859 }
2860
2861 /**
2862  * Apply workarounds for Gen6 gather with UINT/SINT
2863  */
2864 void
2865 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2866 {
2867    if (!wa)
2868       return;
2869
2870    int width = (wa & WA_8BIT) ? 8 : 16;
2871    dst_reg dst_f = dst;
2872    dst_f.type = BRW_REGISTER_TYPE_F;
2873
2874    /* Convert from UNORM to UINT */
2875    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2876    emit(MOV(dst, src_reg(dst_f)));
2877
2878    if (wa & WA_SIGN) {
2879       /* Reinterpret the UINT value as a signed INT value by
2880        * shifting the sign bit into place, then shifting back
2881        * preserving sign.
2882        */
2883       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2884       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2885    }
2886 }
2887
2888 /**
2889  * Set up the gather channel based on the swizzle, for gather4.
2890  */
2891 uint32_t
2892 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2893 {
2894    int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2895    switch (swiz) {
2896       case SWIZZLE_X: return 0;
2897       case SWIZZLE_Y:
2898          /* gather4 sampler is broken for green channel on RG32F --
2899           * we must ask for blue instead.
2900           */
2901          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2902             return 2;
2903          return 1;
2904       case SWIZZLE_Z: return 2;
2905       case SWIZZLE_W: return 3;
2906       default:
2907          unreachable("Not reached"); /* zero, one swizzles handled already */
2908    }
2909 }
2910
2911 void
2912 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2913                              src_reg orig_val, uint32_t sampler,
2914                              const glsl_type *dest_type)
2915 {
2916    int s = key->tex.swizzles[sampler];
2917
2918    dst_reg swizzled_result = dest;
2919
2920    if (op == ir_query_levels) {
2921       /* # levels is in .w */
2922       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2923       emit(MOV(swizzled_result, orig_val));
2924       return;
2925    }
2926
2927    if (op == ir_txs || dest_type == glsl_type::float_type
2928                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2929       emit(MOV(swizzled_result, orig_val));
2930       return;
2931    }
2932
2933
2934    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2935    int swizzle[4] = {0};
2936
2937    for (int i = 0; i < 4; i++) {
2938       switch (GET_SWZ(s, i)) {
2939       case SWIZZLE_ZERO:
2940          zero_mask |= (1 << i);
2941          break;
2942       case SWIZZLE_ONE:
2943          one_mask |= (1 << i);
2944          break;
2945       default:
2946          copy_mask |= (1 << i);
2947          swizzle[i] = GET_SWZ(s, i);
2948          break;
2949       }
2950    }
2951
2952    if (copy_mask) {
2953       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2954       swizzled_result.writemask = copy_mask;
2955       emit(MOV(swizzled_result, orig_val));
2956    }
2957
2958    if (zero_mask) {
2959       swizzled_result.writemask = zero_mask;
2960       emit(MOV(swizzled_result, src_reg(0.0f)));
2961    }
2962
2963    if (one_mask) {
2964       swizzled_result.writemask = one_mask;
2965       emit(MOV(swizzled_result, src_reg(1.0f)));
2966    }
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_return *)
2971 {
2972    unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::visit(ir_discard *)
2977 {
2978    unreachable("not reached");
2979 }
2980
2981 void
2982 vec4_visitor::visit(ir_if *ir)
2983 {
2984    /* Don't point the annotation at the if statement, because then it plus
2985     * the then and else blocks get printed.
2986     */
2987    this->base_ir = ir->condition;
2988
2989    if (devinfo->gen == 6) {
2990       emit_if_gen6(ir);
2991    } else {
2992       enum brw_predicate predicate;
2993       emit_bool_to_cond_code(ir->condition, &predicate);
2994       emit(IF(predicate));
2995    }
2996
2997    visit_instructions(&ir->then_instructions);
2998
2999    if (!ir->else_instructions.is_empty()) {
3000       this->base_ir = ir->condition;
3001       emit(BRW_OPCODE_ELSE);
3002
3003       visit_instructions(&ir->else_instructions);
3004    }
3005
3006    this->base_ir = ir->condition;
3007    emit(BRW_OPCODE_ENDIF);
3008 }
3009
3010 void
3011 vec4_visitor::gs_emit_vertex(int stream_id)
3012 {
3013    unreachable("not reached");
3014 }
3015
3016 void
3017 vec4_visitor::visit(ir_emit_vertex *)
3018 {
3019    unreachable("not reached");
3020 }
3021
3022 void
3023 vec4_visitor::gs_end_primitive()
3024 {
3025    unreachable("not reached");
3026 }
3027
3028
3029 void
3030 vec4_visitor::visit(ir_end_primitive *)
3031 {
3032    unreachable("not reached");
3033 }
3034
3035 void
3036 vec4_visitor::visit(ir_barrier *)
3037 {
3038    unreachable("not reached");
3039 }
3040
3041 void
3042 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3043                                   dst_reg dst, src_reg offset,
3044                                   src_reg src0, src_reg src1)
3045 {
3046    unsigned mlen = 0;
3047
3048    /* Set the atomic operation offset. */
3049    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3050    mlen++;
3051
3052    /* Set the atomic operation arguments. */
3053    if (src0.file != BAD_FILE) {
3054       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3055       mlen++;
3056    }
3057
3058    if (src1.file != BAD_FILE) {
3059       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3060       mlen++;
3061    }
3062
3063    /* Emit the instruction.  Note that this maps to the normal SIMD8
3064     * untyped atomic message on Ivy Bridge, but that's OK because
3065     * unused channels will be masked out.
3066     */
3067    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3068                                  brw_message_reg(0),
3069                                  src_reg(surf_index), src_reg(atomic_op));
3070    inst->mlen = mlen;
3071 }
3072
3073 void
3074 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3075                                         src_reg offset)
3076 {
3077    /* Set the surface read offset. */
3078    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3079
3080    /* Emit the instruction.  Note that this maps to the normal SIMD8
3081     * untyped surface read message, but that's OK because unused
3082     * channels will be masked out.
3083     */
3084    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3085                                  brw_message_reg(0),
3086                                  src_reg(surf_index), src_reg(1));
3087    inst->mlen = 1;
3088 }
3089
3090 void
3091 vec4_visitor::emit_ndc_computation()
3092 {
3093    /* Get the position */
3094    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3095
3096    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3097    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3098    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3099
3100    current_annotation = "NDC";
3101    dst_reg ndc_w = ndc;
3102    ndc_w.writemask = WRITEMASK_W;
3103    src_reg pos_w = pos;
3104    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3105    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3106
3107    dst_reg ndc_xyz = ndc;
3108    ndc_xyz.writemask = WRITEMASK_XYZ;
3109
3110    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3111 }
3112
3113 void
3114 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3115 {
3116    if (devinfo->gen < 6 &&
3117        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3118         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3119       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3120       dst_reg header1_w = header1;
3121       header1_w.writemask = WRITEMASK_W;
3122
3123       emit(MOV(header1, 0u));
3124
3125       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3126          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3127
3128          current_annotation = "Point size";
3129          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3130          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3131       }
3132
3133       if (key->userclip_active) {
3134          current_annotation = "Clipping flags";
3135          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3136          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3137
3138          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3139          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3140          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3141
3142          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3143          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3144          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3145          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3146       }
3147
3148       /* i965 clipping workaround:
3149        * 1) Test for -ve rhw
3150        * 2) If set,
3151        *      set ndc = (0,0,0,0)
3152        *      set ucp[6] = 1
3153        *
3154        * Later, clipping will detect ucp[6] and ensure the primitive is
3155        * clipped against all fixed planes.
3156        */
3157       if (devinfo->has_negative_rhw_bug) {
3158          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3159          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3160          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3161          vec4_instruction *inst;
3162          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3163          inst->predicate = BRW_PREDICATE_NORMAL;
3164          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3165          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3166          inst->predicate = BRW_PREDICATE_NORMAL;
3167       }
3168
3169       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3170    } else if (devinfo->gen < 6) {
3171       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3172    } else {
3173       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3174       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3175          dst_reg reg_w = reg;
3176          reg_w.writemask = WRITEMASK_W;
3177          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3178          reg_as_src.type = reg_w.type;
3179          reg_as_src.swizzle = brw_swizzle_for_size(1);
3180          emit(MOV(reg_w, reg_as_src));
3181       }
3182       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3183          dst_reg reg_y = reg;
3184          reg_y.writemask = WRITEMASK_Y;
3185          reg_y.type = BRW_REGISTER_TYPE_D;
3186          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3187          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3188       }
3189       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3190          dst_reg reg_z = reg;
3191          reg_z.writemask = WRITEMASK_Z;
3192          reg_z.type = BRW_REGISTER_TYPE_D;
3193          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3194          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3195       }
3196    }
3197 }
3198
3199 void
3200 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3201 {
3202    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3203     *
3204     *     "If a linked set of shaders forming the vertex stage contains no
3205     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3206     *     application has requested clipping against user clip planes through
3207     *     the API, then the coordinate written to gl_Position is used for
3208     *     comparison against the user clip planes."
3209     *
3210     * This function is only called if the shader didn't write to
3211     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3212     * if the user wrote to it; otherwise we use gl_Position.
3213     */
3214    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3215    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3216       clip_vertex = VARYING_SLOT_POS;
3217    }
3218
3219    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3220         ++i) {
3221       reg.writemask = 1 << i;
3222       emit(DP4(reg,
3223                src_reg(output_reg[clip_vertex]),
3224                src_reg(this->userplane[i + offset])));
3225    }
3226 }
3227
3228 vec4_instruction *
3229 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3230 {
3231    assert(varying < VARYING_SLOT_MAX);
3232    assert(output_reg[varying].type == reg.type);
3233    current_annotation = output_reg_annotation[varying];
3234    /* Copy the register, saturating if necessary */
3235    return emit(MOV(reg, src_reg(output_reg[varying])));
3236 }
3237
3238 void
3239 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3240 {
3241    reg.type = BRW_REGISTER_TYPE_F;
3242    output_reg[varying].type = reg.type;
3243
3244    switch (varying) {
3245    case VARYING_SLOT_PSIZ:
3246    {
3247       /* PSIZ is always in slot 0, and is coupled with other flags. */
3248       current_annotation = "indices, point width, clip flags";
3249       emit_psiz_and_flags(reg);
3250       break;
3251    }
3252    case BRW_VARYING_SLOT_NDC:
3253       current_annotation = "NDC";
3254       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3255       break;
3256    case VARYING_SLOT_POS:
3257       current_annotation = "gl_Position";
3258       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3259       break;
3260    case VARYING_SLOT_EDGE:
3261       /* This is present when doing unfilled polygons.  We're supposed to copy
3262        * the edge flag from the user-provided vertex array
3263        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3264        * of that attribute (starts as 1.0f).  This is then used in clipping to
3265        * determine which edges should be drawn as wireframe.
3266        */
3267       current_annotation = "edge flag";
3268       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3269                                     glsl_type::float_type, WRITEMASK_XYZW))));
3270       break;
3271    case BRW_VARYING_SLOT_PAD:
3272       /* No need to write to this slot */
3273       break;
3274    case VARYING_SLOT_COL0:
3275    case VARYING_SLOT_COL1:
3276    case VARYING_SLOT_BFC0:
3277    case VARYING_SLOT_BFC1: {
3278       /* These built-in varyings are only supported in compatibility mode,
3279        * and we only support GS in core profile.  So, this must be a vertex
3280        * shader.
3281        */
3282       assert(stage == MESA_SHADER_VERTEX);
3283       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3284       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3285          inst->saturate = true;
3286       break;
3287    }
3288
3289    default:
3290       emit_generic_urb_slot(reg, varying);
3291       break;
3292    }
3293 }
3294
3295 static int
3296 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3297 {
3298    if (devinfo->gen >= 6) {
3299       /* URB data written (does not include the message header reg) must
3300        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3301        * section 5.4.3.2.2: URB_INTERLEAVED.
3302        *
3303        * URB entries are allocated on a multiple of 1024 bits, so an
3304        * extra 128 bits written here to make the end align to 256 is
3305        * no problem.
3306        */
3307       if ((mlen % 2) != 1)
3308          mlen++;
3309    }
3310
3311    return mlen;
3312 }
3313
3314
3315 /**
3316  * Generates the VUE payload plus the necessary URB write instructions to
3317  * output it.
3318  *
3319  * The VUE layout is documented in Volume 2a.
3320  */
3321 void
3322 vec4_visitor::emit_vertex()
3323 {
3324    /* MRF 0 is reserved for the debugger, so start with message header
3325     * in MRF 1.
3326     */
3327    int base_mrf = 1;
3328    int mrf = base_mrf;
3329    /* In the process of generating our URB write message contents, we
3330     * may need to unspill a register or load from an array.  Those
3331     * reads would use MRFs 14-15.
3332     */
3333    int max_usable_mrf = 13;
3334
3335    /* The following assertion verifies that max_usable_mrf causes an
3336     * even-numbered amount of URB write data, which will meet gen6's
3337     * requirements for length alignment.
3338     */
3339    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3340
3341    /* First mrf is the g0-based message header containing URB handles and
3342     * such.
3343     */
3344    emit_urb_write_header(mrf++);
3345
3346    if (devinfo->gen < 6) {
3347       emit_ndc_computation();
3348    }
3349
3350    /* Lower legacy ff and ClipVertex clipping to clip distances */
3351    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3352       current_annotation = "user clip distances";
3353
3354       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3355       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3356
3357       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3358       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3359    }
3360
3361    /* We may need to split this up into several URB writes, so do them in a
3362     * loop.
3363     */
3364    int slot = 0;
3365    bool complete = false;
3366    do {
3367       /* URB offset is in URB row increments, and each of our MRFs is half of
3368        * one of those, since we're doing interleaved writes.
3369        */
3370       int offset = slot / 2;
3371
3372       mrf = base_mrf + 1;
3373       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3374          emit_urb_slot(dst_reg(MRF, mrf++),
3375                        prog_data->vue_map.slot_to_varying[slot]);
3376
3377          /* If this was max_usable_mrf, we can't fit anything more into this
3378           * URB WRITE.
3379           */
3380          if (mrf > max_usable_mrf) {
3381             slot++;
3382             break;
3383          }
3384       }
3385
3386       complete = slot >= prog_data->vue_map.num_slots;
3387       current_annotation = "URB write";
3388       vec4_instruction *inst = emit_urb_write_opcode(complete);
3389       inst->base_mrf = base_mrf;
3390       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3391       inst->offset += offset;
3392    } while(!complete);
3393 }
3394
3395
3396 src_reg
3397 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3398                                  src_reg *reladdr, int reg_offset)
3399 {
3400    /* Because we store the values to scratch interleaved like our
3401     * vertex data, we need to scale the vec4 index by 2.
3402     */
3403    int message_header_scale = 2;
3404
3405    /* Pre-gen6, the message header uses byte offsets instead of vec4
3406     * (16-byte) offset units.
3407     */
3408    if (devinfo->gen < 6)
3409       message_header_scale *= 16;
3410
3411    if (reladdr) {
3412       src_reg index = src_reg(this, glsl_type::int_type);
3413
3414       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3415                                    src_reg(reg_offset)));
3416       emit_before(block, inst, MUL(dst_reg(index), index,
3417                                    src_reg(message_header_scale)));
3418
3419       return index;
3420    } else {
3421       return src_reg(reg_offset * message_header_scale);
3422    }
3423 }
3424
3425 src_reg
3426 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3427                                        src_reg *reladdr, int reg_offset)
3428 {
3429    if (reladdr) {
3430       src_reg index = src_reg(this, glsl_type::int_type);
3431
3432       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3433                                    src_reg(reg_offset)));
3434
3435       /* Pre-gen6, the message header uses byte offsets instead of vec4
3436        * (16-byte) offset units.
3437        */
3438       if (devinfo->gen < 6) {
3439          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3440       }
3441
3442       return index;
3443    } else if (devinfo->gen >= 8) {
3444       /* Store the offset in a GRF so we can send-from-GRF. */
3445       src_reg offset = src_reg(this, glsl_type::int_type);
3446       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3447       return offset;
3448    } else {
3449       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3450       return src_reg(reg_offset * message_header_scale);
3451    }
3452 }
3453
3454 /**
3455  * Emits an instruction before @inst to load the value named by @orig_src
3456  * from scratch space at @base_offset to @temp.
3457  *
3458  * @base_offset is measured in 32-byte units (the size of a register).
3459  */
3460 void
3461 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3462                                 dst_reg temp, src_reg orig_src,
3463                                 int base_offset)
3464 {
3465    int reg_offset = base_offset + orig_src.reg_offset;
3466    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3467                                       reg_offset);
3468
3469    emit_before(block, inst, SCRATCH_READ(temp, index));
3470 }
3471
3472 /**
3473  * Emits an instruction after @inst to store the value to be written
3474  * to @orig_dst to scratch space at @base_offset, from @temp.
3475  *
3476  * @base_offset is measured in 32-byte units (the size of a register).
3477  */
3478 void
3479 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3480                                  int base_offset)
3481 {
3482    int reg_offset = base_offset + inst->dst.reg_offset;
3483    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3484                                       reg_offset);
3485
3486    /* Create a temporary register to store *inst's result in.
3487     *
3488     * We have to be careful in MOVing from our temporary result register in
3489     * the scratch write.  If we swizzle from channels of the temporary that
3490     * weren't initialized, it will confuse live interval analysis, which will
3491     * make spilling fail to make progress.
3492     */
3493    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3494                                        inst->dst.type),
3495                                 brw_swizzle_for_mask(inst->dst.writemask));
3496    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3497                                        inst->dst.writemask));
3498    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3499    if (inst->opcode != BRW_OPCODE_SEL)
3500       write->predicate = inst->predicate;
3501    write->ir = inst->ir;
3502    write->annotation = inst->annotation;
3503    inst->insert_after(block, write);
3504
3505    inst->dst.file = temp.file;
3506    inst->dst.reg = temp.reg;
3507    inst->dst.reg_offset = temp.reg_offset;
3508    inst->dst.reladdr = NULL;
3509 }
3510
3511 /**
3512  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3513  * adds the scratch read(s) before \p inst. The function also checks for
3514  * recursive reladdr scratch accesses, issuing the corresponding scratch
3515  * loads and rewriting reladdr references accordingly.
3516  *
3517  * \return \p src if it did not require a scratch load, otherwise, the
3518  * register holding the result of the scratch load that the caller should
3519  * use to rewrite src.
3520  */
3521 src_reg
3522 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3523                                    vec4_instruction *inst, src_reg src)
3524 {
3525    /* Resolve recursive reladdr scratch access by calling ourselves
3526     * with src.reladdr
3527     */
3528    if (src.reladdr)
3529       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3530                                           *src.reladdr);
3531
3532    /* Now handle scratch access on src */
3533    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3534       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3535       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3536       src.reg = temp.reg;
3537       src.reg_offset = temp.reg_offset;
3538       src.reladdr = NULL;
3539    }
3540
3541    return src;
3542 }
3543
3544 /**
3545  * We can't generally support array access in GRF space, because a
3546  * single instruction's destination can only span 2 contiguous
3547  * registers.  So, we send all GRF arrays that get variable index
3548  * access to scratch space.
3549  */
3550 void
3551 vec4_visitor::move_grf_array_access_to_scratch()
3552 {
3553    int scratch_loc[this->alloc.count];
3554    memset(scratch_loc, -1, sizeof(scratch_loc));
3555
3556    /* First, calculate the set of virtual GRFs that need to be punted
3557     * to scratch due to having any array access on them, and where in
3558     * scratch.
3559     */
3560    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3561       if (inst->dst.file == GRF && inst->dst.reladdr) {
3562          if (scratch_loc[inst->dst.reg] == -1) {
3563             scratch_loc[inst->dst.reg] = last_scratch;
3564             last_scratch += this->alloc.sizes[inst->dst.reg];
3565          }
3566
3567          for (src_reg *iter = inst->dst.reladdr;
3568               iter->reladdr;
3569               iter = iter->reladdr) {
3570             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3571                scratch_loc[iter->reg] = last_scratch;
3572                last_scratch += this->alloc.sizes[iter->reg];
3573             }
3574          }
3575       }
3576
3577       for (int i = 0 ; i < 3; i++) {
3578          for (src_reg *iter = &inst->src[i];
3579               iter->reladdr;
3580               iter = iter->reladdr) {
3581             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3582                scratch_loc[iter->reg] = last_scratch;
3583                last_scratch += this->alloc.sizes[iter->reg];
3584             }
3585          }
3586       }
3587    }
3588
3589    /* Now, for anything that will be accessed through scratch, rewrite
3590     * it to load/store.  Note that this is a _safe list walk, because
3591     * we may generate a new scratch_write instruction after the one
3592     * we're processing.
3593     */
3594    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3595       /* Set up the annotation tracking for new generated instructions. */
3596       base_ir = inst->ir;
3597       current_annotation = inst->annotation;
3598
3599       /* First handle scratch access on the dst. Notice we have to handle
3600        * the case where the dst's reladdr also points to scratch space.
3601        */
3602       if (inst->dst.reladdr)
3603          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3604                                                    *inst->dst.reladdr);
3605
3606       /* Now that we have handled any (possibly recursive) reladdr scratch
3607        * accesses for dst we can safely do the scratch write for dst itself
3608        */
3609       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3610          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3611
3612       /* Now handle scratch access on any src. In this case, since inst->src[i]
3613        * already is a src_reg, we can just call emit_resolve_reladdr with
3614        * inst->src[i] and it will take care of handling scratch loads for
3615        * both src and src.reladdr (recursively).
3616        */
3617       for (int i = 0 ; i < 3; i++) {
3618          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3619                                              inst->src[i]);
3620       }
3621    }
3622 }
3623
3624 /**
3625  * Emits an instruction before @inst to load the value named by @orig_src
3626  * from the pull constant buffer (surface) at @base_offset to @temp.
3627  */
3628 void
3629 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3630                                       dst_reg temp, src_reg orig_src,
3631                                       int base_offset)
3632 {
3633    int reg_offset = base_offset + orig_src.reg_offset;
3634    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3635    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3636                                              reg_offset);
3637
3638    emit_pull_constant_load_reg(temp,
3639                                index,
3640                                offset,
3641                                block, inst);
3642 }
3643
3644 /**
3645  * Implements array access of uniforms by inserting a
3646  * PULL_CONSTANT_LOAD instruction.
3647  *
3648  * Unlike temporary GRF array access (where we don't support it due to
3649  * the difficulty of doing relative addressing on instruction
3650  * destinations), we could potentially do array access of uniforms
3651  * that were loaded in GRF space as push constants.  In real-world
3652  * usage we've seen, though, the arrays being used are always larger
3653  * than we could load as push constants, so just always move all
3654  * uniform array access out to a pull constant buffer.
3655  */
3656 void
3657 vec4_visitor::move_uniform_array_access_to_pull_constants()
3658 {
3659    int pull_constant_loc[this->uniforms];
3660    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3661    bool nested_reladdr;
3662
3663    /* Walk through and find array access of uniforms.  Put a copy of that
3664     * uniform in the pull constant buffer.
3665     *
3666     * Note that we don't move constant-indexed accesses to arrays.  No
3667     * testing has been done of the performance impact of this choice.
3668     */
3669    do {
3670       nested_reladdr = false;
3671
3672       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3673          for (int i = 0 ; i < 3; i++) {
3674             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3675                continue;
3676
3677             int uniform = inst->src[i].reg;
3678
3679             if (inst->src[i].reladdr->reladdr)
3680                nested_reladdr = true;  /* will need another pass */
3681
3682             /* If this array isn't already present in the pull constant buffer,
3683              * add it.
3684              */
3685             if (pull_constant_loc[uniform] == -1) {
3686                const gl_constant_value **values =
3687                   &stage_prog_data->param[uniform * 4];
3688
3689                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3690
3691                assert(uniform < uniform_array_size);
3692                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3693                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3694                      = values[j];
3695                }
3696             }
3697
3698             /* Set up the annotation tracking for new generated instructions. */
3699             base_ir = inst->ir;
3700             current_annotation = inst->annotation;
3701
3702             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3703
3704             emit_pull_constant_load(block, inst, temp, inst->src[i],
3705                                     pull_constant_loc[uniform]);
3706
3707             inst->src[i].file = temp.file;
3708             inst->src[i].reg = temp.reg;
3709             inst->src[i].reg_offset = temp.reg_offset;
3710             inst->src[i].reladdr = NULL;
3711          }
3712       }
3713    } while (nested_reladdr);
3714
3715    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3716     * no need to track them as larger-than-vec4 objects.  This will be
3717     * relied on in cutting out unused uniform vectors from push
3718     * constants.
3719     */
3720    split_uniform_registers();
3721 }
3722
3723 void
3724 vec4_visitor::resolve_ud_negate(src_reg *reg)
3725 {
3726    if (reg->type != BRW_REGISTER_TYPE_UD ||
3727        !reg->negate)
3728       return;
3729
3730    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3731    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3732    *reg = temp;
3733 }
3734
3735 /**
3736  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3737  *
3738  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3739  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3740  */
3741 void
3742 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3743 {
3744    assert(devinfo->gen <= 5);
3745
3746    if (!rvalue->type->is_boolean())
3747       return;
3748
3749    src_reg and_result = src_reg(this, rvalue->type);
3750    src_reg neg_result = src_reg(this, rvalue->type);
3751    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3752    emit(MOV(dst_reg(neg_result), negate(and_result)));
3753    *reg = neg_result;
3754 }
3755
3756 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3757                            void *log_data,
3758                            struct gl_program *prog,
3759                            const struct brw_vue_prog_key *key,
3760                            struct brw_vue_prog_data *prog_data,
3761                            struct gl_shader_program *shader_prog,
3762                            gl_shader_stage stage,
3763                            void *mem_ctx,
3764                            bool no_spills,
3765                            int shader_time_index)
3766    : backend_shader(compiler, log_data, mem_ctx,
3767                     shader_prog, prog, &prog_data->base, stage),
3768      key(key),
3769      prog_data(prog_data),
3770      sanity_param_count(0),
3771      fail_msg(NULL),
3772      first_non_payload_grf(0),
3773      need_all_constants_in_pull_buffer(false),
3774      no_spills(no_spills),
3775      shader_time_index(shader_time_index),
3776      last_scratch(0)
3777 {
3778    this->failed = false;
3779
3780    this->base_ir = NULL;
3781    this->current_annotation = NULL;
3782    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3783
3784    this->variable_ht = hash_table_ctor(0,
3785                                        hash_table_pointer_hash,
3786                                        hash_table_pointer_compare);
3787
3788    this->virtual_grf_start = NULL;
3789    this->virtual_grf_end = NULL;
3790    this->live_intervals = NULL;
3791
3792    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3793
3794    this->uniforms = 0;
3795
3796    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3797     * at least one. See setup_uniforms() in brw_vec4.cpp.
3798     */
3799    this->uniform_array_size = 1;
3800    if (prog_data) {
3801       this->uniform_array_size =
3802          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3803    }
3804
3805    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3806    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3807 }
3808
3809 vec4_visitor::~vec4_visitor()
3810 {
3811    hash_table_dtor(this->variable_ht);
3812 }
3813
3814
3815 void
3816 vec4_visitor::fail(const char *format, ...)
3817 {
3818    va_list va;
3819    char *msg;
3820
3821    if (failed)
3822       return;
3823
3824    failed = true;
3825
3826    va_start(va, format);
3827    msg = ralloc_vasprintf(mem_ctx, format, va);
3828    va_end(va);
3829    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3830
3831    this->fail_msg = msg;
3832
3833    if (debug_enabled) {
3834       fprintf(stderr, "%s",  msg);
3835    }
3836 }
3837
3838 } /* namespace brw */