src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(const src_reg &src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 317 {
 318    if (!src.abs && !src.negate)
 319       return src;
 320
 321    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 322    resolved.type = src.type;
 323    emit(MOV(resolved, src));
 324
 325    return src_reg(resolved);
 326 }
 327
 328 src_reg
 329 vec4_visitor::fix_math_operand(const src_reg &src)
 330 {
 331    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 332       return src;
 333
 334    /* The gen6 math instruction ignores the source modifiers --
 335     * swizzle, abs, negate, and at least some parts of the register
 336     * region description.
 337     *
 338     * Rather than trying to enumerate all these cases, *always* expand the
 339     * operand to a temp GRF for gen6.
 340     *
 341     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 342     * can't use.
 343     */
 344
 345    if (devinfo->gen == 7 && src.file != IMM)
 346       return src;
 347
 348    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 349    expanded.type = src.type;
 350    emit(MOV(expanded, src));
 351    return src_reg(expanded);
 352 }
 353
 354 vec4_instruction *
 355 vec4_visitor::emit_math(enum opcode opcode,
 356                         const dst_reg &dst,
 357                         const src_reg &src0, const src_reg &src1)
 358 {
 359    vec4_instruction *math =
 360       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 361
 362    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 363       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 364       math->dst = dst_reg(this, glsl_type::vec4_type);
 365       math->dst.type = dst.type;
 366       math = emit(MOV(dst, src_reg(math->dst)));
 367    } else if (devinfo->gen < 6) {
 368       math->base_mrf = 1;
 369       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 370    }
 371
 372    return math;
 373 }
 374
 375 void
 376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 377 {
 378    if (devinfo->gen < 7) {
 379       unreachable("ir_unop_pack_half_2x16 should be lowered");
 380    }
 381
 382    assert(dst.type == BRW_REGISTER_TYPE_UD);
 383    assert(src0.type == BRW_REGISTER_TYPE_F);
 384
 385    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 386     *
 387     *   Because this instruction does not have a 16-bit floating-point type,
 388     *   the destination data type must be Word (W).
 389     *
 390     *   The destination must be DWord-aligned and specify a horizontal stride
 391     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 392     *   each destination channel and the upper word is not modified.
 393     *
 394     * The above restriction implies that the f32to16 instruction must use
 395     * align1 mode, because only in align1 mode is it possible to specify
 396     * horizontal stride.  We choose here to defy the hardware docs and emit
 397     * align16 instructions.
 398     *
 399     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 400     * instructions. I was partially successful in that the code passed all
 401     * tests.  However, the code was dubiously correct and fragile, and the
 402     * tests were not harsh enough to probe that frailty. Not trusting the
 403     * code, I chose instead to remain in align16 mode in defiance of the hw
 404     * docs).
 405     *
 406     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 407     * simulator, emitting a f32to16 in align16 mode with UD as destination
 408     * data type is safe. The behavior differs from that specified in the PRM
 409     * in that the upper word of each destination channel is cleared to 0.
 410     */
 411
 412    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 413    src_reg tmp_src(tmp_dst);
 414
 415 #if 0
 416    /* Verify the undocumented behavior on which the following instructions
 417     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 418     * then the result of the bit-or instruction below will be incorrect.
 419     *
 420     * You should inspect the disasm output in order to verify that the MOV is
 421     * not optimized away.
 422     */
 423    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 424 #endif
 425
 426    /* Give tmp the form below, where "." means untouched.
 427     *
 428     *     w z          y          x w z          y          x
 429     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 430     *
 431     * That the upper word of each write-channel be 0 is required for the
 432     * following bit-shift and bit-or instructions to work. Note that this
 433     * relies on the undocumented hardware behavior mentioned above.
 434     */
 435    tmp_dst.writemask = WRITEMASK_XY;
 436    emit(F32TO16(tmp_dst, src0));
 437
 438    /* Give the write-channels of dst the form:
 439     *   0xhhhh0000
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 442    emit(SHL(dst, tmp_src, src_reg(16u)));
 443
 444    /* Finally, give the write-channels of dst the form of packHalf2x16's
 445     * output:
 446     *   0xhhhhllll
 447     */
 448    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 449    emit(OR(dst, src_reg(dst), tmp_src));
 450 }
 451
 452 void
 453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 454 {
 455    if (devinfo->gen < 7) {
 456       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 457    }
 458
 459    assert(dst.type == BRW_REGISTER_TYPE_F);
 460    assert(src0.type == BRW_REGISTER_TYPE_UD);
 461
 462    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 463     *
 464     *   Because this instruction does not have a 16-bit floating-point type,
 465     *   the source data type must be Word (W). The destination type must be
 466     *   F (Float).
 467     *
 468     * To use W as the source data type, we must adjust horizontal strides,
 469     * which is only possible in align1 mode. All my [chadv] attempts at
 470     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 471     * Piglit tests, so I gave up.
 472     *
 473     * I've verified that, on gen7 hardware and the simulator, it is safe to
 474     * emit f16to32 in align16 mode with UD as source data type.
 475     */
 476
 477    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 478    src_reg tmp_src(tmp_dst);
 479
 480    tmp_dst.writemask = WRITEMASK_X;
 481    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 482
 483    tmp_dst.writemask = WRITEMASK_Y;
 484    emit(SHR(tmp_dst, src0, src_reg(16u)));
 485
 486    dst.writemask = WRITEMASK_XY;
 487    emit(F16TO32(dst, tmp_src));
 488 }
 489
 490 void
 491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 492 {
 493    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 494     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 495     * is not suitable to generate the shift values, but we can use the packed
 496     * vector float and a type-converting MOV.
 497     */
 498    dst_reg shift(this, glsl_type::uvec4_type);
 499    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 500
 501    dst_reg shifted(this, glsl_type::uvec4_type);
 502    src0.swizzle = BRW_SWIZZLE_XXXX;
 503    emit(SHR(shifted, src0, src_reg(shift)));
 504
 505    shifted.type = BRW_REGISTER_TYPE_UB;
 506    dst_reg f(this, glsl_type::vec4_type);
 507    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 508
 509    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 514 {
 515    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 516     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 517     * is not suitable to generate the shift values, but we can use the packed
 518     * vector float and a type-converting MOV.
 519     */
 520    dst_reg shift(this, glsl_type::uvec4_type);
 521    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 522
 523    dst_reg shifted(this, glsl_type::uvec4_type);
 524    src0.swizzle = BRW_SWIZZLE_XXXX;
 525    emit(SHR(shifted, src0, src_reg(shift)));
 526
 527    shifted.type = BRW_REGISTER_TYPE_B;
 528    dst_reg f(this, glsl_type::vec4_type);
 529    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 533
 534    dst_reg max(this, glsl_type::vec4_type);
 535    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 536    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 537 }
 538
 539 void
 540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 541 {
 542    dst_reg saturated(this, glsl_type::vec4_type);
 543    vec4_instruction *inst = emit(MOV(saturated, src0));
 544    inst->saturate = true;
 545
 546    dst_reg scaled(this, glsl_type::vec4_type);
 547    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 548
 549    dst_reg rounded(this, glsl_type::vec4_type);
 550    emit(RNDE(rounded, src_reg(scaled)));
 551
 552    dst_reg u(this, glsl_type::uvec4_type);
 553    emit(MOV(u, src_reg(rounded)));
 554
 555    src_reg bytes(u);
 556    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 557 }
 558
 559 void
 560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 561 {
 562    dst_reg max(this, glsl_type::vec4_type);
 563    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 564
 565    dst_reg min(this, glsl_type::vec4_type);
 566    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 567
 568    dst_reg scaled(this, glsl_type::vec4_type);
 569    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 570
 571    dst_reg rounded(this, glsl_type::vec4_type);
 572    emit(RNDE(rounded, src_reg(scaled)));
 573
 574    dst_reg i(this, glsl_type::ivec4_type);
 575    emit(MOV(i, src_reg(rounded)));
 576
 577    src_reg bytes(i);
 578    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 579 }
 580
 581 void
 582 vec4_visitor::visit_instructions(const exec_list *list)
 583 {
 584    foreach_in_list(ir_instruction, ir, list) {
 585       base_ir = ir;
 586       ir->accept(this);
 587    }
 588 }
 589
 590 /**
 591  * Returns the minimum number of vec4 elements needed to pack a type.
 592  *
 593  * For simple types, it will return 1 (a single vec4); for matrices, the
 594  * number of columns; for array and struct, the sum of the vec4_size of
 595  * each of its elements; and for sampler and atomic, zero.
 596  *
 597  * This method is useful to calculate how much register space is needed to
 598  * store a particular type.
 599  */
 600 int
 601 vec4_visitor::type_size(const struct glsl_type *type)
 602 {
 603    unsigned int i;
 604    int size;
 605
 606    switch (type->base_type) {
 607    case GLSL_TYPE_UINT:
 608    case GLSL_TYPE_INT:
 609    case GLSL_TYPE_FLOAT:
 610    case GLSL_TYPE_BOOL:
 611       if (type->is_matrix()) {
 612          return type->matrix_columns;
 613       } else {
 614          /* Regardless of size of vector, it gets a vec4. This is bad
 615           * packing for things like floats, but otherwise arrays become a
 616           * mess.  Hopefully a later pass over the code can pack scalars
 617           * down if appropriate.
 618           */
 619          return 1;
 620       }
 621    case GLSL_TYPE_ARRAY:
 622       assert(type->length > 0);
 623       return type_size(type->fields.array) * type->length;
 624    case GLSL_TYPE_STRUCT:
 625       size = 0;
 626       for (i = 0; i < type->length; i++) {
 627          size += type_size(type->fields.structure[i].type);
 628       }
 629       return size;
 630    case GLSL_TYPE_SUBROUTINE:
 631       return 1;
 632
 633    case GLSL_TYPE_SAMPLER:
 634       /* Samplers take up no register space, since they're baked in at
 635        * link time.
 636        */
 637       return 0;
 638    case GLSL_TYPE_ATOMIC_UINT:
 639       return 0;
 640    case GLSL_TYPE_IMAGE:
 641       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 642    case GLSL_TYPE_VOID:
 643    case GLSL_TYPE_DOUBLE:
 644    case GLSL_TYPE_ERROR:
 645    case GLSL_TYPE_INTERFACE:
 646    case GLSL_TYPE_FUNCTION:
 647       unreachable("not reached");
 648    }
 649
 650    return 0;
 651 }
 652
 653 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 654 {
 655    init();
 656
 657    this->file = GRF;
 658    this->reg = v->alloc.allocate(v->type_size(type));
 659
 660    if (type->is_array() || type->is_record()) {
 661       this->swizzle = BRW_SWIZZLE_NOOP;
 662    } else {
 663       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 664    }
 665
 666    this->type = brw_type_for_base_type(type);
 667 }
 668
 669 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 670 {
 671    assert(size > 0);
 672
 673    init();
 674
 675    this->file = GRF;
 676    this->reg = v->alloc.allocate(v->type_size(type) * size);
 677
 678    this->swizzle = BRW_SWIZZLE_NOOP;
 679
 680    this->type = brw_type_for_base_type(type);
 681 }
 682
 683 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 684 {
 685    init();
 686
 687    this->file = GRF;
 688    this->reg = v->alloc.allocate(v->type_size(type));
 689
 690    if (type->is_array() || type->is_record()) {
 691       this->writemask = WRITEMASK_XYZW;
 692    } else {
 693       this->writemask = (1 << type->vector_elements) - 1;
 694    }
 695
 696    this->type = brw_type_for_base_type(type);
 697 }
 698
 699 void
 700 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
 701                                           unsigned n)
 702 {
 703    static const gl_constant_value zero = { 0 };
 704
 705    for (unsigned i = 0; i < n; ++i)
 706       stage_prog_data->param[4 * uniforms + i] = &values[i];
 707
 708    for (unsigned i = n; i < 4; ++i)
 709       stage_prog_data->param[4 * uniforms + i] = &zero;
 710
 711    uniform_vector_size[uniforms++] = n;
 712 }
 713
 714 /* Our support for uniforms is piggy-backed on the struct
 715  * gl_fragment_program, because that's where the values actually
 716  * get stored, rather than in some global gl_shader_program uniform
 717  * store.
 718  */
 719 void
 720 vec4_visitor::setup_uniform_values(ir_variable *ir)
 721 {
 722    int namelen = strlen(ir->name);
 723
 724    /* The data for our (non-builtin) uniforms is stored in a series of
 725     * gl_uniform_driver_storage structs for each subcomponent that
 726     * glGetUniformLocation() could name.  We know it's been set up in the same
 727     * order we'd walk the type, so walk the list of storage and find anything
 728     * with our name, or the prefix of a component that starts with our name.
 729     */
 730    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 731       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 732
 733       if (storage->builtin)
 734          continue;
 735
 736       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 737           (storage->name[namelen] != 0 &&
 738            storage->name[namelen] != '.' &&
 739            storage->name[namelen] != '[')) {
 740          continue;
 741       }
 742
 743       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 744                                      storage->type->matrix_columns);
 745       const unsigned vector_size = storage->type->vector_elements;
 746
 747       for (unsigned s = 0; s < vector_count; s++)
 748          setup_vector_uniform_values(&storage->storage[s * vector_size],
 749                                      vector_size);
 750    }
 751 }
 752
 753 void
 754 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 755 {
 756    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 757       assert(this->uniforms < uniform_array_size);
 758       this->uniform_vector_size[this->uniforms] = 4;
 759       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 760       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 761       for (int j = 0; j < 4; ++j) {
 762          stage_prog_data->param[this->uniforms * 4 + j] =
 763             (gl_constant_value *) &clip_planes[i][j];
 764       }
 765       ++this->uniforms;
 766    }
 767 }
 768
 769 /* Our support for builtin uniforms is even scarier than non-builtin.
 770  * It sits on top of the PROG_STATE_VAR parameters that are
 771  * automatically updated from GL context state.
 772  */
 773 void
 774 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 775 {
 776    const ir_state_slot *const slots = ir->get_state_slots();
 777    assert(slots != NULL);
 778
 779    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 780       /* This state reference has already been setup by ir_to_mesa,
 781        * but we'll get the same index back here.  We can reference
 782        * ParameterValues directly, since unlike brw_fs.cpp, we never
 783        * add new state references during compile.
 784        */
 785       int index = _mesa_add_state_reference(this->prog->Parameters,
 786                                             (gl_state_index *)slots[i].tokens);
 787       gl_constant_value *values =
 788          &this->prog->Parameters->ParameterValues[index][0];
 789
 790       assert(this->uniforms < uniform_array_size);
 791
 792       for (unsigned j = 0; j < 4; j++)
 793          stage_prog_data->param[this->uniforms * 4 + j] =
 794             &values[GET_SWZ(slots[i].swizzle, j)];
 795
 796       this->uniform_vector_size[this->uniforms] =
 797          (ir->type->is_scalar() || ir->type->is_vector() ||
 798           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 799
 800       this->uniforms++;
 801    }
 802 }
 803
 804 dst_reg *
 805 vec4_visitor::variable_storage(ir_variable *var)
 806 {
 807    return (dst_reg *)hash_table_find(this->variable_ht, var);
 808 }
 809
 810 void
 811 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 812                                      enum brw_predicate *predicate)
 813 {
 814    ir_expression *expr = ir->as_expression();
 815
 816    *predicate = BRW_PREDICATE_NORMAL;
 817
 818    if (expr && expr->operation != ir_binop_ubo_load) {
 819       src_reg op[3];
 820       vec4_instruction *inst;
 821
 822       assert(expr->get_num_operands() <= 3);
 823       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 824          expr->operands[i]->accept(this);
 825          op[i] = this->result;
 826
 827          resolve_ud_negate(&op[i]);
 828       }
 829
 830       switch (expr->operation) {
 831       case ir_unop_logic_not:
 832          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 833          inst->conditional_mod = BRW_CONDITIONAL_Z;
 834          break;
 835
 836       case ir_binop_logic_xor:
 837          if (devinfo->gen <= 5) {
 838             src_reg temp = src_reg(this, ir->type);
 839             emit(XOR(dst_reg(temp), op[0], op[1]));
 840             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 841          } else {
 842             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 843          }
 844          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 845          break;
 846
 847       case ir_binop_logic_or:
 848          if (devinfo->gen <= 5) {
 849             src_reg temp = src_reg(this, ir->type);
 850             emit(OR(dst_reg(temp), op[0], op[1]));
 851             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 852          } else {
 853             inst = emit(OR(dst_null_d(), op[0], op[1]));
 854          }
 855          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 856          break;
 857
 858       case ir_binop_logic_and:
 859          if (devinfo->gen <= 5) {
 860             src_reg temp = src_reg(this, ir->type);
 861             emit(AND(dst_reg(temp), op[0], op[1]));
 862             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 863          } else {
 864             inst = emit(AND(dst_null_d(), op[0], op[1]));
 865          }
 866          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867          break;
 868
 869       case ir_unop_f2b:
 870          if (devinfo->gen >= 6) {
 871             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 872          } else {
 873             inst = emit(MOV(dst_null_f(), op[0]));
 874             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 875          }
 876          break;
 877
 878       case ir_unop_i2b:
 879          if (devinfo->gen >= 6) {
 880             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 881          } else {
 882             inst = emit(MOV(dst_null_d(), op[0]));
 883             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 884          }
 885          break;
 886
 887       case ir_binop_all_equal:
 888          if (devinfo->gen <= 5) {
 889             resolve_bool_comparison(expr->operands[0], &op[0]);
 890             resolve_bool_comparison(expr->operands[1], &op[1]);
 891          }
 892          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 893          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 894          break;
 895
 896       case ir_binop_any_nequal:
 897          if (devinfo->gen <= 5) {
 898             resolve_bool_comparison(expr->operands[0], &op[0]);
 899             resolve_bool_comparison(expr->operands[1], &op[1]);
 900          }
 901          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 902          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 903          break;
 904
 905       case ir_unop_any:
 906          if (devinfo->gen <= 5) {
 907             resolve_bool_comparison(expr->operands[0], &op[0]);
 908          }
 909          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 910          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 911          break;
 912
 913       case ir_binop_greater:
 914       case ir_binop_gequal:
 915       case ir_binop_less:
 916       case ir_binop_lequal:
 917       case ir_binop_equal:
 918       case ir_binop_nequal:
 919          if (devinfo->gen <= 5) {
 920             resolve_bool_comparison(expr->operands[0], &op[0]);
 921             resolve_bool_comparison(expr->operands[1], &op[1]);
 922          }
 923          emit(CMP(dst_null_d(), op[0], op[1],
 924                   brw_conditional_for_comparison(expr->operation)));
 925          break;
 926
 927       case ir_triop_csel: {
 928          /* Expand the boolean condition into the flag register. */
 929          inst = emit(MOV(dst_null_d(), op[0]));
 930          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 931
 932          /* Select which boolean to return. */
 933          dst_reg temp(this, expr->operands[1]->type);
 934          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 935          inst->predicate = BRW_PREDICATE_NORMAL;
 936
 937          /* Expand the result to a condition code. */
 938          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 939          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 940          break;
 941       }
 942
 943       default:
 944          unreachable("not reached");
 945       }
 946       return;
 947    }
 948
 949    ir->accept(this);
 950
 951    resolve_ud_negate(&this->result);
 952
 953    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 954    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 955 }
 956
 957 /**
 958  * Emit a gen6 IF statement with the comparison folded into the IF
 959  * instruction.
 960  */
 961 void
 962 vec4_visitor::emit_if_gen6(ir_if *ir)
 963 {
 964    ir_expression *expr = ir->condition->as_expression();
 965
 966    if (expr && expr->operation != ir_binop_ubo_load) {
 967       src_reg op[3];
 968       dst_reg temp;
 969
 970       assert(expr->get_num_operands() <= 3);
 971       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 972          expr->operands[i]->accept(this);
 973          op[i] = this->result;
 974       }
 975
 976       switch (expr->operation) {
 977       case ir_unop_logic_not:
 978          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 979          return;
 980
 981       case ir_binop_logic_xor:
 982          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 983          return;
 984
 985       case ir_binop_logic_or:
 986          temp = dst_reg(this, glsl_type::bool_type);
 987          emit(OR(temp, op[0], op[1]));
 988          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 989          return;
 990
 991       case ir_binop_logic_and:
 992          temp = dst_reg(this, glsl_type::bool_type);
 993          emit(AND(temp, op[0], op[1]));
 994          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 995          return;
 996
 997       case ir_unop_f2b:
 998          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 999          return;
1000
1001       case ir_unop_i2b:
1002          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1003          return;
1004
1005       case ir_binop_greater:
1006       case ir_binop_gequal:
1007       case ir_binop_less:
1008       case ir_binop_lequal:
1009       case ir_binop_equal:
1010       case ir_binop_nequal:
1011          emit(IF(op[0], op[1],
1012                  brw_conditional_for_comparison(expr->operation)));
1013          return;
1014
1015       case ir_binop_all_equal:
1016          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1017          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1018          return;
1019
1020       case ir_binop_any_nequal:
1021          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1022          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1023          return;
1024
1025       case ir_unop_any:
1026          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1027          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1028          return;
1029
1030       case ir_triop_csel: {
1031          /* Expand the boolean condition into the flag register. */
1032          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1033          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1034
1035          /* Select which boolean to return. */
1036          dst_reg temp(this, expr->operands[1]->type);
1037          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1038          inst->predicate = BRW_PREDICATE_NORMAL;
1039
1040          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1041          return;
1042       }
1043
1044       default:
1045          unreachable("not reached");
1046       }
1047       return;
1048    }
1049
1050    ir->condition->accept(this);
1051
1052    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1053 }
1054
1055 void
1056 vec4_visitor::visit(ir_variable *ir)
1057 {
1058    dst_reg *reg = NULL;
1059
1060    if (variable_storage(ir))
1061       return;
1062
1063    switch (ir->data.mode) {
1064    case ir_var_shader_in:
1065       assert(ir->data.location != -1);
1066       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1067       break;
1068
1069    case ir_var_shader_out:
1070       assert(ir->data.location != -1);
1071       reg = new(mem_ctx) dst_reg(this, ir->type);
1072
1073       for (int i = 0; i < type_size(ir->type); i++) {
1074          output_reg[ir->data.location + i] = *reg;
1075          output_reg[ir->data.location + i].reg_offset = i;
1076          output_reg_annotation[ir->data.location + i] = ir->name;
1077       }
1078       break;
1079
1080    case ir_var_auto:
1081    case ir_var_temporary:
1082       reg = new(mem_ctx) dst_reg(this, ir->type);
1083       break;
1084
1085    case ir_var_uniform:
1086       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1087
1088       /* Thanks to the lower_ubo_reference pass, we will see only
1089        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1090        * variables, so no need for them to be in variable_ht.
1091        *
1092        * Some uniforms, such as samplers and atomic counters, have no actual
1093        * storage, so we should ignore them.
1094        */
1095       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1096          return;
1097
1098       /* Track how big the whole uniform variable is, in case we need to put a
1099        * copy of its data into pull constants for array access.
1100        */
1101       assert(this->uniforms < uniform_array_size);
1102       this->uniform_size[this->uniforms] = type_size(ir->type);
1103
1104       if (!strncmp(ir->name, "gl_", 3)) {
1105          setup_builtin_uniform_values(ir);
1106       } else {
1107          setup_uniform_values(ir);
1108       }
1109       break;
1110
1111    case ir_var_system_value:
1112       reg = make_reg_for_system_value(ir->data.location, ir->type);
1113       break;
1114
1115    default:
1116       unreachable("not reached");
1117    }
1118
1119    reg->type = brw_type_for_base_type(ir->type);
1120    hash_table_insert(this->variable_ht, reg, ir);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop *ir)
1125 {
1126    /* We don't want debugging output to print the whole body of the
1127     * loop as the annotation.
1128     */
1129    this->base_ir = NULL;
1130
1131    emit(BRW_OPCODE_DO);
1132
1133    visit_instructions(&ir->body_instructions);
1134
1135    emit(BRW_OPCODE_WHILE);
1136 }
1137
1138 void
1139 vec4_visitor::visit(ir_loop_jump *ir)
1140 {
1141    switch (ir->mode) {
1142    case ir_loop_jump::jump_break:
1143       emit(BRW_OPCODE_BREAK);
1144       break;
1145    case ir_loop_jump::jump_continue:
1146       emit(BRW_OPCODE_CONTINUE);
1147       break;
1148    }
1149 }
1150
1151
1152 void
1153 vec4_visitor::visit(ir_function_signature *)
1154 {
1155    unreachable("not reached");
1156 }
1157
1158 void
1159 vec4_visitor::visit(ir_function *ir)
1160 {
1161    /* Ignore function bodies other than main() -- we shouldn't see calls to
1162     * them since they should all be inlined.
1163     */
1164    if (strcmp(ir->name, "main") == 0) {
1165       const ir_function_signature *sig;
1166       exec_list empty;
1167
1168       sig = ir->matching_signature(NULL, &empty, false);
1169
1170       assert(sig);
1171
1172       visit_instructions(&sig->body);
1173    }
1174 }
1175
1176 bool
1177 vec4_visitor::try_emit_mad(ir_expression *ir)
1178 {
1179    /* 3-src instructions were introduced in gen6. */
1180    if (devinfo->gen < 6)
1181       return false;
1182
1183    /* MAD can only handle floating-point data. */
1184    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1185       return false;
1186
1187    ir_rvalue *nonmul;
1188    ir_expression *mul;
1189    bool mul_negate, mul_abs;
1190
1191    for (int i = 0; i < 2; i++) {
1192       mul_negate = false;
1193       mul_abs = false;
1194
1195       mul = ir->operands[i]->as_expression();
1196       nonmul = ir->operands[1 - i];
1197
1198       if (mul && mul->operation == ir_unop_abs) {
1199          mul = mul->operands[0]->as_expression();
1200          mul_abs = true;
1201       } else if (mul && mul->operation == ir_unop_neg) {
1202          mul = mul->operands[0]->as_expression();
1203          mul_negate = true;
1204       }
1205
1206       if (mul && mul->operation == ir_binop_mul)
1207          break;
1208    }
1209
1210    if (!mul || mul->operation != ir_binop_mul)
1211       return false;
1212
1213    nonmul->accept(this);
1214    src_reg src0 = fix_3src_operand(this->result);
1215
1216    mul->operands[0]->accept(this);
1217    src_reg src1 = fix_3src_operand(this->result);
1218    src1.negate ^= mul_negate;
1219    src1.abs = mul_abs;
1220    if (mul_abs)
1221       src1.negate = false;
1222
1223    mul->operands[1]->accept(this);
1224    src_reg src2 = fix_3src_operand(this->result);
1225    src2.abs = mul_abs;
1226    if (mul_abs)
1227       src2.negate = false;
1228
1229    this->result = src_reg(this, ir->type);
1230    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1231
1232    return true;
1233 }
1234
1235 bool
1236 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1237 {
1238    /* This optimization relies on CMP setting the destination to 0 when
1239     * false.  Early hardware only sets the least significant bit, and
1240     * leaves the other bits undefined.  So we can't use it.
1241     */
1242    if (devinfo->gen < 6)
1243       return false;
1244
1245    ir_expression *const cmp = ir->operands[0]->as_expression();
1246
1247    if (cmp == NULL)
1248       return false;
1249
1250    switch (cmp->operation) {
1251    case ir_binop_less:
1252    case ir_binop_greater:
1253    case ir_binop_lequal:
1254    case ir_binop_gequal:
1255    case ir_binop_equal:
1256    case ir_binop_nequal:
1257       break;
1258
1259    default:
1260       return false;
1261    }
1262
1263    cmp->operands[0]->accept(this);
1264    const src_reg cmp_src0 = this->result;
1265
1266    cmp->operands[1]->accept(this);
1267    const src_reg cmp_src1 = this->result;
1268
1269    this->result = src_reg(this, ir->type);
1270
1271    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1272             brw_conditional_for_comparison(cmp->operation)));
1273
1274    /* If the comparison is false, this->result will just happen to be zero.
1275     */
1276    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1277                                        this->result, src_reg(1.0f));
1278    inst->predicate = BRW_PREDICATE_NORMAL;
1279    inst->predicate_inverse = true;
1280
1281    return true;
1282 }
1283
1284 vec4_instruction *
1285 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1286                           src_reg src0, src_reg src1)
1287 {
1288    vec4_instruction *inst;
1289
1290    if (devinfo->gen >= 6) {
1291       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1292       inst->conditional_mod = conditionalmod;
1293    } else {
1294       emit(CMP(dst, src0, src1, conditionalmod));
1295
1296       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1297       inst->predicate = BRW_PREDICATE_NORMAL;
1298    }
1299
1300    return inst;
1301 }
1302
1303 vec4_instruction *
1304 vec4_visitor::emit_lrp(const dst_reg &dst,
1305                        const src_reg &x, const src_reg &y, const src_reg &a)
1306 {
1307    if (devinfo->gen >= 6) {
1308       /* Note that the instruction's argument order is reversed from GLSL
1309        * and the IR.
1310        */
1311      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1312                      fix_3src_operand(x)));
1313    } else {
1314       /* Earlier generations don't support three source operations, so we
1315        * need to emit x*(1-a) + y*a.
1316        */
1317       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1318       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1319       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1320       y_times_a.writemask           = dst.writemask;
1321       one_minus_a.writemask         = dst.writemask;
1322       x_times_one_minus_a.writemask = dst.writemask;
1323
1324       emit(MUL(y_times_a, y, a));
1325       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1326       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1327       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1328    }
1329 }
1330
1331 /**
1332  * Emits the instructions needed to perform a pull constant load. before_block
1333  * and before_inst can be NULL in which case the instruction will be appended
1334  * to the end of the instruction list.
1335  */
1336 void
1337 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1338                                           src_reg surf_index,
1339                                           src_reg offset_reg,
1340                                           bblock_t *before_block,
1341                                           vec4_instruction *before_inst)
1342 {
1343    assert((before_inst == NULL && before_block == NULL) ||
1344           (before_inst && before_block));
1345
1346    vec4_instruction *pull;
1347
1348    if (devinfo->gen >= 9) {
1349       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1350       src_reg header(this, glsl_type::uvec4_type, 2);
1351
1352       pull = new(mem_ctx)
1353          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1354                           dst_reg(header));
1355
1356       if (before_inst)
1357          emit_before(before_block, before_inst, pull);
1358       else
1359          emit(pull);
1360
1361       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1362                                  offset_reg.type);
1363       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1364
1365       if (before_inst)
1366          emit_before(before_block, before_inst, pull);
1367       else
1368          emit(pull);
1369
1370       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1371                                            dst,
1372                                            surf_index,
1373                                            header);
1374       pull->mlen = 2;
1375       pull->header_size = 1;
1376    } else if (devinfo->gen >= 7) {
1377       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1378
1379       grf_offset.type = offset_reg.type;
1380
1381       pull = MOV(grf_offset, offset_reg);
1382
1383       if (before_inst)
1384          emit_before(before_block, before_inst, pull);
1385       else
1386          emit(pull);
1387
1388       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1389                                            dst,
1390                                            surf_index,
1391                                            src_reg(grf_offset));
1392       pull->mlen = 1;
1393    } else {
1394       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1395                                            dst,
1396                                            surf_index,
1397                                            offset_reg);
1398       pull->base_mrf = 14;
1399       pull->mlen = 1;
1400    }
1401
1402    if (before_inst)
1403       emit_before(before_block, before_inst, pull);
1404    else
1405       emit(pull);
1406 }
1407
1408 src_reg
1409 vec4_visitor::emit_uniformize(const src_reg &src)
1410 {
1411    const src_reg chan_index(this, glsl_type::uint_type);
1412    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1413                               src.type);
1414
1415    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1416       ->force_writemask_all = true;
1417    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1418       ->force_writemask_all = true;
1419
1420    return src_reg(dst);
1421 }
1422
1423 void
1424 vec4_visitor::visit(ir_expression *ir)
1425 {
1426    unsigned int operand;
1427    src_reg op[ARRAY_SIZE(ir->operands)];
1428    vec4_instruction *inst;
1429
1430    if (ir->operation == ir_binop_add) {
1431       if (try_emit_mad(ir))
1432          return;
1433    }
1434
1435    if (ir->operation == ir_unop_b2f) {
1436       if (try_emit_b2f_of_compare(ir))
1437          return;
1438    }
1439
1440    /* Storage for our result.  Ideally for an assignment we'd be using
1441     * the actual storage for the result here, instead.
1442     */
1443    dst_reg result_dst(this, ir->type);
1444    src_reg result_src(result_dst);
1445
1446    if (ir->operation == ir_triop_csel) {
1447       ir->operands[1]->accept(this);
1448       op[1] = this->result;
1449       ir->operands[2]->accept(this);
1450       op[2] = this->result;
1451
1452       enum brw_predicate predicate;
1453       emit_bool_to_cond_code(ir->operands[0], &predicate);
1454       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1455       inst->predicate = predicate;
1456       this->result = result_src;
1457       return;
1458    }
1459
1460    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1461       this->result.file = BAD_FILE;
1462       ir->operands[operand]->accept(this);
1463       if (this->result.file == BAD_FILE) {
1464          fprintf(stderr, "Failed to get tree for expression operand:\n");
1465          ir->operands[operand]->fprint(stderr);
1466          exit(1);
1467       }
1468       op[operand] = this->result;
1469
1470       /* Matrix expression operands should have been broken down to vector
1471        * operations already.
1472        */
1473       assert(!ir->operands[operand]->type->is_matrix());
1474    }
1475
1476    /* If nothing special happens, this is the result. */
1477    this->result = result_src;
1478
1479    switch (ir->operation) {
1480    case ir_unop_logic_not:
1481       emit(NOT(result_dst, op[0]));
1482       break;
1483    case ir_unop_neg:
1484       op[0].negate = !op[0].negate;
1485       emit(MOV(result_dst, op[0]));
1486       break;
1487    case ir_unop_abs:
1488       op[0].abs = true;
1489       op[0].negate = false;
1490       emit(MOV(result_dst, op[0]));
1491       break;
1492
1493    case ir_unop_sign:
1494       if (ir->type->is_float()) {
1495          /* AND(val, 0x80000000) gives the sign bit.
1496           *
1497           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1498           * zero.
1499           */
1500          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1501
1502          op[0].type = BRW_REGISTER_TYPE_UD;
1503          result_dst.type = BRW_REGISTER_TYPE_UD;
1504          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1505
1506          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1507          inst->predicate = BRW_PREDICATE_NORMAL;
1508
1509          this->result.type = BRW_REGISTER_TYPE_F;
1510       } else {
1511          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1512           *               -> non-negative val generates 0x00000000.
1513           *  Predicated OR sets 1 if val is positive.
1514           */
1515          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1516
1517          emit(ASR(result_dst, op[0], src_reg(31)));
1518
1519          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1520          inst->predicate = BRW_PREDICATE_NORMAL;
1521       }
1522       break;
1523
1524    case ir_unop_rcp:
1525       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1526       break;
1527
1528    case ir_unop_exp2:
1529       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1530       break;
1531    case ir_unop_log2:
1532       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1533       break;
1534    case ir_unop_exp:
1535    case ir_unop_log:
1536       unreachable("not reached: should be handled by ir_explog_to_explog2");
1537    case ir_unop_sin:
1538       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1539       break;
1540    case ir_unop_cos:
1541       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1542       break;
1543
1544    case ir_unop_dFdx:
1545    case ir_unop_dFdx_coarse:
1546    case ir_unop_dFdx_fine:
1547    case ir_unop_dFdy:
1548    case ir_unop_dFdy_coarse:
1549    case ir_unop_dFdy_fine:
1550       unreachable("derivatives not valid in vertex shader");
1551
1552    case ir_unop_bitfield_reverse:
1553       emit(BFREV(result_dst, op[0]));
1554       break;
1555    case ir_unop_bit_count:
1556       emit(CBIT(result_dst, op[0]));
1557       break;
1558    case ir_unop_find_msb: {
1559       src_reg temp = src_reg(this, glsl_type::uint_type);
1560
1561       inst = emit(FBH(dst_reg(temp), op[0]));
1562       inst->dst.writemask = WRITEMASK_XYZW;
1563
1564       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1565        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1566        * subtract the result from 31 to convert the MSB count into an LSB count.
1567        */
1568
1569       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1570       temp.swizzle = BRW_SWIZZLE_NOOP;
1571       emit(MOV(result_dst, temp));
1572
1573       src_reg src_tmp = src_reg(result_dst);
1574       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1575
1576       src_tmp.negate = true;
1577       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1578       inst->predicate = BRW_PREDICATE_NORMAL;
1579       break;
1580    }
1581    case ir_unop_find_lsb:
1582       emit(FBL(result_dst, op[0]));
1583       break;
1584    case ir_unop_saturate:
1585       inst = emit(MOV(result_dst, op[0]));
1586       inst->saturate = true;
1587       break;
1588
1589    case ir_unop_noise:
1590       unreachable("not reached: should be handled by lower_noise");
1591
1592    case ir_unop_subroutine_to_int:
1593       emit(MOV(result_dst, op[0]));
1594       break;
1595
1596    case ir_binop_add:
1597       emit(ADD(result_dst, op[0], op[1]));
1598       break;
1599    case ir_binop_sub:
1600       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1601
1602    case ir_binop_mul:
1603       if (devinfo->gen < 8 && ir->type->is_integer()) {
1604          /* For integer multiplication, the MUL uses the low 16 bits of one of
1605           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1606           * accumulates in the contribution of the upper 16 bits of that
1607           * operand.  If we can determine that one of the args is in the low
1608           * 16 bits, though, we can just emit a single MUL.
1609           */
1610          if (ir->operands[0]->is_uint16_constant()) {
1611             if (devinfo->gen < 7)
1612                emit(MUL(result_dst, op[0], op[1]));
1613             else
1614                emit(MUL(result_dst, op[1], op[0]));
1615          } else if (ir->operands[1]->is_uint16_constant()) {
1616             if (devinfo->gen < 7)
1617                emit(MUL(result_dst, op[1], op[0]));
1618             else
1619                emit(MUL(result_dst, op[0], op[1]));
1620          } else {
1621             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1622
1623             emit(MUL(acc, op[0], op[1]));
1624             emit(MACH(dst_null_d(), op[0], op[1]));
1625             emit(MOV(result_dst, src_reg(acc)));
1626          }
1627       } else {
1628          emit(MUL(result_dst, op[0], op[1]));
1629       }
1630       break;
1631    case ir_binop_imul_high: {
1632       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1633
1634       emit(MUL(acc, op[0], op[1]));
1635       emit(MACH(result_dst, op[0], op[1]));
1636       break;
1637    }
1638    case ir_binop_div:
1639       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1640       assert(ir->type->is_integer());
1641       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1642       break;
1643
1644    case ir_binop_carry:
1645       unreachable("Should have been lowered by carry_to_arith().");
1646
1647    case ir_binop_borrow:
1648       unreachable("Should have been lowered by borrow_to_arith().");
1649
1650    case ir_binop_mod:
1651       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1652       assert(ir->type->is_integer());
1653       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1654       break;
1655
1656    case ir_binop_less:
1657    case ir_binop_greater:
1658    case ir_binop_lequal:
1659    case ir_binop_gequal:
1660    case ir_binop_equal:
1661    case ir_binop_nequal: {
1662       if (devinfo->gen <= 5) {
1663          resolve_bool_comparison(ir->operands[0], &op[0]);
1664          resolve_bool_comparison(ir->operands[1], &op[1]);
1665       }
1666       emit(CMP(result_dst, op[0], op[1],
1667                brw_conditional_for_comparison(ir->operation)));
1668       break;
1669    }
1670
1671    case ir_binop_all_equal:
1672       if (devinfo->gen <= 5) {
1673          resolve_bool_comparison(ir->operands[0], &op[0]);
1674          resolve_bool_comparison(ir->operands[1], &op[1]);
1675       }
1676
1677       /* "==" operator producing a scalar boolean. */
1678       if (ir->operands[0]->type->is_vector() ||
1679           ir->operands[1]->type->is_vector()) {
1680          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1681          emit(MOV(result_dst, src_reg(0)));
1682          inst = emit(MOV(result_dst, src_reg(~0)));
1683          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1684       } else {
1685          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1686       }
1687       break;
1688    case ir_binop_any_nequal:
1689       if (devinfo->gen <= 5) {
1690          resolve_bool_comparison(ir->operands[0], &op[0]);
1691          resolve_bool_comparison(ir->operands[1], &op[1]);
1692       }
1693
1694       /* "!=" operator producing a scalar boolean. */
1695       if (ir->operands[0]->type->is_vector() ||
1696           ir->operands[1]->type->is_vector()) {
1697          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1698
1699          emit(MOV(result_dst, src_reg(0)));
1700          inst = emit(MOV(result_dst, src_reg(~0)));
1701          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1702       } else {
1703          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1704       }
1705       break;
1706
1707    case ir_unop_any:
1708       if (devinfo->gen <= 5) {
1709          resolve_bool_comparison(ir->operands[0], &op[0]);
1710       }
1711       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1712       emit(MOV(result_dst, src_reg(0)));
1713
1714       inst = emit(MOV(result_dst, src_reg(~0)));
1715       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1716       break;
1717
1718    case ir_binop_logic_xor:
1719       emit(XOR(result_dst, op[0], op[1]));
1720       break;
1721
1722    case ir_binop_logic_or:
1723       emit(OR(result_dst, op[0], op[1]));
1724       break;
1725
1726    case ir_binop_logic_and:
1727       emit(AND(result_dst, op[0], op[1]));
1728       break;
1729
1730    case ir_binop_dot:
1731       assert(ir->operands[0]->type->is_vector());
1732       assert(ir->operands[0]->type == ir->operands[1]->type);
1733       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1734       break;
1735
1736    case ir_unop_sqrt:
1737       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1738       break;
1739    case ir_unop_rsq:
1740       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1741       break;
1742
1743    case ir_unop_bitcast_i2f:
1744    case ir_unop_bitcast_u2f:
1745       this->result = op[0];
1746       this->result.type = BRW_REGISTER_TYPE_F;
1747       break;
1748
1749    case ir_unop_bitcast_f2i:
1750       this->result = op[0];
1751       this->result.type = BRW_REGISTER_TYPE_D;
1752       break;
1753
1754    case ir_unop_bitcast_f2u:
1755       this->result = op[0];
1756       this->result.type = BRW_REGISTER_TYPE_UD;
1757       break;
1758
1759    case ir_unop_i2f:
1760    case ir_unop_i2u:
1761    case ir_unop_u2i:
1762    case ir_unop_u2f:
1763    case ir_unop_f2i:
1764    case ir_unop_f2u:
1765       emit(MOV(result_dst, op[0]));
1766       break;
1767    case ir_unop_b2i:
1768    case ir_unop_b2f:
1769       if (devinfo->gen <= 5) {
1770          resolve_bool_comparison(ir->operands[0], &op[0]);
1771       }
1772       emit(MOV(result_dst, negate(op[0])));
1773       break;
1774    case ir_unop_f2b:
1775       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1776       break;
1777    case ir_unop_i2b:
1778       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1779       break;
1780
1781    case ir_unop_trunc:
1782       emit(RNDZ(result_dst, op[0]));
1783       break;
1784    case ir_unop_ceil: {
1785          src_reg tmp = src_reg(this, ir->type);
1786          op[0].negate = !op[0].negate;
1787          emit(RNDD(dst_reg(tmp), op[0]));
1788          tmp.negate = true;
1789          emit(MOV(result_dst, tmp));
1790       }
1791       break;
1792    case ir_unop_floor:
1793       inst = emit(RNDD(result_dst, op[0]));
1794       break;
1795    case ir_unop_fract:
1796       inst = emit(FRC(result_dst, op[0]));
1797       break;
1798    case ir_unop_round_even:
1799       emit(RNDE(result_dst, op[0]));
1800       break;
1801
1802    case ir_binop_min:
1803       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1804       break;
1805    case ir_binop_max:
1806       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1807       break;
1808
1809    case ir_binop_pow:
1810       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1811       break;
1812
1813    case ir_unop_bit_not:
1814       inst = emit(NOT(result_dst, op[0]));
1815       break;
1816    case ir_binop_bit_and:
1817       inst = emit(AND(result_dst, op[0], op[1]));
1818       break;
1819    case ir_binop_bit_xor:
1820       inst = emit(XOR(result_dst, op[0], op[1]));
1821       break;
1822    case ir_binop_bit_or:
1823       inst = emit(OR(result_dst, op[0], op[1]));
1824       break;
1825
1826    case ir_binop_lshift:
1827       inst = emit(SHL(result_dst, op[0], op[1]));
1828       break;
1829
1830    case ir_binop_rshift:
1831       if (ir->type->base_type == GLSL_TYPE_INT)
1832          inst = emit(ASR(result_dst, op[0], op[1]));
1833       else
1834          inst = emit(SHR(result_dst, op[0], op[1]));
1835       break;
1836
1837    case ir_binop_bfm:
1838       emit(BFI1(result_dst, op[0], op[1]));
1839       break;
1840
1841    case ir_binop_ubo_load: {
1842       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1843       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1844       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1845       src_reg offset;
1846
1847       /* Now, load the vector from that offset. */
1848       assert(ir->type->is_vector() || ir->type->is_scalar());
1849
1850       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1851       packed_consts.type = result.type;
1852       src_reg surf_index;
1853
1854       if (const_uniform_block) {
1855          /* The block index is a constant, so just emit the binding table entry
1856           * as an immediate.
1857           */
1858          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1859                               const_uniform_block->value.u[0]);
1860       } else {
1861          /* The block index is not a constant. Evaluate the index expression
1862           * per-channel and add the base UBO index; we have to select a value
1863           * from any live channel.
1864           */
1865          surf_index = src_reg(this, glsl_type::uint_type);
1866          emit(ADD(dst_reg(surf_index), op[0],
1867                   src_reg(prog_data->base.binding_table.ubo_start)));
1868          surf_index = emit_uniformize(surf_index);
1869
1870          /* Assume this may touch any UBO. It would be nice to provide
1871           * a tighter bound, but the array information is already lowered away.
1872           */
1873          brw_mark_surface_used(&prog_data->base,
1874                                prog_data->base.binding_table.ubo_start +
1875                                shader_prog->NumUniformBlocks - 1);
1876       }
1877
1878       if (const_offset_ir) {
1879          if (devinfo->gen >= 8) {
1880             /* Store the offset in a GRF so we can send-from-GRF. */
1881             offset = src_reg(this, glsl_type::int_type);
1882             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1883          } else {
1884             /* Immediates are fine on older generations since they'll be moved
1885              * to a (potentially fake) MRF at the generator level.
1886              */
1887             offset = src_reg(const_offset / 16);
1888          }
1889       } else {
1890          offset = src_reg(this, glsl_type::uint_type);
1891          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1892       }
1893
1894       emit_pull_constant_load_reg(dst_reg(packed_consts),
1895                                   surf_index,
1896                                   offset,
1897                                   NULL, NULL /* before_block/inst */);
1898
1899       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1900       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1901                                             const_offset % 16 / 4,
1902                                             const_offset % 16 / 4,
1903                                             const_offset % 16 / 4);
1904
1905       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1906       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1907          emit(CMP(result_dst, packed_consts, src_reg(0u),
1908                   BRW_CONDITIONAL_NZ));
1909       } else {
1910          emit(MOV(result_dst, packed_consts));
1911       }
1912       break;
1913    }
1914
1915    case ir_binop_vector_extract:
1916       unreachable("should have been lowered by vec_index_to_cond_assign");
1917
1918    case ir_triop_fma:
1919       op[0] = fix_3src_operand(op[0]);
1920       op[1] = fix_3src_operand(op[1]);
1921       op[2] = fix_3src_operand(op[2]);
1922       /* Note that the instruction's argument order is reversed from GLSL
1923        * and the IR.
1924        */
1925       emit(MAD(result_dst, op[2], op[1], op[0]));
1926       break;
1927
1928    case ir_triop_lrp:
1929       emit_lrp(result_dst, op[0], op[1], op[2]);
1930       break;
1931
1932    case ir_triop_csel:
1933       unreachable("already handled above");
1934       break;
1935
1936    case ir_triop_bfi:
1937       op[0] = fix_3src_operand(op[0]);
1938       op[1] = fix_3src_operand(op[1]);
1939       op[2] = fix_3src_operand(op[2]);
1940       emit(BFI2(result_dst, op[0], op[1], op[2]));
1941       break;
1942
1943    case ir_triop_bitfield_extract:
1944       op[0] = fix_3src_operand(op[0]);
1945       op[1] = fix_3src_operand(op[1]);
1946       op[2] = fix_3src_operand(op[2]);
1947       /* Note that the instruction's argument order is reversed from GLSL
1948        * and the IR.
1949        */
1950       emit(BFE(result_dst, op[2], op[1], op[0]));
1951       break;
1952
1953    case ir_triop_vector_insert:
1954       unreachable("should have been lowered by lower_vector_insert");
1955
1956    case ir_quadop_bitfield_insert:
1957       unreachable("not reached: should be handled by "
1958               "bitfield_insert_to_bfm_bfi\n");
1959
1960    case ir_quadop_vector:
1961       unreachable("not reached: should be handled by lower_quadop_vector");
1962
1963    case ir_unop_pack_half_2x16:
1964       emit_pack_half_2x16(result_dst, op[0]);
1965       break;
1966    case ir_unop_unpack_half_2x16:
1967       emit_unpack_half_2x16(result_dst, op[0]);
1968       break;
1969    case ir_unop_unpack_unorm_4x8:
1970       emit_unpack_unorm_4x8(result_dst, op[0]);
1971       break;
1972    case ir_unop_unpack_snorm_4x8:
1973       emit_unpack_snorm_4x8(result_dst, op[0]);
1974       break;
1975    case ir_unop_pack_unorm_4x8:
1976       emit_pack_unorm_4x8(result_dst, op[0]);
1977       break;
1978    case ir_unop_pack_snorm_4x8:
1979       emit_pack_snorm_4x8(result_dst, op[0]);
1980       break;
1981    case ir_unop_pack_snorm_2x16:
1982    case ir_unop_pack_unorm_2x16:
1983    case ir_unop_unpack_snorm_2x16:
1984    case ir_unop_unpack_unorm_2x16:
1985       unreachable("not reached: should be handled by lower_packing_builtins");
1986    case ir_unop_unpack_half_2x16_split_x:
1987    case ir_unop_unpack_half_2x16_split_y:
1988    case ir_binop_pack_half_2x16_split:
1989    case ir_unop_interpolate_at_centroid:
1990    case ir_binop_interpolate_at_sample:
1991    case ir_binop_interpolate_at_offset:
1992       unreachable("not reached: should not occur in vertex shader");
1993    case ir_binop_ldexp:
1994       unreachable("not reached: should be handled by ldexp_to_arith()");
1995    case ir_unop_d2f:
1996    case ir_unop_f2d:
1997    case ir_unop_d2i:
1998    case ir_unop_i2d:
1999    case ir_unop_d2u:
2000    case ir_unop_u2d:
2001    case ir_unop_d2b:
2002    case ir_unop_pack_double_2x32:
2003    case ir_unop_unpack_double_2x32:
2004    case ir_unop_frexp_sig:
2005    case ir_unop_frexp_exp:
2006       unreachable("fp64 todo");
2007    }
2008 }
2009
2010
2011 void
2012 vec4_visitor::visit(ir_swizzle *ir)
2013 {
2014    /* Note that this is only swizzles in expressions, not those on the left
2015     * hand side of an assignment, which do write masking.  See ir_assignment
2016     * for that.
2017     */
2018    const unsigned swz = brw_compose_swizzle(
2019       brw_swizzle_for_size(ir->type->vector_elements),
2020       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2021
2022    ir->val->accept(this);
2023    this->result = swizzle(this->result, swz);
2024 }
2025
2026 void
2027 vec4_visitor::visit(ir_dereference_variable *ir)
2028 {
2029    const struct glsl_type *type = ir->type;
2030    dst_reg *reg = variable_storage(ir->var);
2031
2032    if (!reg) {
2033       fail("Failed to find variable storage for %s\n", ir->var->name);
2034       this->result = src_reg(brw_null_reg());
2035       return;
2036    }
2037
2038    this->result = src_reg(*reg);
2039
2040    /* System values get their swizzle from the dst_reg writemask */
2041    if (ir->var->data.mode == ir_var_system_value)
2042       return;
2043
2044    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2045       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2046 }
2047
2048
2049 int
2050 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2051 {
2052    /* Under normal circumstances array elements are stored consecutively, so
2053     * the stride is equal to the size of the array element.
2054     */
2055    return type_size(ir->type);
2056 }
2057
2058
2059 void
2060 vec4_visitor::visit(ir_dereference_array *ir)
2061 {
2062    ir_constant *constant_index;
2063    src_reg src;
2064    int array_stride = compute_array_stride(ir);
2065
2066    constant_index = ir->array_index->constant_expression_value();
2067
2068    ir->array->accept(this);
2069    src = this->result;
2070
2071    if (constant_index) {
2072       src.reg_offset += constant_index->value.i[0] * array_stride;
2073    } else {
2074       /* Variable index array dereference.  It eats the "vec4" of the
2075        * base of the array and an index that offsets the Mesa register
2076        * index.
2077        */
2078       ir->array_index->accept(this);
2079
2080       src_reg index_reg;
2081
2082       if (array_stride == 1) {
2083          index_reg = this->result;
2084       } else {
2085          index_reg = src_reg(this, glsl_type::int_type);
2086
2087          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2088       }
2089
2090       if (src.reladdr) {
2091          src_reg temp = src_reg(this, glsl_type::int_type);
2092
2093          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2094
2095          index_reg = temp;
2096       }
2097
2098       src.reladdr = ralloc(mem_ctx, src_reg);
2099       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2100    }
2101
2102    /* If the type is smaller than a vec4, replicate the last channel out. */
2103    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2104       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2105    else
2106       src.swizzle = BRW_SWIZZLE_NOOP;
2107    src.type = brw_type_for_base_type(ir->type);
2108
2109    this->result = src;
2110 }
2111
2112 void
2113 vec4_visitor::visit(ir_dereference_record *ir)
2114 {
2115    unsigned int i;
2116    const glsl_type *struct_type = ir->record->type;
2117    int offset = 0;
2118
2119    ir->record->accept(this);
2120
2121    for (i = 0; i < struct_type->length; i++) {
2122       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2123          break;
2124       offset += type_size(struct_type->fields.structure[i].type);
2125    }
2126
2127    /* If the type is smaller than a vec4, replicate the last channel out. */
2128    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2129       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2130    else
2131       this->result.swizzle = BRW_SWIZZLE_NOOP;
2132    this->result.type = brw_type_for_base_type(ir->type);
2133
2134    this->result.reg_offset += offset;
2135 }
2136
2137 /**
2138  * We want to be careful in assignment setup to hit the actual storage
2139  * instead of potentially using a temporary like we might with the
2140  * ir_dereference handler.
2141  */
2142 static dst_reg
2143 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2144 {
2145    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2146     * access of a vector, it must be separated into a series conditional moves
2147     * before reaching this point (see ir_vec_index_to_cond_assign).
2148     */
2149    assert(ir->as_dereference());
2150    ir_dereference_array *deref_array = ir->as_dereference_array();
2151    if (deref_array) {
2152       assert(!deref_array->array->type->is_vector());
2153    }
2154
2155    /* Use the rvalue deref handler for the most part.  We'll ignore
2156     * swizzles in it and write swizzles using writemask, though.
2157     */
2158    ir->accept(v);
2159    return dst_reg(v->result);
2160 }
2161
2162 void
2163 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2164                               const struct glsl_type *type,
2165                               enum brw_predicate predicate)
2166 {
2167    if (type->base_type == GLSL_TYPE_STRUCT) {
2168       for (unsigned int i = 0; i < type->length; i++) {
2169          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2170       }
2171       return;
2172    }
2173
2174    if (type->is_array()) {
2175       for (unsigned int i = 0; i < type->length; i++) {
2176          emit_block_move(dst, src, type->fields.array, predicate);
2177       }
2178       return;
2179    }
2180
2181    if (type->is_matrix()) {
2182       const struct glsl_type *vec_type;
2183
2184       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2185                                          type->vector_elements, 1);
2186
2187       for (int i = 0; i < type->matrix_columns; i++) {
2188          emit_block_move(dst, src, vec_type, predicate);
2189       }
2190       return;
2191    }
2192
2193    assert(type->is_scalar() || type->is_vector());
2194
2195    dst->type = brw_type_for_base_type(type);
2196    src->type = dst->type;
2197
2198    dst->writemask = (1 << type->vector_elements) - 1;
2199
2200    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2201
2202    vec4_instruction *inst = emit(MOV(*dst, *src));
2203    inst->predicate = predicate;
2204
2205    dst->reg_offset++;
2206    src->reg_offset++;
2207 }
2208
2209
2210 /* If the RHS processing resulted in an instruction generating a
2211  * temporary value, and it would be easy to rewrite the instruction to
2212  * generate its result right into the LHS instead, do so.  This ends
2213  * up reliably removing instructions where it can be tricky to do so
2214  * later without real UD chain information.
2215  */
2216 bool
2217 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2218                                      dst_reg dst,
2219                                      src_reg src,
2220                                      vec4_instruction *pre_rhs_inst,
2221                                      vec4_instruction *last_rhs_inst)
2222 {
2223    /* This could be supported, but it would take more smarts. */
2224    if (ir->condition)
2225       return false;
2226
2227    if (pre_rhs_inst == last_rhs_inst)
2228       return false; /* No instructions generated to work with. */
2229
2230    /* Make sure the last instruction generated our source reg. */
2231    if (src.file != GRF ||
2232        src.file != last_rhs_inst->dst.file ||
2233        src.reg != last_rhs_inst->dst.reg ||
2234        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2235        src.reladdr ||
2236        src.abs ||
2237        src.negate ||
2238        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2239       return false;
2240
2241    /* Check that that last instruction fully initialized the channels
2242     * we want to use, in the order we want to use them.  We could
2243     * potentially reswizzle the operands of many instructions so that
2244     * we could handle out of order channels, but don't yet.
2245     */
2246
2247    for (unsigned i = 0; i < 4; i++) {
2248       if (dst.writemask & (1 << i)) {
2249          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2250             return false;
2251
2252          if (BRW_GET_SWZ(src.swizzle, i) != i)
2253             return false;
2254       }
2255    }
2256
2257    /* Success!  Rewrite the instruction. */
2258    last_rhs_inst->dst.file = dst.file;
2259    last_rhs_inst->dst.reg = dst.reg;
2260    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2261    last_rhs_inst->dst.reladdr = dst.reladdr;
2262    last_rhs_inst->dst.writemask &= dst.writemask;
2263
2264    return true;
2265 }
2266
2267 void
2268 vec4_visitor::visit(ir_assignment *ir)
2269 {
2270    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2271    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2272
2273    if (!ir->lhs->type->is_scalar() &&
2274        !ir->lhs->type->is_vector()) {
2275       ir->rhs->accept(this);
2276       src_reg src = this->result;
2277
2278       if (ir->condition) {
2279          emit_bool_to_cond_code(ir->condition, &predicate);
2280       }
2281
2282       /* emit_block_move doesn't account for swizzles in the source register.
2283        * This should be ok, since the source register is a structure or an
2284        * array, and those can't be swizzled.  But double-check to be sure.
2285        */
2286       assert(src.swizzle ==
2287              (ir->rhs->type->is_matrix()
2288               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2289               : BRW_SWIZZLE_NOOP));
2290
2291       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2292       return;
2293    }
2294
2295    /* Now we're down to just a scalar/vector with writemasks. */
2296    int i;
2297
2298    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2299    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2300
2301    ir->rhs->accept(this);
2302
2303    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2304
2305    int swizzles[4];
2306    int src_chan = 0;
2307
2308    assert(ir->lhs->type->is_vector() ||
2309           ir->lhs->type->is_scalar());
2310    dst.writemask = ir->write_mask;
2311
2312    /* Swizzle a small RHS vector into the channels being written.
2313     *
2314     * glsl ir treats write_mask as dictating how many channels are
2315     * present on the RHS while in our instructions we need to make
2316     * those channels appear in the slots of the vec4 they're written to.
2317     */
2318    for (int i = 0; i < 4; i++)
2319       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2320
2321    src_reg src = swizzle(this->result,
2322                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2323                                       swizzles[2], swizzles[3]));
2324
2325    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2326       return;
2327    }
2328
2329    if (ir->condition) {
2330       emit_bool_to_cond_code(ir->condition, &predicate);
2331    }
2332
2333    for (i = 0; i < type_size(ir->lhs->type); i++) {
2334       vec4_instruction *inst = emit(MOV(dst, src));
2335       inst->predicate = predicate;
2336
2337       dst.reg_offset++;
2338       src.reg_offset++;
2339    }
2340 }
2341
2342 void
2343 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2344 {
2345    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2346       foreach_in_list(ir_constant, field_value, &ir->components) {
2347          emit_constant_values(dst, field_value);
2348       }
2349       return;
2350    }
2351
2352    if (ir->type->is_array()) {
2353       for (unsigned int i = 0; i < ir->type->length; i++) {
2354          emit_constant_values(dst, ir->array_elements[i]);
2355       }
2356       return;
2357    }
2358
2359    if (ir->type->is_matrix()) {
2360       for (int i = 0; i < ir->type->matrix_columns; i++) {
2361          float *vec = &ir->value.f[i * ir->type->vector_elements];
2362
2363          for (int j = 0; j < ir->type->vector_elements; j++) {
2364             dst->writemask = 1 << j;
2365             dst->type = BRW_REGISTER_TYPE_F;
2366
2367             emit(MOV(*dst, src_reg(vec[j])));
2368          }
2369          dst->reg_offset++;
2370       }
2371       return;
2372    }
2373
2374    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2375
2376    for (int i = 0; i < ir->type->vector_elements; i++) {
2377       if (!(remaining_writemask & (1 << i)))
2378          continue;
2379
2380       dst->writemask = 1 << i;
2381       dst->type = brw_type_for_base_type(ir->type);
2382
2383       /* Find other components that match the one we're about to
2384        * write.  Emits fewer instructions for things like vec4(0.5,
2385        * 1.5, 1.5, 1.5).
2386        */
2387       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2388          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2389             if (ir->value.b[i] == ir->value.b[j])
2390                dst->writemask |= (1 << j);
2391          } else {
2392             /* u, i, and f storage all line up, so no need for a
2393              * switch case for comparing each type.
2394              */
2395             if (ir->value.u[i] == ir->value.u[j])
2396                dst->writemask |= (1 << j);
2397          }
2398       }
2399
2400       switch (ir->type->base_type) {
2401       case GLSL_TYPE_FLOAT:
2402          emit(MOV(*dst, src_reg(ir->value.f[i])));
2403          break;
2404       case GLSL_TYPE_INT:
2405          emit(MOV(*dst, src_reg(ir->value.i[i])));
2406          break;
2407       case GLSL_TYPE_UINT:
2408          emit(MOV(*dst, src_reg(ir->value.u[i])));
2409          break;
2410       case GLSL_TYPE_BOOL:
2411          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2412          break;
2413       default:
2414          unreachable("Non-float/uint/int/bool constant");
2415       }
2416
2417       remaining_writemask &= ~dst->writemask;
2418    }
2419    dst->reg_offset++;
2420 }
2421
2422 void
2423 vec4_visitor::visit(ir_constant *ir)
2424 {
2425    dst_reg dst = dst_reg(this, ir->type);
2426    this->result = src_reg(dst);
2427
2428    emit_constant_values(&dst, ir);
2429 }
2430
2431 void
2432 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2433 {
2434    ir_dereference *deref = static_cast<ir_dereference *>(
2435       ir->actual_parameters.get_head());
2436    ir_variable *location = deref->variable_referenced();
2437    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2438                           location->data.binding);
2439
2440    /* Calculate the surface offset */
2441    src_reg offset(this, glsl_type::uint_type);
2442    ir_dereference_array *deref_array = deref->as_dereference_array();
2443    if (deref_array) {
2444       deref_array->array_index->accept(this);
2445
2446       src_reg tmp(this, glsl_type::uint_type);
2447       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2448       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2449    } else {
2450       offset = location->data.atomic.offset;
2451    }
2452
2453    /* Emit the appropriate machine instruction */
2454    const char *callee = ir->callee->function_name();
2455    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2456
2457    if (!strcmp("__intrinsic_atomic_read", callee)) {
2458       emit_untyped_surface_read(surf_index, dst, offset);
2459
2460    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2461       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2462                           src_reg(), src_reg());
2463
2464    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2465       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2466                           src_reg(), src_reg());
2467    }
2468
2469    brw_mark_surface_used(stage_prog_data, surf_index);
2470 }
2471
2472 void
2473 vec4_visitor::visit(ir_call *ir)
2474 {
2475    const char *callee = ir->callee->function_name();
2476
2477    if (!strcmp("__intrinsic_atomic_read", callee) ||
2478        !strcmp("__intrinsic_atomic_increment", callee) ||
2479        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2480       visit_atomic_counter_intrinsic(ir);
2481    } else {
2482       unreachable("Unsupported intrinsic.");
2483    }
2484 }
2485
2486 src_reg
2487 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2488                              src_reg coordinate, src_reg sampler)
2489 {
2490    vec4_instruction *inst =
2491       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2492                                     dst_reg(this, glsl_type::uvec4_type));
2493    inst->base_mrf = 2;
2494    inst->src[1] = sampler;
2495
2496    int param_base;
2497
2498    if (devinfo->gen >= 9) {
2499       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2500       vec4_instruction *header_inst = new(mem_ctx)
2501          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2502                           dst_reg(MRF, inst->base_mrf));
2503
2504       emit(header_inst);
2505
2506       inst->mlen = 2;
2507       inst->header_size = 1;
2508       param_base = inst->base_mrf + 1;
2509    } else {
2510       inst->mlen = 1;
2511       param_base = inst->base_mrf;
2512    }
2513
2514    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2515    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2516    int zero_mask = 0xf & ~coord_mask;
2517
2518    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2519             coordinate));
2520
2521    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2522             src_reg(0)));
2523
2524    emit(inst);
2525    return src_reg(inst->dst);
2526 }
2527
2528 bool
2529 vec4_visitor::is_high_sampler(src_reg sampler)
2530 {
2531    if (devinfo->gen < 8 && !devinfo->is_haswell)
2532       return false;
2533
2534    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2535 }
2536
2537 void
2538 vec4_visitor::emit_texture(ir_texture_opcode op,
2539                            dst_reg dest,
2540                            const glsl_type *dest_type,
2541                            src_reg coordinate,
2542                            int coord_components,
2543                            src_reg shadow_comparitor,
2544                            src_reg lod, src_reg lod2,
2545                            src_reg sample_index,
2546                            uint32_t constant_offset,
2547                            src_reg offset_value,
2548                            src_reg mcs,
2549                            bool is_cube_array,
2550                            uint32_t sampler,
2551                            src_reg sampler_reg)
2552 {
2553    enum opcode opcode;
2554    switch (op) {
2555    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2556    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2557    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2558    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2559    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2560    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2561    case ir_tg4: opcode = offset_value.file != BAD_FILE
2562                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2563    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2564    case ir_txb:
2565       unreachable("TXB is not valid for vertex shaders.");
2566    case ir_lod:
2567       unreachable("LOD is not valid for vertex shaders.");
2568    default:
2569       unreachable("Unrecognized tex op");
2570    }
2571
2572    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2573       opcode, dst_reg(this, dest_type));
2574
2575    inst->offset = constant_offset;
2576
2577    /* The message header is necessary for:
2578     * - Gen4 (always)
2579     * - Gen9+ for selecting SIMD4x2
2580     * - Texel offsets
2581     * - Gather channel selection
2582     * - Sampler indices too large to fit in a 4-bit value.
2583     */
2584    inst->header_size =
2585       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2586        inst->offset != 0 || op == ir_tg4 ||
2587        is_high_sampler(sampler_reg)) ? 1 : 0;
2588    inst->base_mrf = 2;
2589    inst->mlen = inst->header_size + 1; /* always at least one */
2590    inst->dst.writemask = WRITEMASK_XYZW;
2591    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2592
2593    inst->src[1] = sampler_reg;
2594
2595    /* MRF for the first parameter */
2596    int param_base = inst->base_mrf + inst->header_size;
2597
2598    if (op == ir_txs || op == ir_query_levels) {
2599       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2600       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2601    } else {
2602       /* Load the coordinate */
2603       /* FINISHME: gl_clamp_mask and saturate */
2604       int coord_mask = (1 << coord_components) - 1;
2605       int zero_mask = 0xf & ~coord_mask;
2606
2607       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2608                coordinate));
2609
2610       if (zero_mask != 0) {
2611          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2612                   src_reg(0)));
2613       }
2614       /* Load the shadow comparitor */
2615       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2616          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2617                           WRITEMASK_X),
2618                   shadow_comparitor));
2619          inst->mlen++;
2620       }
2621
2622       /* Load the LOD info */
2623       if (op == ir_tex || op == ir_txl) {
2624          int mrf, writemask;
2625          if (devinfo->gen >= 5) {
2626             mrf = param_base + 1;
2627             if (shadow_comparitor.file != BAD_FILE) {
2628                writemask = WRITEMASK_Y;
2629                /* mlen already incremented */
2630             } else {
2631                writemask = WRITEMASK_X;
2632                inst->mlen++;
2633             }
2634          } else /* devinfo->gen == 4 */ {
2635             mrf = param_base;
2636             writemask = WRITEMASK_W;
2637          }
2638          lod.swizzle = BRW_SWIZZLE_XXXX;
2639          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2640       } else if (op == ir_txf) {
2641          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2642       } else if (op == ir_txf_ms) {
2643          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2644                   sample_index));
2645          if (devinfo->gen >= 7) {
2646             /* MCS data is in the first channel of `mcs`, but we need to get it into
2647              * the .y channel of the second vec4 of params, so replicate .x across
2648              * the whole vec4 and then mask off everything except .y
2649              */
2650             mcs.swizzle = BRW_SWIZZLE_XXXX;
2651             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2652                      mcs));
2653          }
2654          inst->mlen++;
2655       } else if (op == ir_txd) {
2656          const brw_reg_type type = lod.type;
2657
2658          if (devinfo->gen >= 5) {
2659             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2661             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2662             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2663             inst->mlen++;
2664
2665             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2666                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2667                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2668                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2669                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2670                inst->mlen++;
2671
2672                if (shadow_comparitor.file != BAD_FILE) {
2673                   emit(MOV(dst_reg(MRF, param_base + 2,
2674                                    shadow_comparitor.type, WRITEMASK_Z),
2675                            shadow_comparitor));
2676                }
2677             }
2678          } else /* devinfo->gen == 4 */ {
2679             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2680             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2681             inst->mlen += 2;
2682          }
2683       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2684          if (shadow_comparitor.file != BAD_FILE) {
2685             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2686                      shadow_comparitor));
2687          }
2688
2689          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2690                   offset_value));
2691          inst->mlen++;
2692       }
2693    }
2694
2695    emit(inst);
2696
2697    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2698     * spec requires layers.
2699     */
2700    if (op == ir_txs && is_cube_array) {
2701       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2702                 writemask(inst->dst, WRITEMASK_Z),
2703                 src_reg(inst->dst), src_reg(6));
2704    }
2705
2706    if (devinfo->gen == 6 && op == ir_tg4) {
2707       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2708    }
2709
2710    swizzle_result(op, dest,
2711                   src_reg(inst->dst), sampler, dest_type);
2712 }
2713
2714 void
2715 vec4_visitor::visit(ir_texture *ir)
2716 {
2717    uint32_t sampler =
2718       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2719
2720    ir_rvalue *nonconst_sampler_index =
2721       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2722
2723    /* Handle non-constant sampler array indexing */
2724    src_reg sampler_reg;
2725    if (nonconst_sampler_index) {
2726       /* The highest sampler which may be used by this operation is
2727        * the last element of the array. Mark it here, because the generator
2728        * doesn't have enough information to determine the bound.
2729        */
2730       uint32_t array_size = ir->sampler->as_dereference_array()
2731          ->array->type->array_size();
2732
2733       uint32_t max_used = sampler + array_size - 1;
2734       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2735          max_used += prog_data->base.binding_table.gather_texture_start;
2736       } else {
2737          max_used += prog_data->base.binding_table.texture_start;
2738       }
2739
2740       brw_mark_surface_used(&prog_data->base, max_used);
2741
2742       /* Emit code to evaluate the actual indexing expression */
2743       nonconst_sampler_index->accept(this);
2744       src_reg temp(this, glsl_type::uint_type);
2745       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2746       sampler_reg = emit_uniformize(temp);
2747    } else {
2748       /* Single sampler, or constant array index; the indexing expression
2749        * is just an immediate.
2750        */
2751       sampler_reg = src_reg(sampler);
2752    }
2753
2754    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2755     * emitting anything other than setting up the constant result.
2756     */
2757    if (ir->op == ir_tg4) {
2758       ir_constant *chan = ir->lod_info.component->as_constant();
2759       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2760       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2761          dst_reg result(this, ir->type);
2762          this->result = src_reg(result);
2763          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2764          return;
2765       }
2766    }
2767
2768    /* Should be lowered by do_lower_texture_projection */
2769    assert(!ir->projector);
2770
2771    /* Should be lowered */
2772    assert(!ir->offset || !ir->offset->type->is_array());
2773
2774    /* Generate code to compute all the subexpression trees.  This has to be
2775     * done before loading any values into MRFs for the sampler message since
2776     * generating these values may involve SEND messages that need the MRFs.
2777     */
2778    src_reg coordinate;
2779    int coord_components = 0;
2780    if (ir->coordinate) {
2781       coord_components = ir->coordinate->type->vector_elements;
2782       ir->coordinate->accept(this);
2783       coordinate = this->result;
2784    }
2785
2786    src_reg shadow_comparitor;
2787    if (ir->shadow_comparitor) {
2788       ir->shadow_comparitor->accept(this);
2789       shadow_comparitor = this->result;
2790    }
2791
2792    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2793    src_reg offset_value;
2794    if (has_nonconstant_offset) {
2795       ir->offset->accept(this);
2796       offset_value = src_reg(this->result);
2797    }
2798
2799    src_reg lod, lod2, sample_index, mcs;
2800    switch (ir->op) {
2801    case ir_tex:
2802       lod = src_reg(0.0f);
2803       break;
2804    case ir_txf:
2805    case ir_txl:
2806    case ir_txs:
2807       ir->lod_info.lod->accept(this);
2808       lod = this->result;
2809       break;
2810    case ir_query_levels:
2811       lod = src_reg(0);
2812       break;
2813    case ir_txf_ms:
2814       ir->lod_info.sample_index->accept(this);
2815       sample_index = this->result;
2816
2817       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2818          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2819       else
2820          mcs = src_reg(0u);
2821       break;
2822    case ir_txd:
2823       ir->lod_info.grad.dPdx->accept(this);
2824       lod = this->result;
2825
2826       ir->lod_info.grad.dPdy->accept(this);
2827       lod2 = this->result;
2828       break;
2829    case ir_txb:
2830    case ir_lod:
2831    case ir_tg4:
2832       break;
2833    }
2834
2835    uint32_t constant_offset = 0;
2836    if (ir->offset != NULL && !has_nonconstant_offset) {
2837       constant_offset  =
2838          brw_texture_offset(ir->offset->as_constant()->value.i,
2839                             ir->offset->type->vector_elements);
2840    }
2841
2842    /* Stuff the channel select bits in the top of the texture offset */
2843    if (ir->op == ir_tg4)
2844       constant_offset |=
2845          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2846                          sampler) << 16;
2847
2848    glsl_type const *type = ir->sampler->type;
2849    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2850       type->sampler_array;
2851
2852    this->result = src_reg(this, ir->type);
2853    dst_reg dest = dst_reg(this->result);
2854
2855    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2856                 shadow_comparitor,
2857                 lod, lod2, sample_index,
2858                 constant_offset, offset_value,
2859                 mcs, is_cube_array, sampler, sampler_reg);
2860 }
2861
2862 /**
2863  * Apply workarounds for Gen6 gather with UINT/SINT
2864  */
2865 void
2866 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2867 {
2868    if (!wa)
2869       return;
2870
2871    int width = (wa & WA_8BIT) ? 8 : 16;
2872    dst_reg dst_f = dst;
2873    dst_f.type = BRW_REGISTER_TYPE_F;
2874
2875    /* Convert from UNORM to UINT */
2876    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2877    emit(MOV(dst, src_reg(dst_f)));
2878
2879    if (wa & WA_SIGN) {
2880       /* Reinterpret the UINT value as a signed INT value by
2881        * shifting the sign bit into place, then shifting back
2882        * preserving sign.
2883        */
2884       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2885       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2886    }
2887 }
2888
2889 /**
2890  * Set up the gather channel based on the swizzle, for gather4.
2891  */
2892 uint32_t
2893 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2894 {
2895    int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2896    switch (swiz) {
2897       case SWIZZLE_X: return 0;
2898       case SWIZZLE_Y:
2899          /* gather4 sampler is broken for green channel on RG32F --
2900           * we must ask for blue instead.
2901           */
2902          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2903             return 2;
2904          return 1;
2905       case SWIZZLE_Z: return 2;
2906       case SWIZZLE_W: return 3;
2907       default:
2908          unreachable("Not reached"); /* zero, one swizzles handled already */
2909    }
2910 }
2911
2912 void
2913 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2914                              src_reg orig_val, uint32_t sampler,
2915                              const glsl_type *dest_type)
2916 {
2917    int s = key->tex.swizzles[sampler];
2918
2919    dst_reg swizzled_result = dest;
2920
2921    if (op == ir_query_levels) {
2922       /* # levels is in .w */
2923       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2924       emit(MOV(swizzled_result, orig_val));
2925       return;
2926    }
2927
2928    if (op == ir_txs || dest_type == glsl_type::float_type
2929                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2930       emit(MOV(swizzled_result, orig_val));
2931       return;
2932    }
2933
2934
2935    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2936    int swizzle[4] = {0};
2937
2938    for (int i = 0; i < 4; i++) {
2939       switch (GET_SWZ(s, i)) {
2940       case SWIZZLE_ZERO:
2941          zero_mask |= (1 << i);
2942          break;
2943       case SWIZZLE_ONE:
2944          one_mask |= (1 << i);
2945          break;
2946       default:
2947          copy_mask |= (1 << i);
2948          swizzle[i] = GET_SWZ(s, i);
2949          break;
2950       }
2951    }
2952
2953    if (copy_mask) {
2954       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2955       swizzled_result.writemask = copy_mask;
2956       emit(MOV(swizzled_result, orig_val));
2957    }
2958
2959    if (zero_mask) {
2960       swizzled_result.writemask = zero_mask;
2961       emit(MOV(swizzled_result, src_reg(0.0f)));
2962    }
2963
2964    if (one_mask) {
2965       swizzled_result.writemask = one_mask;
2966       emit(MOV(swizzled_result, src_reg(1.0f)));
2967    }
2968 }
2969
2970 void
2971 vec4_visitor::visit(ir_return *)
2972 {
2973    unreachable("not reached");
2974 }
2975
2976 void
2977 vec4_visitor::visit(ir_discard *)
2978 {
2979    unreachable("not reached");
2980 }
2981
2982 void
2983 vec4_visitor::visit(ir_if *ir)
2984 {
2985    /* Don't point the annotation at the if statement, because then it plus
2986     * the then and else blocks get printed.
2987     */
2988    this->base_ir = ir->condition;
2989
2990    if (devinfo->gen == 6) {
2991       emit_if_gen6(ir);
2992    } else {
2993       enum brw_predicate predicate;
2994       emit_bool_to_cond_code(ir->condition, &predicate);
2995       emit(IF(predicate));
2996    }
2997
2998    visit_instructions(&ir->then_instructions);
2999
3000    if (!ir->else_instructions.is_empty()) {
3001       this->base_ir = ir->condition;
3002       emit(BRW_OPCODE_ELSE);
3003
3004       visit_instructions(&ir->else_instructions);
3005    }
3006
3007    this->base_ir = ir->condition;
3008    emit(BRW_OPCODE_ENDIF);
3009 }
3010
3011 void
3012 vec4_visitor::gs_emit_vertex(int stream_id)
3013 {
3014    unreachable("not reached");
3015 }
3016
3017 void
3018 vec4_visitor::visit(ir_emit_vertex *)
3019 {
3020    unreachable("not reached");
3021 }
3022
3023 void
3024 vec4_visitor::gs_end_primitive()
3025 {
3026    unreachable("not reached");
3027 }
3028
3029
3030 void
3031 vec4_visitor::visit(ir_end_primitive *)
3032 {
3033    unreachable("not reached");
3034 }
3035
3036 void
3037 vec4_visitor::visit(ir_barrier *)
3038 {
3039    unreachable("not reached");
3040 }
3041
3042 void
3043 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3044                                   dst_reg dst, src_reg offset,
3045                                   src_reg src0, src_reg src1)
3046 {
3047    unsigned mlen = 0;
3048
3049    /* Set the atomic operation offset. */
3050    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3051    mlen++;
3052
3053    /* Set the atomic operation arguments. */
3054    if (src0.file != BAD_FILE) {
3055       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3056       mlen++;
3057    }
3058
3059    if (src1.file != BAD_FILE) {
3060       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3061       mlen++;
3062    }
3063
3064    /* Emit the instruction.  Note that this maps to the normal SIMD8
3065     * untyped atomic message on Ivy Bridge, but that's OK because
3066     * unused channels will be masked out.
3067     */
3068    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3069                                  brw_message_reg(0),
3070                                  src_reg(surf_index), src_reg(atomic_op));
3071    inst->mlen = mlen;
3072 }
3073
3074 void
3075 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3076                                         src_reg offset)
3077 {
3078    /* Set the surface read offset. */
3079    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3080
3081    /* Emit the instruction.  Note that this maps to the normal SIMD8
3082     * untyped surface read message, but that's OK because unused
3083     * channels will be masked out.
3084     */
3085    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3086                                  brw_message_reg(0),
3087                                  src_reg(surf_index), src_reg(1));
3088    inst->mlen = 1;
3089 }
3090
3091 void
3092 vec4_visitor::emit_ndc_computation()
3093 {
3094    /* Get the position */
3095    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3096
3097    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3098    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3099    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3100
3101    current_annotation = "NDC";
3102    dst_reg ndc_w = ndc;
3103    ndc_w.writemask = WRITEMASK_W;
3104    src_reg pos_w = pos;
3105    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3106    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3107
3108    dst_reg ndc_xyz = ndc;
3109    ndc_xyz.writemask = WRITEMASK_XYZ;
3110
3111    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3112 }
3113
3114 void
3115 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3116 {
3117    if (devinfo->gen < 6 &&
3118        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3119         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3120       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3121       dst_reg header1_w = header1;
3122       header1_w.writemask = WRITEMASK_W;
3123
3124       emit(MOV(header1, 0u));
3125
3126       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3127          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3128
3129          current_annotation = "Point size";
3130          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3131          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3132       }
3133
3134       if (key->userclip_active) {
3135          current_annotation = "Clipping flags";
3136          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3137          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3138
3139          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3140          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3141          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3142
3143          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3144          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3145          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3146          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3147       }
3148
3149       /* i965 clipping workaround:
3150        * 1) Test for -ve rhw
3151        * 2) If set,
3152        *      set ndc = (0,0,0,0)
3153        *      set ucp[6] = 1
3154        *
3155        * Later, clipping will detect ucp[6] and ensure the primitive is
3156        * clipped against all fixed planes.
3157        */
3158       if (devinfo->has_negative_rhw_bug) {
3159          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3160          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3161          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3162          vec4_instruction *inst;
3163          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3164          inst->predicate = BRW_PREDICATE_NORMAL;
3165          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3166          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3167          inst->predicate = BRW_PREDICATE_NORMAL;
3168       }
3169
3170       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3171    } else if (devinfo->gen < 6) {
3172       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3173    } else {
3174       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3175       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3176          dst_reg reg_w = reg;
3177          reg_w.writemask = WRITEMASK_W;
3178          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3179          reg_as_src.type = reg_w.type;
3180          reg_as_src.swizzle = brw_swizzle_for_size(1);
3181          emit(MOV(reg_w, reg_as_src));
3182       }
3183       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3184          dst_reg reg_y = reg;
3185          reg_y.writemask = WRITEMASK_Y;
3186          reg_y.type = BRW_REGISTER_TYPE_D;
3187          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3188          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3189       }
3190       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3191          dst_reg reg_z = reg;
3192          reg_z.writemask = WRITEMASK_Z;
3193          reg_z.type = BRW_REGISTER_TYPE_D;
3194          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3195          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3196       }
3197    }
3198 }
3199
3200 void
3201 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3202 {
3203    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3204     *
3205     *     "If a linked set of shaders forming the vertex stage contains no
3206     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3207     *     application has requested clipping against user clip planes through
3208     *     the API, then the coordinate written to gl_Position is used for
3209     *     comparison against the user clip planes."
3210     *
3211     * This function is only called if the shader didn't write to
3212     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3213     * if the user wrote to it; otherwise we use gl_Position.
3214     */
3215    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3216    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3217       clip_vertex = VARYING_SLOT_POS;
3218    }
3219
3220    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3221         ++i) {
3222       reg.writemask = 1 << i;
3223       emit(DP4(reg,
3224                src_reg(output_reg[clip_vertex]),
3225                src_reg(this->userplane[i + offset])));
3226    }
3227 }
3228
3229 vec4_instruction *
3230 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3231 {
3232    assert(varying < VARYING_SLOT_MAX);
3233    assert(output_reg[varying].type == reg.type);
3234    current_annotation = output_reg_annotation[varying];
3235    /* Copy the register, saturating if necessary */
3236    return emit(MOV(reg, src_reg(output_reg[varying])));
3237 }
3238
3239 void
3240 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3241 {
3242    reg.type = BRW_REGISTER_TYPE_F;
3243    output_reg[varying].type = reg.type;
3244
3245    switch (varying) {
3246    case VARYING_SLOT_PSIZ:
3247    {
3248       /* PSIZ is always in slot 0, and is coupled with other flags. */
3249       current_annotation = "indices, point width, clip flags";
3250       emit_psiz_and_flags(reg);
3251       break;
3252    }
3253    case BRW_VARYING_SLOT_NDC:
3254       current_annotation = "NDC";
3255       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3256       break;
3257    case VARYING_SLOT_POS:
3258       current_annotation = "gl_Position";
3259       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3260       break;
3261    case VARYING_SLOT_EDGE:
3262       /* This is present when doing unfilled polygons.  We're supposed to copy
3263        * the edge flag from the user-provided vertex array
3264        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3265        * of that attribute (starts as 1.0f).  This is then used in clipping to
3266        * determine which edges should be drawn as wireframe.
3267        */
3268       current_annotation = "edge flag";
3269       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3270                                     glsl_type::float_type, WRITEMASK_XYZW))));
3271       break;
3272    case BRW_VARYING_SLOT_PAD:
3273       /* No need to write to this slot */
3274       break;
3275    case VARYING_SLOT_COL0:
3276    case VARYING_SLOT_COL1:
3277    case VARYING_SLOT_BFC0:
3278    case VARYING_SLOT_BFC1: {
3279       /* These built-in varyings are only supported in compatibility mode,
3280        * and we only support GS in core profile.  So, this must be a vertex
3281        * shader.
3282        */
3283       assert(stage == MESA_SHADER_VERTEX);
3284       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3285       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3286          inst->saturate = true;
3287       break;
3288    }
3289
3290    default:
3291       emit_generic_urb_slot(reg, varying);
3292       break;
3293    }
3294 }
3295
3296 static int
3297 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3298 {
3299    if (devinfo->gen >= 6) {
3300       /* URB data written (does not include the message header reg) must
3301        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3302        * section 5.4.3.2.2: URB_INTERLEAVED.
3303        *
3304        * URB entries are allocated on a multiple of 1024 bits, so an
3305        * extra 128 bits written here to make the end align to 256 is
3306        * no problem.
3307        */
3308       if ((mlen % 2) != 1)
3309          mlen++;
3310    }
3311
3312    return mlen;
3313 }
3314
3315
3316 /**
3317  * Generates the VUE payload plus the necessary URB write instructions to
3318  * output it.
3319  *
3320  * The VUE layout is documented in Volume 2a.
3321  */
3322 void
3323 vec4_visitor::emit_vertex()
3324 {
3325    /* MRF 0 is reserved for the debugger, so start with message header
3326     * in MRF 1.
3327     */
3328    int base_mrf = 1;
3329    int mrf = base_mrf;
3330    /* In the process of generating our URB write message contents, we
3331     * may need to unspill a register or load from an array.  Those
3332     * reads would use MRFs 14-15.
3333     */
3334    int max_usable_mrf = 13;
3335
3336    /* The following assertion verifies that max_usable_mrf causes an
3337     * even-numbered amount of URB write data, which will meet gen6's
3338     * requirements for length alignment.
3339     */
3340    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3341
3342    /* First mrf is the g0-based message header containing URB handles and
3343     * such.
3344     */
3345    emit_urb_write_header(mrf++);
3346
3347    if (devinfo->gen < 6) {
3348       emit_ndc_computation();
3349    }
3350
3351    /* Lower legacy ff and ClipVertex clipping to clip distances */
3352    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3353       current_annotation = "user clip distances";
3354
3355       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3356       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3357
3358       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3359       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3360    }
3361
3362    /* We may need to split this up into several URB writes, so do them in a
3363     * loop.
3364     */
3365    int slot = 0;
3366    bool complete = false;
3367    do {
3368       /* URB offset is in URB row increments, and each of our MRFs is half of
3369        * one of those, since we're doing interleaved writes.
3370        */
3371       int offset = slot / 2;
3372
3373       mrf = base_mrf + 1;
3374       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3375          emit_urb_slot(dst_reg(MRF, mrf++),
3376                        prog_data->vue_map.slot_to_varying[slot]);
3377
3378          /* If this was max_usable_mrf, we can't fit anything more into this
3379           * URB WRITE.
3380           */
3381          if (mrf > max_usable_mrf) {
3382             slot++;
3383             break;
3384          }
3385       }
3386
3387       complete = slot >= prog_data->vue_map.num_slots;
3388       current_annotation = "URB write";
3389       vec4_instruction *inst = emit_urb_write_opcode(complete);
3390       inst->base_mrf = base_mrf;
3391       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3392       inst->offset += offset;
3393    } while(!complete);
3394 }
3395
3396
3397 src_reg
3398 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3399                                  src_reg *reladdr, int reg_offset)
3400 {
3401    /* Because we store the values to scratch interleaved like our
3402     * vertex data, we need to scale the vec4 index by 2.
3403     */
3404    int message_header_scale = 2;
3405
3406    /* Pre-gen6, the message header uses byte offsets instead of vec4
3407     * (16-byte) offset units.
3408     */
3409    if (devinfo->gen < 6)
3410       message_header_scale *= 16;
3411
3412    if (reladdr) {
3413       src_reg index = src_reg(this, glsl_type::int_type);
3414
3415       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3416                                    src_reg(reg_offset)));
3417       emit_before(block, inst, MUL(dst_reg(index), index,
3418                                    src_reg(message_header_scale)));
3419
3420       return index;
3421    } else {
3422       return src_reg(reg_offset * message_header_scale);
3423    }
3424 }
3425
3426 src_reg
3427 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3428                                        src_reg *reladdr, int reg_offset)
3429 {
3430    if (reladdr) {
3431       src_reg index = src_reg(this, glsl_type::int_type);
3432
3433       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3434                                    src_reg(reg_offset)));
3435
3436       /* Pre-gen6, the message header uses byte offsets instead of vec4
3437        * (16-byte) offset units.
3438        */
3439       if (devinfo->gen < 6) {
3440          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3441       }
3442
3443       return index;
3444    } else if (devinfo->gen >= 8) {
3445       /* Store the offset in a GRF so we can send-from-GRF. */
3446       src_reg offset = src_reg(this, glsl_type::int_type);
3447       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3448       return offset;
3449    } else {
3450       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3451       return src_reg(reg_offset * message_header_scale);
3452    }
3453 }
3454
3455 /**
3456  * Emits an instruction before @inst to load the value named by @orig_src
3457  * from scratch space at @base_offset to @temp.
3458  *
3459  * @base_offset is measured in 32-byte units (the size of a register).
3460  */
3461 void
3462 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3463                                 dst_reg temp, src_reg orig_src,
3464                                 int base_offset)
3465 {
3466    int reg_offset = base_offset + orig_src.reg_offset;
3467    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3468                                       reg_offset);
3469
3470    emit_before(block, inst, SCRATCH_READ(temp, index));
3471 }
3472
3473 /**
3474  * Emits an instruction after @inst to store the value to be written
3475  * to @orig_dst to scratch space at @base_offset, from @temp.
3476  *
3477  * @base_offset is measured in 32-byte units (the size of a register).
3478  */
3479 void
3480 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3481                                  int base_offset)
3482 {
3483    int reg_offset = base_offset + inst->dst.reg_offset;
3484    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3485                                       reg_offset);
3486
3487    /* Create a temporary register to store *inst's result in.
3488     *
3489     * We have to be careful in MOVing from our temporary result register in
3490     * the scratch write.  If we swizzle from channels of the temporary that
3491     * weren't initialized, it will confuse live interval analysis, which will
3492     * make spilling fail to make progress.
3493     */
3494    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3495                                        inst->dst.type),
3496                                 brw_swizzle_for_mask(inst->dst.writemask));
3497    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3498                                        inst->dst.writemask));
3499    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3500    if (inst->opcode != BRW_OPCODE_SEL)
3501       write->predicate = inst->predicate;
3502    write->ir = inst->ir;
3503    write->annotation = inst->annotation;
3504    inst->insert_after(block, write);
3505
3506    inst->dst.file = temp.file;
3507    inst->dst.reg = temp.reg;
3508    inst->dst.reg_offset = temp.reg_offset;
3509    inst->dst.reladdr = NULL;
3510 }
3511
3512 /**
3513  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3514  * adds the scratch read(s) before \p inst. The function also checks for
3515  * recursive reladdr scratch accesses, issuing the corresponding scratch
3516  * loads and rewriting reladdr references accordingly.
3517  *
3518  * \return \p src if it did not require a scratch load, otherwise, the
3519  * register holding the result of the scratch load that the caller should
3520  * use to rewrite src.
3521  */
3522 src_reg
3523 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3524                                    vec4_instruction *inst, src_reg src)
3525 {
3526    /* Resolve recursive reladdr scratch access by calling ourselves
3527     * with src.reladdr
3528     */
3529    if (src.reladdr)
3530       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3531                                           *src.reladdr);
3532
3533    /* Now handle scratch access on src */
3534    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3535       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3536       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3537       src.reg = temp.reg;
3538       src.reg_offset = temp.reg_offset;
3539       src.reladdr = NULL;
3540    }
3541
3542    return src;
3543 }
3544
3545 /**
3546  * We can't generally support array access in GRF space, because a
3547  * single instruction's destination can only span 2 contiguous
3548  * registers.  So, we send all GRF arrays that get variable index
3549  * access to scratch space.
3550  */
3551 void
3552 vec4_visitor::move_grf_array_access_to_scratch()
3553 {
3554    int scratch_loc[this->alloc.count];
3555    memset(scratch_loc, -1, sizeof(scratch_loc));
3556
3557    /* First, calculate the set of virtual GRFs that need to be punted
3558     * to scratch due to having any array access on them, and where in
3559     * scratch.
3560     */
3561    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3562       if (inst->dst.file == GRF && inst->dst.reladdr) {
3563          if (scratch_loc[inst->dst.reg] == -1) {
3564             scratch_loc[inst->dst.reg] = last_scratch;
3565             last_scratch += this->alloc.sizes[inst->dst.reg];
3566          }
3567
3568          for (src_reg *iter = inst->dst.reladdr;
3569               iter->reladdr;
3570               iter = iter->reladdr) {
3571             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3572                scratch_loc[iter->reg] = last_scratch;
3573                last_scratch += this->alloc.sizes[iter->reg];
3574             }
3575          }
3576       }
3577
3578       for (int i = 0 ; i < 3; i++) {
3579          for (src_reg *iter = &inst->src[i];
3580               iter->reladdr;
3581               iter = iter->reladdr) {
3582             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3583                scratch_loc[iter->reg] = last_scratch;
3584                last_scratch += this->alloc.sizes[iter->reg];
3585             }
3586          }
3587       }
3588    }
3589
3590    /* Now, for anything that will be accessed through scratch, rewrite
3591     * it to load/store.  Note that this is a _safe list walk, because
3592     * we may generate a new scratch_write instruction after the one
3593     * we're processing.
3594     */
3595    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3596       /* Set up the annotation tracking for new generated instructions. */
3597       base_ir = inst->ir;
3598       current_annotation = inst->annotation;
3599
3600       /* First handle scratch access on the dst. Notice we have to handle
3601        * the case where the dst's reladdr also points to scratch space.
3602        */
3603       if (inst->dst.reladdr)
3604          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3605                                                    *inst->dst.reladdr);
3606
3607       /* Now that we have handled any (possibly recursive) reladdr scratch
3608        * accesses for dst we can safely do the scratch write for dst itself
3609        */
3610       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3611          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3612
3613       /* Now handle scratch access on any src. In this case, since inst->src[i]
3614        * already is a src_reg, we can just call emit_resolve_reladdr with
3615        * inst->src[i] and it will take care of handling scratch loads for
3616        * both src and src.reladdr (recursively).
3617        */
3618       for (int i = 0 ; i < 3; i++) {
3619          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3620                                              inst->src[i]);
3621       }
3622    }
3623 }
3624
3625 /**
3626  * Emits an instruction before @inst to load the value named by @orig_src
3627  * from the pull constant buffer (surface) at @base_offset to @temp.
3628  */
3629 void
3630 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3631                                       dst_reg temp, src_reg orig_src,
3632                                       int base_offset)
3633 {
3634    int reg_offset = base_offset + orig_src.reg_offset;
3635    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3636    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3637                                              reg_offset);
3638
3639    emit_pull_constant_load_reg(temp,
3640                                index,
3641                                offset,
3642                                block, inst);
3643 }
3644
3645 /**
3646  * Implements array access of uniforms by inserting a
3647  * PULL_CONSTANT_LOAD instruction.
3648  *
3649  * Unlike temporary GRF array access (where we don't support it due to
3650  * the difficulty of doing relative addressing on instruction
3651  * destinations), we could potentially do array access of uniforms
3652  * that were loaded in GRF space as push constants.  In real-world
3653  * usage we've seen, though, the arrays being used are always larger
3654  * than we could load as push constants, so just always move all
3655  * uniform array access out to a pull constant buffer.
3656  */
3657 void
3658 vec4_visitor::move_uniform_array_access_to_pull_constants()
3659 {
3660    int pull_constant_loc[this->uniforms];
3661    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3662    bool nested_reladdr;
3663
3664    /* Walk through and find array access of uniforms.  Put a copy of that
3665     * uniform in the pull constant buffer.
3666     *
3667     * Note that we don't move constant-indexed accesses to arrays.  No
3668     * testing has been done of the performance impact of this choice.
3669     */
3670    do {
3671       nested_reladdr = false;
3672
3673       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3674          for (int i = 0 ; i < 3; i++) {
3675             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3676                continue;
3677
3678             int uniform = inst->src[i].reg;
3679
3680             if (inst->src[i].reladdr->reladdr)
3681                nested_reladdr = true;  /* will need another pass */
3682
3683             /* If this array isn't already present in the pull constant buffer,
3684              * add it.
3685              */
3686             if (pull_constant_loc[uniform] == -1) {
3687                const gl_constant_value **values =
3688                   &stage_prog_data->param[uniform * 4];
3689
3690                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3691
3692                assert(uniform < uniform_array_size);
3693                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3694                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3695                      = values[j];
3696                }
3697             }
3698
3699             /* Set up the annotation tracking for new generated instructions. */
3700             base_ir = inst->ir;
3701             current_annotation = inst->annotation;
3702
3703             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3704
3705             emit_pull_constant_load(block, inst, temp, inst->src[i],
3706                                     pull_constant_loc[uniform]);
3707
3708             inst->src[i].file = temp.file;
3709             inst->src[i].reg = temp.reg;
3710             inst->src[i].reg_offset = temp.reg_offset;
3711             inst->src[i].reladdr = NULL;
3712          }
3713       }
3714    } while (nested_reladdr);
3715
3716    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3717     * no need to track them as larger-than-vec4 objects.  This will be
3718     * relied on in cutting out unused uniform vectors from push
3719     * constants.
3720     */
3721    split_uniform_registers();
3722 }
3723
3724 void
3725 vec4_visitor::resolve_ud_negate(src_reg *reg)
3726 {
3727    if (reg->type != BRW_REGISTER_TYPE_UD ||
3728        !reg->negate)
3729       return;
3730
3731    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3732    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3733    *reg = temp;
3734 }
3735
3736 /**
3737  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3738  *
3739  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3740  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3741  */
3742 void
3743 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3744 {
3745    assert(devinfo->gen <= 5);
3746
3747    if (!rvalue->type->is_boolean())
3748       return;
3749
3750    src_reg and_result = src_reg(this, rvalue->type);
3751    src_reg neg_result = src_reg(this, rvalue->type);
3752    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3753    emit(MOV(dst_reg(neg_result), negate(and_result)));
3754    *reg = neg_result;
3755 }
3756
3757 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3758                            void *log_data,
3759                            struct gl_program *prog,
3760                            const struct brw_vue_prog_key *key,
3761                            struct brw_vue_prog_data *prog_data,
3762                            struct gl_shader_program *shader_prog,
3763                            gl_shader_stage stage,
3764                            void *mem_ctx,
3765                            bool no_spills,
3766                            int shader_time_index)
3767    : backend_shader(compiler, log_data, mem_ctx,
3768                     shader_prog, prog, &prog_data->base, stage),
3769      key(key),
3770      prog_data(prog_data),
3771      sanity_param_count(0),
3772      fail_msg(NULL),
3773      first_non_payload_grf(0),
3774      need_all_constants_in_pull_buffer(false),
3775      no_spills(no_spills),
3776      shader_time_index(shader_time_index),
3777      last_scratch(0)
3778 {
3779    this->failed = false;
3780
3781    this->base_ir = NULL;
3782    this->current_annotation = NULL;
3783    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3784
3785    this->variable_ht = hash_table_ctor(0,
3786                                        hash_table_pointer_hash,
3787                                        hash_table_pointer_compare);
3788
3789    this->virtual_grf_start = NULL;
3790    this->virtual_grf_end = NULL;
3791    this->live_intervals = NULL;
3792
3793    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3794
3795    this->uniforms = 0;
3796
3797    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3798     * at least one. See setup_uniforms() in brw_vec4.cpp.
3799     */
3800    this->uniform_array_size = 1;
3801    if (prog_data) {
3802       this->uniform_array_size =
3803          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3804    }
3805
3806    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3807    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3808 }
3809
3810 vec4_visitor::~vec4_visitor()
3811 {
3812    hash_table_dtor(this->variable_ht);
3813 }
3814
3815
3816 void
3817 vec4_visitor::fail(const char *format, ...)
3818 {
3819    va_list va;
3820    char *msg;
3821
3822    if (failed)
3823       return;
3824
3825    failed = true;
3826
3827    va_start(va, format);
3828    msg = ralloc_vasprintf(mem_ctx, format, va);
3829    va_end(va);
3830    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3831
3832    this->fail_msg = msg;
3833
3834    if (debug_enabled) {
3835       fprintf(stderr, "%s",  msg);
3836    }
3837 }
3838
3839 } /* namespace brw */