src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(const src_reg &src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 317 {
 318    if (!src.abs && !src.negate)
 319       return src;
 320
 321    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 322    resolved.type = src.type;
 323    emit(MOV(resolved, src));
 324
 325    return src_reg(resolved);
 326 }
 327
 328 src_reg
 329 vec4_visitor::fix_math_operand(const src_reg &src)
 330 {
 331    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 332       return src;
 333
 334    /* The gen6 math instruction ignores the source modifiers --
 335     * swizzle, abs, negate, and at least some parts of the register
 336     * region description.
 337     *
 338     * Rather than trying to enumerate all these cases, *always* expand the
 339     * operand to a temp GRF for gen6.
 340     *
 341     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 342     * can't use.
 343     */
 344
 345    if (devinfo->gen == 7 && src.file != IMM)
 346       return src;
 347
 348    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 349    expanded.type = src.type;
 350    emit(MOV(expanded, src));
 351    return src_reg(expanded);
 352 }
 353
 354 vec4_instruction *
 355 vec4_visitor::emit_math(enum opcode opcode,
 356                         const dst_reg &dst,
 357                         const src_reg &src0, const src_reg &src1)
 358 {
 359    vec4_instruction *math =
 360       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 361
 362    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 363       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 364       math->dst = dst_reg(this, glsl_type::vec4_type);
 365       math->dst.type = dst.type;
 366       math = emit(MOV(dst, src_reg(math->dst)));
 367    } else if (devinfo->gen < 6) {
 368       math->base_mrf = 1;
 369       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 370    }
 371
 372    return math;
 373 }
 374
 375 void
 376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 377 {
 378    if (devinfo->gen < 7) {
 379       unreachable("ir_unop_pack_half_2x16 should be lowered");
 380    }
 381
 382    assert(dst.type == BRW_REGISTER_TYPE_UD);
 383    assert(src0.type == BRW_REGISTER_TYPE_F);
 384
 385    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 386     *
 387     *   Because this instruction does not have a 16-bit floating-point type,
 388     *   the destination data type must be Word (W).
 389     *
 390     *   The destination must be DWord-aligned and specify a horizontal stride
 391     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 392     *   each destination channel and the upper word is not modified.
 393     *
 394     * The above restriction implies that the f32to16 instruction must use
 395     * align1 mode, because only in align1 mode is it possible to specify
 396     * horizontal stride.  We choose here to defy the hardware docs and emit
 397     * align16 instructions.
 398     *
 399     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 400     * instructions. I was partially successful in that the code passed all
 401     * tests.  However, the code was dubiously correct and fragile, and the
 402     * tests were not harsh enough to probe that frailty. Not trusting the
 403     * code, I chose instead to remain in align16 mode in defiance of the hw
 404     * docs).
 405     *
 406     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 407     * simulator, emitting a f32to16 in align16 mode with UD as destination
 408     * data type is safe. The behavior differs from that specified in the PRM
 409     * in that the upper word of each destination channel is cleared to 0.
 410     */
 411
 412    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 413    src_reg tmp_src(tmp_dst);
 414
 415 #if 0
 416    /* Verify the undocumented behavior on which the following instructions
 417     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 418     * then the result of the bit-or instruction below will be incorrect.
 419     *
 420     * You should inspect the disasm output in order to verify that the MOV is
 421     * not optimized away.
 422     */
 423    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 424 #endif
 425
 426    /* Give tmp the form below, where "." means untouched.
 427     *
 428     *     w z          y          x w z          y          x
 429     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 430     *
 431     * That the upper word of each write-channel be 0 is required for the
 432     * following bit-shift and bit-or instructions to work. Note that this
 433     * relies on the undocumented hardware behavior mentioned above.
 434     */
 435    tmp_dst.writemask = WRITEMASK_XY;
 436    emit(F32TO16(tmp_dst, src0));
 437
 438    /* Give the write-channels of dst the form:
 439     *   0xhhhh0000
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 442    emit(SHL(dst, tmp_src, src_reg(16u)));
 443
 444    /* Finally, give the write-channels of dst the form of packHalf2x16's
 445     * output:
 446     *   0xhhhhllll
 447     */
 448    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 449    emit(OR(dst, src_reg(dst), tmp_src));
 450 }
 451
 452 void
 453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 454 {
 455    if (devinfo->gen < 7) {
 456       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 457    }
 458
 459    assert(dst.type == BRW_REGISTER_TYPE_F);
 460    assert(src0.type == BRW_REGISTER_TYPE_UD);
 461
 462    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 463     *
 464     *   Because this instruction does not have a 16-bit floating-point type,
 465     *   the source data type must be Word (W). The destination type must be
 466     *   F (Float).
 467     *
 468     * To use W as the source data type, we must adjust horizontal strides,
 469     * which is only possible in align1 mode. All my [chadv] attempts at
 470     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 471     * Piglit tests, so I gave up.
 472     *
 473     * I've verified that, on gen7 hardware and the simulator, it is safe to
 474     * emit f16to32 in align16 mode with UD as source data type.
 475     */
 476
 477    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 478    src_reg tmp_src(tmp_dst);
 479
 480    tmp_dst.writemask = WRITEMASK_X;
 481    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 482
 483    tmp_dst.writemask = WRITEMASK_Y;
 484    emit(SHR(tmp_dst, src0, src_reg(16u)));
 485
 486    dst.writemask = WRITEMASK_XY;
 487    emit(F16TO32(dst, tmp_src));
 488 }
 489
 490 void
 491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 492 {
 493    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 494     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 495     * is not suitable to generate the shift values, but we can use the packed
 496     * vector float and a type-converting MOV.
 497     */
 498    dst_reg shift(this, glsl_type::uvec4_type);
 499    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 500
 501    dst_reg shifted(this, glsl_type::uvec4_type);
 502    src0.swizzle = BRW_SWIZZLE_XXXX;
 503    emit(SHR(shifted, src0, src_reg(shift)));
 504
 505    shifted.type = BRW_REGISTER_TYPE_UB;
 506    dst_reg f(this, glsl_type::vec4_type);
 507    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 508
 509    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 514 {
 515    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 516     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 517     * is not suitable to generate the shift values, but we can use the packed
 518     * vector float and a type-converting MOV.
 519     */
 520    dst_reg shift(this, glsl_type::uvec4_type);
 521    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 522
 523    dst_reg shifted(this, glsl_type::uvec4_type);
 524    src0.swizzle = BRW_SWIZZLE_XXXX;
 525    emit(SHR(shifted, src0, src_reg(shift)));
 526
 527    shifted.type = BRW_REGISTER_TYPE_B;
 528    dst_reg f(this, glsl_type::vec4_type);
 529    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 533
 534    dst_reg max(this, glsl_type::vec4_type);
 535    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 536    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 537 }
 538
 539 void
 540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 541 {
 542    dst_reg saturated(this, glsl_type::vec4_type);
 543    vec4_instruction *inst = emit(MOV(saturated, src0));
 544    inst->saturate = true;
 545
 546    dst_reg scaled(this, glsl_type::vec4_type);
 547    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 548
 549    dst_reg rounded(this, glsl_type::vec4_type);
 550    emit(RNDE(rounded, src_reg(scaled)));
 551
 552    dst_reg u(this, glsl_type::uvec4_type);
 553    emit(MOV(u, src_reg(rounded)));
 554
 555    src_reg bytes(u);
 556    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 557 }
 558
 559 void
 560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 561 {
 562    dst_reg max(this, glsl_type::vec4_type);
 563    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 564
 565    dst_reg min(this, glsl_type::vec4_type);
 566    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 567
 568    dst_reg scaled(this, glsl_type::vec4_type);
 569    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 570
 571    dst_reg rounded(this, glsl_type::vec4_type);
 572    emit(RNDE(rounded, src_reg(scaled)));
 573
 574    dst_reg i(this, glsl_type::ivec4_type);
 575    emit(MOV(i, src_reg(rounded)));
 576
 577    src_reg bytes(i);
 578    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 579 }
 580
 581 void
 582 vec4_visitor::visit_instructions(const exec_list *list)
 583 {
 584    foreach_in_list(ir_instruction, ir, list) {
 585       base_ir = ir;
 586       ir->accept(this);
 587    }
 588 }
 589
 590 /**
 591  * Returns the minimum number of vec4 elements needed to pack a type.
 592  *
 593  * For simple types, it will return 1 (a single vec4); for matrices, the
 594  * number of columns; for array and struct, the sum of the vec4_size of
 595  * each of its elements; and for sampler and atomic, zero.
 596  *
 597  * This method is useful to calculate how much register space is needed to
 598  * store a particular type.
 599  */
 600 extern "C" int
 601 type_size_vec4(const struct glsl_type *type)
 602 {
 603    unsigned int i;
 604    int size;
 605
 606    switch (type->base_type) {
 607    case GLSL_TYPE_UINT:
 608    case GLSL_TYPE_INT:
 609    case GLSL_TYPE_FLOAT:
 610    case GLSL_TYPE_BOOL:
 611       if (type->is_matrix()) {
 612          return type->matrix_columns;
 613       } else {
 614          /* Regardless of size of vector, it gets a vec4. This is bad
 615           * packing for things like floats, but otherwise arrays become a
 616           * mess.  Hopefully a later pass over the code can pack scalars
 617           * down if appropriate.
 618           */
 619          return 1;
 620       }
 621    case GLSL_TYPE_ARRAY:
 622       assert(type->length > 0);
 623       return type_size_vec4(type->fields.array) * type->length;
 624    case GLSL_TYPE_STRUCT:
 625       size = 0;
 626       for (i = 0; i < type->length; i++) {
 627          size += type_size_vec4(type->fields.structure[i].type);
 628       }
 629       return size;
 630    case GLSL_TYPE_SUBROUTINE:
 631       return 1;
 632
 633    case GLSL_TYPE_SAMPLER:
 634       /* Samplers take up no register space, since they're baked in at
 635        * link time.
 636        */
 637       return 0;
 638    case GLSL_TYPE_ATOMIC_UINT:
 639       return 0;
 640    case GLSL_TYPE_IMAGE:
 641       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 642    case GLSL_TYPE_VOID:
 643    case GLSL_TYPE_DOUBLE:
 644    case GLSL_TYPE_ERROR:
 645    case GLSL_TYPE_INTERFACE:
 646    case GLSL_TYPE_FUNCTION:
 647       unreachable("not reached");
 648    }
 649
 650    return 0;
 651 }
 652
 653 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 654 {
 655    init();
 656
 657    this->file = GRF;
 658    this->reg = v->alloc.allocate(type_size_vec4(type));
 659
 660    if (type->is_array() || type->is_record()) {
 661       this->swizzle = BRW_SWIZZLE_NOOP;
 662    } else {
 663       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 664    }
 665
 666    this->type = brw_type_for_base_type(type);
 667 }
 668
 669 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 670 {
 671    assert(size > 0);
 672
 673    init();
 674
 675    this->file = GRF;
 676    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 677
 678    this->swizzle = BRW_SWIZZLE_NOOP;
 679
 680    this->type = brw_type_for_base_type(type);
 681 }
 682
 683 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 684 {
 685    init();
 686
 687    this->file = GRF;
 688    this->reg = v->alloc.allocate(type_size_vec4(type));
 689
 690    if (type->is_array() || type->is_record()) {
 691       this->writemask = WRITEMASK_XYZW;
 692    } else {
 693       this->writemask = (1 << type->vector_elements) - 1;
 694    }
 695
 696    this->type = brw_type_for_base_type(type);
 697 }
 698
 699 void
 700 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 701                                        const gl_constant_value *values,
 702                                        unsigned n)
 703 {
 704    static const gl_constant_value zero = { 0 };
 705
 706    assert(param_offset % 4 == 0);
 707
 708    for (unsigned i = 0; i < n; ++i)
 709       stage_prog_data->param[param_offset + i] = &values[i];
 710
 711    for (unsigned i = n; i < 4; ++i)
 712       stage_prog_data->param[param_offset + i] = &zero;
 713
 714    uniform_vector_size[param_offset / 4] = n;
 715 }
 716
 717 /* Our support for uniforms is piggy-backed on the struct
 718  * gl_fragment_program, because that's where the values actually
 719  * get stored, rather than in some global gl_shader_program uniform
 720  * store.
 721  */
 722 void
 723 vec4_visitor::setup_uniform_values(ir_variable *ir)
 724 {
 725    int namelen = strlen(ir->name);
 726
 727    /* The data for our (non-builtin) uniforms is stored in a series of
 728     * gl_uniform_driver_storage structs for each subcomponent that
 729     * glGetUniformLocation() could name.  We know it's been set up in the same
 730     * order we'd walk the type, so walk the list of storage and find anything
 731     * with our name, or the prefix of a component that starts with our name.
 732     */
 733    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 734       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 735
 736       if (storage->builtin)
 737          continue;
 738
 739       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 740           (storage->name[namelen] != 0 &&
 741            storage->name[namelen] != '.' &&
 742            storage->name[namelen] != '[')) {
 743          continue;
 744       }
 745
 746       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 747                                      storage->type->matrix_columns);
 748       const unsigned vector_size = storage->type->vector_elements;
 749
 750       for (unsigned s = 0; s < vector_count; s++) {
 751          setup_vec4_uniform_value(uniforms * 4,
 752                                   &storage->storage[s * vector_size],
 753                                   vector_size);
 754          uniforms++;
 755       }
 756    }
 757 }
 758
 759 void
 760 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 761 {
 762    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 763       assert(this->uniforms < uniform_array_size);
 764       this->uniform_vector_size[this->uniforms] = 4;
 765       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 766       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 767       for (int j = 0; j < 4; ++j) {
 768          stage_prog_data->param[this->uniforms * 4 + j] =
 769             (gl_constant_value *) &clip_planes[i][j];
 770       }
 771       ++this->uniforms;
 772    }
 773 }
 774
 775 /* Our support for builtin uniforms is even scarier than non-builtin.
 776  * It sits on top of the PROG_STATE_VAR parameters that are
 777  * automatically updated from GL context state.
 778  */
 779 void
 780 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 781 {
 782    const ir_state_slot *const slots = ir->get_state_slots();
 783    assert(slots != NULL);
 784
 785    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 786       /* This state reference has already been setup by ir_to_mesa,
 787        * but we'll get the same index back here.  We can reference
 788        * ParameterValues directly, since unlike brw_fs.cpp, we never
 789        * add new state references during compile.
 790        */
 791       int index = _mesa_add_state_reference(this->prog->Parameters,
 792                                             (gl_state_index *)slots[i].tokens);
 793       gl_constant_value *values =
 794          &this->prog->Parameters->ParameterValues[index][0];
 795
 796       assert(this->uniforms < uniform_array_size);
 797
 798       for (unsigned j = 0; j < 4; j++)
 799          stage_prog_data->param[this->uniforms * 4 + j] =
 800             &values[GET_SWZ(slots[i].swizzle, j)];
 801
 802       this->uniform_vector_size[this->uniforms] =
 803          (ir->type->is_scalar() || ir->type->is_vector() ||
 804           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 805
 806       this->uniforms++;
 807    }
 808 }
 809
 810 dst_reg *
 811 vec4_visitor::variable_storage(ir_variable *var)
 812 {
 813    return (dst_reg *)hash_table_find(this->variable_ht, var);
 814 }
 815
 816 void
 817 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 818                                      enum brw_predicate *predicate)
 819 {
 820    ir_expression *expr = ir->as_expression();
 821
 822    *predicate = BRW_PREDICATE_NORMAL;
 823
 824    if (expr && expr->operation != ir_binop_ubo_load) {
 825       src_reg op[3];
 826       vec4_instruction *inst;
 827
 828       assert(expr->get_num_operands() <= 3);
 829       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 830          expr->operands[i]->accept(this);
 831          op[i] = this->result;
 832
 833          resolve_ud_negate(&op[i]);
 834       }
 835
 836       switch (expr->operation) {
 837       case ir_unop_logic_not:
 838          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 839          inst->conditional_mod = BRW_CONDITIONAL_Z;
 840          break;
 841
 842       case ir_binop_logic_xor:
 843          if (devinfo->gen <= 5) {
 844             src_reg temp = src_reg(this, ir->type);
 845             emit(XOR(dst_reg(temp), op[0], op[1]));
 846             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 847          } else {
 848             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 849          }
 850          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 851          break;
 852
 853       case ir_binop_logic_or:
 854          if (devinfo->gen <= 5) {
 855             src_reg temp = src_reg(this, ir->type);
 856             emit(OR(dst_reg(temp), op[0], op[1]));
 857             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 858          } else {
 859             inst = emit(OR(dst_null_d(), op[0], op[1]));
 860          }
 861          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 862          break;
 863
 864       case ir_binop_logic_and:
 865          if (devinfo->gen <= 5) {
 866             src_reg temp = src_reg(this, ir->type);
 867             emit(AND(dst_reg(temp), op[0], op[1]));
 868             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 869          } else {
 870             inst = emit(AND(dst_null_d(), op[0], op[1]));
 871          }
 872          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 873          break;
 874
 875       case ir_unop_f2b:
 876          if (devinfo->gen >= 6) {
 877             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 878          } else {
 879             inst = emit(MOV(dst_null_f(), op[0]));
 880             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 881          }
 882          break;
 883
 884       case ir_unop_i2b:
 885          if (devinfo->gen >= 6) {
 886             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 887          } else {
 888             inst = emit(MOV(dst_null_d(), op[0]));
 889             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 890          }
 891          break;
 892
 893       case ir_binop_all_equal:
 894          if (devinfo->gen <= 5) {
 895             resolve_bool_comparison(expr->operands[0], &op[0]);
 896             resolve_bool_comparison(expr->operands[1], &op[1]);
 897          }
 898          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 899          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 900          break;
 901
 902       case ir_binop_any_nequal:
 903          if (devinfo->gen <= 5) {
 904             resolve_bool_comparison(expr->operands[0], &op[0]);
 905             resolve_bool_comparison(expr->operands[1], &op[1]);
 906          }
 907          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 908          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 909          break;
 910
 911       case ir_unop_any:
 912          if (devinfo->gen <= 5) {
 913             resolve_bool_comparison(expr->operands[0], &op[0]);
 914          }
 915          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 916          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 917          break;
 918
 919       case ir_binop_greater:
 920       case ir_binop_gequal:
 921       case ir_binop_less:
 922       case ir_binop_lequal:
 923       case ir_binop_equal:
 924       case ir_binop_nequal:
 925          if (devinfo->gen <= 5) {
 926             resolve_bool_comparison(expr->operands[0], &op[0]);
 927             resolve_bool_comparison(expr->operands[1], &op[1]);
 928          }
 929          emit(CMP(dst_null_d(), op[0], op[1],
 930                   brw_conditional_for_comparison(expr->operation)));
 931          break;
 932
 933       case ir_triop_csel: {
 934          /* Expand the boolean condition into the flag register. */
 935          inst = emit(MOV(dst_null_d(), op[0]));
 936          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 937
 938          /* Select which boolean to return. */
 939          dst_reg temp(this, expr->operands[1]->type);
 940          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 941          inst->predicate = BRW_PREDICATE_NORMAL;
 942
 943          /* Expand the result to a condition code. */
 944          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 945          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 946          break;
 947       }
 948
 949       default:
 950          unreachable("not reached");
 951       }
 952       return;
 953    }
 954
 955    ir->accept(this);
 956
 957    resolve_ud_negate(&this->result);
 958
 959    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 960    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 961 }
 962
 963 /**
 964  * Emit a gen6 IF statement with the comparison folded into the IF
 965  * instruction.
 966  */
 967 void
 968 vec4_visitor::emit_if_gen6(ir_if *ir)
 969 {
 970    ir_expression *expr = ir->condition->as_expression();
 971
 972    if (expr && expr->operation != ir_binop_ubo_load) {
 973       src_reg op[3];
 974       dst_reg temp;
 975
 976       assert(expr->get_num_operands() <= 3);
 977       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 978          expr->operands[i]->accept(this);
 979          op[i] = this->result;
 980       }
 981
 982       switch (expr->operation) {
 983       case ir_unop_logic_not:
 984          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 985          return;
 986
 987       case ir_binop_logic_xor:
 988          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 989          return;
 990
 991       case ir_binop_logic_or:
 992          temp = dst_reg(this, glsl_type::bool_type);
 993          emit(OR(temp, op[0], op[1]));
 994          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 995          return;
 996
 997       case ir_binop_logic_and:
 998          temp = dst_reg(this, glsl_type::bool_type);
 999          emit(AND(temp, op[0], op[1]));
1000          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1001          return;
1002
1003       case ir_unop_f2b:
1004          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1005          return;
1006
1007       case ir_unop_i2b:
1008          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1009          return;
1010
1011       case ir_binop_greater:
1012       case ir_binop_gequal:
1013       case ir_binop_less:
1014       case ir_binop_lequal:
1015       case ir_binop_equal:
1016       case ir_binop_nequal:
1017          emit(IF(op[0], op[1],
1018                  brw_conditional_for_comparison(expr->operation)));
1019          return;
1020
1021       case ir_binop_all_equal:
1022          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1023          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1024          return;
1025
1026       case ir_binop_any_nequal:
1027          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1028          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1029          return;
1030
1031       case ir_unop_any:
1032          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1033          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1034          return;
1035
1036       case ir_triop_csel: {
1037          /* Expand the boolean condition into the flag register. */
1038          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1039          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1040
1041          /* Select which boolean to return. */
1042          dst_reg temp(this, expr->operands[1]->type);
1043          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1044          inst->predicate = BRW_PREDICATE_NORMAL;
1045
1046          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1047          return;
1048       }
1049
1050       default:
1051          unreachable("not reached");
1052       }
1053       return;
1054    }
1055
1056    ir->condition->accept(this);
1057
1058    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1059 }
1060
1061 void
1062 vec4_visitor::visit(ir_variable *ir)
1063 {
1064    dst_reg *reg = NULL;
1065
1066    if (variable_storage(ir))
1067       return;
1068
1069    switch (ir->data.mode) {
1070    case ir_var_shader_in:
1071       assert(ir->data.location != -1);
1072       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1073       break;
1074
1075    case ir_var_shader_out:
1076       assert(ir->data.location != -1);
1077       reg = new(mem_ctx) dst_reg(this, ir->type);
1078
1079       for (int i = 0; i < type_size_vec4(ir->type); i++) {
1080          output_reg[ir->data.location + i] = *reg;
1081          output_reg[ir->data.location + i].reg_offset = i;
1082          output_reg_annotation[ir->data.location + i] = ir->name;
1083       }
1084       break;
1085
1086    case ir_var_auto:
1087    case ir_var_temporary:
1088       reg = new(mem_ctx) dst_reg(this, ir->type);
1089       break;
1090
1091    case ir_var_uniform:
1092       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1093
1094       /* Thanks to the lower_ubo_reference pass, we will see only
1095        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1096        * variables, so no need for them to be in variable_ht.
1097        *
1098        * Some uniforms, such as samplers and atomic counters, have no actual
1099        * storage, so we should ignore them.
1100        */
1101       if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1102          return;
1103
1104       /* Track how big the whole uniform variable is, in case we need to put a
1105        * copy of its data into pull constants for array access.
1106        */
1107       assert(this->uniforms < uniform_array_size);
1108       this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1109
1110       if (!strncmp(ir->name, "gl_", 3)) {
1111          setup_builtin_uniform_values(ir);
1112       } else {
1113          setup_uniform_values(ir);
1114       }
1115       break;
1116
1117    case ir_var_system_value:
1118       reg = make_reg_for_system_value(ir->data.location, ir->type);
1119       break;
1120
1121    default:
1122       unreachable("not reached");
1123    }
1124
1125    reg->type = brw_type_for_base_type(ir->type);
1126    hash_table_insert(this->variable_ht, reg, ir);
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_loop *ir)
1131 {
1132    /* We don't want debugging output to print the whole body of the
1133     * loop as the annotation.
1134     */
1135    this->base_ir = NULL;
1136
1137    emit(BRW_OPCODE_DO);
1138
1139    visit_instructions(&ir->body_instructions);
1140
1141    emit(BRW_OPCODE_WHILE);
1142 }
1143
1144 void
1145 vec4_visitor::visit(ir_loop_jump *ir)
1146 {
1147    switch (ir->mode) {
1148    case ir_loop_jump::jump_break:
1149       emit(BRW_OPCODE_BREAK);
1150       break;
1151    case ir_loop_jump::jump_continue:
1152       emit(BRW_OPCODE_CONTINUE);
1153       break;
1154    }
1155 }
1156
1157
1158 void
1159 vec4_visitor::visit(ir_function_signature *)
1160 {
1161    unreachable("not reached");
1162 }
1163
1164 void
1165 vec4_visitor::visit(ir_function *ir)
1166 {
1167    /* Ignore function bodies other than main() -- we shouldn't see calls to
1168     * them since they should all be inlined.
1169     */
1170    if (strcmp(ir->name, "main") == 0) {
1171       const ir_function_signature *sig;
1172       exec_list empty;
1173
1174       sig = ir->matching_signature(NULL, &empty, false);
1175
1176       assert(sig);
1177
1178       visit_instructions(&sig->body);
1179    }
1180 }
1181
1182 bool
1183 vec4_visitor::try_emit_mad(ir_expression *ir)
1184 {
1185    /* 3-src instructions were introduced in gen6. */
1186    if (devinfo->gen < 6)
1187       return false;
1188
1189    /* MAD can only handle floating-point data. */
1190    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1191       return false;
1192
1193    ir_rvalue *nonmul;
1194    ir_expression *mul;
1195    bool mul_negate, mul_abs;
1196
1197    for (int i = 0; i < 2; i++) {
1198       mul_negate = false;
1199       mul_abs = false;
1200
1201       mul = ir->operands[i]->as_expression();
1202       nonmul = ir->operands[1 - i];
1203
1204       if (mul && mul->operation == ir_unop_abs) {
1205          mul = mul->operands[0]->as_expression();
1206          mul_abs = true;
1207       } else if (mul && mul->operation == ir_unop_neg) {
1208          mul = mul->operands[0]->as_expression();
1209          mul_negate = true;
1210       }
1211
1212       if (mul && mul->operation == ir_binop_mul)
1213          break;
1214    }
1215
1216    if (!mul || mul->operation != ir_binop_mul)
1217       return false;
1218
1219    nonmul->accept(this);
1220    src_reg src0 = fix_3src_operand(this->result);
1221
1222    mul->operands[0]->accept(this);
1223    src_reg src1 = fix_3src_operand(this->result);
1224    src1.negate ^= mul_negate;
1225    src1.abs = mul_abs;
1226    if (mul_abs)
1227       src1.negate = false;
1228
1229    mul->operands[1]->accept(this);
1230    src_reg src2 = fix_3src_operand(this->result);
1231    src2.abs = mul_abs;
1232    if (mul_abs)
1233       src2.negate = false;
1234
1235    this->result = src_reg(this, ir->type);
1236    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1237
1238    return true;
1239 }
1240
1241 bool
1242 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1243 {
1244    /* This optimization relies on CMP setting the destination to 0 when
1245     * false.  Early hardware only sets the least significant bit, and
1246     * leaves the other bits undefined.  So we can't use it.
1247     */
1248    if (devinfo->gen < 6)
1249       return false;
1250
1251    ir_expression *const cmp = ir->operands[0]->as_expression();
1252
1253    if (cmp == NULL)
1254       return false;
1255
1256    switch (cmp->operation) {
1257    case ir_binop_less:
1258    case ir_binop_greater:
1259    case ir_binop_lequal:
1260    case ir_binop_gequal:
1261    case ir_binop_equal:
1262    case ir_binop_nequal:
1263       break;
1264
1265    default:
1266       return false;
1267    }
1268
1269    cmp->operands[0]->accept(this);
1270    const src_reg cmp_src0 = this->result;
1271
1272    cmp->operands[1]->accept(this);
1273    const src_reg cmp_src1 = this->result;
1274
1275    this->result = src_reg(this, ir->type);
1276
1277    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1278             brw_conditional_for_comparison(cmp->operation)));
1279
1280    /* If the comparison is false, this->result will just happen to be zero.
1281     */
1282    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1283                                        this->result, src_reg(1.0f));
1284    inst->predicate = BRW_PREDICATE_NORMAL;
1285    inst->predicate_inverse = true;
1286
1287    return true;
1288 }
1289
1290 vec4_instruction *
1291 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1292                           src_reg src0, src_reg src1)
1293 {
1294    vec4_instruction *inst;
1295
1296    if (devinfo->gen >= 6) {
1297       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1298       inst->conditional_mod = conditionalmod;
1299    } else {
1300       emit(CMP(dst, src0, src1, conditionalmod));
1301
1302       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1303       inst->predicate = BRW_PREDICATE_NORMAL;
1304    }
1305
1306    return inst;
1307 }
1308
1309 vec4_instruction *
1310 vec4_visitor::emit_lrp(const dst_reg &dst,
1311                        const src_reg &x, const src_reg &y, const src_reg &a)
1312 {
1313    if (devinfo->gen >= 6) {
1314       /* Note that the instruction's argument order is reversed from GLSL
1315        * and the IR.
1316        */
1317      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1318                      fix_3src_operand(x)));
1319    } else {
1320       /* Earlier generations don't support three source operations, so we
1321        * need to emit x*(1-a) + y*a.
1322        */
1323       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1324       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1325       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1326       y_times_a.writemask           = dst.writemask;
1327       one_minus_a.writemask         = dst.writemask;
1328       x_times_one_minus_a.writemask = dst.writemask;
1329
1330       emit(MUL(y_times_a, y, a));
1331       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1332       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1333       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1334    }
1335 }
1336
1337 /**
1338  * Emits the instructions needed to perform a pull constant load. before_block
1339  * and before_inst can be NULL in which case the instruction will be appended
1340  * to the end of the instruction list.
1341  */
1342 void
1343 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1344                                           src_reg surf_index,
1345                                           src_reg offset_reg,
1346                                           bblock_t *before_block,
1347                                           vec4_instruction *before_inst)
1348 {
1349    assert((before_inst == NULL && before_block == NULL) ||
1350           (before_inst && before_block));
1351
1352    vec4_instruction *pull;
1353
1354    if (devinfo->gen >= 9) {
1355       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1356       src_reg header(this, glsl_type::uvec4_type, 2);
1357
1358       pull = new(mem_ctx)
1359          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1360                           dst_reg(header));
1361
1362       if (before_inst)
1363          emit_before(before_block, before_inst, pull);
1364       else
1365          emit(pull);
1366
1367       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1368                                  offset_reg.type);
1369       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1370
1371       if (before_inst)
1372          emit_before(before_block, before_inst, pull);
1373       else
1374          emit(pull);
1375
1376       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1377                                            dst,
1378                                            surf_index,
1379                                            header);
1380       pull->mlen = 2;
1381       pull->header_size = 1;
1382    } else if (devinfo->gen >= 7) {
1383       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1384
1385       grf_offset.type = offset_reg.type;
1386
1387       pull = MOV(grf_offset, offset_reg);
1388
1389       if (before_inst)
1390          emit_before(before_block, before_inst, pull);
1391       else
1392          emit(pull);
1393
1394       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1395                                            dst,
1396                                            surf_index,
1397                                            src_reg(grf_offset));
1398       pull->mlen = 1;
1399    } else {
1400       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1401                                            dst,
1402                                            surf_index,
1403                                            offset_reg);
1404       pull->base_mrf = 14;
1405       pull->mlen = 1;
1406    }
1407
1408    if (before_inst)
1409       emit_before(before_block, before_inst, pull);
1410    else
1411       emit(pull);
1412 }
1413
1414 src_reg
1415 vec4_visitor::emit_uniformize(const src_reg &src)
1416 {
1417    const src_reg chan_index(this, glsl_type::uint_type);
1418    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1419                               src.type);
1420
1421    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1422       ->force_writemask_all = true;
1423    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1424       ->force_writemask_all = true;
1425
1426    return src_reg(dst);
1427 }
1428
1429 void
1430 vec4_visitor::visit(ir_expression *ir)
1431 {
1432    unsigned int operand;
1433    src_reg op[ARRAY_SIZE(ir->operands)];
1434    vec4_instruction *inst;
1435
1436    if (ir->operation == ir_binop_add) {
1437       if (try_emit_mad(ir))
1438          return;
1439    }
1440
1441    if (ir->operation == ir_unop_b2f) {
1442       if (try_emit_b2f_of_compare(ir))
1443          return;
1444    }
1445
1446    /* Storage for our result.  Ideally for an assignment we'd be using
1447     * the actual storage for the result here, instead.
1448     */
1449    dst_reg result_dst(this, ir->type);
1450    src_reg result_src(result_dst);
1451
1452    if (ir->operation == ir_triop_csel) {
1453       ir->operands[1]->accept(this);
1454       op[1] = this->result;
1455       ir->operands[2]->accept(this);
1456       op[2] = this->result;
1457
1458       enum brw_predicate predicate;
1459       emit_bool_to_cond_code(ir->operands[0], &predicate);
1460       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1461       inst->predicate = predicate;
1462       this->result = result_src;
1463       return;
1464    }
1465
1466    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1467       this->result.file = BAD_FILE;
1468       ir->operands[operand]->accept(this);
1469       if (this->result.file == BAD_FILE) {
1470          fprintf(stderr, "Failed to get tree for expression operand:\n");
1471          ir->operands[operand]->fprint(stderr);
1472          exit(1);
1473       }
1474       op[operand] = this->result;
1475
1476       /* Matrix expression operands should have been broken down to vector
1477        * operations already.
1478        */
1479       assert(!ir->operands[operand]->type->is_matrix());
1480    }
1481
1482    /* If nothing special happens, this is the result. */
1483    this->result = result_src;
1484
1485    switch (ir->operation) {
1486    case ir_unop_logic_not:
1487       emit(NOT(result_dst, op[0]));
1488       break;
1489    case ir_unop_neg:
1490       op[0].negate = !op[0].negate;
1491       emit(MOV(result_dst, op[0]));
1492       break;
1493    case ir_unop_abs:
1494       op[0].abs = true;
1495       op[0].negate = false;
1496       emit(MOV(result_dst, op[0]));
1497       break;
1498
1499    case ir_unop_sign:
1500       if (ir->type->is_float()) {
1501          /* AND(val, 0x80000000) gives the sign bit.
1502           *
1503           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1504           * zero.
1505           */
1506          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1507
1508          op[0].type = BRW_REGISTER_TYPE_UD;
1509          result_dst.type = BRW_REGISTER_TYPE_UD;
1510          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1511
1512          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1513          inst->predicate = BRW_PREDICATE_NORMAL;
1514
1515          this->result.type = BRW_REGISTER_TYPE_F;
1516       } else {
1517          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1518           *               -> non-negative val generates 0x00000000.
1519           *  Predicated OR sets 1 if val is positive.
1520           */
1521          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1522
1523          emit(ASR(result_dst, op[0], src_reg(31)));
1524
1525          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1526          inst->predicate = BRW_PREDICATE_NORMAL;
1527       }
1528       break;
1529
1530    case ir_unop_rcp:
1531       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1532       break;
1533
1534    case ir_unop_exp2:
1535       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1536       break;
1537    case ir_unop_log2:
1538       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1539       break;
1540    case ir_unop_exp:
1541    case ir_unop_log:
1542       unreachable("not reached: should be handled by ir_explog_to_explog2");
1543    case ir_unop_sin:
1544       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1545       break;
1546    case ir_unop_cos:
1547       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1548       break;
1549
1550    case ir_unop_dFdx:
1551    case ir_unop_dFdx_coarse:
1552    case ir_unop_dFdx_fine:
1553    case ir_unop_dFdy:
1554    case ir_unop_dFdy_coarse:
1555    case ir_unop_dFdy_fine:
1556       unreachable("derivatives not valid in vertex shader");
1557
1558    case ir_unop_bitfield_reverse:
1559       emit(BFREV(result_dst, op[0]));
1560       break;
1561    case ir_unop_bit_count:
1562       emit(CBIT(result_dst, op[0]));
1563       break;
1564    case ir_unop_find_msb: {
1565       src_reg temp = src_reg(this, glsl_type::uint_type);
1566
1567       inst = emit(FBH(dst_reg(temp), op[0]));
1568       inst->dst.writemask = WRITEMASK_XYZW;
1569
1570       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1571        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1572        * subtract the result from 31 to convert the MSB count into an LSB count.
1573        */
1574
1575       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1576       temp.swizzle = BRW_SWIZZLE_NOOP;
1577       emit(MOV(result_dst, temp));
1578
1579       src_reg src_tmp = src_reg(result_dst);
1580       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1581
1582       src_tmp.negate = true;
1583       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1584       inst->predicate = BRW_PREDICATE_NORMAL;
1585       break;
1586    }
1587    case ir_unop_find_lsb:
1588       emit(FBL(result_dst, op[0]));
1589       break;
1590    case ir_unop_saturate:
1591       inst = emit(MOV(result_dst, op[0]));
1592       inst->saturate = true;
1593       break;
1594
1595    case ir_unop_noise:
1596       unreachable("not reached: should be handled by lower_noise");
1597
1598    case ir_unop_subroutine_to_int:
1599       emit(MOV(result_dst, op[0]));
1600       break;
1601
1602    case ir_binop_add:
1603       emit(ADD(result_dst, op[0], op[1]));
1604       break;
1605    case ir_binop_sub:
1606       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1607
1608    case ir_binop_mul:
1609       if (devinfo->gen < 8 && ir->type->is_integer()) {
1610          /* For integer multiplication, the MUL uses the low 16 bits of one of
1611           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1612           * accumulates in the contribution of the upper 16 bits of that
1613           * operand.  If we can determine that one of the args is in the low
1614           * 16 bits, though, we can just emit a single MUL.
1615           */
1616          if (ir->operands[0]->is_uint16_constant()) {
1617             if (devinfo->gen < 7)
1618                emit(MUL(result_dst, op[0], op[1]));
1619             else
1620                emit(MUL(result_dst, op[1], op[0]));
1621          } else if (ir->operands[1]->is_uint16_constant()) {
1622             if (devinfo->gen < 7)
1623                emit(MUL(result_dst, op[1], op[0]));
1624             else
1625                emit(MUL(result_dst, op[0], op[1]));
1626          } else {
1627             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1628
1629             emit(MUL(acc, op[0], op[1]));
1630             emit(MACH(dst_null_d(), op[0], op[1]));
1631             emit(MOV(result_dst, src_reg(acc)));
1632          }
1633       } else {
1634          emit(MUL(result_dst, op[0], op[1]));
1635       }
1636       break;
1637    case ir_binop_imul_high: {
1638       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1639
1640       emit(MUL(acc, op[0], op[1]));
1641       emit(MACH(result_dst, op[0], op[1]));
1642       break;
1643    }
1644    case ir_binop_div:
1645       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1646       assert(ir->type->is_integer());
1647       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1648       break;
1649
1650    case ir_binop_carry:
1651       unreachable("Should have been lowered by carry_to_arith().");
1652
1653    case ir_binop_borrow:
1654       unreachable("Should have been lowered by borrow_to_arith().");
1655
1656    case ir_binop_mod:
1657       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1658       assert(ir->type->is_integer());
1659       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1660       break;
1661
1662    case ir_binop_less:
1663    case ir_binop_greater:
1664    case ir_binop_lequal:
1665    case ir_binop_gequal:
1666    case ir_binop_equal:
1667    case ir_binop_nequal: {
1668       if (devinfo->gen <= 5) {
1669          resolve_bool_comparison(ir->operands[0], &op[0]);
1670          resolve_bool_comparison(ir->operands[1], &op[1]);
1671       }
1672       emit(CMP(result_dst, op[0], op[1],
1673                brw_conditional_for_comparison(ir->operation)));
1674       break;
1675    }
1676
1677    case ir_binop_all_equal:
1678       if (devinfo->gen <= 5) {
1679          resolve_bool_comparison(ir->operands[0], &op[0]);
1680          resolve_bool_comparison(ir->operands[1], &op[1]);
1681       }
1682
1683       /* "==" operator producing a scalar boolean. */
1684       if (ir->operands[0]->type->is_vector() ||
1685           ir->operands[1]->type->is_vector()) {
1686          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1687          emit(MOV(result_dst, src_reg(0)));
1688          inst = emit(MOV(result_dst, src_reg(~0)));
1689          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1690       } else {
1691          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1692       }
1693       break;
1694    case ir_binop_any_nequal:
1695       if (devinfo->gen <= 5) {
1696          resolve_bool_comparison(ir->operands[0], &op[0]);
1697          resolve_bool_comparison(ir->operands[1], &op[1]);
1698       }
1699
1700       /* "!=" operator producing a scalar boolean. */
1701       if (ir->operands[0]->type->is_vector() ||
1702           ir->operands[1]->type->is_vector()) {
1703          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1704
1705          emit(MOV(result_dst, src_reg(0)));
1706          inst = emit(MOV(result_dst, src_reg(~0)));
1707          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1708       } else {
1709          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1710       }
1711       break;
1712
1713    case ir_unop_any:
1714       if (devinfo->gen <= 5) {
1715          resolve_bool_comparison(ir->operands[0], &op[0]);
1716       }
1717       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1718       emit(MOV(result_dst, src_reg(0)));
1719
1720       inst = emit(MOV(result_dst, src_reg(~0)));
1721       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1722       break;
1723
1724    case ir_binop_logic_xor:
1725       emit(XOR(result_dst, op[0], op[1]));
1726       break;
1727
1728    case ir_binop_logic_or:
1729       emit(OR(result_dst, op[0], op[1]));
1730       break;
1731
1732    case ir_binop_logic_and:
1733       emit(AND(result_dst, op[0], op[1]));
1734       break;
1735
1736    case ir_binop_dot:
1737       assert(ir->operands[0]->type->is_vector());
1738       assert(ir->operands[0]->type == ir->operands[1]->type);
1739       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1740       break;
1741
1742    case ir_unop_sqrt:
1743       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1744       break;
1745    case ir_unop_rsq:
1746       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1747       break;
1748
1749    case ir_unop_bitcast_i2f:
1750    case ir_unop_bitcast_u2f:
1751       this->result = op[0];
1752       this->result.type = BRW_REGISTER_TYPE_F;
1753       break;
1754
1755    case ir_unop_bitcast_f2i:
1756       this->result = op[0];
1757       this->result.type = BRW_REGISTER_TYPE_D;
1758       break;
1759
1760    case ir_unop_bitcast_f2u:
1761       this->result = op[0];
1762       this->result.type = BRW_REGISTER_TYPE_UD;
1763       break;
1764
1765    case ir_unop_i2f:
1766    case ir_unop_i2u:
1767    case ir_unop_u2i:
1768    case ir_unop_u2f:
1769    case ir_unop_f2i:
1770    case ir_unop_f2u:
1771       emit(MOV(result_dst, op[0]));
1772       break;
1773    case ir_unop_b2i:
1774    case ir_unop_b2f:
1775       if (devinfo->gen <= 5) {
1776          resolve_bool_comparison(ir->operands[0], &op[0]);
1777       }
1778       emit(MOV(result_dst, negate(op[0])));
1779       break;
1780    case ir_unop_f2b:
1781       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1782       break;
1783    case ir_unop_i2b:
1784       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1785       break;
1786
1787    case ir_unop_trunc:
1788       emit(RNDZ(result_dst, op[0]));
1789       break;
1790    case ir_unop_ceil: {
1791          src_reg tmp = src_reg(this, ir->type);
1792          op[0].negate = !op[0].negate;
1793          emit(RNDD(dst_reg(tmp), op[0]));
1794          tmp.negate = true;
1795          emit(MOV(result_dst, tmp));
1796       }
1797       break;
1798    case ir_unop_floor:
1799       inst = emit(RNDD(result_dst, op[0]));
1800       break;
1801    case ir_unop_fract:
1802       inst = emit(FRC(result_dst, op[0]));
1803       break;
1804    case ir_unop_round_even:
1805       emit(RNDE(result_dst, op[0]));
1806       break;
1807
1808    case ir_binop_min:
1809       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1810       break;
1811    case ir_binop_max:
1812       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1813       break;
1814
1815    case ir_binop_pow:
1816       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1817       break;
1818
1819    case ir_unop_bit_not:
1820       inst = emit(NOT(result_dst, op[0]));
1821       break;
1822    case ir_binop_bit_and:
1823       inst = emit(AND(result_dst, op[0], op[1]));
1824       break;
1825    case ir_binop_bit_xor:
1826       inst = emit(XOR(result_dst, op[0], op[1]));
1827       break;
1828    case ir_binop_bit_or:
1829       inst = emit(OR(result_dst, op[0], op[1]));
1830       break;
1831
1832    case ir_binop_lshift:
1833       inst = emit(SHL(result_dst, op[0], op[1]));
1834       break;
1835
1836    case ir_binop_rshift:
1837       if (ir->type->base_type == GLSL_TYPE_INT)
1838          inst = emit(ASR(result_dst, op[0], op[1]));
1839       else
1840          inst = emit(SHR(result_dst, op[0], op[1]));
1841       break;
1842
1843    case ir_binop_bfm:
1844       emit(BFI1(result_dst, op[0], op[1]));
1845       break;
1846
1847    case ir_binop_ubo_load: {
1848       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1849       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1850       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1851       src_reg offset;
1852
1853       /* Now, load the vector from that offset. */
1854       assert(ir->type->is_vector() || ir->type->is_scalar());
1855
1856       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1857       packed_consts.type = result.type;
1858       src_reg surf_index;
1859
1860       if (const_uniform_block) {
1861          /* The block index is a constant, so just emit the binding table entry
1862           * as an immediate.
1863           */
1864          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1865                               const_uniform_block->value.u[0]);
1866       } else {
1867          /* The block index is not a constant. Evaluate the index expression
1868           * per-channel and add the base UBO index; we have to select a value
1869           * from any live channel.
1870           */
1871          surf_index = src_reg(this, glsl_type::uint_type);
1872          emit(ADD(dst_reg(surf_index), op[0],
1873                   src_reg(prog_data->base.binding_table.ubo_start)));
1874          surf_index = emit_uniformize(surf_index);
1875
1876          /* Assume this may touch any UBO. It would be nice to provide
1877           * a tighter bound, but the array information is already lowered away.
1878           */
1879          brw_mark_surface_used(&prog_data->base,
1880                                prog_data->base.binding_table.ubo_start +
1881                                shader_prog->NumUniformBlocks - 1);
1882       }
1883
1884       if (const_offset_ir) {
1885          if (devinfo->gen >= 8) {
1886             /* Store the offset in a GRF so we can send-from-GRF. */
1887             offset = src_reg(this, glsl_type::int_type);
1888             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1889          } else {
1890             /* Immediates are fine on older generations since they'll be moved
1891              * to a (potentially fake) MRF at the generator level.
1892              */
1893             offset = src_reg(const_offset / 16);
1894          }
1895       } else {
1896          offset = src_reg(this, glsl_type::uint_type);
1897          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1898       }
1899
1900       emit_pull_constant_load_reg(dst_reg(packed_consts),
1901                                   surf_index,
1902                                   offset,
1903                                   NULL, NULL /* before_block/inst */);
1904
1905       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1906       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1907                                             const_offset % 16 / 4,
1908                                             const_offset % 16 / 4,
1909                                             const_offset % 16 / 4);
1910
1911       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1912       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1913          emit(CMP(result_dst, packed_consts, src_reg(0u),
1914                   BRW_CONDITIONAL_NZ));
1915       } else {
1916          emit(MOV(result_dst, packed_consts));
1917       }
1918       break;
1919    }
1920
1921    case ir_binop_vector_extract:
1922       unreachable("should have been lowered by vec_index_to_cond_assign");
1923
1924    case ir_triop_fma:
1925       op[0] = fix_3src_operand(op[0]);
1926       op[1] = fix_3src_operand(op[1]);
1927       op[2] = fix_3src_operand(op[2]);
1928       /* Note that the instruction's argument order is reversed from GLSL
1929        * and the IR.
1930        */
1931       emit(MAD(result_dst, op[2], op[1], op[0]));
1932       break;
1933
1934    case ir_triop_lrp:
1935       emit_lrp(result_dst, op[0], op[1], op[2]);
1936       break;
1937
1938    case ir_triop_csel:
1939       unreachable("already handled above");
1940       break;
1941
1942    case ir_triop_bfi:
1943       op[0] = fix_3src_operand(op[0]);
1944       op[1] = fix_3src_operand(op[1]);
1945       op[2] = fix_3src_operand(op[2]);
1946       emit(BFI2(result_dst, op[0], op[1], op[2]));
1947       break;
1948
1949    case ir_triop_bitfield_extract:
1950       op[0] = fix_3src_operand(op[0]);
1951       op[1] = fix_3src_operand(op[1]);
1952       op[2] = fix_3src_operand(op[2]);
1953       /* Note that the instruction's argument order is reversed from GLSL
1954        * and the IR.
1955        */
1956       emit(BFE(result_dst, op[2], op[1], op[0]));
1957       break;
1958
1959    case ir_triop_vector_insert:
1960       unreachable("should have been lowered by lower_vector_insert");
1961
1962    case ir_quadop_bitfield_insert:
1963       unreachable("not reached: should be handled by "
1964               "bitfield_insert_to_bfm_bfi\n");
1965
1966    case ir_quadop_vector:
1967       unreachable("not reached: should be handled by lower_quadop_vector");
1968
1969    case ir_unop_pack_half_2x16:
1970       emit_pack_half_2x16(result_dst, op[0]);
1971       break;
1972    case ir_unop_unpack_half_2x16:
1973       emit_unpack_half_2x16(result_dst, op[0]);
1974       break;
1975    case ir_unop_unpack_unorm_4x8:
1976       emit_unpack_unorm_4x8(result_dst, op[0]);
1977       break;
1978    case ir_unop_unpack_snorm_4x8:
1979       emit_unpack_snorm_4x8(result_dst, op[0]);
1980       break;
1981    case ir_unop_pack_unorm_4x8:
1982       emit_pack_unorm_4x8(result_dst, op[0]);
1983       break;
1984    case ir_unop_pack_snorm_4x8:
1985       emit_pack_snorm_4x8(result_dst, op[0]);
1986       break;
1987    case ir_unop_pack_snorm_2x16:
1988    case ir_unop_pack_unorm_2x16:
1989    case ir_unop_unpack_snorm_2x16:
1990    case ir_unop_unpack_unorm_2x16:
1991       unreachable("not reached: should be handled by lower_packing_builtins");
1992    case ir_unop_unpack_half_2x16_split_x:
1993    case ir_unop_unpack_half_2x16_split_y:
1994    case ir_binop_pack_half_2x16_split:
1995    case ir_unop_interpolate_at_centroid:
1996    case ir_binop_interpolate_at_sample:
1997    case ir_binop_interpolate_at_offset:
1998       unreachable("not reached: should not occur in vertex shader");
1999    case ir_binop_ldexp:
2000       unreachable("not reached: should be handled by ldexp_to_arith()");
2001    case ir_unop_d2f:
2002    case ir_unop_f2d:
2003    case ir_unop_d2i:
2004    case ir_unop_i2d:
2005    case ir_unop_d2u:
2006    case ir_unop_u2d:
2007    case ir_unop_d2b:
2008    case ir_unop_pack_double_2x32:
2009    case ir_unop_unpack_double_2x32:
2010    case ir_unop_frexp_sig:
2011    case ir_unop_frexp_exp:
2012       unreachable("fp64 todo");
2013    }
2014 }
2015
2016
2017 void
2018 vec4_visitor::visit(ir_swizzle *ir)
2019 {
2020    /* Note that this is only swizzles in expressions, not those on the left
2021     * hand side of an assignment, which do write masking.  See ir_assignment
2022     * for that.
2023     */
2024    const unsigned swz = brw_compose_swizzle(
2025       brw_swizzle_for_size(ir->type->vector_elements),
2026       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2027
2028    ir->val->accept(this);
2029    this->result = swizzle(this->result, swz);
2030 }
2031
2032 void
2033 vec4_visitor::visit(ir_dereference_variable *ir)
2034 {
2035    const struct glsl_type *type = ir->type;
2036    dst_reg *reg = variable_storage(ir->var);
2037
2038    if (!reg) {
2039       fail("Failed to find variable storage for %s\n", ir->var->name);
2040       this->result = src_reg(brw_null_reg());
2041       return;
2042    }
2043
2044    this->result = src_reg(*reg);
2045
2046    /* System values get their swizzle from the dst_reg writemask */
2047    if (ir->var->data.mode == ir_var_system_value)
2048       return;
2049
2050    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2051       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2052 }
2053
2054
2055 int
2056 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2057 {
2058    /* Under normal circumstances array elements are stored consecutively, so
2059     * the stride is equal to the size of the array element.
2060     */
2061    return type_size_vec4(ir->type);
2062 }
2063
2064
2065 void
2066 vec4_visitor::visit(ir_dereference_array *ir)
2067 {
2068    ir_constant *constant_index;
2069    src_reg src;
2070    int array_stride = compute_array_stride(ir);
2071
2072    constant_index = ir->array_index->constant_expression_value();
2073
2074    ir->array->accept(this);
2075    src = this->result;
2076
2077    if (constant_index) {
2078       src.reg_offset += constant_index->value.i[0] * array_stride;
2079    } else {
2080       /* Variable index array dereference.  It eats the "vec4" of the
2081        * base of the array and an index that offsets the Mesa register
2082        * index.
2083        */
2084       ir->array_index->accept(this);
2085
2086       src_reg index_reg;
2087
2088       if (array_stride == 1) {
2089          index_reg = this->result;
2090       } else {
2091          index_reg = src_reg(this, glsl_type::int_type);
2092
2093          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2094       }
2095
2096       if (src.reladdr) {
2097          src_reg temp = src_reg(this, glsl_type::int_type);
2098
2099          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2100
2101          index_reg = temp;
2102       }
2103
2104       src.reladdr = ralloc(mem_ctx, src_reg);
2105       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2106    }
2107
2108    /* If the type is smaller than a vec4, replicate the last channel out. */
2109    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2110       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2111    else
2112       src.swizzle = BRW_SWIZZLE_NOOP;
2113    src.type = brw_type_for_base_type(ir->type);
2114
2115    this->result = src;
2116 }
2117
2118 void
2119 vec4_visitor::visit(ir_dereference_record *ir)
2120 {
2121    unsigned int i;
2122    const glsl_type *struct_type = ir->record->type;
2123    int offset = 0;
2124
2125    ir->record->accept(this);
2126
2127    for (i = 0; i < struct_type->length; i++) {
2128       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2129          break;
2130       offset += type_size_vec4(struct_type->fields.structure[i].type);
2131    }
2132
2133    /* If the type is smaller than a vec4, replicate the last channel out. */
2134    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2135       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2136    else
2137       this->result.swizzle = BRW_SWIZZLE_NOOP;
2138    this->result.type = brw_type_for_base_type(ir->type);
2139
2140    this->result.reg_offset += offset;
2141 }
2142
2143 /**
2144  * We want to be careful in assignment setup to hit the actual storage
2145  * instead of potentially using a temporary like we might with the
2146  * ir_dereference handler.
2147  */
2148 static dst_reg
2149 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2150 {
2151    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2152     * access of a vector, it must be separated into a series conditional moves
2153     * before reaching this point (see ir_vec_index_to_cond_assign).
2154     */
2155    assert(ir->as_dereference());
2156    ir_dereference_array *deref_array = ir->as_dereference_array();
2157    if (deref_array) {
2158       assert(!deref_array->array->type->is_vector());
2159    }
2160
2161    /* Use the rvalue deref handler for the most part.  We'll ignore
2162     * swizzles in it and write swizzles using writemask, though.
2163     */
2164    ir->accept(v);
2165    return dst_reg(v->result);
2166 }
2167
2168 void
2169 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2170                               const struct glsl_type *type,
2171                               enum brw_predicate predicate)
2172 {
2173    if (type->base_type == GLSL_TYPE_STRUCT) {
2174       for (unsigned int i = 0; i < type->length; i++) {
2175          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2176       }
2177       return;
2178    }
2179
2180    if (type->is_array()) {
2181       for (unsigned int i = 0; i < type->length; i++) {
2182          emit_block_move(dst, src, type->fields.array, predicate);
2183       }
2184       return;
2185    }
2186
2187    if (type->is_matrix()) {
2188       const struct glsl_type *vec_type;
2189
2190       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2191                                          type->vector_elements, 1);
2192
2193       for (int i = 0; i < type->matrix_columns; i++) {
2194          emit_block_move(dst, src, vec_type, predicate);
2195       }
2196       return;
2197    }
2198
2199    assert(type->is_scalar() || type->is_vector());
2200
2201    dst->type = brw_type_for_base_type(type);
2202    src->type = dst->type;
2203
2204    dst->writemask = (1 << type->vector_elements) - 1;
2205
2206    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2207
2208    vec4_instruction *inst = emit(MOV(*dst, *src));
2209    inst->predicate = predicate;
2210
2211    dst->reg_offset++;
2212    src->reg_offset++;
2213 }
2214
2215
2216 /* If the RHS processing resulted in an instruction generating a
2217  * temporary value, and it would be easy to rewrite the instruction to
2218  * generate its result right into the LHS instead, do so.  This ends
2219  * up reliably removing instructions where it can be tricky to do so
2220  * later without real UD chain information.
2221  */
2222 bool
2223 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2224                                      dst_reg dst,
2225                                      src_reg src,
2226                                      vec4_instruction *pre_rhs_inst,
2227                                      vec4_instruction *last_rhs_inst)
2228 {
2229    /* This could be supported, but it would take more smarts. */
2230    if (ir->condition)
2231       return false;
2232
2233    if (pre_rhs_inst == last_rhs_inst)
2234       return false; /* No instructions generated to work with. */
2235
2236    /* Make sure the last instruction generated our source reg. */
2237    if (src.file != GRF ||
2238        src.file != last_rhs_inst->dst.file ||
2239        src.reg != last_rhs_inst->dst.reg ||
2240        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2241        src.reladdr ||
2242        src.abs ||
2243        src.negate ||
2244        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2245       return false;
2246
2247    /* Check that that last instruction fully initialized the channels
2248     * we want to use, in the order we want to use them.  We could
2249     * potentially reswizzle the operands of many instructions so that
2250     * we could handle out of order channels, but don't yet.
2251     */
2252
2253    for (unsigned i = 0; i < 4; i++) {
2254       if (dst.writemask & (1 << i)) {
2255          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2256             return false;
2257
2258          if (BRW_GET_SWZ(src.swizzle, i) != i)
2259             return false;
2260       }
2261    }
2262
2263    /* Success!  Rewrite the instruction. */
2264    last_rhs_inst->dst.file = dst.file;
2265    last_rhs_inst->dst.reg = dst.reg;
2266    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2267    last_rhs_inst->dst.reladdr = dst.reladdr;
2268    last_rhs_inst->dst.writemask &= dst.writemask;
2269
2270    return true;
2271 }
2272
2273 void
2274 vec4_visitor::visit(ir_assignment *ir)
2275 {
2276    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2277    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2278
2279    if (!ir->lhs->type->is_scalar() &&
2280        !ir->lhs->type->is_vector()) {
2281       ir->rhs->accept(this);
2282       src_reg src = this->result;
2283
2284       if (ir->condition) {
2285          emit_bool_to_cond_code(ir->condition, &predicate);
2286       }
2287
2288       /* emit_block_move doesn't account for swizzles in the source register.
2289        * This should be ok, since the source register is a structure or an
2290        * array, and those can't be swizzled.  But double-check to be sure.
2291        */
2292       assert(src.swizzle ==
2293              (ir->rhs->type->is_matrix()
2294               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2295               : BRW_SWIZZLE_NOOP));
2296
2297       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2298       return;
2299    }
2300
2301    /* Now we're down to just a scalar/vector with writemasks. */
2302    int i;
2303
2304    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2305    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2306
2307    ir->rhs->accept(this);
2308
2309    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2310
2311    int swizzles[4];
2312    int src_chan = 0;
2313
2314    assert(ir->lhs->type->is_vector() ||
2315           ir->lhs->type->is_scalar());
2316    dst.writemask = ir->write_mask;
2317
2318    /* Swizzle a small RHS vector into the channels being written.
2319     *
2320     * glsl ir treats write_mask as dictating how many channels are
2321     * present on the RHS while in our instructions we need to make
2322     * those channels appear in the slots of the vec4 they're written to.
2323     */
2324    for (int i = 0; i < 4; i++)
2325       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2326
2327    src_reg src = swizzle(this->result,
2328                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2329                                       swizzles[2], swizzles[3]));
2330
2331    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2332       return;
2333    }
2334
2335    if (ir->condition) {
2336       emit_bool_to_cond_code(ir->condition, &predicate);
2337    }
2338
2339    for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2340       vec4_instruction *inst = emit(MOV(dst, src));
2341       inst->predicate = predicate;
2342
2343       dst.reg_offset++;
2344       src.reg_offset++;
2345    }
2346 }
2347
2348 void
2349 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2350 {
2351    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2352       foreach_in_list(ir_constant, field_value, &ir->components) {
2353          emit_constant_values(dst, field_value);
2354       }
2355       return;
2356    }
2357
2358    if (ir->type->is_array()) {
2359       for (unsigned int i = 0; i < ir->type->length; i++) {
2360          emit_constant_values(dst, ir->array_elements[i]);
2361       }
2362       return;
2363    }
2364
2365    if (ir->type->is_matrix()) {
2366       for (int i = 0; i < ir->type->matrix_columns; i++) {
2367          float *vec = &ir->value.f[i * ir->type->vector_elements];
2368
2369          for (int j = 0; j < ir->type->vector_elements; j++) {
2370             dst->writemask = 1 << j;
2371             dst->type = BRW_REGISTER_TYPE_F;
2372
2373             emit(MOV(*dst, src_reg(vec[j])));
2374          }
2375          dst->reg_offset++;
2376       }
2377       return;
2378    }
2379
2380    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2381
2382    for (int i = 0; i < ir->type->vector_elements; i++) {
2383       if (!(remaining_writemask & (1 << i)))
2384          continue;
2385
2386       dst->writemask = 1 << i;
2387       dst->type = brw_type_for_base_type(ir->type);
2388
2389       /* Find other components that match the one we're about to
2390        * write.  Emits fewer instructions for things like vec4(0.5,
2391        * 1.5, 1.5, 1.5).
2392        */
2393       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2394          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2395             if (ir->value.b[i] == ir->value.b[j])
2396                dst->writemask |= (1 << j);
2397          } else {
2398             /* u, i, and f storage all line up, so no need for a
2399              * switch case for comparing each type.
2400              */
2401             if (ir->value.u[i] == ir->value.u[j])
2402                dst->writemask |= (1 << j);
2403          }
2404       }
2405
2406       switch (ir->type->base_type) {
2407       case GLSL_TYPE_FLOAT:
2408          emit(MOV(*dst, src_reg(ir->value.f[i])));
2409          break;
2410       case GLSL_TYPE_INT:
2411          emit(MOV(*dst, src_reg(ir->value.i[i])));
2412          break;
2413       case GLSL_TYPE_UINT:
2414          emit(MOV(*dst, src_reg(ir->value.u[i])));
2415          break;
2416       case GLSL_TYPE_BOOL:
2417          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2418          break;
2419       default:
2420          unreachable("Non-float/uint/int/bool constant");
2421       }
2422
2423       remaining_writemask &= ~dst->writemask;
2424    }
2425    dst->reg_offset++;
2426 }
2427
2428 void
2429 vec4_visitor::visit(ir_constant *ir)
2430 {
2431    dst_reg dst = dst_reg(this, ir->type);
2432    this->result = src_reg(dst);
2433
2434    emit_constant_values(&dst, ir);
2435 }
2436
2437 void
2438 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2439 {
2440    ir_dereference *deref = static_cast<ir_dereference *>(
2441       ir->actual_parameters.get_head());
2442    ir_variable *location = deref->variable_referenced();
2443    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2444                           location->data.binding);
2445
2446    /* Calculate the surface offset */
2447    src_reg offset(this, glsl_type::uint_type);
2448    ir_dereference_array *deref_array = deref->as_dereference_array();
2449    if (deref_array) {
2450       deref_array->array_index->accept(this);
2451
2452       src_reg tmp(this, glsl_type::uint_type);
2453       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2454       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2455    } else {
2456       offset = location->data.atomic.offset;
2457    }
2458
2459    /* Emit the appropriate machine instruction */
2460    const char *callee = ir->callee->function_name();
2461    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2462
2463    if (!strcmp("__intrinsic_atomic_read", callee)) {
2464       emit_untyped_surface_read(surf_index, dst, offset);
2465
2466    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2467       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2468                           src_reg(), src_reg());
2469
2470    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2471       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2472                           src_reg(), src_reg());
2473    }
2474
2475    brw_mark_surface_used(stage_prog_data, surf_index);
2476 }
2477
2478 void
2479 vec4_visitor::visit(ir_call *ir)
2480 {
2481    const char *callee = ir->callee->function_name();
2482
2483    if (!strcmp("__intrinsic_atomic_read", callee) ||
2484        !strcmp("__intrinsic_atomic_increment", callee) ||
2485        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2486       visit_atomic_counter_intrinsic(ir);
2487    } else {
2488       unreachable("Unsupported intrinsic.");
2489    }
2490 }
2491
2492 src_reg
2493 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2494                              src_reg coordinate, src_reg sampler)
2495 {
2496    vec4_instruction *inst =
2497       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2498                                     dst_reg(this, glsl_type::uvec4_type));
2499    inst->base_mrf = 2;
2500    inst->src[1] = sampler;
2501
2502    int param_base;
2503
2504    if (devinfo->gen >= 9) {
2505       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2506       vec4_instruction *header_inst = new(mem_ctx)
2507          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2508                           dst_reg(MRF, inst->base_mrf));
2509
2510       emit(header_inst);
2511
2512       inst->mlen = 2;
2513       inst->header_size = 1;
2514       param_base = inst->base_mrf + 1;
2515    } else {
2516       inst->mlen = 1;
2517       param_base = inst->base_mrf;
2518    }
2519
2520    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2521    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2522    int zero_mask = 0xf & ~coord_mask;
2523
2524    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2525             coordinate));
2526
2527    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2528             src_reg(0)));
2529
2530    emit(inst);
2531    return src_reg(inst->dst);
2532 }
2533
2534 bool
2535 vec4_visitor::is_high_sampler(src_reg sampler)
2536 {
2537    if (devinfo->gen < 8 && !devinfo->is_haswell)
2538       return false;
2539
2540    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2541 }
2542
2543 void
2544 vec4_visitor::emit_texture(ir_texture_opcode op,
2545                            dst_reg dest,
2546                            const glsl_type *dest_type,
2547                            src_reg coordinate,
2548                            int coord_components,
2549                            src_reg shadow_comparitor,
2550                            src_reg lod, src_reg lod2,
2551                            src_reg sample_index,
2552                            uint32_t constant_offset,
2553                            src_reg offset_value,
2554                            src_reg mcs,
2555                            bool is_cube_array,
2556                            uint32_t sampler,
2557                            src_reg sampler_reg)
2558 {
2559    enum opcode opcode;
2560    switch (op) {
2561    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2562    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2563    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2564    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2565    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2566    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2567    case ir_tg4: opcode = offset_value.file != BAD_FILE
2568                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2569    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2570    case ir_txb:
2571       unreachable("TXB is not valid for vertex shaders.");
2572    case ir_lod:
2573       unreachable("LOD is not valid for vertex shaders.");
2574    default:
2575       unreachable("Unrecognized tex op");
2576    }
2577
2578    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2579       opcode, dst_reg(this, dest_type));
2580
2581    inst->offset = constant_offset;
2582
2583    /* The message header is necessary for:
2584     * - Gen4 (always)
2585     * - Gen9+ for selecting SIMD4x2
2586     * - Texel offsets
2587     * - Gather channel selection
2588     * - Sampler indices too large to fit in a 4-bit value.
2589     */
2590    inst->header_size =
2591       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2592        inst->offset != 0 || op == ir_tg4 ||
2593        is_high_sampler(sampler_reg)) ? 1 : 0;
2594    inst->base_mrf = 2;
2595    inst->mlen = inst->header_size + 1; /* always at least one */
2596    inst->dst.writemask = WRITEMASK_XYZW;
2597    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2598
2599    inst->src[1] = sampler_reg;
2600
2601    /* MRF for the first parameter */
2602    int param_base = inst->base_mrf + inst->header_size;
2603
2604    if (op == ir_txs || op == ir_query_levels) {
2605       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2606       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2607    } else {
2608       /* Load the coordinate */
2609       /* FINISHME: gl_clamp_mask and saturate */
2610       int coord_mask = (1 << coord_components) - 1;
2611       int zero_mask = 0xf & ~coord_mask;
2612
2613       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2614                coordinate));
2615
2616       if (zero_mask != 0) {
2617          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2618                   src_reg(0)));
2619       }
2620       /* Load the shadow comparitor */
2621       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2622          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2623                           WRITEMASK_X),
2624                   shadow_comparitor));
2625          inst->mlen++;
2626       }
2627
2628       /* Load the LOD info */
2629       if (op == ir_tex || op == ir_txl) {
2630          int mrf, writemask;
2631          if (devinfo->gen >= 5) {
2632             mrf = param_base + 1;
2633             if (shadow_comparitor.file != BAD_FILE) {
2634                writemask = WRITEMASK_Y;
2635                /* mlen already incremented */
2636             } else {
2637                writemask = WRITEMASK_X;
2638                inst->mlen++;
2639             }
2640          } else /* devinfo->gen == 4 */ {
2641             mrf = param_base;
2642             writemask = WRITEMASK_W;
2643          }
2644          lod.swizzle = BRW_SWIZZLE_XXXX;
2645          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2646       } else if (op == ir_txf) {
2647          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2648       } else if (op == ir_txf_ms) {
2649          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2650                   sample_index));
2651          if (devinfo->gen >= 7) {
2652             /* MCS data is in the first channel of `mcs`, but we need to get it into
2653              * the .y channel of the second vec4 of params, so replicate .x across
2654              * the whole vec4 and then mask off everything except .y
2655              */
2656             mcs.swizzle = BRW_SWIZZLE_XXXX;
2657             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2658                      mcs));
2659          }
2660          inst->mlen++;
2661       } else if (op == ir_txd) {
2662          const brw_reg_type type = lod.type;
2663
2664          if (devinfo->gen >= 5) {
2665             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2666             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2667             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2668             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2669             inst->mlen++;
2670
2671             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2672                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2673                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2674                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2675                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2676                inst->mlen++;
2677
2678                if (shadow_comparitor.file != BAD_FILE) {
2679                   emit(MOV(dst_reg(MRF, param_base + 2,
2680                                    shadow_comparitor.type, WRITEMASK_Z),
2681                            shadow_comparitor));
2682                }
2683             }
2684          } else /* devinfo->gen == 4 */ {
2685             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2686             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2687             inst->mlen += 2;
2688          }
2689       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2690          if (shadow_comparitor.file != BAD_FILE) {
2691             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2692                      shadow_comparitor));
2693          }
2694
2695          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2696                   offset_value));
2697          inst->mlen++;
2698       }
2699    }
2700
2701    emit(inst);
2702
2703    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2704     * spec requires layers.
2705     */
2706    if (op == ir_txs && is_cube_array) {
2707       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2708                 writemask(inst->dst, WRITEMASK_Z),
2709                 src_reg(inst->dst), src_reg(6));
2710    }
2711
2712    if (devinfo->gen == 6 && op == ir_tg4) {
2713       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2714    }
2715
2716    swizzle_result(op, dest,
2717                   src_reg(inst->dst), sampler, dest_type);
2718 }
2719
2720 void
2721 vec4_visitor::visit(ir_texture *ir)
2722 {
2723    uint32_t sampler =
2724       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2725
2726    ir_rvalue *nonconst_sampler_index =
2727       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2728
2729    /* Handle non-constant sampler array indexing */
2730    src_reg sampler_reg;
2731    if (nonconst_sampler_index) {
2732       /* The highest sampler which may be used by this operation is
2733        * the last element of the array. Mark it here, because the generator
2734        * doesn't have enough information to determine the bound.
2735        */
2736       uint32_t array_size = ir->sampler->as_dereference_array()
2737          ->array->type->array_size();
2738
2739       uint32_t max_used = sampler + array_size - 1;
2740       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2741          max_used += prog_data->base.binding_table.gather_texture_start;
2742       } else {
2743          max_used += prog_data->base.binding_table.texture_start;
2744       }
2745
2746       brw_mark_surface_used(&prog_data->base, max_used);
2747
2748       /* Emit code to evaluate the actual indexing expression */
2749       nonconst_sampler_index->accept(this);
2750       src_reg temp(this, glsl_type::uint_type);
2751       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2752       sampler_reg = emit_uniformize(temp);
2753    } else {
2754       /* Single sampler, or constant array index; the indexing expression
2755        * is just an immediate.
2756        */
2757       sampler_reg = src_reg(sampler);
2758    }
2759
2760    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2761     * emitting anything other than setting up the constant result.
2762     */
2763    if (ir->op == ir_tg4) {
2764       ir_constant *chan = ir->lod_info.component->as_constant();
2765       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2766       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2767          dst_reg result(this, ir->type);
2768          this->result = src_reg(result);
2769          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2770          return;
2771       }
2772    }
2773
2774    /* Should be lowered by do_lower_texture_projection */
2775    assert(!ir->projector);
2776
2777    /* Should be lowered */
2778    assert(!ir->offset || !ir->offset->type->is_array());
2779
2780    /* Generate code to compute all the subexpression trees.  This has to be
2781     * done before loading any values into MRFs for the sampler message since
2782     * generating these values may involve SEND messages that need the MRFs.
2783     */
2784    src_reg coordinate;
2785    int coord_components = 0;
2786    if (ir->coordinate) {
2787       coord_components = ir->coordinate->type->vector_elements;
2788       ir->coordinate->accept(this);
2789       coordinate = this->result;
2790    }
2791
2792    src_reg shadow_comparitor;
2793    if (ir->shadow_comparitor) {
2794       ir->shadow_comparitor->accept(this);
2795       shadow_comparitor = this->result;
2796    }
2797
2798    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2799    src_reg offset_value;
2800    if (has_nonconstant_offset) {
2801       ir->offset->accept(this);
2802       offset_value = src_reg(this->result);
2803    }
2804
2805    src_reg lod, lod2, sample_index, mcs;
2806    switch (ir->op) {
2807    case ir_tex:
2808       lod = src_reg(0.0f);
2809       break;
2810    case ir_txf:
2811    case ir_txl:
2812    case ir_txs:
2813       ir->lod_info.lod->accept(this);
2814       lod = this->result;
2815       break;
2816    case ir_query_levels:
2817       lod = src_reg(0);
2818       break;
2819    case ir_txf_ms:
2820       ir->lod_info.sample_index->accept(this);
2821       sample_index = this->result;
2822
2823       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2824          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2825       else
2826          mcs = src_reg(0u);
2827       break;
2828    case ir_txd:
2829       ir->lod_info.grad.dPdx->accept(this);
2830       lod = this->result;
2831
2832       ir->lod_info.grad.dPdy->accept(this);
2833       lod2 = this->result;
2834       break;
2835    case ir_txb:
2836    case ir_lod:
2837    case ir_tg4:
2838       break;
2839    }
2840
2841    uint32_t constant_offset = 0;
2842    if (ir->offset != NULL && !has_nonconstant_offset) {
2843       constant_offset  =
2844          brw_texture_offset(ir->offset->as_constant()->value.i,
2845                             ir->offset->type->vector_elements);
2846    }
2847
2848    /* Stuff the channel select bits in the top of the texture offset */
2849    if (ir->op == ir_tg4)
2850       constant_offset |=
2851          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2852                          sampler) << 16;
2853
2854    glsl_type const *type = ir->sampler->type;
2855    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2856       type->sampler_array;
2857
2858    this->result = src_reg(this, ir->type);
2859    dst_reg dest = dst_reg(this->result);
2860
2861    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2862                 shadow_comparitor,
2863                 lod, lod2, sample_index,
2864                 constant_offset, offset_value,
2865                 mcs, is_cube_array, sampler, sampler_reg);
2866 }
2867
2868 /**
2869  * Apply workarounds for Gen6 gather with UINT/SINT
2870  */
2871 void
2872 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2873 {
2874    if (!wa)
2875       return;
2876
2877    int width = (wa & WA_8BIT) ? 8 : 16;
2878    dst_reg dst_f = dst;
2879    dst_f.type = BRW_REGISTER_TYPE_F;
2880
2881    /* Convert from UNORM to UINT */
2882    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2883    emit(MOV(dst, src_reg(dst_f)));
2884
2885    if (wa & WA_SIGN) {
2886       /* Reinterpret the UINT value as a signed INT value by
2887        * shifting the sign bit into place, then shifting back
2888        * preserving sign.
2889        */
2890       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2891       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2892    }
2893 }
2894
2895 /**
2896  * Set up the gather channel based on the swizzle, for gather4.
2897  */
2898 uint32_t
2899 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2900 {
2901    int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2902    switch (swiz) {
2903       case SWIZZLE_X: return 0;
2904       case SWIZZLE_Y:
2905          /* gather4 sampler is broken for green channel on RG32F --
2906           * we must ask for blue instead.
2907           */
2908          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2909             return 2;
2910          return 1;
2911       case SWIZZLE_Z: return 2;
2912       case SWIZZLE_W: return 3;
2913       default:
2914          unreachable("Not reached"); /* zero, one swizzles handled already */
2915    }
2916 }
2917
2918 void
2919 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2920                              src_reg orig_val, uint32_t sampler,
2921                              const glsl_type *dest_type)
2922 {
2923    int s = key->tex.swizzles[sampler];
2924
2925    dst_reg swizzled_result = dest;
2926
2927    if (op == ir_query_levels) {
2928       /* # levels is in .w */
2929       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2930       emit(MOV(swizzled_result, orig_val));
2931       return;
2932    }
2933
2934    if (op == ir_txs || dest_type == glsl_type::float_type
2935                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2936       emit(MOV(swizzled_result, orig_val));
2937       return;
2938    }
2939
2940
2941    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2942    int swizzle[4] = {0};
2943
2944    for (int i = 0; i < 4; i++) {
2945       switch (GET_SWZ(s, i)) {
2946       case SWIZZLE_ZERO:
2947          zero_mask |= (1 << i);
2948          break;
2949       case SWIZZLE_ONE:
2950          one_mask |= (1 << i);
2951          break;
2952       default:
2953          copy_mask |= (1 << i);
2954          swizzle[i] = GET_SWZ(s, i);
2955          break;
2956       }
2957    }
2958
2959    if (copy_mask) {
2960       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2961       swizzled_result.writemask = copy_mask;
2962       emit(MOV(swizzled_result, orig_val));
2963    }
2964
2965    if (zero_mask) {
2966       swizzled_result.writemask = zero_mask;
2967       emit(MOV(swizzled_result, src_reg(0.0f)));
2968    }
2969
2970    if (one_mask) {
2971       swizzled_result.writemask = one_mask;
2972       emit(MOV(swizzled_result, src_reg(1.0f)));
2973    }
2974 }
2975
2976 void
2977 vec4_visitor::visit(ir_return *)
2978 {
2979    unreachable("not reached");
2980 }
2981
2982 void
2983 vec4_visitor::visit(ir_discard *)
2984 {
2985    unreachable("not reached");
2986 }
2987
2988 void
2989 vec4_visitor::visit(ir_if *ir)
2990 {
2991    /* Don't point the annotation at the if statement, because then it plus
2992     * the then and else blocks get printed.
2993     */
2994    this->base_ir = ir->condition;
2995
2996    if (devinfo->gen == 6) {
2997       emit_if_gen6(ir);
2998    } else {
2999       enum brw_predicate predicate;
3000       emit_bool_to_cond_code(ir->condition, &predicate);
3001       emit(IF(predicate));
3002    }
3003
3004    visit_instructions(&ir->then_instructions);
3005
3006    if (!ir->else_instructions.is_empty()) {
3007       this->base_ir = ir->condition;
3008       emit(BRW_OPCODE_ELSE);
3009
3010       visit_instructions(&ir->else_instructions);
3011    }
3012
3013    this->base_ir = ir->condition;
3014    emit(BRW_OPCODE_ENDIF);
3015 }
3016
3017 void
3018 vec4_visitor::gs_emit_vertex(int stream_id)
3019 {
3020    unreachable("not reached");
3021 }
3022
3023 void
3024 vec4_visitor::visit(ir_emit_vertex *)
3025 {
3026    unreachable("not reached");
3027 }
3028
3029 void
3030 vec4_visitor::gs_end_primitive()
3031 {
3032    unreachable("not reached");
3033 }
3034
3035
3036 void
3037 vec4_visitor::visit(ir_end_primitive *)
3038 {
3039    unreachable("not reached");
3040 }
3041
3042 void
3043 vec4_visitor::visit(ir_barrier *)
3044 {
3045    unreachable("not reached");
3046 }
3047
3048 void
3049 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3050                                   dst_reg dst, src_reg offset,
3051                                   src_reg src0, src_reg src1)
3052 {
3053    unsigned mlen = 0;
3054
3055    /* Set the atomic operation offset. */
3056    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3057    mlen++;
3058
3059    /* Set the atomic operation arguments. */
3060    if (src0.file != BAD_FILE) {
3061       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3062       mlen++;
3063    }
3064
3065    if (src1.file != BAD_FILE) {
3066       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3067       mlen++;
3068    }
3069
3070    /* Emit the instruction.  Note that this maps to the normal SIMD8
3071     * untyped atomic message on Ivy Bridge, but that's OK because
3072     * unused channels will be masked out.
3073     */
3074    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3075                                  brw_message_reg(0),
3076                                  src_reg(surf_index), src_reg(atomic_op));
3077    inst->mlen = mlen;
3078 }
3079
3080 void
3081 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3082                                         src_reg offset)
3083 {
3084    /* Set the surface read offset. */
3085    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3086
3087    /* Emit the instruction.  Note that this maps to the normal SIMD8
3088     * untyped surface read message, but that's OK because unused
3089     * channels will be masked out.
3090     */
3091    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3092                                  brw_message_reg(0),
3093                                  src_reg(surf_index), src_reg(1));
3094    inst->mlen = 1;
3095 }
3096
3097 void
3098 vec4_visitor::emit_ndc_computation()
3099 {
3100    /* Get the position */
3101    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3102
3103    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3104    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3105    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3106
3107    current_annotation = "NDC";
3108    dst_reg ndc_w = ndc;
3109    ndc_w.writemask = WRITEMASK_W;
3110    src_reg pos_w = pos;
3111    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3112    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3113
3114    dst_reg ndc_xyz = ndc;
3115    ndc_xyz.writemask = WRITEMASK_XYZ;
3116
3117    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3118 }
3119
3120 void
3121 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3122 {
3123    if (devinfo->gen < 6 &&
3124        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3125         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3126       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3127       dst_reg header1_w = header1;
3128       header1_w.writemask = WRITEMASK_W;
3129
3130       emit(MOV(header1, 0u));
3131
3132       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3133          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3134
3135          current_annotation = "Point size";
3136          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3137          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3138       }
3139
3140       if (key->userclip_active) {
3141          current_annotation = "Clipping flags";
3142          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3143          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3144
3145          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3146          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3147          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3148
3149          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3150          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3151          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3152          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3153       }
3154
3155       /* i965 clipping workaround:
3156        * 1) Test for -ve rhw
3157        * 2) If set,
3158        *      set ndc = (0,0,0,0)
3159        *      set ucp[6] = 1
3160        *
3161        * Later, clipping will detect ucp[6] and ensure the primitive is
3162        * clipped against all fixed planes.
3163        */
3164       if (devinfo->has_negative_rhw_bug) {
3165          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3166          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3167          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3168          vec4_instruction *inst;
3169          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3170          inst->predicate = BRW_PREDICATE_NORMAL;
3171          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3172          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3173          inst->predicate = BRW_PREDICATE_NORMAL;
3174       }
3175
3176       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3177    } else if (devinfo->gen < 6) {
3178       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3179    } else {
3180       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3181       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3182          dst_reg reg_w = reg;
3183          reg_w.writemask = WRITEMASK_W;
3184          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3185          reg_as_src.type = reg_w.type;
3186          reg_as_src.swizzle = brw_swizzle_for_size(1);
3187          emit(MOV(reg_w, reg_as_src));
3188       }
3189       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3190          dst_reg reg_y = reg;
3191          reg_y.writemask = WRITEMASK_Y;
3192          reg_y.type = BRW_REGISTER_TYPE_D;
3193          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3194          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3195       }
3196       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3197          dst_reg reg_z = reg;
3198          reg_z.writemask = WRITEMASK_Z;
3199          reg_z.type = BRW_REGISTER_TYPE_D;
3200          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3201          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3202       }
3203    }
3204 }
3205
3206 void
3207 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3208 {
3209    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3210     *
3211     *     "If a linked set of shaders forming the vertex stage contains no
3212     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3213     *     application has requested clipping against user clip planes through
3214     *     the API, then the coordinate written to gl_Position is used for
3215     *     comparison against the user clip planes."
3216     *
3217     * This function is only called if the shader didn't write to
3218     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3219     * if the user wrote to it; otherwise we use gl_Position.
3220     */
3221    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3222    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3223       clip_vertex = VARYING_SLOT_POS;
3224    }
3225
3226    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3227         ++i) {
3228       reg.writemask = 1 << i;
3229       emit(DP4(reg,
3230                src_reg(output_reg[clip_vertex]),
3231                src_reg(this->userplane[i + offset])));
3232    }
3233 }
3234
3235 vec4_instruction *
3236 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3237 {
3238    assert(varying < VARYING_SLOT_MAX);
3239    assert(output_reg[varying].type == reg.type);
3240    current_annotation = output_reg_annotation[varying];
3241    /* Copy the register, saturating if necessary */
3242    return emit(MOV(reg, src_reg(output_reg[varying])));
3243 }
3244
3245 void
3246 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3247 {
3248    reg.type = BRW_REGISTER_TYPE_F;
3249    output_reg[varying].type = reg.type;
3250
3251    switch (varying) {
3252    case VARYING_SLOT_PSIZ:
3253    {
3254       /* PSIZ is always in slot 0, and is coupled with other flags. */
3255       current_annotation = "indices, point width, clip flags";
3256       emit_psiz_and_flags(reg);
3257       break;
3258    }
3259    case BRW_VARYING_SLOT_NDC:
3260       current_annotation = "NDC";
3261       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3262       break;
3263    case VARYING_SLOT_POS:
3264       current_annotation = "gl_Position";
3265       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3266       break;
3267    case VARYING_SLOT_EDGE:
3268       /* This is present when doing unfilled polygons.  We're supposed to copy
3269        * the edge flag from the user-provided vertex array
3270        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3271        * of that attribute (starts as 1.0f).  This is then used in clipping to
3272        * determine which edges should be drawn as wireframe.
3273        */
3274       current_annotation = "edge flag";
3275       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3276                                     glsl_type::float_type, WRITEMASK_XYZW))));
3277       break;
3278    case BRW_VARYING_SLOT_PAD:
3279       /* No need to write to this slot */
3280       break;
3281    case VARYING_SLOT_COL0:
3282    case VARYING_SLOT_COL1:
3283    case VARYING_SLOT_BFC0:
3284    case VARYING_SLOT_BFC1: {
3285       /* These built-in varyings are only supported in compatibility mode,
3286        * and we only support GS in core profile.  So, this must be a vertex
3287        * shader.
3288        */
3289       assert(stage == MESA_SHADER_VERTEX);
3290       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3291       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3292          inst->saturate = true;
3293       break;
3294    }
3295
3296    default:
3297       emit_generic_urb_slot(reg, varying);
3298       break;
3299    }
3300 }
3301
3302 static int
3303 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3304 {
3305    if (devinfo->gen >= 6) {
3306       /* URB data written (does not include the message header reg) must
3307        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3308        * section 5.4.3.2.2: URB_INTERLEAVED.
3309        *
3310        * URB entries are allocated on a multiple of 1024 bits, so an
3311        * extra 128 bits written here to make the end align to 256 is
3312        * no problem.
3313        */
3314       if ((mlen % 2) != 1)
3315          mlen++;
3316    }
3317
3318    return mlen;
3319 }
3320
3321
3322 /**
3323  * Generates the VUE payload plus the necessary URB write instructions to
3324  * output it.
3325  *
3326  * The VUE layout is documented in Volume 2a.
3327  */
3328 void
3329 vec4_visitor::emit_vertex()
3330 {
3331    /* MRF 0 is reserved for the debugger, so start with message header
3332     * in MRF 1.
3333     */
3334    int base_mrf = 1;
3335    int mrf = base_mrf;
3336    /* In the process of generating our URB write message contents, we
3337     * may need to unspill a register or load from an array.  Those
3338     * reads would use MRFs 14-15.
3339     */
3340    int max_usable_mrf = 13;
3341
3342    /* The following assertion verifies that max_usable_mrf causes an
3343     * even-numbered amount of URB write data, which will meet gen6's
3344     * requirements for length alignment.
3345     */
3346    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3347
3348    /* First mrf is the g0-based message header containing URB handles and
3349     * such.
3350     */
3351    emit_urb_write_header(mrf++);
3352
3353    if (devinfo->gen < 6) {
3354       emit_ndc_computation();
3355    }
3356
3357    /* Lower legacy ff and ClipVertex clipping to clip distances */
3358    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3359       current_annotation = "user clip distances";
3360
3361       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3362       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3363
3364       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3365       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3366    }
3367
3368    /* We may need to split this up into several URB writes, so do them in a
3369     * loop.
3370     */
3371    int slot = 0;
3372    bool complete = false;
3373    do {
3374       /* URB offset is in URB row increments, and each of our MRFs is half of
3375        * one of those, since we're doing interleaved writes.
3376        */
3377       int offset = slot / 2;
3378
3379       mrf = base_mrf + 1;
3380       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3381          emit_urb_slot(dst_reg(MRF, mrf++),
3382                        prog_data->vue_map.slot_to_varying[slot]);
3383
3384          /* If this was max_usable_mrf, we can't fit anything more into this
3385           * URB WRITE.
3386           */
3387          if (mrf > max_usable_mrf) {
3388             slot++;
3389             break;
3390          }
3391       }
3392
3393       complete = slot >= prog_data->vue_map.num_slots;
3394       current_annotation = "URB write";
3395       vec4_instruction *inst = emit_urb_write_opcode(complete);
3396       inst->base_mrf = base_mrf;
3397       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3398       inst->offset += offset;
3399    } while(!complete);
3400 }
3401
3402
3403 src_reg
3404 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3405                                  src_reg *reladdr, int reg_offset)
3406 {
3407    /* Because we store the values to scratch interleaved like our
3408     * vertex data, we need to scale the vec4 index by 2.
3409     */
3410    int message_header_scale = 2;
3411
3412    /* Pre-gen6, the message header uses byte offsets instead of vec4
3413     * (16-byte) offset units.
3414     */
3415    if (devinfo->gen < 6)
3416       message_header_scale *= 16;
3417
3418    if (reladdr) {
3419       src_reg index = src_reg(this, glsl_type::int_type);
3420
3421       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3422                                    src_reg(reg_offset)));
3423       emit_before(block, inst, MUL(dst_reg(index), index,
3424                                    src_reg(message_header_scale)));
3425
3426       return index;
3427    } else {
3428       return src_reg(reg_offset * message_header_scale);
3429    }
3430 }
3431
3432 src_reg
3433 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3434                                        src_reg *reladdr, int reg_offset)
3435 {
3436    if (reladdr) {
3437       src_reg index = src_reg(this, glsl_type::int_type);
3438
3439       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3440                                    src_reg(reg_offset)));
3441
3442       /* Pre-gen6, the message header uses byte offsets instead of vec4
3443        * (16-byte) offset units.
3444        */
3445       if (devinfo->gen < 6) {
3446          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3447       }
3448
3449       return index;
3450    } else if (devinfo->gen >= 8) {
3451       /* Store the offset in a GRF so we can send-from-GRF. */
3452       src_reg offset = src_reg(this, glsl_type::int_type);
3453       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3454       return offset;
3455    } else {
3456       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3457       return src_reg(reg_offset * message_header_scale);
3458    }
3459 }
3460
3461 /**
3462  * Emits an instruction before @inst to load the value named by @orig_src
3463  * from scratch space at @base_offset to @temp.
3464  *
3465  * @base_offset is measured in 32-byte units (the size of a register).
3466  */
3467 void
3468 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3469                                 dst_reg temp, src_reg orig_src,
3470                                 int base_offset)
3471 {
3472    int reg_offset = base_offset + orig_src.reg_offset;
3473    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3474                                       reg_offset);
3475
3476    emit_before(block, inst, SCRATCH_READ(temp, index));
3477 }
3478
3479 /**
3480  * Emits an instruction after @inst to store the value to be written
3481  * to @orig_dst to scratch space at @base_offset, from @temp.
3482  *
3483  * @base_offset is measured in 32-byte units (the size of a register).
3484  */
3485 void
3486 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3487                                  int base_offset)
3488 {
3489    int reg_offset = base_offset + inst->dst.reg_offset;
3490    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3491                                       reg_offset);
3492
3493    /* Create a temporary register to store *inst's result in.
3494     *
3495     * We have to be careful in MOVing from our temporary result register in
3496     * the scratch write.  If we swizzle from channels of the temporary that
3497     * weren't initialized, it will confuse live interval analysis, which will
3498     * make spilling fail to make progress.
3499     */
3500    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3501                                        inst->dst.type),
3502                                 brw_swizzle_for_mask(inst->dst.writemask));
3503    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3504                                        inst->dst.writemask));
3505    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3506    if (inst->opcode != BRW_OPCODE_SEL)
3507       write->predicate = inst->predicate;
3508    write->ir = inst->ir;
3509    write->annotation = inst->annotation;
3510    inst->insert_after(block, write);
3511
3512    inst->dst.file = temp.file;
3513    inst->dst.reg = temp.reg;
3514    inst->dst.reg_offset = temp.reg_offset;
3515    inst->dst.reladdr = NULL;
3516 }
3517
3518 /**
3519  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3520  * adds the scratch read(s) before \p inst. The function also checks for
3521  * recursive reladdr scratch accesses, issuing the corresponding scratch
3522  * loads and rewriting reladdr references accordingly.
3523  *
3524  * \return \p src if it did not require a scratch load, otherwise, the
3525  * register holding the result of the scratch load that the caller should
3526  * use to rewrite src.
3527  */
3528 src_reg
3529 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3530                                    vec4_instruction *inst, src_reg src)
3531 {
3532    /* Resolve recursive reladdr scratch access by calling ourselves
3533     * with src.reladdr
3534     */
3535    if (src.reladdr)
3536       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3537                                           *src.reladdr);
3538
3539    /* Now handle scratch access on src */
3540    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3541       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3542       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3543       src.reg = temp.reg;
3544       src.reg_offset = temp.reg_offset;
3545       src.reladdr = NULL;
3546    }
3547
3548    return src;
3549 }
3550
3551 /**
3552  * We can't generally support array access in GRF space, because a
3553  * single instruction's destination can only span 2 contiguous
3554  * registers.  So, we send all GRF arrays that get variable index
3555  * access to scratch space.
3556  */
3557 void
3558 vec4_visitor::move_grf_array_access_to_scratch()
3559 {
3560    int scratch_loc[this->alloc.count];
3561    memset(scratch_loc, -1, sizeof(scratch_loc));
3562
3563    /* First, calculate the set of virtual GRFs that need to be punted
3564     * to scratch due to having any array access on them, and where in
3565     * scratch.
3566     */
3567    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3568       if (inst->dst.file == GRF && inst->dst.reladdr) {
3569          if (scratch_loc[inst->dst.reg] == -1) {
3570             scratch_loc[inst->dst.reg] = last_scratch;
3571             last_scratch += this->alloc.sizes[inst->dst.reg];
3572          }
3573
3574          for (src_reg *iter = inst->dst.reladdr;
3575               iter->reladdr;
3576               iter = iter->reladdr) {
3577             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3578                scratch_loc[iter->reg] = last_scratch;
3579                last_scratch += this->alloc.sizes[iter->reg];
3580             }
3581          }
3582       }
3583
3584       for (int i = 0 ; i < 3; i++) {
3585          for (src_reg *iter = &inst->src[i];
3586               iter->reladdr;
3587               iter = iter->reladdr) {
3588             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3589                scratch_loc[iter->reg] = last_scratch;
3590                last_scratch += this->alloc.sizes[iter->reg];
3591             }
3592          }
3593       }
3594    }
3595
3596    /* Now, for anything that will be accessed through scratch, rewrite
3597     * it to load/store.  Note that this is a _safe list walk, because
3598     * we may generate a new scratch_write instruction after the one
3599     * we're processing.
3600     */
3601    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3602       /* Set up the annotation tracking for new generated instructions. */
3603       base_ir = inst->ir;
3604       current_annotation = inst->annotation;
3605
3606       /* First handle scratch access on the dst. Notice we have to handle
3607        * the case where the dst's reladdr also points to scratch space.
3608        */
3609       if (inst->dst.reladdr)
3610          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3611                                                    *inst->dst.reladdr);
3612
3613       /* Now that we have handled any (possibly recursive) reladdr scratch
3614        * accesses for dst we can safely do the scratch write for dst itself
3615        */
3616       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3617          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3618
3619       /* Now handle scratch access on any src. In this case, since inst->src[i]
3620        * already is a src_reg, we can just call emit_resolve_reladdr with
3621        * inst->src[i] and it will take care of handling scratch loads for
3622        * both src and src.reladdr (recursively).
3623        */
3624       for (int i = 0 ; i < 3; i++) {
3625          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3626                                              inst->src[i]);
3627       }
3628    }
3629 }
3630
3631 /**
3632  * Emits an instruction before @inst to load the value named by @orig_src
3633  * from the pull constant buffer (surface) at @base_offset to @temp.
3634  */
3635 void
3636 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3637                                       dst_reg temp, src_reg orig_src,
3638                                       int base_offset)
3639 {
3640    int reg_offset = base_offset + orig_src.reg_offset;
3641    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3642    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3643                                              reg_offset);
3644
3645    emit_pull_constant_load_reg(temp,
3646                                index,
3647                                offset,
3648                                block, inst);
3649 }
3650
3651 /**
3652  * Implements array access of uniforms by inserting a
3653  * PULL_CONSTANT_LOAD instruction.
3654  *
3655  * Unlike temporary GRF array access (where we don't support it due to
3656  * the difficulty of doing relative addressing on instruction
3657  * destinations), we could potentially do array access of uniforms
3658  * that were loaded in GRF space as push constants.  In real-world
3659  * usage we've seen, though, the arrays being used are always larger
3660  * than we could load as push constants, so just always move all
3661  * uniform array access out to a pull constant buffer.
3662  */
3663 void
3664 vec4_visitor::move_uniform_array_access_to_pull_constants()
3665 {
3666    int pull_constant_loc[this->uniforms];
3667    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3668    bool nested_reladdr;
3669
3670    /* Walk through and find array access of uniforms.  Put a copy of that
3671     * uniform in the pull constant buffer.
3672     *
3673     * Note that we don't move constant-indexed accesses to arrays.  No
3674     * testing has been done of the performance impact of this choice.
3675     */
3676    do {
3677       nested_reladdr = false;
3678
3679       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3680          for (int i = 0 ; i < 3; i++) {
3681             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3682                continue;
3683
3684             int uniform = inst->src[i].reg;
3685
3686             if (inst->src[i].reladdr->reladdr)
3687                nested_reladdr = true;  /* will need another pass */
3688
3689             /* If this array isn't already present in the pull constant buffer,
3690              * add it.
3691              */
3692             if (pull_constant_loc[uniform] == -1) {
3693                const gl_constant_value **values =
3694                   &stage_prog_data->param[uniform * 4];
3695
3696                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3697
3698                assert(uniform < uniform_array_size);
3699                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3700                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3701                      = values[j];
3702                }
3703             }
3704
3705             /* Set up the annotation tracking for new generated instructions. */
3706             base_ir = inst->ir;
3707             current_annotation = inst->annotation;
3708
3709             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3710
3711             emit_pull_constant_load(block, inst, temp, inst->src[i],
3712                                     pull_constant_loc[uniform]);
3713
3714             inst->src[i].file = temp.file;
3715             inst->src[i].reg = temp.reg;
3716             inst->src[i].reg_offset = temp.reg_offset;
3717             inst->src[i].reladdr = NULL;
3718          }
3719       }
3720    } while (nested_reladdr);
3721
3722    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3723     * no need to track them as larger-than-vec4 objects.  This will be
3724     * relied on in cutting out unused uniform vectors from push
3725     * constants.
3726     */
3727    split_uniform_registers();
3728 }
3729
3730 void
3731 vec4_visitor::resolve_ud_negate(src_reg *reg)
3732 {
3733    if (reg->type != BRW_REGISTER_TYPE_UD ||
3734        !reg->negate)
3735       return;
3736
3737    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3738    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3739    *reg = temp;
3740 }
3741
3742 /**
3743  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3744  *
3745  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3746  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3747  */
3748 void
3749 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3750 {
3751    assert(devinfo->gen <= 5);
3752
3753    if (!rvalue->type->is_boolean())
3754       return;
3755
3756    src_reg and_result = src_reg(this, rvalue->type);
3757    src_reg neg_result = src_reg(this, rvalue->type);
3758    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3759    emit(MOV(dst_reg(neg_result), negate(and_result)));
3760    *reg = neg_result;
3761 }
3762
3763 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3764                            void *log_data,
3765                            struct gl_program *prog,
3766                            const struct brw_vue_prog_key *key,
3767                            struct brw_vue_prog_data *prog_data,
3768                            struct gl_shader_program *shader_prog,
3769                            gl_shader_stage stage,
3770                            void *mem_ctx,
3771                            bool no_spills,
3772                            int shader_time_index)
3773    : backend_shader(compiler, log_data, mem_ctx,
3774                     shader_prog, prog, &prog_data->base, stage),
3775      key(key),
3776      prog_data(prog_data),
3777      sanity_param_count(0),
3778      fail_msg(NULL),
3779      first_non_payload_grf(0),
3780      need_all_constants_in_pull_buffer(false),
3781      no_spills(no_spills),
3782      shader_time_index(shader_time_index),
3783      last_scratch(0)
3784 {
3785    this->failed = false;
3786
3787    this->base_ir = NULL;
3788    this->current_annotation = NULL;
3789    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3790
3791    this->variable_ht = hash_table_ctor(0,
3792                                        hash_table_pointer_hash,
3793                                        hash_table_pointer_compare);
3794
3795    this->virtual_grf_start = NULL;
3796    this->virtual_grf_end = NULL;
3797    this->live_intervals = NULL;
3798
3799    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3800
3801    this->uniforms = 0;
3802
3803    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3804     * at least one. See setup_uniforms() in brw_vec4.cpp.
3805     */
3806    this->uniform_array_size = 1;
3807    if (prog_data) {
3808       this->uniform_array_size =
3809          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3810    }
3811
3812    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3813    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3814 }
3815
3816 vec4_visitor::~vec4_visitor()
3817 {
3818    hash_table_dtor(this->variable_ht);
3819 }
3820
3821
3822 void
3823 vec4_visitor::fail(const char *format, ...)
3824 {
3825    va_list va;
3826    char *msg;
3827
3828    if (failed)
3829       return;
3830
3831    failed = true;
3832
3833    va_start(va, format);
3834    msg = ralloc_vasprintf(mem_ctx, format, va);
3835    va_end(va);
3836    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3837
3838    this->fail_msg = msg;
3839
3840    if (debug_enabled) {
3841       fprintf(stderr, "%s",  msg);
3842    }
3843 }
3844
3845 } /* namespace brw */