src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(const src_reg &src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 317 {
 318    if (!src.abs && !src.negate)
 319       return src;
 320
 321    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 322    resolved.type = src.type;
 323    emit(MOV(resolved, src));
 324
 325    return src_reg(resolved);
 326 }
 327
 328 src_reg
 329 vec4_visitor::fix_math_operand(const src_reg &src)
 330 {
 331    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 332       return src;
 333
 334    /* The gen6 math instruction ignores the source modifiers --
 335     * swizzle, abs, negate, and at least some parts of the register
 336     * region description.
 337     *
 338     * Rather than trying to enumerate all these cases, *always* expand the
 339     * operand to a temp GRF for gen6.
 340     *
 341     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 342     * can't use.
 343     */
 344
 345    if (devinfo->gen == 7 && src.file != IMM)
 346       return src;
 347
 348    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 349    expanded.type = src.type;
 350    emit(MOV(expanded, src));
 351    return src_reg(expanded);
 352 }
 353
 354 vec4_instruction *
 355 vec4_visitor::emit_math(enum opcode opcode,
 356                         const dst_reg &dst,
 357                         const src_reg &src0, const src_reg &src1)
 358 {
 359    vec4_instruction *math =
 360       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 361
 362    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 363       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 364       math->dst = dst_reg(this, glsl_type::vec4_type);
 365       math->dst.type = dst.type;
 366       math = emit(MOV(dst, src_reg(math->dst)));
 367    } else if (devinfo->gen < 6) {
 368       math->base_mrf = 1;
 369       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 370    }
 371
 372    return math;
 373 }
 374
 375 void
 376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 377 {
 378    if (devinfo->gen < 7) {
 379       unreachable("ir_unop_pack_half_2x16 should be lowered");
 380    }
 381
 382    assert(dst.type == BRW_REGISTER_TYPE_UD);
 383    assert(src0.type == BRW_REGISTER_TYPE_F);
 384
 385    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 386     *
 387     *   Because this instruction does not have a 16-bit floating-point type,
 388     *   the destination data type must be Word (W).
 389     *
 390     *   The destination must be DWord-aligned and specify a horizontal stride
 391     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 392     *   each destination channel and the upper word is not modified.
 393     *
 394     * The above restriction implies that the f32to16 instruction must use
 395     * align1 mode, because only in align1 mode is it possible to specify
 396     * horizontal stride.  We choose here to defy the hardware docs and emit
 397     * align16 instructions.
 398     *
 399     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 400     * instructions. I was partially successful in that the code passed all
 401     * tests.  However, the code was dubiously correct and fragile, and the
 402     * tests were not harsh enough to probe that frailty. Not trusting the
 403     * code, I chose instead to remain in align16 mode in defiance of the hw
 404     * docs).
 405     *
 406     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 407     * simulator, emitting a f32to16 in align16 mode with UD as destination
 408     * data type is safe. The behavior differs from that specified in the PRM
 409     * in that the upper word of each destination channel is cleared to 0.
 410     */
 411
 412    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 413    src_reg tmp_src(tmp_dst);
 414
 415 #if 0
 416    /* Verify the undocumented behavior on which the following instructions
 417     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 418     * then the result of the bit-or instruction below will be incorrect.
 419     *
 420     * You should inspect the disasm output in order to verify that the MOV is
 421     * not optimized away.
 422     */
 423    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 424 #endif
 425
 426    /* Give tmp the form below, where "." means untouched.
 427     *
 428     *     w z          y          x w z          y          x
 429     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 430     *
 431     * That the upper word of each write-channel be 0 is required for the
 432     * following bit-shift and bit-or instructions to work. Note that this
 433     * relies on the undocumented hardware behavior mentioned above.
 434     */
 435    tmp_dst.writemask = WRITEMASK_XY;
 436    emit(F32TO16(tmp_dst, src0));
 437
 438    /* Give the write-channels of dst the form:
 439     *   0xhhhh0000
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 442    emit(SHL(dst, tmp_src, src_reg(16u)));
 443
 444    /* Finally, give the write-channels of dst the form of packHalf2x16's
 445     * output:
 446     *   0xhhhhllll
 447     */
 448    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 449    emit(OR(dst, src_reg(dst), tmp_src));
 450 }
 451
 452 void
 453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 454 {
 455    if (devinfo->gen < 7) {
 456       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 457    }
 458
 459    assert(dst.type == BRW_REGISTER_TYPE_F);
 460    assert(src0.type == BRW_REGISTER_TYPE_UD);
 461
 462    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 463     *
 464     *   Because this instruction does not have a 16-bit floating-point type,
 465     *   the source data type must be Word (W). The destination type must be
 466     *   F (Float).
 467     *
 468     * To use W as the source data type, we must adjust horizontal strides,
 469     * which is only possible in align1 mode. All my [chadv] attempts at
 470     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 471     * Piglit tests, so I gave up.
 472     *
 473     * I've verified that, on gen7 hardware and the simulator, it is safe to
 474     * emit f16to32 in align16 mode with UD as source data type.
 475     */
 476
 477    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 478    src_reg tmp_src(tmp_dst);
 479
 480    tmp_dst.writemask = WRITEMASK_X;
 481    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 482
 483    tmp_dst.writemask = WRITEMASK_Y;
 484    emit(SHR(tmp_dst, src0, src_reg(16u)));
 485
 486    dst.writemask = WRITEMASK_XY;
 487    emit(F16TO32(dst, tmp_src));
 488 }
 489
 490 void
 491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 492 {
 493    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 494     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 495     * is not suitable to generate the shift values, but we can use the packed
 496     * vector float and a type-converting MOV.
 497     */
 498    dst_reg shift(this, glsl_type::uvec4_type);
 499    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 500
 501    dst_reg shifted(this, glsl_type::uvec4_type);
 502    src0.swizzle = BRW_SWIZZLE_XXXX;
 503    emit(SHR(shifted, src0, src_reg(shift)));
 504
 505    shifted.type = BRW_REGISTER_TYPE_UB;
 506    dst_reg f(this, glsl_type::vec4_type);
 507    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 508
 509    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 514 {
 515    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 516     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 517     * is not suitable to generate the shift values, but we can use the packed
 518     * vector float and a type-converting MOV.
 519     */
 520    dst_reg shift(this, glsl_type::uvec4_type);
 521    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 522
 523    dst_reg shifted(this, glsl_type::uvec4_type);
 524    src0.swizzle = BRW_SWIZZLE_XXXX;
 525    emit(SHR(shifted, src0, src_reg(shift)));
 526
 527    shifted.type = BRW_REGISTER_TYPE_B;
 528    dst_reg f(this, glsl_type::vec4_type);
 529    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 533
 534    dst_reg max(this, glsl_type::vec4_type);
 535    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 536    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 537 }
 538
 539 void
 540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 541 {
 542    dst_reg saturated(this, glsl_type::vec4_type);
 543    vec4_instruction *inst = emit(MOV(saturated, src0));
 544    inst->saturate = true;
 545
 546    dst_reg scaled(this, glsl_type::vec4_type);
 547    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 548
 549    dst_reg rounded(this, glsl_type::vec4_type);
 550    emit(RNDE(rounded, src_reg(scaled)));
 551
 552    dst_reg u(this, glsl_type::uvec4_type);
 553    emit(MOV(u, src_reg(rounded)));
 554
 555    src_reg bytes(u);
 556    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 557 }
 558
 559 void
 560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 561 {
 562    dst_reg max(this, glsl_type::vec4_type);
 563    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 564
 565    dst_reg min(this, glsl_type::vec4_type);
 566    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 567
 568    dst_reg scaled(this, glsl_type::vec4_type);
 569    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 570
 571    dst_reg rounded(this, glsl_type::vec4_type);
 572    emit(RNDE(rounded, src_reg(scaled)));
 573
 574    dst_reg i(this, glsl_type::ivec4_type);
 575    emit(MOV(i, src_reg(rounded)));
 576
 577    src_reg bytes(i);
 578    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 579 }
 580
 581 void
 582 vec4_visitor::visit_instructions(const exec_list *list)
 583 {
 584    foreach_in_list(ir_instruction, ir, list) {
 585       base_ir = ir;
 586       ir->accept(this);
 587    }
 588 }
 589
 590 /**
 591  * Returns the minimum number of vec4 elements needed to pack a type.
 592  *
 593  * For simple types, it will return 1 (a single vec4); for matrices, the
 594  * number of columns; for array and struct, the sum of the vec4_size of
 595  * each of its elements; and for sampler and atomic, zero.
 596  *
 597  * This method is useful to calculate how much register space is needed to
 598  * store a particular type.
 599  */
 600 extern "C" int
 601 type_size_vec4(const struct glsl_type *type)
 602 {
 603    unsigned int i;
 604    int size;
 605
 606    switch (type->base_type) {
 607    case GLSL_TYPE_UINT:
 608    case GLSL_TYPE_INT:
 609    case GLSL_TYPE_FLOAT:
 610    case GLSL_TYPE_BOOL:
 611       if (type->is_matrix()) {
 612          return type->matrix_columns;
 613       } else {
 614          /* Regardless of size of vector, it gets a vec4. This is bad
 615           * packing for things like floats, but otherwise arrays become a
 616           * mess.  Hopefully a later pass over the code can pack scalars
 617           * down if appropriate.
 618           */
 619          return 1;
 620       }
 621    case GLSL_TYPE_ARRAY:
 622       assert(type->length > 0);
 623       return type_size_vec4(type->fields.array) * type->length;
 624    case GLSL_TYPE_STRUCT:
 625       size = 0;
 626       for (i = 0; i < type->length; i++) {
 627          size += type_size_vec4(type->fields.structure[i].type);
 628       }
 629       return size;
 630    case GLSL_TYPE_SUBROUTINE:
 631       return 1;
 632
 633    case GLSL_TYPE_SAMPLER:
 634       /* Samplers take up no register space, since they're baked in at
 635        * link time.
 636        */
 637       return 0;
 638    case GLSL_TYPE_ATOMIC_UINT:
 639       return 0;
 640    case GLSL_TYPE_IMAGE:
 641       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 642    case GLSL_TYPE_VOID:
 643    case GLSL_TYPE_DOUBLE:
 644    case GLSL_TYPE_ERROR:
 645    case GLSL_TYPE_INTERFACE:
 646       unreachable("not reached");
 647    }
 648
 649    return 0;
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 653 {
 654    init();
 655
 656    this->file = GRF;
 657    this->reg = v->alloc.allocate(type_size_vec4(type));
 658
 659    if (type->is_array() || type->is_record()) {
 660       this->swizzle = BRW_SWIZZLE_NOOP;
 661    } else {
 662       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 663    }
 664
 665    this->type = brw_type_for_base_type(type);
 666 }
 667
 668 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 669 {
 670    assert(size > 0);
 671
 672    init();
 673
 674    this->file = GRF;
 675    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 676
 677    this->swizzle = BRW_SWIZZLE_NOOP;
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 683 {
 684    init();
 685
 686    this->file = GRF;
 687    this->reg = v->alloc.allocate(type_size_vec4(type));
 688
 689    if (type->is_array() || type->is_record()) {
 690       this->writemask = WRITEMASK_XYZW;
 691    } else {
 692       this->writemask = (1 << type->vector_elements) - 1;
 693    }
 694
 695    this->type = brw_type_for_base_type(type);
 696 }
 697
 698 void
 699 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 700                                        const gl_constant_value *values,
 701                                        unsigned n)
 702 {
 703    static const gl_constant_value zero = { 0 };
 704
 705    assert(param_offset % 4 == 0);
 706
 707    for (unsigned i = 0; i < n; ++i)
 708       stage_prog_data->param[param_offset + i] = &values[i];
 709
 710    for (unsigned i = n; i < 4; ++i)
 711       stage_prog_data->param[param_offset + i] = &zero;
 712
 713    uniform_vector_size[param_offset / 4] = n;
 714 }
 715
 716 /* Our support for uniforms is piggy-backed on the struct
 717  * gl_fragment_program, because that's where the values actually
 718  * get stored, rather than in some global gl_shader_program uniform
 719  * store.
 720  */
 721 void
 722 vec4_visitor::setup_uniform_values(ir_variable *ir)
 723 {
 724    int namelen = strlen(ir->name);
 725
 726    /* The data for our (non-builtin) uniforms is stored in a series of
 727     * gl_uniform_driver_storage structs for each subcomponent that
 728     * glGetUniformLocation() could name.  We know it's been set up in the same
 729     * order we'd walk the type, so walk the list of storage and find anything
 730     * with our name, or the prefix of a component that starts with our name.
 731     */
 732    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 733       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 734
 735       if (storage->builtin)
 736          continue;
 737
 738       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 739           (storage->name[namelen] != 0 &&
 740            storage->name[namelen] != '.' &&
 741            storage->name[namelen] != '[')) {
 742          continue;
 743       }
 744
 745       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 746                                      storage->type->matrix_columns);
 747       const unsigned vector_size = storage->type->vector_elements;
 748
 749       for (unsigned s = 0; s < vector_count; s++) {
 750          setup_vec4_uniform_value(uniforms * 4,
 751                                   &storage->storage[s * vector_size],
 752                                   vector_size);
 753          uniforms++;
 754       }
 755    }
 756 }
 757
 758 void
 759 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 760 {
 761    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 762       assert(this->uniforms < uniform_array_size);
 763       this->uniform_vector_size[this->uniforms] = 4;
 764       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 765       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 766       for (int j = 0; j < 4; ++j) {
 767          stage_prog_data->param[this->uniforms * 4 + j] =
 768             (gl_constant_value *) &clip_planes[i][j];
 769       }
 770       ++this->uniforms;
 771    }
 772 }
 773
 774 /* Our support for builtin uniforms is even scarier than non-builtin.
 775  * It sits on top of the PROG_STATE_VAR parameters that are
 776  * automatically updated from GL context state.
 777  */
 778 void
 779 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 780 {
 781    const ir_state_slot *const slots = ir->get_state_slots();
 782    assert(slots != NULL);
 783
 784    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 785       /* This state reference has already been setup by ir_to_mesa,
 786        * but we'll get the same index back here.  We can reference
 787        * ParameterValues directly, since unlike brw_fs.cpp, we never
 788        * add new state references during compile.
 789        */
 790       int index = _mesa_add_state_reference(this->prog->Parameters,
 791                                             (gl_state_index *)slots[i].tokens);
 792       gl_constant_value *values =
 793          &this->prog->Parameters->ParameterValues[index][0];
 794
 795       assert(this->uniforms < uniform_array_size);
 796
 797       for (unsigned j = 0; j < 4; j++)
 798          stage_prog_data->param[this->uniforms * 4 + j] =
 799             &values[GET_SWZ(slots[i].swizzle, j)];
 800
 801       this->uniform_vector_size[this->uniforms] =
 802          (ir->type->is_scalar() || ir->type->is_vector() ||
 803           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 804
 805       this->uniforms++;
 806    }
 807 }
 808
 809 dst_reg *
 810 vec4_visitor::variable_storage(ir_variable *var)
 811 {
 812    return (dst_reg *)hash_table_find(this->variable_ht, var);
 813 }
 814
 815 void
 816 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 817                                      enum brw_predicate *predicate)
 818 {
 819    ir_expression *expr = ir->as_expression();
 820
 821    *predicate = BRW_PREDICATE_NORMAL;
 822
 823    if (expr && expr->operation != ir_binop_ubo_load) {
 824       src_reg op[3];
 825       vec4_instruction *inst;
 826
 827       assert(expr->get_num_operands() <= 3);
 828       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 829          expr->operands[i]->accept(this);
 830          op[i] = this->result;
 831
 832          resolve_ud_negate(&op[i]);
 833       }
 834
 835       switch (expr->operation) {
 836       case ir_unop_logic_not:
 837          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 838          inst->conditional_mod = BRW_CONDITIONAL_Z;
 839          break;
 840
 841       case ir_binop_logic_xor:
 842          if (devinfo->gen <= 5) {
 843             src_reg temp = src_reg(this, ir->type);
 844             emit(XOR(dst_reg(temp), op[0], op[1]));
 845             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 846          } else {
 847             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 848          }
 849          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 850          break;
 851
 852       case ir_binop_logic_or:
 853          if (devinfo->gen <= 5) {
 854             src_reg temp = src_reg(this, ir->type);
 855             emit(OR(dst_reg(temp), op[0], op[1]));
 856             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 857          } else {
 858             inst = emit(OR(dst_null_d(), op[0], op[1]));
 859          }
 860          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 861          break;
 862
 863       case ir_binop_logic_and:
 864          if (devinfo->gen <= 5) {
 865             src_reg temp = src_reg(this, ir->type);
 866             emit(AND(dst_reg(temp), op[0], op[1]));
 867             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 868          } else {
 869             inst = emit(AND(dst_null_d(), op[0], op[1]));
 870          }
 871          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872          break;
 873
 874       case ir_unop_f2b:
 875          if (devinfo->gen >= 6) {
 876             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 877          } else {
 878             inst = emit(MOV(dst_null_f(), op[0]));
 879             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 880          }
 881          break;
 882
 883       case ir_unop_i2b:
 884          if (devinfo->gen >= 6) {
 885             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 886          } else {
 887             inst = emit(MOV(dst_null_d(), op[0]));
 888             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 889          }
 890          break;
 891
 892       case ir_binop_all_equal:
 893          if (devinfo->gen <= 5) {
 894             resolve_bool_comparison(expr->operands[0], &op[0]);
 895             resolve_bool_comparison(expr->operands[1], &op[1]);
 896          }
 897          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 898          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 899          break;
 900
 901       case ir_binop_any_nequal:
 902          if (devinfo->gen <= 5) {
 903             resolve_bool_comparison(expr->operands[0], &op[0]);
 904             resolve_bool_comparison(expr->operands[1], &op[1]);
 905          }
 906          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 907          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 908          break;
 909
 910       case ir_unop_any:
 911          if (devinfo->gen <= 5) {
 912             resolve_bool_comparison(expr->operands[0], &op[0]);
 913          }
 914          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 915          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 916          break;
 917
 918       case ir_binop_greater:
 919       case ir_binop_gequal:
 920       case ir_binop_less:
 921       case ir_binop_lequal:
 922       case ir_binop_equal:
 923       case ir_binop_nequal:
 924          if (devinfo->gen <= 5) {
 925             resolve_bool_comparison(expr->operands[0], &op[0]);
 926             resolve_bool_comparison(expr->operands[1], &op[1]);
 927          }
 928          emit(CMP(dst_null_d(), op[0], op[1],
 929                   brw_conditional_for_comparison(expr->operation)));
 930          break;
 931
 932       case ir_triop_csel: {
 933          /* Expand the boolean condition into the flag register. */
 934          inst = emit(MOV(dst_null_d(), op[0]));
 935          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 936
 937          /* Select which boolean to return. */
 938          dst_reg temp(this, expr->operands[1]->type);
 939          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 940          inst->predicate = BRW_PREDICATE_NORMAL;
 941
 942          /* Expand the result to a condition code. */
 943          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 944          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 945          break;
 946       }
 947
 948       default:
 949          unreachable("not reached");
 950       }
 951       return;
 952    }
 953
 954    ir->accept(this);
 955
 956    resolve_ud_negate(&this->result);
 957
 958    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 959    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 960 }
 961
 962 /**
 963  * Emit a gen6 IF statement with the comparison folded into the IF
 964  * instruction.
 965  */
 966 void
 967 vec4_visitor::emit_if_gen6(ir_if *ir)
 968 {
 969    ir_expression *expr = ir->condition->as_expression();
 970
 971    if (expr && expr->operation != ir_binop_ubo_load) {
 972       src_reg op[3];
 973       dst_reg temp;
 974
 975       assert(expr->get_num_operands() <= 3);
 976       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 977          expr->operands[i]->accept(this);
 978          op[i] = this->result;
 979       }
 980
 981       switch (expr->operation) {
 982       case ir_unop_logic_not:
 983          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 984          return;
 985
 986       case ir_binop_logic_xor:
 987          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 988          return;
 989
 990       case ir_binop_logic_or:
 991          temp = dst_reg(this, glsl_type::bool_type);
 992          emit(OR(temp, op[0], op[1]));
 993          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 994          return;
 995
 996       case ir_binop_logic_and:
 997          temp = dst_reg(this, glsl_type::bool_type);
 998          emit(AND(temp, op[0], op[1]));
 999          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1000          return;
1001
1002       case ir_unop_f2b:
1003          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1004          return;
1005
1006       case ir_unop_i2b:
1007          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1008          return;
1009
1010       case ir_binop_greater:
1011       case ir_binop_gequal:
1012       case ir_binop_less:
1013       case ir_binop_lequal:
1014       case ir_binop_equal:
1015       case ir_binop_nequal:
1016          emit(IF(op[0], op[1],
1017                  brw_conditional_for_comparison(expr->operation)));
1018          return;
1019
1020       case ir_binop_all_equal:
1021          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1022          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1023          return;
1024
1025       case ir_binop_any_nequal:
1026          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1027          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1028          return;
1029
1030       case ir_unop_any:
1031          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1032          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1033          return;
1034
1035       case ir_triop_csel: {
1036          /* Expand the boolean condition into the flag register. */
1037          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1038          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1039
1040          /* Select which boolean to return. */
1041          dst_reg temp(this, expr->operands[1]->type);
1042          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1043          inst->predicate = BRW_PREDICATE_NORMAL;
1044
1045          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1046          return;
1047       }
1048
1049       default:
1050          unreachable("not reached");
1051       }
1052       return;
1053    }
1054
1055    ir->condition->accept(this);
1056
1057    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1058 }
1059
1060 void
1061 vec4_visitor::visit(ir_variable *ir)
1062 {
1063    dst_reg *reg = NULL;
1064
1065    if (variable_storage(ir))
1066       return;
1067
1068    switch (ir->data.mode) {
1069    case ir_var_shader_in:
1070       assert(ir->data.location != -1);
1071       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1072       break;
1073
1074    case ir_var_shader_out:
1075       assert(ir->data.location != -1);
1076       reg = new(mem_ctx) dst_reg(this, ir->type);
1077
1078       for (int i = 0; i < type_size_vec4(ir->type); i++) {
1079          output_reg[ir->data.location + i] = *reg;
1080          output_reg[ir->data.location + i].reg_offset = i;
1081          output_reg_annotation[ir->data.location + i] = ir->name;
1082       }
1083       break;
1084
1085    case ir_var_auto:
1086    case ir_var_temporary:
1087       reg = new(mem_ctx) dst_reg(this, ir->type);
1088       break;
1089
1090    case ir_var_uniform:
1091       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1092
1093       /* Thanks to the lower_ubo_reference pass, we will see only
1094        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1095        * variables, so no need for them to be in variable_ht.
1096        *
1097        * Some uniforms, such as samplers and atomic counters, have no actual
1098        * storage, so we should ignore them.
1099        */
1100       if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1101          return;
1102
1103       /* Track how big the whole uniform variable is, in case we need to put a
1104        * copy of its data into pull constants for array access.
1105        */
1106       assert(this->uniforms < uniform_array_size);
1107       this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1108
1109       if (!strncmp(ir->name, "gl_", 3)) {
1110          setup_builtin_uniform_values(ir);
1111       } else {
1112          setup_uniform_values(ir);
1113       }
1114       break;
1115
1116    case ir_var_system_value:
1117       reg = make_reg_for_system_value(ir->data.location, ir->type);
1118       break;
1119
1120    default:
1121       unreachable("not reached");
1122    }
1123
1124    reg->type = brw_type_for_base_type(ir->type);
1125    hash_table_insert(this->variable_ht, reg, ir);
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_loop *ir)
1130 {
1131    /* We don't want debugging output to print the whole body of the
1132     * loop as the annotation.
1133     */
1134    this->base_ir = NULL;
1135
1136    emit(BRW_OPCODE_DO);
1137
1138    visit_instructions(&ir->body_instructions);
1139
1140    emit(BRW_OPCODE_WHILE);
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_loop_jump *ir)
1145 {
1146    switch (ir->mode) {
1147    case ir_loop_jump::jump_break:
1148       emit(BRW_OPCODE_BREAK);
1149       break;
1150    case ir_loop_jump::jump_continue:
1151       emit(BRW_OPCODE_CONTINUE);
1152       break;
1153    }
1154 }
1155
1156
1157 void
1158 vec4_visitor::visit(ir_function_signature *)
1159 {
1160    unreachable("not reached");
1161 }
1162
1163 void
1164 vec4_visitor::visit(ir_function *ir)
1165 {
1166    /* Ignore function bodies other than main() -- we shouldn't see calls to
1167     * them since they should all be inlined.
1168     */
1169    if (strcmp(ir->name, "main") == 0) {
1170       const ir_function_signature *sig;
1171       exec_list empty;
1172
1173       sig = ir->matching_signature(NULL, &empty, false);
1174
1175       assert(sig);
1176
1177       visit_instructions(&sig->body);
1178    }
1179 }
1180
1181 bool
1182 vec4_visitor::try_emit_mad(ir_expression *ir)
1183 {
1184    /* 3-src instructions were introduced in gen6. */
1185    if (devinfo->gen < 6)
1186       return false;
1187
1188    /* MAD can only handle floating-point data. */
1189    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1190       return false;
1191
1192    ir_rvalue *nonmul;
1193    ir_expression *mul;
1194    bool mul_negate, mul_abs;
1195
1196    for (int i = 0; i < 2; i++) {
1197       mul_negate = false;
1198       mul_abs = false;
1199
1200       mul = ir->operands[i]->as_expression();
1201       nonmul = ir->operands[1 - i];
1202
1203       if (mul && mul->operation == ir_unop_abs) {
1204          mul = mul->operands[0]->as_expression();
1205          mul_abs = true;
1206       } else if (mul && mul->operation == ir_unop_neg) {
1207          mul = mul->operands[0]->as_expression();
1208          mul_negate = true;
1209       }
1210
1211       if (mul && mul->operation == ir_binop_mul)
1212          break;
1213    }
1214
1215    if (!mul || mul->operation != ir_binop_mul)
1216       return false;
1217
1218    nonmul->accept(this);
1219    src_reg src0 = fix_3src_operand(this->result);
1220
1221    mul->operands[0]->accept(this);
1222    src_reg src1 = fix_3src_operand(this->result);
1223    src1.negate ^= mul_negate;
1224    src1.abs = mul_abs;
1225    if (mul_abs)
1226       src1.negate = false;
1227
1228    mul->operands[1]->accept(this);
1229    src_reg src2 = fix_3src_operand(this->result);
1230    src2.abs = mul_abs;
1231    if (mul_abs)
1232       src2.negate = false;
1233
1234    this->result = src_reg(this, ir->type);
1235    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1236
1237    return true;
1238 }
1239
1240 bool
1241 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1242 {
1243    /* This optimization relies on CMP setting the destination to 0 when
1244     * false.  Early hardware only sets the least significant bit, and
1245     * leaves the other bits undefined.  So we can't use it.
1246     */
1247    if (devinfo->gen < 6)
1248       return false;
1249
1250    ir_expression *const cmp = ir->operands[0]->as_expression();
1251
1252    if (cmp == NULL)
1253       return false;
1254
1255    switch (cmp->operation) {
1256    case ir_binop_less:
1257    case ir_binop_greater:
1258    case ir_binop_lequal:
1259    case ir_binop_gequal:
1260    case ir_binop_equal:
1261    case ir_binop_nequal:
1262       break;
1263
1264    default:
1265       return false;
1266    }
1267
1268    cmp->operands[0]->accept(this);
1269    const src_reg cmp_src0 = this->result;
1270
1271    cmp->operands[1]->accept(this);
1272    const src_reg cmp_src1 = this->result;
1273
1274    this->result = src_reg(this, ir->type);
1275
1276    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1277             brw_conditional_for_comparison(cmp->operation)));
1278
1279    /* If the comparison is false, this->result will just happen to be zero.
1280     */
1281    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1282                                        this->result, src_reg(1.0f));
1283    inst->predicate = BRW_PREDICATE_NORMAL;
1284    inst->predicate_inverse = true;
1285
1286    return true;
1287 }
1288
1289 vec4_instruction *
1290 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1291                           src_reg src0, src_reg src1)
1292 {
1293    vec4_instruction *inst;
1294
1295    if (devinfo->gen >= 6) {
1296       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1297       inst->conditional_mod = conditionalmod;
1298    } else {
1299       emit(CMP(dst, src0, src1, conditionalmod));
1300
1301       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1302       inst->predicate = BRW_PREDICATE_NORMAL;
1303    }
1304
1305    return inst;
1306 }
1307
1308 vec4_instruction *
1309 vec4_visitor::emit_lrp(const dst_reg &dst,
1310                        const src_reg &x, const src_reg &y, const src_reg &a)
1311 {
1312    if (devinfo->gen >= 6) {
1313       /* Note that the instruction's argument order is reversed from GLSL
1314        * and the IR.
1315        */
1316      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1317                      fix_3src_operand(x)));
1318    } else {
1319       /* Earlier generations don't support three source operations, so we
1320        * need to emit x*(1-a) + y*a.
1321        */
1322       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1323       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1324       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1325       y_times_a.writemask           = dst.writemask;
1326       one_minus_a.writemask         = dst.writemask;
1327       x_times_one_minus_a.writemask = dst.writemask;
1328
1329       emit(MUL(y_times_a, y, a));
1330       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1331       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1332       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1333    }
1334 }
1335
1336 /**
1337  * Emits the instructions needed to perform a pull constant load. before_block
1338  * and before_inst can be NULL in which case the instruction will be appended
1339  * to the end of the instruction list.
1340  */
1341 void
1342 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1343                                           src_reg surf_index,
1344                                           src_reg offset_reg,
1345                                           bblock_t *before_block,
1346                                           vec4_instruction *before_inst)
1347 {
1348    assert((before_inst == NULL && before_block == NULL) ||
1349           (before_inst && before_block));
1350
1351    vec4_instruction *pull;
1352
1353    if (devinfo->gen >= 9) {
1354       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1355       src_reg header(this, glsl_type::uvec4_type, 2);
1356
1357       pull = new(mem_ctx)
1358          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1359                           dst_reg(header));
1360
1361       if (before_inst)
1362          emit_before(before_block, before_inst, pull);
1363       else
1364          emit(pull);
1365
1366       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1367                                  offset_reg.type);
1368       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1369
1370       if (before_inst)
1371          emit_before(before_block, before_inst, pull);
1372       else
1373          emit(pull);
1374
1375       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1376                                            dst,
1377                                            surf_index,
1378                                            header);
1379       pull->mlen = 2;
1380       pull->header_size = 1;
1381    } else if (devinfo->gen >= 7) {
1382       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1383
1384       grf_offset.type = offset_reg.type;
1385
1386       pull = MOV(grf_offset, offset_reg);
1387
1388       if (before_inst)
1389          emit_before(before_block, before_inst, pull);
1390       else
1391          emit(pull);
1392
1393       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1394                                            dst,
1395                                            surf_index,
1396                                            src_reg(grf_offset));
1397       pull->mlen = 1;
1398    } else {
1399       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1400                                            dst,
1401                                            surf_index,
1402                                            offset_reg);
1403       pull->base_mrf = 14;
1404       pull->mlen = 1;
1405    }
1406
1407    if (before_inst)
1408       emit_before(before_block, before_inst, pull);
1409    else
1410       emit(pull);
1411 }
1412
1413 src_reg
1414 vec4_visitor::emit_uniformize(const src_reg &src)
1415 {
1416    const src_reg chan_index(this, glsl_type::uint_type);
1417    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1418                               src.type);
1419
1420    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1421       ->force_writemask_all = true;
1422    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1423       ->force_writemask_all = true;
1424
1425    return src_reg(dst);
1426 }
1427
1428 void
1429 vec4_visitor::visit(ir_expression *ir)
1430 {
1431    unsigned int operand;
1432    src_reg op[ARRAY_SIZE(ir->operands)];
1433    vec4_instruction *inst;
1434
1435    if (ir->operation == ir_binop_add) {
1436       if (try_emit_mad(ir))
1437          return;
1438    }
1439
1440    if (ir->operation == ir_unop_b2f) {
1441       if (try_emit_b2f_of_compare(ir))
1442          return;
1443    }
1444
1445    /* Storage for our result.  Ideally for an assignment we'd be using
1446     * the actual storage for the result here, instead.
1447     */
1448    dst_reg result_dst(this, ir->type);
1449    src_reg result_src(result_dst);
1450
1451    if (ir->operation == ir_triop_csel) {
1452       ir->operands[1]->accept(this);
1453       op[1] = this->result;
1454       ir->operands[2]->accept(this);
1455       op[2] = this->result;
1456
1457       enum brw_predicate predicate;
1458       emit_bool_to_cond_code(ir->operands[0], &predicate);
1459       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1460       inst->predicate = predicate;
1461       this->result = result_src;
1462       return;
1463    }
1464
1465    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1466       this->result.file = BAD_FILE;
1467       ir->operands[operand]->accept(this);
1468       if (this->result.file == BAD_FILE) {
1469          fprintf(stderr, "Failed to get tree for expression operand:\n");
1470          ir->operands[operand]->fprint(stderr);
1471          exit(1);
1472       }
1473       op[operand] = this->result;
1474
1475       /* Matrix expression operands should have been broken down to vector
1476        * operations already.
1477        */
1478       assert(!ir->operands[operand]->type->is_matrix());
1479    }
1480
1481    /* If nothing special happens, this is the result. */
1482    this->result = result_src;
1483
1484    switch (ir->operation) {
1485    case ir_unop_logic_not:
1486       emit(NOT(result_dst, op[0]));
1487       break;
1488    case ir_unop_neg:
1489       op[0].negate = !op[0].negate;
1490       emit(MOV(result_dst, op[0]));
1491       break;
1492    case ir_unop_abs:
1493       op[0].abs = true;
1494       op[0].negate = false;
1495       emit(MOV(result_dst, op[0]));
1496       break;
1497
1498    case ir_unop_sign:
1499       if (ir->type->is_float()) {
1500          /* AND(val, 0x80000000) gives the sign bit.
1501           *
1502           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1503           * zero.
1504           */
1505          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1506
1507          op[0].type = BRW_REGISTER_TYPE_UD;
1508          result_dst.type = BRW_REGISTER_TYPE_UD;
1509          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1510
1511          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1512          inst->predicate = BRW_PREDICATE_NORMAL;
1513
1514          this->result.type = BRW_REGISTER_TYPE_F;
1515       } else {
1516          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1517           *               -> non-negative val generates 0x00000000.
1518           *  Predicated OR sets 1 if val is positive.
1519           */
1520          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1521
1522          emit(ASR(result_dst, op[0], src_reg(31)));
1523
1524          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1525          inst->predicate = BRW_PREDICATE_NORMAL;
1526       }
1527       break;
1528
1529    case ir_unop_rcp:
1530       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1531       break;
1532
1533    case ir_unop_exp2:
1534       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1535       break;
1536    case ir_unop_log2:
1537       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1538       break;
1539    case ir_unop_exp:
1540    case ir_unop_log:
1541       unreachable("not reached: should be handled by ir_explog_to_explog2");
1542    case ir_unop_sin:
1543       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1544       break;
1545    case ir_unop_cos:
1546       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1547       break;
1548
1549    case ir_unop_dFdx:
1550    case ir_unop_dFdx_coarse:
1551    case ir_unop_dFdx_fine:
1552    case ir_unop_dFdy:
1553    case ir_unop_dFdy_coarse:
1554    case ir_unop_dFdy_fine:
1555       unreachable("derivatives not valid in vertex shader");
1556
1557    case ir_unop_bitfield_reverse:
1558       emit(BFREV(result_dst, op[0]));
1559       break;
1560    case ir_unop_bit_count:
1561       emit(CBIT(result_dst, op[0]));
1562       break;
1563    case ir_unop_find_msb: {
1564       src_reg temp = src_reg(this, glsl_type::uint_type);
1565
1566       inst = emit(FBH(dst_reg(temp), op[0]));
1567       inst->dst.writemask = WRITEMASK_XYZW;
1568
1569       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1570        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1571        * subtract the result from 31 to convert the MSB count into an LSB count.
1572        */
1573
1574       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1575       temp.swizzle = BRW_SWIZZLE_NOOP;
1576       emit(MOV(result_dst, temp));
1577
1578       src_reg src_tmp = src_reg(result_dst);
1579       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1580
1581       src_tmp.negate = true;
1582       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1583       inst->predicate = BRW_PREDICATE_NORMAL;
1584       break;
1585    }
1586    case ir_unop_find_lsb:
1587       emit(FBL(result_dst, op[0]));
1588       break;
1589    case ir_unop_saturate:
1590       inst = emit(MOV(result_dst, op[0]));
1591       inst->saturate = true;
1592       break;
1593
1594    case ir_unop_noise:
1595       unreachable("not reached: should be handled by lower_noise");
1596
1597    case ir_unop_subroutine_to_int:
1598       emit(MOV(result_dst, op[0]));
1599       break;
1600
1601    case ir_binop_add:
1602       emit(ADD(result_dst, op[0], op[1]));
1603       break;
1604    case ir_binop_sub:
1605       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1606
1607    case ir_binop_mul:
1608       if (devinfo->gen < 8 && ir->type->is_integer()) {
1609          /* For integer multiplication, the MUL uses the low 16 bits of one of
1610           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1611           * accumulates in the contribution of the upper 16 bits of that
1612           * operand.  If we can determine that one of the args is in the low
1613           * 16 bits, though, we can just emit a single MUL.
1614           */
1615          if (ir->operands[0]->is_uint16_constant()) {
1616             if (devinfo->gen < 7)
1617                emit(MUL(result_dst, op[0], op[1]));
1618             else
1619                emit(MUL(result_dst, op[1], op[0]));
1620          } else if (ir->operands[1]->is_uint16_constant()) {
1621             if (devinfo->gen < 7)
1622                emit(MUL(result_dst, op[1], op[0]));
1623             else
1624                emit(MUL(result_dst, op[0], op[1]));
1625          } else {
1626             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1627
1628             emit(MUL(acc, op[0], op[1]));
1629             emit(MACH(dst_null_d(), op[0], op[1]));
1630             emit(MOV(result_dst, src_reg(acc)));
1631          }
1632       } else {
1633          emit(MUL(result_dst, op[0], op[1]));
1634       }
1635       break;
1636    case ir_binop_imul_high: {
1637       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1638
1639       emit(MUL(acc, op[0], op[1]));
1640       emit(MACH(result_dst, op[0], op[1]));
1641       break;
1642    }
1643    case ir_binop_div:
1644       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1645       assert(ir->type->is_integer());
1646       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1647       break;
1648
1649    case ir_binop_carry:
1650       unreachable("Should have been lowered by carry_to_arith().");
1651
1652    case ir_binop_borrow:
1653       unreachable("Should have been lowered by borrow_to_arith().");
1654
1655    case ir_binop_mod:
1656       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1657       assert(ir->type->is_integer());
1658       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1659       break;
1660
1661    case ir_binop_less:
1662    case ir_binop_greater:
1663    case ir_binop_lequal:
1664    case ir_binop_gequal:
1665    case ir_binop_equal:
1666    case ir_binop_nequal: {
1667       if (devinfo->gen <= 5) {
1668          resolve_bool_comparison(ir->operands[0], &op[0]);
1669          resolve_bool_comparison(ir->operands[1], &op[1]);
1670       }
1671       emit(CMP(result_dst, op[0], op[1],
1672                brw_conditional_for_comparison(ir->operation)));
1673       break;
1674    }
1675
1676    case ir_binop_all_equal:
1677       if (devinfo->gen <= 5) {
1678          resolve_bool_comparison(ir->operands[0], &op[0]);
1679          resolve_bool_comparison(ir->operands[1], &op[1]);
1680       }
1681
1682       /* "==" operator producing a scalar boolean. */
1683       if (ir->operands[0]->type->is_vector() ||
1684           ir->operands[1]->type->is_vector()) {
1685          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1686          emit(MOV(result_dst, src_reg(0)));
1687          inst = emit(MOV(result_dst, src_reg(~0)));
1688          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1689       } else {
1690          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1691       }
1692       break;
1693    case ir_binop_any_nequal:
1694       if (devinfo->gen <= 5) {
1695          resolve_bool_comparison(ir->operands[0], &op[0]);
1696          resolve_bool_comparison(ir->operands[1], &op[1]);
1697       }
1698
1699       /* "!=" operator producing a scalar boolean. */
1700       if (ir->operands[0]->type->is_vector() ||
1701           ir->operands[1]->type->is_vector()) {
1702          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1703
1704          emit(MOV(result_dst, src_reg(0)));
1705          inst = emit(MOV(result_dst, src_reg(~0)));
1706          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1707       } else {
1708          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1709       }
1710       break;
1711
1712    case ir_unop_any:
1713       if (devinfo->gen <= 5) {
1714          resolve_bool_comparison(ir->operands[0], &op[0]);
1715       }
1716       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1717       emit(MOV(result_dst, src_reg(0)));
1718
1719       inst = emit(MOV(result_dst, src_reg(~0)));
1720       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1721       break;
1722
1723    case ir_binop_logic_xor:
1724       emit(XOR(result_dst, op[0], op[1]));
1725       break;
1726
1727    case ir_binop_logic_or:
1728       emit(OR(result_dst, op[0], op[1]));
1729       break;
1730
1731    case ir_binop_logic_and:
1732       emit(AND(result_dst, op[0], op[1]));
1733       break;
1734
1735    case ir_binop_dot:
1736       assert(ir->operands[0]->type->is_vector());
1737       assert(ir->operands[0]->type == ir->operands[1]->type);
1738       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1739       break;
1740
1741    case ir_unop_sqrt:
1742       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1743       break;
1744    case ir_unop_rsq:
1745       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1746       break;
1747
1748    case ir_unop_bitcast_i2f:
1749    case ir_unop_bitcast_u2f:
1750       this->result = op[0];
1751       this->result.type = BRW_REGISTER_TYPE_F;
1752       break;
1753
1754    case ir_unop_bitcast_f2i:
1755       this->result = op[0];
1756       this->result.type = BRW_REGISTER_TYPE_D;
1757       break;
1758
1759    case ir_unop_bitcast_f2u:
1760       this->result = op[0];
1761       this->result.type = BRW_REGISTER_TYPE_UD;
1762       break;
1763
1764    case ir_unop_i2f:
1765    case ir_unop_i2u:
1766    case ir_unop_u2i:
1767    case ir_unop_u2f:
1768    case ir_unop_f2i:
1769    case ir_unop_f2u:
1770       emit(MOV(result_dst, op[0]));
1771       break;
1772    case ir_unop_b2i:
1773    case ir_unop_b2f:
1774       if (devinfo->gen <= 5) {
1775          resolve_bool_comparison(ir->operands[0], &op[0]);
1776       }
1777       emit(MOV(result_dst, negate(op[0])));
1778       break;
1779    case ir_unop_f2b:
1780       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1781       break;
1782    case ir_unop_i2b:
1783       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1784       break;
1785
1786    case ir_unop_trunc:
1787       emit(RNDZ(result_dst, op[0]));
1788       break;
1789    case ir_unop_ceil: {
1790          src_reg tmp = src_reg(this, ir->type);
1791          op[0].negate = !op[0].negate;
1792          emit(RNDD(dst_reg(tmp), op[0]));
1793          tmp.negate = true;
1794          emit(MOV(result_dst, tmp));
1795       }
1796       break;
1797    case ir_unop_floor:
1798       inst = emit(RNDD(result_dst, op[0]));
1799       break;
1800    case ir_unop_fract:
1801       inst = emit(FRC(result_dst, op[0]));
1802       break;
1803    case ir_unop_round_even:
1804       emit(RNDE(result_dst, op[0]));
1805       break;
1806
1807    case ir_binop_min:
1808       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1809       break;
1810    case ir_binop_max:
1811       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1812       break;
1813
1814    case ir_binop_pow:
1815       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1816       break;
1817
1818    case ir_unop_bit_not:
1819       inst = emit(NOT(result_dst, op[0]));
1820       break;
1821    case ir_binop_bit_and:
1822       inst = emit(AND(result_dst, op[0], op[1]));
1823       break;
1824    case ir_binop_bit_xor:
1825       inst = emit(XOR(result_dst, op[0], op[1]));
1826       break;
1827    case ir_binop_bit_or:
1828       inst = emit(OR(result_dst, op[0], op[1]));
1829       break;
1830
1831    case ir_binop_lshift:
1832       inst = emit(SHL(result_dst, op[0], op[1]));
1833       break;
1834
1835    case ir_binop_rshift:
1836       if (ir->type->base_type == GLSL_TYPE_INT)
1837          inst = emit(ASR(result_dst, op[0], op[1]));
1838       else
1839          inst = emit(SHR(result_dst, op[0], op[1]));
1840       break;
1841
1842    case ir_binop_bfm:
1843       emit(BFI1(result_dst, op[0], op[1]));
1844       break;
1845
1846    case ir_binop_ubo_load: {
1847       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1848       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1849       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1850       src_reg offset;
1851
1852       /* Now, load the vector from that offset. */
1853       assert(ir->type->is_vector() || ir->type->is_scalar());
1854
1855       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1856       packed_consts.type = result.type;
1857       src_reg surf_index;
1858
1859       if (const_uniform_block) {
1860          /* The block index is a constant, so just emit the binding table entry
1861           * as an immediate.
1862           */
1863          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1864                               const_uniform_block->value.u[0]);
1865       } else {
1866          /* The block index is not a constant. Evaluate the index expression
1867           * per-channel and add the base UBO index; we have to select a value
1868           * from any live channel.
1869           */
1870          surf_index = src_reg(this, glsl_type::uint_type);
1871          emit(ADD(dst_reg(surf_index), op[0],
1872                   src_reg(prog_data->base.binding_table.ubo_start)));
1873          surf_index = emit_uniformize(surf_index);
1874
1875          /* Assume this may touch any UBO. It would be nice to provide
1876           * a tighter bound, but the array information is already lowered away.
1877           */
1878          brw_mark_surface_used(&prog_data->base,
1879                                prog_data->base.binding_table.ubo_start +
1880                                shader_prog->NumUniformBlocks - 1);
1881       }
1882
1883       if (const_offset_ir) {
1884          if (devinfo->gen >= 8) {
1885             /* Store the offset in a GRF so we can send-from-GRF. */
1886             offset = src_reg(this, glsl_type::int_type);
1887             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1888          } else {
1889             /* Immediates are fine on older generations since they'll be moved
1890              * to a (potentially fake) MRF at the generator level.
1891              */
1892             offset = src_reg(const_offset / 16);
1893          }
1894       } else {
1895          offset = src_reg(this, glsl_type::uint_type);
1896          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1897       }
1898
1899       emit_pull_constant_load_reg(dst_reg(packed_consts),
1900                                   surf_index,
1901                                   offset,
1902                                   NULL, NULL /* before_block/inst */);
1903
1904       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1905       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1906                                             const_offset % 16 / 4,
1907                                             const_offset % 16 / 4,
1908                                             const_offset % 16 / 4);
1909
1910       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1911       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1912          emit(CMP(result_dst, packed_consts, src_reg(0u),
1913                   BRW_CONDITIONAL_NZ));
1914       } else {
1915          emit(MOV(result_dst, packed_consts));
1916       }
1917       break;
1918    }
1919
1920    case ir_binop_vector_extract:
1921       unreachable("should have been lowered by vec_index_to_cond_assign");
1922
1923    case ir_triop_fma:
1924       op[0] = fix_3src_operand(op[0]);
1925       op[1] = fix_3src_operand(op[1]);
1926       op[2] = fix_3src_operand(op[2]);
1927       /* Note that the instruction's argument order is reversed from GLSL
1928        * and the IR.
1929        */
1930       emit(MAD(result_dst, op[2], op[1], op[0]));
1931       break;
1932
1933    case ir_triop_lrp:
1934       emit_lrp(result_dst, op[0], op[1], op[2]);
1935       break;
1936
1937    case ir_triop_csel:
1938       unreachable("already handled above");
1939       break;
1940
1941    case ir_triop_bfi:
1942       op[0] = fix_3src_operand(op[0]);
1943       op[1] = fix_3src_operand(op[1]);
1944       op[2] = fix_3src_operand(op[2]);
1945       emit(BFI2(result_dst, op[0], op[1], op[2]));
1946       break;
1947
1948    case ir_triop_bitfield_extract:
1949       op[0] = fix_3src_operand(op[0]);
1950       op[1] = fix_3src_operand(op[1]);
1951       op[2] = fix_3src_operand(op[2]);
1952       /* Note that the instruction's argument order is reversed from GLSL
1953        * and the IR.
1954        */
1955       emit(BFE(result_dst, op[2], op[1], op[0]));
1956       break;
1957
1958    case ir_triop_vector_insert:
1959       unreachable("should have been lowered by lower_vector_insert");
1960
1961    case ir_quadop_bitfield_insert:
1962       unreachable("not reached: should be handled by "
1963               "bitfield_insert_to_bfm_bfi\n");
1964
1965    case ir_quadop_vector:
1966       unreachable("not reached: should be handled by lower_quadop_vector");
1967
1968    case ir_unop_pack_half_2x16:
1969       emit_pack_half_2x16(result_dst, op[0]);
1970       break;
1971    case ir_unop_unpack_half_2x16:
1972       emit_unpack_half_2x16(result_dst, op[0]);
1973       break;
1974    case ir_unop_unpack_unorm_4x8:
1975       emit_unpack_unorm_4x8(result_dst, op[0]);
1976       break;
1977    case ir_unop_unpack_snorm_4x8:
1978       emit_unpack_snorm_4x8(result_dst, op[0]);
1979       break;
1980    case ir_unop_pack_unorm_4x8:
1981       emit_pack_unorm_4x8(result_dst, op[0]);
1982       break;
1983    case ir_unop_pack_snorm_4x8:
1984       emit_pack_snorm_4x8(result_dst, op[0]);
1985       break;
1986    case ir_unop_pack_snorm_2x16:
1987    case ir_unop_pack_unorm_2x16:
1988    case ir_unop_unpack_snorm_2x16:
1989    case ir_unop_unpack_unorm_2x16:
1990       unreachable("not reached: should be handled by lower_packing_builtins");
1991    case ir_unop_unpack_half_2x16_split_x:
1992    case ir_unop_unpack_half_2x16_split_y:
1993    case ir_binop_pack_half_2x16_split:
1994    case ir_unop_interpolate_at_centroid:
1995    case ir_binop_interpolate_at_sample:
1996    case ir_binop_interpolate_at_offset:
1997       unreachable("not reached: should not occur in vertex shader");
1998    case ir_binop_ldexp:
1999       unreachable("not reached: should be handled by ldexp_to_arith()");
2000    case ir_unop_d2f:
2001    case ir_unop_f2d:
2002    case ir_unop_d2i:
2003    case ir_unop_i2d:
2004    case ir_unop_d2u:
2005    case ir_unop_u2d:
2006    case ir_unop_d2b:
2007    case ir_unop_pack_double_2x32:
2008    case ir_unop_unpack_double_2x32:
2009    case ir_unop_frexp_sig:
2010    case ir_unop_frexp_exp:
2011       unreachable("fp64 todo");
2012    }
2013 }
2014
2015
2016 void
2017 vec4_visitor::visit(ir_swizzle *ir)
2018 {
2019    /* Note that this is only swizzles in expressions, not those on the left
2020     * hand side of an assignment, which do write masking.  See ir_assignment
2021     * for that.
2022     */
2023    const unsigned swz = brw_compose_swizzle(
2024       brw_swizzle_for_size(ir->type->vector_elements),
2025       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2026
2027    ir->val->accept(this);
2028    this->result = swizzle(this->result, swz);
2029 }
2030
2031 void
2032 vec4_visitor::visit(ir_dereference_variable *ir)
2033 {
2034    const struct glsl_type *type = ir->type;
2035    dst_reg *reg = variable_storage(ir->var);
2036
2037    if (!reg) {
2038       fail("Failed to find variable storage for %s\n", ir->var->name);
2039       this->result = src_reg(brw_null_reg());
2040       return;
2041    }
2042
2043    this->result = src_reg(*reg);
2044
2045    /* System values get their swizzle from the dst_reg writemask */
2046    if (ir->var->data.mode == ir_var_system_value)
2047       return;
2048
2049    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2050       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2051 }
2052
2053
2054 int
2055 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2056 {
2057    /* Under normal circumstances array elements are stored consecutively, so
2058     * the stride is equal to the size of the array element.
2059     */
2060    return type_size_vec4(ir->type);
2061 }
2062
2063
2064 void
2065 vec4_visitor::visit(ir_dereference_array *ir)
2066 {
2067    ir_constant *constant_index;
2068    src_reg src;
2069    int array_stride = compute_array_stride(ir);
2070
2071    constant_index = ir->array_index->constant_expression_value();
2072
2073    ir->array->accept(this);
2074    src = this->result;
2075
2076    if (constant_index) {
2077       src.reg_offset += constant_index->value.i[0] * array_stride;
2078    } else {
2079       /* Variable index array dereference.  It eats the "vec4" of the
2080        * base of the array and an index that offsets the Mesa register
2081        * index.
2082        */
2083       ir->array_index->accept(this);
2084
2085       src_reg index_reg;
2086
2087       if (array_stride == 1) {
2088          index_reg = this->result;
2089       } else {
2090          index_reg = src_reg(this, glsl_type::int_type);
2091
2092          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2093       }
2094
2095       if (src.reladdr) {
2096          src_reg temp = src_reg(this, glsl_type::int_type);
2097
2098          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2099
2100          index_reg = temp;
2101       }
2102
2103       src.reladdr = ralloc(mem_ctx, src_reg);
2104       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2105    }
2106
2107    /* If the type is smaller than a vec4, replicate the last channel out. */
2108    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2109       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2110    else
2111       src.swizzle = BRW_SWIZZLE_NOOP;
2112    src.type = brw_type_for_base_type(ir->type);
2113
2114    this->result = src;
2115 }
2116
2117 void
2118 vec4_visitor::visit(ir_dereference_record *ir)
2119 {
2120    unsigned int i;
2121    const glsl_type *struct_type = ir->record->type;
2122    int offset = 0;
2123
2124    ir->record->accept(this);
2125
2126    for (i = 0; i < struct_type->length; i++) {
2127       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2128          break;
2129       offset += type_size_vec4(struct_type->fields.structure[i].type);
2130    }
2131
2132    /* If the type is smaller than a vec4, replicate the last channel out. */
2133    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2134       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2135    else
2136       this->result.swizzle = BRW_SWIZZLE_NOOP;
2137    this->result.type = brw_type_for_base_type(ir->type);
2138
2139    this->result.reg_offset += offset;
2140 }
2141
2142 /**
2143  * We want to be careful in assignment setup to hit the actual storage
2144  * instead of potentially using a temporary like we might with the
2145  * ir_dereference handler.
2146  */
2147 static dst_reg
2148 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2149 {
2150    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2151     * access of a vector, it must be separated into a series conditional moves
2152     * before reaching this point (see ir_vec_index_to_cond_assign).
2153     */
2154    assert(ir->as_dereference());
2155    ir_dereference_array *deref_array = ir->as_dereference_array();
2156    if (deref_array) {
2157       assert(!deref_array->array->type->is_vector());
2158    }
2159
2160    /* Use the rvalue deref handler for the most part.  We'll ignore
2161     * swizzles in it and write swizzles using writemask, though.
2162     */
2163    ir->accept(v);
2164    return dst_reg(v->result);
2165 }
2166
2167 void
2168 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2169                               const struct glsl_type *type,
2170                               enum brw_predicate predicate)
2171 {
2172    if (type->base_type == GLSL_TYPE_STRUCT) {
2173       for (unsigned int i = 0; i < type->length; i++) {
2174          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2175       }
2176       return;
2177    }
2178
2179    if (type->is_array()) {
2180       for (unsigned int i = 0; i < type->length; i++) {
2181          emit_block_move(dst, src, type->fields.array, predicate);
2182       }
2183       return;
2184    }
2185
2186    if (type->is_matrix()) {
2187       const struct glsl_type *vec_type;
2188
2189       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2190                                          type->vector_elements, 1);
2191
2192       for (int i = 0; i < type->matrix_columns; i++) {
2193          emit_block_move(dst, src, vec_type, predicate);
2194       }
2195       return;
2196    }
2197
2198    assert(type->is_scalar() || type->is_vector());
2199
2200    dst->type = brw_type_for_base_type(type);
2201    src->type = dst->type;
2202
2203    dst->writemask = (1 << type->vector_elements) - 1;
2204
2205    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2206
2207    vec4_instruction *inst = emit(MOV(*dst, *src));
2208    inst->predicate = predicate;
2209
2210    dst->reg_offset++;
2211    src->reg_offset++;
2212 }
2213
2214
2215 /* If the RHS processing resulted in an instruction generating a
2216  * temporary value, and it would be easy to rewrite the instruction to
2217  * generate its result right into the LHS instead, do so.  This ends
2218  * up reliably removing instructions where it can be tricky to do so
2219  * later without real UD chain information.
2220  */
2221 bool
2222 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2223                                      dst_reg dst,
2224                                      src_reg src,
2225                                      vec4_instruction *pre_rhs_inst,
2226                                      vec4_instruction *last_rhs_inst)
2227 {
2228    /* This could be supported, but it would take more smarts. */
2229    if (ir->condition)
2230       return false;
2231
2232    if (pre_rhs_inst == last_rhs_inst)
2233       return false; /* No instructions generated to work with. */
2234
2235    /* Make sure the last instruction generated our source reg. */
2236    if (src.file != GRF ||
2237        src.file != last_rhs_inst->dst.file ||
2238        src.reg != last_rhs_inst->dst.reg ||
2239        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2240        src.reladdr ||
2241        src.abs ||
2242        src.negate ||
2243        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2244       return false;
2245
2246    /* Check that that last instruction fully initialized the channels
2247     * we want to use, in the order we want to use them.  We could
2248     * potentially reswizzle the operands of many instructions so that
2249     * we could handle out of order channels, but don't yet.
2250     */
2251
2252    for (unsigned i = 0; i < 4; i++) {
2253       if (dst.writemask & (1 << i)) {
2254          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2255             return false;
2256
2257          if (BRW_GET_SWZ(src.swizzle, i) != i)
2258             return false;
2259       }
2260    }
2261
2262    /* Success!  Rewrite the instruction. */
2263    last_rhs_inst->dst.file = dst.file;
2264    last_rhs_inst->dst.reg = dst.reg;
2265    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2266    last_rhs_inst->dst.reladdr = dst.reladdr;
2267    last_rhs_inst->dst.writemask &= dst.writemask;
2268
2269    return true;
2270 }
2271
2272 void
2273 vec4_visitor::visit(ir_assignment *ir)
2274 {
2275    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2276    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2277
2278    if (!ir->lhs->type->is_scalar() &&
2279        !ir->lhs->type->is_vector()) {
2280       ir->rhs->accept(this);
2281       src_reg src = this->result;
2282
2283       if (ir->condition) {
2284          emit_bool_to_cond_code(ir->condition, &predicate);
2285       }
2286
2287       /* emit_block_move doesn't account for swizzles in the source register.
2288        * This should be ok, since the source register is a structure or an
2289        * array, and those can't be swizzled.  But double-check to be sure.
2290        */
2291       assert(src.swizzle ==
2292              (ir->rhs->type->is_matrix()
2293               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2294               : BRW_SWIZZLE_NOOP));
2295
2296       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2297       return;
2298    }
2299
2300    /* Now we're down to just a scalar/vector with writemasks. */
2301    int i;
2302
2303    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2304    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2305
2306    ir->rhs->accept(this);
2307
2308    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2309
2310    int swizzles[4];
2311    int src_chan = 0;
2312
2313    assert(ir->lhs->type->is_vector() ||
2314           ir->lhs->type->is_scalar());
2315    dst.writemask = ir->write_mask;
2316
2317    /* Swizzle a small RHS vector into the channels being written.
2318     *
2319     * glsl ir treats write_mask as dictating how many channels are
2320     * present on the RHS while in our instructions we need to make
2321     * those channels appear in the slots of the vec4 they're written to.
2322     */
2323    for (int i = 0; i < 4; i++)
2324       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2325
2326    src_reg src = swizzle(this->result,
2327                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2328                                       swizzles[2], swizzles[3]));
2329
2330    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2331       return;
2332    }
2333
2334    if (ir->condition) {
2335       emit_bool_to_cond_code(ir->condition, &predicate);
2336    }
2337
2338    for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2339       vec4_instruction *inst = emit(MOV(dst, src));
2340       inst->predicate = predicate;
2341
2342       dst.reg_offset++;
2343       src.reg_offset++;
2344    }
2345 }
2346
2347 void
2348 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2349 {
2350    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2351       foreach_in_list(ir_constant, field_value, &ir->components) {
2352          emit_constant_values(dst, field_value);
2353       }
2354       return;
2355    }
2356
2357    if (ir->type->is_array()) {
2358       for (unsigned int i = 0; i < ir->type->length; i++) {
2359          emit_constant_values(dst, ir->array_elements[i]);
2360       }
2361       return;
2362    }
2363
2364    if (ir->type->is_matrix()) {
2365       for (int i = 0; i < ir->type->matrix_columns; i++) {
2366          float *vec = &ir->value.f[i * ir->type->vector_elements];
2367
2368          for (int j = 0; j < ir->type->vector_elements; j++) {
2369             dst->writemask = 1 << j;
2370             dst->type = BRW_REGISTER_TYPE_F;
2371
2372             emit(MOV(*dst, src_reg(vec[j])));
2373          }
2374          dst->reg_offset++;
2375       }
2376       return;
2377    }
2378
2379    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2380
2381    for (int i = 0; i < ir->type->vector_elements; i++) {
2382       if (!(remaining_writemask & (1 << i)))
2383          continue;
2384
2385       dst->writemask = 1 << i;
2386       dst->type = brw_type_for_base_type(ir->type);
2387
2388       /* Find other components that match the one we're about to
2389        * write.  Emits fewer instructions for things like vec4(0.5,
2390        * 1.5, 1.5, 1.5).
2391        */
2392       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2393          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2394             if (ir->value.b[i] == ir->value.b[j])
2395                dst->writemask |= (1 << j);
2396          } else {
2397             /* u, i, and f storage all line up, so no need for a
2398              * switch case for comparing each type.
2399              */
2400             if (ir->value.u[i] == ir->value.u[j])
2401                dst->writemask |= (1 << j);
2402          }
2403       }
2404
2405       switch (ir->type->base_type) {
2406       case GLSL_TYPE_FLOAT:
2407          emit(MOV(*dst, src_reg(ir->value.f[i])));
2408          break;
2409       case GLSL_TYPE_INT:
2410          emit(MOV(*dst, src_reg(ir->value.i[i])));
2411          break;
2412       case GLSL_TYPE_UINT:
2413          emit(MOV(*dst, src_reg(ir->value.u[i])));
2414          break;
2415       case GLSL_TYPE_BOOL:
2416          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2417          break;
2418       default:
2419          unreachable("Non-float/uint/int/bool constant");
2420       }
2421
2422       remaining_writemask &= ~dst->writemask;
2423    }
2424    dst->reg_offset++;
2425 }
2426
2427 void
2428 vec4_visitor::visit(ir_constant *ir)
2429 {
2430    dst_reg dst = dst_reg(this, ir->type);
2431    this->result = src_reg(dst);
2432
2433    emit_constant_values(&dst, ir);
2434 }
2435
2436 void
2437 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2438 {
2439    ir_dereference *deref = static_cast<ir_dereference *>(
2440       ir->actual_parameters.get_head());
2441    ir_variable *location = deref->variable_referenced();
2442    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2443                           location->data.binding);
2444
2445    /* Calculate the surface offset */
2446    src_reg offset(this, glsl_type::uint_type);
2447    ir_dereference_array *deref_array = deref->as_dereference_array();
2448    if (deref_array) {
2449       deref_array->array_index->accept(this);
2450
2451       src_reg tmp(this, glsl_type::uint_type);
2452       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2453       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2454    } else {
2455       offset = location->data.atomic.offset;
2456    }
2457
2458    /* Emit the appropriate machine instruction */
2459    const char *callee = ir->callee->function_name();
2460    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2461
2462    if (!strcmp("__intrinsic_atomic_read", callee)) {
2463       emit_untyped_surface_read(surf_index, dst, offset);
2464
2465    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2466       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2467                           src_reg(), src_reg());
2468
2469    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2470       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2471                           src_reg(), src_reg());
2472    }
2473
2474    brw_mark_surface_used(stage_prog_data, surf_index);
2475 }
2476
2477 void
2478 vec4_visitor::visit(ir_call *ir)
2479 {
2480    const char *callee = ir->callee->function_name();
2481
2482    if (!strcmp("__intrinsic_atomic_read", callee) ||
2483        !strcmp("__intrinsic_atomic_increment", callee) ||
2484        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2485       visit_atomic_counter_intrinsic(ir);
2486    } else {
2487       unreachable("Unsupported intrinsic.");
2488    }
2489 }
2490
2491 src_reg
2492 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2493                              src_reg coordinate, src_reg sampler)
2494 {
2495    vec4_instruction *inst =
2496       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2497                                     dst_reg(this, glsl_type::uvec4_type));
2498    inst->base_mrf = 2;
2499    inst->src[1] = sampler;
2500
2501    int param_base;
2502
2503    if (devinfo->gen >= 9) {
2504       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2505       vec4_instruction *header_inst = new(mem_ctx)
2506          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2507                           dst_reg(MRF, inst->base_mrf));
2508
2509       emit(header_inst);
2510
2511       inst->mlen = 2;
2512       inst->header_size = 1;
2513       param_base = inst->base_mrf + 1;
2514    } else {
2515       inst->mlen = 1;
2516       param_base = inst->base_mrf;
2517    }
2518
2519    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2520    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2521    int zero_mask = 0xf & ~coord_mask;
2522
2523    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2524             coordinate));
2525
2526    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2527             src_reg(0)));
2528
2529    emit(inst);
2530    return src_reg(inst->dst);
2531 }
2532
2533 bool
2534 vec4_visitor::is_high_sampler(src_reg sampler)
2535 {
2536    if (devinfo->gen < 8 && !devinfo->is_haswell)
2537       return false;
2538
2539    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2540 }
2541
2542 void
2543 vec4_visitor::emit_texture(ir_texture_opcode op,
2544                            dst_reg dest,
2545                            const glsl_type *dest_type,
2546                            src_reg coordinate,
2547                            int coord_components,
2548                            src_reg shadow_comparitor,
2549                            src_reg lod, src_reg lod2,
2550                            src_reg sample_index,
2551                            uint32_t constant_offset,
2552                            src_reg offset_value,
2553                            src_reg mcs,
2554                            bool is_cube_array,
2555                            uint32_t sampler,
2556                            src_reg sampler_reg)
2557 {
2558    enum opcode opcode;
2559    switch (op) {
2560    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2561    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2562    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2563    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2564    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2565    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2566    case ir_tg4: opcode = offset_value.file != BAD_FILE
2567                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2568    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2569    case ir_txb:
2570       unreachable("TXB is not valid for vertex shaders.");
2571    case ir_lod:
2572       unreachable("LOD is not valid for vertex shaders.");
2573    default:
2574       unreachable("Unrecognized tex op");
2575    }
2576
2577    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2578       opcode, dst_reg(this, dest_type));
2579
2580    inst->offset = constant_offset;
2581
2582    /* The message header is necessary for:
2583     * - Gen4 (always)
2584     * - Gen9+ for selecting SIMD4x2
2585     * - Texel offsets
2586     * - Gather channel selection
2587     * - Sampler indices too large to fit in a 4-bit value.
2588     */
2589    inst->header_size =
2590       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2591        inst->offset != 0 || op == ir_tg4 ||
2592        is_high_sampler(sampler_reg)) ? 1 : 0;
2593    inst->base_mrf = 2;
2594    inst->mlen = inst->header_size + 1; /* always at least one */
2595    inst->dst.writemask = WRITEMASK_XYZW;
2596    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2597
2598    inst->src[1] = sampler_reg;
2599
2600    /* MRF for the first parameter */
2601    int param_base = inst->base_mrf + inst->header_size;
2602
2603    if (op == ir_txs || op == ir_query_levels) {
2604       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2605       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2606    } else {
2607       /* Load the coordinate */
2608       /* FINISHME: gl_clamp_mask and saturate */
2609       int coord_mask = (1 << coord_components) - 1;
2610       int zero_mask = 0xf & ~coord_mask;
2611
2612       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2613                coordinate));
2614
2615       if (zero_mask != 0) {
2616          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2617                   src_reg(0)));
2618       }
2619       /* Load the shadow comparitor */
2620       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2621          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2622                           WRITEMASK_X),
2623                   shadow_comparitor));
2624          inst->mlen++;
2625       }
2626
2627       /* Load the LOD info */
2628       if (op == ir_tex || op == ir_txl) {
2629          int mrf, writemask;
2630          if (devinfo->gen >= 5) {
2631             mrf = param_base + 1;
2632             if (shadow_comparitor.file != BAD_FILE) {
2633                writemask = WRITEMASK_Y;
2634                /* mlen already incremented */
2635             } else {
2636                writemask = WRITEMASK_X;
2637                inst->mlen++;
2638             }
2639          } else /* devinfo->gen == 4 */ {
2640             mrf = param_base;
2641             writemask = WRITEMASK_W;
2642          }
2643          lod.swizzle = BRW_SWIZZLE_XXXX;
2644          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2645       } else if (op == ir_txf) {
2646          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2647       } else if (op == ir_txf_ms) {
2648          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2649                   sample_index));
2650          if (devinfo->gen >= 7) {
2651             /* MCS data is in the first channel of `mcs`, but we need to get it into
2652              * the .y channel of the second vec4 of params, so replicate .x across
2653              * the whole vec4 and then mask off everything except .y
2654              */
2655             mcs.swizzle = BRW_SWIZZLE_XXXX;
2656             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2657                      mcs));
2658          }
2659          inst->mlen++;
2660       } else if (op == ir_txd) {
2661          const brw_reg_type type = lod.type;
2662
2663          if (devinfo->gen >= 5) {
2664             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2665             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2666             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2667             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2668             inst->mlen++;
2669
2670             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2671                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2672                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2673                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2674                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2675                inst->mlen++;
2676
2677                if (shadow_comparitor.file != BAD_FILE) {
2678                   emit(MOV(dst_reg(MRF, param_base + 2,
2679                                    shadow_comparitor.type, WRITEMASK_Z),
2680                            shadow_comparitor));
2681                }
2682             }
2683          } else /* devinfo->gen == 4 */ {
2684             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2685             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2686             inst->mlen += 2;
2687          }
2688       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2689          if (shadow_comparitor.file != BAD_FILE) {
2690             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2691                      shadow_comparitor));
2692          }
2693
2694          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2695                   offset_value));
2696          inst->mlen++;
2697       }
2698    }
2699
2700    emit(inst);
2701
2702    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2703     * spec requires layers.
2704     */
2705    if (op == ir_txs && is_cube_array) {
2706       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2707                 writemask(inst->dst, WRITEMASK_Z),
2708                 src_reg(inst->dst), src_reg(6));
2709    }
2710
2711    if (devinfo->gen == 6 && op == ir_tg4) {
2712       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2713    }
2714
2715    swizzle_result(op, dest,
2716                   src_reg(inst->dst), sampler, dest_type);
2717 }
2718
2719 void
2720 vec4_visitor::visit(ir_texture *ir)
2721 {
2722    uint32_t sampler =
2723       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2724
2725    ir_rvalue *nonconst_sampler_index =
2726       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2727
2728    /* Handle non-constant sampler array indexing */
2729    src_reg sampler_reg;
2730    if (nonconst_sampler_index) {
2731       /* The highest sampler which may be used by this operation is
2732        * the last element of the array. Mark it here, because the generator
2733        * doesn't have enough information to determine the bound.
2734        */
2735       uint32_t array_size = ir->sampler->as_dereference_array()
2736          ->array->type->array_size();
2737
2738       uint32_t max_used = sampler + array_size - 1;
2739       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2740          max_used += prog_data->base.binding_table.gather_texture_start;
2741       } else {
2742          max_used += prog_data->base.binding_table.texture_start;
2743       }
2744
2745       brw_mark_surface_used(&prog_data->base, max_used);
2746
2747       /* Emit code to evaluate the actual indexing expression */
2748       nonconst_sampler_index->accept(this);
2749       src_reg temp(this, glsl_type::uint_type);
2750       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2751       sampler_reg = emit_uniformize(temp);
2752    } else {
2753       /* Single sampler, or constant array index; the indexing expression
2754        * is just an immediate.
2755        */
2756       sampler_reg = src_reg(sampler);
2757    }
2758
2759    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2760     * emitting anything other than setting up the constant result.
2761     */
2762    if (ir->op == ir_tg4) {
2763       ir_constant *chan = ir->lod_info.component->as_constant();
2764       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2765       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2766          dst_reg result(this, ir->type);
2767          this->result = src_reg(result);
2768          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2769          return;
2770       }
2771    }
2772
2773    /* Should be lowered by do_lower_texture_projection */
2774    assert(!ir->projector);
2775
2776    /* Should be lowered */
2777    assert(!ir->offset || !ir->offset->type->is_array());
2778
2779    /* Generate code to compute all the subexpression trees.  This has to be
2780     * done before loading any values into MRFs for the sampler message since
2781     * generating these values may involve SEND messages that need the MRFs.
2782     */
2783    src_reg coordinate;
2784    int coord_components = 0;
2785    if (ir->coordinate) {
2786       coord_components = ir->coordinate->type->vector_elements;
2787       ir->coordinate->accept(this);
2788       coordinate = this->result;
2789    }
2790
2791    src_reg shadow_comparitor;
2792    if (ir->shadow_comparitor) {
2793       ir->shadow_comparitor->accept(this);
2794       shadow_comparitor = this->result;
2795    }
2796
2797    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2798    src_reg offset_value;
2799    if (has_nonconstant_offset) {
2800       ir->offset->accept(this);
2801       offset_value = src_reg(this->result);
2802    }
2803
2804    src_reg lod, lod2, sample_index, mcs;
2805    switch (ir->op) {
2806    case ir_tex:
2807       lod = src_reg(0.0f);
2808       break;
2809    case ir_txf:
2810    case ir_txl:
2811    case ir_txs:
2812       ir->lod_info.lod->accept(this);
2813       lod = this->result;
2814       break;
2815    case ir_query_levels:
2816       lod = src_reg(0);
2817       break;
2818    case ir_txf_ms:
2819       ir->lod_info.sample_index->accept(this);
2820       sample_index = this->result;
2821
2822       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2823          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2824       else
2825          mcs = src_reg(0u);
2826       break;
2827    case ir_txd:
2828       ir->lod_info.grad.dPdx->accept(this);
2829       lod = this->result;
2830
2831       ir->lod_info.grad.dPdy->accept(this);
2832       lod2 = this->result;
2833       break;
2834    case ir_txb:
2835    case ir_lod:
2836    case ir_tg4:
2837       break;
2838    }
2839
2840    uint32_t constant_offset = 0;
2841    if (ir->offset != NULL && !has_nonconstant_offset) {
2842       constant_offset  =
2843          brw_texture_offset(ir->offset->as_constant()->value.i,
2844                             ir->offset->type->vector_elements);
2845    }
2846
2847    /* Stuff the channel select bits in the top of the texture offset */
2848    if (ir->op == ir_tg4)
2849       constant_offset |=
2850          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2851                          sampler) << 16;
2852
2853    glsl_type const *type = ir->sampler->type;
2854    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2855       type->sampler_array;
2856
2857    this->result = src_reg(this, ir->type);
2858    dst_reg dest = dst_reg(this->result);
2859
2860    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2861                 shadow_comparitor,
2862                 lod, lod2, sample_index,
2863                 constant_offset, offset_value,
2864                 mcs, is_cube_array, sampler, sampler_reg);
2865 }
2866
2867 /**
2868  * Apply workarounds for Gen6 gather with UINT/SINT
2869  */
2870 void
2871 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2872 {
2873    if (!wa)
2874       return;
2875
2876    int width = (wa & WA_8BIT) ? 8 : 16;
2877    dst_reg dst_f = dst;
2878    dst_f.type = BRW_REGISTER_TYPE_F;
2879
2880    /* Convert from UNORM to UINT */
2881    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2882    emit(MOV(dst, src_reg(dst_f)));
2883
2884    if (wa & WA_SIGN) {
2885       /* Reinterpret the UINT value as a signed INT value by
2886        * shifting the sign bit into place, then shifting back
2887        * preserving sign.
2888        */
2889       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2890       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2891    }
2892 }
2893
2894 /**
2895  * Set up the gather channel based on the swizzle, for gather4.
2896  */
2897 uint32_t
2898 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2899 {
2900    int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2901    switch (swiz) {
2902       case SWIZZLE_X: return 0;
2903       case SWIZZLE_Y:
2904          /* gather4 sampler is broken for green channel on RG32F --
2905           * we must ask for blue instead.
2906           */
2907          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2908             return 2;
2909          return 1;
2910       case SWIZZLE_Z: return 2;
2911       case SWIZZLE_W: return 3;
2912       default:
2913          unreachable("Not reached"); /* zero, one swizzles handled already */
2914    }
2915 }
2916
2917 void
2918 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2919                              src_reg orig_val, uint32_t sampler,
2920                              const glsl_type *dest_type)
2921 {
2922    int s = key->tex.swizzles[sampler];
2923
2924    dst_reg swizzled_result = dest;
2925
2926    if (op == ir_query_levels) {
2927       /* # levels is in .w */
2928       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2929       emit(MOV(swizzled_result, orig_val));
2930       return;
2931    }
2932
2933    if (op == ir_txs || dest_type == glsl_type::float_type
2934                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2935       emit(MOV(swizzled_result, orig_val));
2936       return;
2937    }
2938
2939
2940    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2941    int swizzle[4] = {0};
2942
2943    for (int i = 0; i < 4; i++) {
2944       switch (GET_SWZ(s, i)) {
2945       case SWIZZLE_ZERO:
2946          zero_mask |= (1 << i);
2947          break;
2948       case SWIZZLE_ONE:
2949          one_mask |= (1 << i);
2950          break;
2951       default:
2952          copy_mask |= (1 << i);
2953          swizzle[i] = GET_SWZ(s, i);
2954          break;
2955       }
2956    }
2957
2958    if (copy_mask) {
2959       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2960       swizzled_result.writemask = copy_mask;
2961       emit(MOV(swizzled_result, orig_val));
2962    }
2963
2964    if (zero_mask) {
2965       swizzled_result.writemask = zero_mask;
2966       emit(MOV(swizzled_result, src_reg(0.0f)));
2967    }
2968
2969    if (one_mask) {
2970       swizzled_result.writemask = one_mask;
2971       emit(MOV(swizzled_result, src_reg(1.0f)));
2972    }
2973 }
2974
2975 void
2976 vec4_visitor::visit(ir_return *)
2977 {
2978    unreachable("not reached");
2979 }
2980
2981 void
2982 vec4_visitor::visit(ir_discard *)
2983 {
2984    unreachable("not reached");
2985 }
2986
2987 void
2988 vec4_visitor::visit(ir_if *ir)
2989 {
2990    /* Don't point the annotation at the if statement, because then it plus
2991     * the then and else blocks get printed.
2992     */
2993    this->base_ir = ir->condition;
2994
2995    if (devinfo->gen == 6) {
2996       emit_if_gen6(ir);
2997    } else {
2998       enum brw_predicate predicate;
2999       emit_bool_to_cond_code(ir->condition, &predicate);
3000       emit(IF(predicate));
3001    }
3002
3003    visit_instructions(&ir->then_instructions);
3004
3005    if (!ir->else_instructions.is_empty()) {
3006       this->base_ir = ir->condition;
3007       emit(BRW_OPCODE_ELSE);
3008
3009       visit_instructions(&ir->else_instructions);
3010    }
3011
3012    this->base_ir = ir->condition;
3013    emit(BRW_OPCODE_ENDIF);
3014 }
3015
3016 void
3017 vec4_visitor::gs_emit_vertex(int stream_id)
3018 {
3019    unreachable("not reached");
3020 }
3021
3022 void
3023 vec4_visitor::visit(ir_emit_vertex *)
3024 {
3025    unreachable("not reached");
3026 }
3027
3028 void
3029 vec4_visitor::gs_end_primitive()
3030 {
3031    unreachable("not reached");
3032 }
3033
3034
3035 void
3036 vec4_visitor::visit(ir_end_primitive *)
3037 {
3038    unreachable("not reached");
3039 }
3040
3041 void
3042 vec4_visitor::visit(ir_barrier *)
3043 {
3044    unreachable("not reached");
3045 }
3046
3047 void
3048 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3049                                   dst_reg dst, src_reg offset,
3050                                   src_reg src0, src_reg src1)
3051 {
3052    unsigned mlen = 0;
3053
3054    /* Set the atomic operation offset. */
3055    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3056    mlen++;
3057
3058    /* Set the atomic operation arguments. */
3059    if (src0.file != BAD_FILE) {
3060       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3061       mlen++;
3062    }
3063
3064    if (src1.file != BAD_FILE) {
3065       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3066       mlen++;
3067    }
3068
3069    /* Emit the instruction.  Note that this maps to the normal SIMD8
3070     * untyped atomic message on Ivy Bridge, but that's OK because
3071     * unused channels will be masked out.
3072     */
3073    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3074                                  brw_message_reg(0),
3075                                  src_reg(surf_index), src_reg(atomic_op));
3076    inst->mlen = mlen;
3077 }
3078
3079 void
3080 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3081                                         src_reg offset)
3082 {
3083    /* Set the surface read offset. */
3084    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3085
3086    /* Emit the instruction.  Note that this maps to the normal SIMD8
3087     * untyped surface read message, but that's OK because unused
3088     * channels will be masked out.
3089     */
3090    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3091                                  brw_message_reg(0),
3092                                  src_reg(surf_index), src_reg(1));
3093    inst->mlen = 1;
3094 }
3095
3096 void
3097 vec4_visitor::emit_ndc_computation()
3098 {
3099    /* Get the position */
3100    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3101
3102    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3103    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3104    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3105
3106    current_annotation = "NDC";
3107    dst_reg ndc_w = ndc;
3108    ndc_w.writemask = WRITEMASK_W;
3109    src_reg pos_w = pos;
3110    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3111    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3112
3113    dst_reg ndc_xyz = ndc;
3114    ndc_xyz.writemask = WRITEMASK_XYZ;
3115
3116    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3117 }
3118
3119 void
3120 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3121 {
3122    if (devinfo->gen < 6 &&
3123        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3124         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3125         devinfo->has_negative_rhw_bug)) {
3126       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3127       dst_reg header1_w = header1;
3128       header1_w.writemask = WRITEMASK_W;
3129
3130       emit(MOV(header1, 0u));
3131
3132       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3133          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3134
3135          current_annotation = "Point size";
3136          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3137          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3138       }
3139
3140       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3141          current_annotation = "Clipping flags";
3142          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3143          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3144
3145          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3146          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3147          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3148
3149          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3150          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3151          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3152          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3153       }
3154
3155       /* i965 clipping workaround:
3156        * 1) Test for -ve rhw
3157        * 2) If set,
3158        *      set ndc = (0,0,0,0)
3159        *      set ucp[6] = 1
3160        *
3161        * Later, clipping will detect ucp[6] and ensure the primitive is
3162        * clipped against all fixed planes.
3163        */
3164       if (devinfo->has_negative_rhw_bug) {
3165          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3166          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3167          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3168          vec4_instruction *inst;
3169          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3170          inst->predicate = BRW_PREDICATE_NORMAL;
3171          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3172          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3173          inst->predicate = BRW_PREDICATE_NORMAL;
3174       }
3175
3176       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3177    } else if (devinfo->gen < 6) {
3178       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3179    } else {
3180       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3181       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3182          dst_reg reg_w = reg;
3183          reg_w.writemask = WRITEMASK_W;
3184          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3185          reg_as_src.type = reg_w.type;
3186          reg_as_src.swizzle = brw_swizzle_for_size(1);
3187          emit(MOV(reg_w, reg_as_src));
3188       }
3189       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3190          dst_reg reg_y = reg;
3191          reg_y.writemask = WRITEMASK_Y;
3192          reg_y.type = BRW_REGISTER_TYPE_D;
3193          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3194          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3195       }
3196       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3197          dst_reg reg_z = reg;
3198          reg_z.writemask = WRITEMASK_Z;
3199          reg_z.type = BRW_REGISTER_TYPE_D;
3200          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3201          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3202       }
3203    }
3204 }
3205
3206 void
3207 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3208 {
3209    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3210     *
3211     *     "If a linked set of shaders forming the vertex stage contains no
3212     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3213     *     application has requested clipping against user clip planes through
3214     *     the API, then the coordinate written to gl_Position is used for
3215     *     comparison against the user clip planes."
3216     *
3217     * This function is only called if the shader didn't write to
3218     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3219     * if the user wrote to it; otherwise we use gl_Position.
3220     */
3221    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3222    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3223       clip_vertex = VARYING_SLOT_POS;
3224    }
3225
3226    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3227         ++i) {
3228       reg.writemask = 1 << i;
3229       emit(DP4(reg,
3230                src_reg(output_reg[clip_vertex]),
3231                src_reg(this->userplane[i + offset])));
3232    }
3233 }
3234
3235 vec4_instruction *
3236 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3237 {
3238    assert(varying < VARYING_SLOT_MAX);
3239    assert(output_reg[varying].type == reg.type);
3240    current_annotation = output_reg_annotation[varying];
3241    /* Copy the register, saturating if necessary */
3242    return emit(MOV(reg, src_reg(output_reg[varying])));
3243 }
3244
3245 void
3246 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3247 {
3248    reg.type = BRW_REGISTER_TYPE_F;
3249    output_reg[varying].type = reg.type;
3250
3251    switch (varying) {
3252    case VARYING_SLOT_PSIZ:
3253    {
3254       /* PSIZ is always in slot 0, and is coupled with other flags. */
3255       current_annotation = "indices, point width, clip flags";
3256       emit_psiz_and_flags(reg);
3257       break;
3258    }
3259    case BRW_VARYING_SLOT_NDC:
3260       current_annotation = "NDC";
3261       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3262       break;
3263    case VARYING_SLOT_POS:
3264       current_annotation = "gl_Position";
3265       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3266       break;
3267    case VARYING_SLOT_EDGE:
3268       /* This is present when doing unfilled polygons.  We're supposed to copy
3269        * the edge flag from the user-provided vertex array
3270        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3271        * of that attribute (starts as 1.0f).  This is then used in clipping to
3272        * determine which edges should be drawn as wireframe.
3273        */
3274       current_annotation = "edge flag";
3275       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3276                                     glsl_type::float_type, WRITEMASK_XYZW))));
3277       break;
3278    case BRW_VARYING_SLOT_PAD:
3279       /* No need to write to this slot */
3280       break;
3281    case VARYING_SLOT_COL0:
3282    case VARYING_SLOT_COL1:
3283    case VARYING_SLOT_BFC0:
3284    case VARYING_SLOT_BFC1: {
3285       /* These built-in varyings are only supported in compatibility mode,
3286        * and we only support GS in core profile.  So, this must be a vertex
3287        * shader.
3288        */
3289       assert(stage == MESA_SHADER_VERTEX);
3290       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3291       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3292          inst->saturate = true;
3293       break;
3294    }
3295
3296    default:
3297       emit_generic_urb_slot(reg, varying);
3298       break;
3299    }
3300 }
3301
3302 static int
3303 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3304 {
3305    if (devinfo->gen >= 6) {
3306       /* URB data written (does not include the message header reg) must
3307        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3308        * section 5.4.3.2.2: URB_INTERLEAVED.
3309        *
3310        * URB entries are allocated on a multiple of 1024 bits, so an
3311        * extra 128 bits written here to make the end align to 256 is
3312        * no problem.
3313        */
3314       if ((mlen % 2) != 1)
3315          mlen++;
3316    }
3317
3318    return mlen;
3319 }
3320
3321
3322 /**
3323  * Generates the VUE payload plus the necessary URB write instructions to
3324  * output it.
3325  *
3326  * The VUE layout is documented in Volume 2a.
3327  */
3328 void
3329 vec4_visitor::emit_vertex()
3330 {
3331    /* MRF 0 is reserved for the debugger, so start with message header
3332     * in MRF 1.
3333     */
3334    int base_mrf = 1;
3335    int mrf = base_mrf;
3336    /* In the process of generating our URB write message contents, we
3337     * may need to unspill a register or load from an array.  Those
3338     * reads would use MRFs 14-15.
3339     */
3340    int max_usable_mrf = 13;
3341
3342    /* The following assertion verifies that max_usable_mrf causes an
3343     * even-numbered amount of URB write data, which will meet gen6's
3344     * requirements for length alignment.
3345     */
3346    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3347
3348    /* First mrf is the g0-based message header containing URB handles and
3349     * such.
3350     */
3351    emit_urb_write_header(mrf++);
3352
3353    if (devinfo->gen < 6) {
3354       emit_ndc_computation();
3355    }
3356
3357    /* Lower legacy ff and ClipVertex clipping to clip distances */
3358    if (key->nr_userclip_plane_consts > 0) {
3359       current_annotation = "user clip distances";
3360
3361       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3362       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3363
3364       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3365       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3366    }
3367
3368    /* We may need to split this up into several URB writes, so do them in a
3369     * loop.
3370     */
3371    int slot = 0;
3372    bool complete = false;
3373    do {
3374       /* URB offset is in URB row increments, and each of our MRFs is half of
3375        * one of those, since we're doing interleaved writes.
3376        */
3377       int offset = slot / 2;
3378
3379       mrf = base_mrf + 1;
3380       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3381          emit_urb_slot(dst_reg(MRF, mrf++),
3382                        prog_data->vue_map.slot_to_varying[slot]);
3383
3384          /* If this was max_usable_mrf, we can't fit anything more into this
3385           * URB WRITE.
3386           */
3387          if (mrf > max_usable_mrf) {
3388             slot++;
3389             break;
3390          }
3391       }
3392
3393       complete = slot >= prog_data->vue_map.num_slots;
3394       current_annotation = "URB write";
3395       vec4_instruction *inst = emit_urb_write_opcode(complete);
3396       inst->base_mrf = base_mrf;
3397       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3398       inst->offset += offset;
3399    } while(!complete);
3400 }
3401
3402
3403 src_reg
3404 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3405                                  src_reg *reladdr, int reg_offset)
3406 {
3407    /* Because we store the values to scratch interleaved like our
3408     * vertex data, we need to scale the vec4 index by 2.
3409     */
3410    int message_header_scale = 2;
3411
3412    /* Pre-gen6, the message header uses byte offsets instead of vec4
3413     * (16-byte) offset units.
3414     */
3415    if (devinfo->gen < 6)
3416       message_header_scale *= 16;
3417
3418    if (reladdr) {
3419       src_reg index = src_reg(this, glsl_type::int_type);
3420
3421       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3422                                    src_reg(reg_offset)));
3423       emit_before(block, inst, MUL(dst_reg(index), index,
3424                                    src_reg(message_header_scale)));
3425
3426       return index;
3427    } else {
3428       return src_reg(reg_offset * message_header_scale);
3429    }
3430 }
3431
3432 src_reg
3433 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3434                                        src_reg *reladdr, int reg_offset)
3435 {
3436    if (reladdr) {
3437       src_reg index = src_reg(this, glsl_type::int_type);
3438
3439       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3440                                    src_reg(reg_offset)));
3441
3442       /* Pre-gen6, the message header uses byte offsets instead of vec4
3443        * (16-byte) offset units.
3444        */
3445       if (devinfo->gen < 6) {
3446          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3447       }
3448
3449       return index;
3450    } else if (devinfo->gen >= 8) {
3451       /* Store the offset in a GRF so we can send-from-GRF. */
3452       src_reg offset = src_reg(this, glsl_type::int_type);
3453       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3454       return offset;
3455    } else {
3456       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3457       return src_reg(reg_offset * message_header_scale);
3458    }
3459 }
3460
3461 /**
3462  * Emits an instruction before @inst to load the value named by @orig_src
3463  * from scratch space at @base_offset to @temp.
3464  *
3465  * @base_offset is measured in 32-byte units (the size of a register).
3466  */
3467 void
3468 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3469                                 dst_reg temp, src_reg orig_src,
3470                                 int base_offset)
3471 {
3472    int reg_offset = base_offset + orig_src.reg_offset;
3473    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3474                                       reg_offset);
3475
3476    emit_before(block, inst, SCRATCH_READ(temp, index));
3477 }
3478
3479 /**
3480  * Emits an instruction after @inst to store the value to be written
3481  * to @orig_dst to scratch space at @base_offset, from @temp.
3482  *
3483  * @base_offset is measured in 32-byte units (the size of a register).
3484  */
3485 void
3486 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3487                                  int base_offset)
3488 {
3489    int reg_offset = base_offset + inst->dst.reg_offset;
3490    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3491                                       reg_offset);
3492
3493    /* Create a temporary register to store *inst's result in.
3494     *
3495     * We have to be careful in MOVing from our temporary result register in
3496     * the scratch write.  If we swizzle from channels of the temporary that
3497     * weren't initialized, it will confuse live interval analysis, which will
3498     * make spilling fail to make progress.
3499     */
3500    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3501                                        inst->dst.type),
3502                                 brw_swizzle_for_mask(inst->dst.writemask));
3503    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3504                                        inst->dst.writemask));
3505    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3506    if (inst->opcode != BRW_OPCODE_SEL)
3507       write->predicate = inst->predicate;
3508    write->ir = inst->ir;
3509    write->annotation = inst->annotation;
3510    inst->insert_after(block, write);
3511
3512    inst->dst.file = temp.file;
3513    inst->dst.reg = temp.reg;
3514    inst->dst.reg_offset = temp.reg_offset;
3515    inst->dst.reladdr = NULL;
3516 }
3517
3518 /**
3519  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3520  * adds the scratch read(s) before \p inst. The function also checks for
3521  * recursive reladdr scratch accesses, issuing the corresponding scratch
3522  * loads and rewriting reladdr references accordingly.
3523  *
3524  * \return \p src if it did not require a scratch load, otherwise, the
3525  * register holding the result of the scratch load that the caller should
3526  * use to rewrite src.
3527  */
3528 src_reg
3529 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3530                                    vec4_instruction *inst, src_reg src)
3531 {
3532    /* Resolve recursive reladdr scratch access by calling ourselves
3533     * with src.reladdr
3534     */
3535    if (src.reladdr)
3536       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3537                                           *src.reladdr);
3538
3539    /* Now handle scratch access on src */
3540    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3541       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3542       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3543       src.reg = temp.reg;
3544       src.reg_offset = temp.reg_offset;
3545       src.reladdr = NULL;
3546    }
3547
3548    return src;
3549 }
3550
3551 /**
3552  * We can't generally support array access in GRF space, because a
3553  * single instruction's destination can only span 2 contiguous
3554  * registers.  So, we send all GRF arrays that get variable index
3555  * access to scratch space.
3556  */
3557 void
3558 vec4_visitor::move_grf_array_access_to_scratch()
3559 {
3560    int scratch_loc[this->alloc.count];
3561    memset(scratch_loc, -1, sizeof(scratch_loc));
3562
3563    /* First, calculate the set of virtual GRFs that need to be punted
3564     * to scratch due to having any array access on them, and where in
3565     * scratch.
3566     */
3567    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3568       if (inst->dst.file == GRF && inst->dst.reladdr) {
3569          if (scratch_loc[inst->dst.reg] == -1) {
3570             scratch_loc[inst->dst.reg] = last_scratch;
3571             last_scratch += this->alloc.sizes[inst->dst.reg];
3572          }
3573
3574          for (src_reg *iter = inst->dst.reladdr;
3575               iter->reladdr;
3576               iter = iter->reladdr) {
3577             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3578                scratch_loc[iter->reg] = last_scratch;
3579                last_scratch += this->alloc.sizes[iter->reg];
3580             }
3581          }
3582       }
3583
3584       for (int i = 0 ; i < 3; i++) {
3585          for (src_reg *iter = &inst->src[i];
3586               iter->reladdr;
3587               iter = iter->reladdr) {
3588             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3589                scratch_loc[iter->reg] = last_scratch;
3590                last_scratch += this->alloc.sizes[iter->reg];
3591             }
3592          }
3593       }
3594    }
3595
3596    /* Now, for anything that will be accessed through scratch, rewrite
3597     * it to load/store.  Note that this is a _safe list walk, because
3598     * we may generate a new scratch_write instruction after the one
3599     * we're processing.
3600     */
3601    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3602       /* Set up the annotation tracking for new generated instructions. */
3603       base_ir = inst->ir;
3604       current_annotation = inst->annotation;
3605
3606       /* First handle scratch access on the dst. Notice we have to handle
3607        * the case where the dst's reladdr also points to scratch space.
3608        */
3609       if (inst->dst.reladdr)
3610          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3611                                                    *inst->dst.reladdr);
3612
3613       /* Now that we have handled any (possibly recursive) reladdr scratch
3614        * accesses for dst we can safely do the scratch write for dst itself
3615        */
3616       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3617          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3618
3619       /* Now handle scratch access on any src. In this case, since inst->src[i]
3620        * already is a src_reg, we can just call emit_resolve_reladdr with
3621        * inst->src[i] and it will take care of handling scratch loads for
3622        * both src and src.reladdr (recursively).
3623        */
3624       for (int i = 0 ; i < 3; i++) {
3625          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3626                                              inst->src[i]);
3627       }
3628    }
3629 }
3630
3631 /**
3632  * Emits an instruction before @inst to load the value named by @orig_src
3633  * from the pull constant buffer (surface) at @base_offset to @temp.
3634  */
3635 void
3636 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3637                                       dst_reg temp, src_reg orig_src,
3638                                       int base_offset)
3639 {
3640    int reg_offset = base_offset + orig_src.reg_offset;
3641    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3642    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3643                                              reg_offset);
3644
3645    emit_pull_constant_load_reg(temp,
3646                                index,
3647                                offset,
3648                                block, inst);
3649 }
3650
3651 /**
3652  * Implements array access of uniforms by inserting a
3653  * PULL_CONSTANT_LOAD instruction.
3654  *
3655  * Unlike temporary GRF array access (where we don't support it due to
3656  * the difficulty of doing relative addressing on instruction
3657  * destinations), we could potentially do array access of uniforms
3658  * that were loaded in GRF space as push constants.  In real-world
3659  * usage we've seen, though, the arrays being used are always larger
3660  * than we could load as push constants, so just always move all
3661  * uniform array access out to a pull constant buffer.
3662  */
3663 void
3664 vec4_visitor::move_uniform_array_access_to_pull_constants()
3665 {
3666    int pull_constant_loc[this->uniforms];
3667    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3668    bool nested_reladdr;
3669
3670    /* Walk through and find array access of uniforms.  Put a copy of that
3671     * uniform in the pull constant buffer.
3672     *
3673     * Note that we don't move constant-indexed accesses to arrays.  No
3674     * testing has been done of the performance impact of this choice.
3675     */
3676    do {
3677       nested_reladdr = false;
3678
3679       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3680          for (int i = 0 ; i < 3; i++) {
3681             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3682                continue;
3683
3684             int uniform = inst->src[i].reg;
3685
3686             if (inst->src[i].reladdr->reladdr)
3687                nested_reladdr = true;  /* will need another pass */
3688
3689             /* If this array isn't already present in the pull constant buffer,
3690              * add it.
3691              */
3692             if (pull_constant_loc[uniform] == -1) {
3693                const gl_constant_value **values =
3694                   &stage_prog_data->param[uniform * 4];
3695
3696                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3697
3698                assert(uniform < uniform_array_size);
3699                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3700                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3701                      = values[j];
3702                }
3703             }
3704
3705             /* Set up the annotation tracking for new generated instructions. */
3706             base_ir = inst->ir;
3707             current_annotation = inst->annotation;
3708
3709             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3710
3711             emit_pull_constant_load(block, inst, temp, inst->src[i],
3712                                     pull_constant_loc[uniform]);
3713
3714             inst->src[i].file = temp.file;
3715             inst->src[i].reg = temp.reg;
3716             inst->src[i].reg_offset = temp.reg_offset;
3717             inst->src[i].reladdr = NULL;
3718          }
3719       }
3720    } while (nested_reladdr);
3721
3722    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3723     * no need to track them as larger-than-vec4 objects.  This will be
3724     * relied on in cutting out unused uniform vectors from push
3725     * constants.
3726     */
3727    split_uniform_registers();
3728 }
3729
3730 void
3731 vec4_visitor::resolve_ud_negate(src_reg *reg)
3732 {
3733    if (reg->type != BRW_REGISTER_TYPE_UD ||
3734        !reg->negate)
3735       return;
3736
3737    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3738    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3739    *reg = temp;
3740 }
3741
3742 /**
3743  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3744  *
3745  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3746  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3747  */
3748 void
3749 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3750 {
3751    assert(devinfo->gen <= 5);
3752
3753    if (!rvalue->type->is_boolean())
3754       return;
3755
3756    src_reg and_result = src_reg(this, rvalue->type);
3757    src_reg neg_result = src_reg(this, rvalue->type);
3758    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3759    emit(MOV(dst_reg(neg_result), negate(and_result)));
3760    *reg = neg_result;
3761 }
3762
3763 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3764                            void *log_data,
3765                            struct gl_program *prog,
3766                            const struct brw_vue_prog_key *key,
3767                            struct brw_vue_prog_data *prog_data,
3768                            struct gl_shader_program *shader_prog,
3769                            gl_shader_stage stage,
3770                            void *mem_ctx,
3771                            bool no_spills,
3772                            int shader_time_index)
3773    : backend_shader(compiler, log_data, mem_ctx,
3774                     shader_prog, prog, &prog_data->base, stage),
3775      key(key),
3776      prog_data(prog_data),
3777      sanity_param_count(0),
3778      fail_msg(NULL),
3779      first_non_payload_grf(0),
3780      need_all_constants_in_pull_buffer(false),
3781      no_spills(no_spills),
3782      shader_time_index(shader_time_index),
3783      last_scratch(0)
3784 {
3785    this->failed = false;
3786
3787    this->base_ir = NULL;
3788    this->current_annotation = NULL;
3789    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3790
3791    this->variable_ht = hash_table_ctor(0,
3792                                        hash_table_pointer_hash,
3793                                        hash_table_pointer_compare);
3794
3795    this->virtual_grf_start = NULL;
3796    this->virtual_grf_end = NULL;
3797    this->live_intervals = NULL;
3798
3799    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3800
3801    this->uniforms = 0;
3802
3803    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3804     * at least one. See setup_uniforms() in brw_vec4.cpp.
3805     */
3806    this->uniform_array_size = 1;
3807    if (prog_data) {
3808       this->uniform_array_size =
3809          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3810    }
3811
3812    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3813    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3814 }
3815
3816 vec4_visitor::~vec4_visitor()
3817 {
3818    hash_table_dtor(this->variable_ht);
3819 }
3820
3821
3822 void
3823 vec4_visitor::fail(const char *format, ...)
3824 {
3825    va_list va;
3826    char *msg;
3827
3828    if (failed)
3829       return;
3830
3831    failed = true;
3832
3833    va_start(va, format);
3834    msg = ralloc_vasprintf(mem_ctx, format, va);
3835    va_end(va);
3836    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3837
3838    this->fail_msg = msg;
3839
3840    if (debug_enabled) {
3841       fprintf(stderr, "%s",  msg);
3842    }
3843 }
3844
3845 } /* namespace brw */