src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->target = 0;
  47    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  48    this->shadow_compare = false;
  49    this->ir = NULL;
  50    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  51    this->header_present = false;
  52    this->mlen = 0;
  53    this->base_mrf = 0;
  54    this->offset = 0;
  55    this->annotation = NULL;
  56 }
  57
  58 vec4_instruction *
  59 vec4_visitor::emit(vec4_instruction *inst)
  60 {
  61    inst->ir = this->base_ir;
  62    inst->annotation = this->current_annotation;
  63
  64    this->instructions.push_tail(inst);
  65
  66    return inst;
  67 }
  68
  69 vec4_instruction *
  70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  71                           vec4_instruction *new_inst)
  72 {
  73    new_inst->ir = inst->ir;
  74    new_inst->annotation = inst->annotation;
  75
  76    inst->insert_before(block, new_inst);
  77
  78    return inst;
  79 }
  80
  81 vec4_instruction *
  82 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  83                    const src_reg &src1, const src_reg &src2)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  91                    const src_reg &src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 119    }
 120
 121 #define ALU2(op)                                                        \
 122    vec4_instruction *                                                   \
 123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 124                     const src_reg &src1)                                \
 125    {                                                                    \
 126       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 127                                            src0, src1);                 \
 128    }
 129
 130 #define ALU2_ACC(op)                                                    \
 131    vec4_instruction *                                                   \
 132    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 133                     const src_reg &src1)                                \
 134    {                                                                    \
 135       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 136                        BRW_OPCODE_##op, dst, src0, src1);               \
 137       inst->writes_accumulator = true;                                  \
 138       return inst;                                                      \
 139    }
 140
 141 #define ALU3(op)                                                        \
 142    vec4_instruction *                                                   \
 143    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 144                     const src_reg &src1, const src_reg &src2)           \
 145    {                                                                    \
 146       assert(brw->gen >= 6);                                            \
 147       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 148                                            src0, src1, src2);           \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU1(F32TO16)
 158 ALU1(F16TO32)
 159 ALU2(ADD)
 160 ALU2(MUL)
 161 ALU2_ACC(MACH)
 162 ALU2(AND)
 163 ALU2(OR)
 164 ALU2(XOR)
 165 ALU2(DP3)
 166 ALU2(DP4)
 167 ALU2(DPH)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172 ALU1(BFREV)
 173 ALU3(BFE)
 174 ALU2(BFI1)
 175 ALU3(BFI2)
 176 ALU1(FBH)
 177 ALU1(FBL)
 178 ALU1(CBIT)
 179 ALU3(MAD)
 180 ALU2_ACC(ADDC)
 181 ALU2_ACC(SUBB)
 182 ALU2(MAC)
 183
 184 /** Gen4 predicated IF. */
 185 vec4_instruction *
 186 vec4_visitor::IF(enum brw_predicate predicate)
 187 {
 188    vec4_instruction *inst;
 189
 190    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 191    inst->predicate = predicate;
 192
 193    return inst;
 194 }
 195
 196 /** Gen6 IF with embedded comparison. */
 197 vec4_instruction *
 198 vec4_visitor::IF(src_reg src0, src_reg src1,
 199                  enum brw_conditional_mod condition)
 200 {
 201    assert(brw->gen == 6);
 202
 203    vec4_instruction *inst;
 204
 205    resolve_ud_negate(&src0);
 206    resolve_ud_negate(&src1);
 207
 208    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 209                                         src0, src1);
 210    inst->conditional_mod = condition;
 211
 212    return inst;
 213 }
 214
 215 /**
 216  * CMP: Sets the low bit of the destination channels with the result
 217  * of the comparison, while the upper bits are undefined, and updates
 218  * the flag register with the packed 16 bits of the result.
 219  */
 220 vec4_instruction *
 221 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 222                   enum brw_conditional_mod condition)
 223 {
 224    vec4_instruction *inst;
 225
 226    /* Take the instruction:
 227     *
 228     * CMP null<d> src0<f> src1<f>
 229     *
 230     * Original gen4 does type conversion to the destination type before
 231     * comparison, producing garbage results for floating point comparisons.
 232     *
 233     * The destination type doesn't matter on newer generations, so we set the
 234     * type to match src0 so we can compact the instruction.
 235     */
 236    dst.type = src0.type;
 237    if (dst.file == HW_REG)
 238       dst.fixed_hw_reg.type = dst.type;
 239
 240    resolve_ud_negate(&src0);
 241    resolve_ud_negate(&src1);
 242
 243    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 244    inst->conditional_mod = condition;
 245
 246    return inst;
 247 }
 248
 249 vec4_instruction *
 250 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 251 {
 252    vec4_instruction *inst;
 253
 254    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 255                                         dst, index);
 256    inst->base_mrf = 14;
 257    inst->mlen = 2;
 258
 259    return inst;
 260 }
 261
 262 vec4_instruction *
 263 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 264                             const src_reg &index)
 265 {
 266    vec4_instruction *inst;
 267
 268    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 269                                         dst, src, index);
 270    inst->base_mrf = 13;
 271    inst->mlen = 3;
 272
 273    return inst;
 274 }
 275
 276 void
 277 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 278 {
 279    static enum opcode dot_opcodes[] = {
 280       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 281    };
 282
 283    emit(dot_opcodes[elements - 2], dst, src0, src1);
 284 }
 285
 286 src_reg
 287 vec4_visitor::fix_3src_operand(src_reg src)
 288 {
 289    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 290     * able to use vertical stride of zero to replicate the vec4 uniform, like
 291     *
 292     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 293     *
 294     * But you can't, since vertical stride is always four in three-source
 295     * instructions. Instead, insert a MOV instruction to do the replication so
 296     * that the three-source instruction can consume it.
 297     */
 298
 299    /* The MOV is only needed if the source is a uniform or immediate. */
 300    if (src.file != UNIFORM && src.file != IMM)
 301       return src;
 302
 303    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 304       return src;
 305
 306    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 307    expanded.type = src.type;
 308    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 309    return src_reg(expanded);
 310 }
 311
 312 src_reg
 313 vec4_visitor::fix_math_operand(src_reg src)
 314 {
 315    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 316       return src;
 317
 318    /* The gen6 math instruction ignores the source modifiers --
 319     * swizzle, abs, negate, and at least some parts of the register
 320     * region description.
 321     *
 322     * Rather than trying to enumerate all these cases, *always* expand the
 323     * operand to a temp GRF for gen6.
 324     *
 325     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 326     * can't use.
 327     */
 328
 329    if (brw->gen == 7 && src.file != IMM)
 330       return src;
 331
 332    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 333    expanded.type = src.type;
 334    emit(MOV(expanded, src));
 335    return src_reg(expanded);
 336 }
 337
 338 void
 339 vec4_visitor::emit_math(enum opcode opcode,
 340                         const dst_reg &dst,
 341                         const src_reg &src0, const src_reg &src1)
 342 {
 343    vec4_instruction *math =
 344       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 345
 346    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 347       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 348       math->dst = dst_reg(this, glsl_type::vec4_type);
 349       math->dst.type = dst.type;
 350       emit(MOV(dst, src_reg(math->dst)));
 351    } else if (brw->gen < 6) {
 352       math->base_mrf = 1;
 353       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 354    }
 355 }
 356
 357 void
 358 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 359 {
 360    if (brw->gen < 7) {
 361       unreachable("ir_unop_pack_half_2x16 should be lowered");
 362    }
 363
 364    assert(dst.type == BRW_REGISTER_TYPE_UD);
 365    assert(src0.type == BRW_REGISTER_TYPE_F);
 366
 367    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 368     *
 369     *   Because this instruction does not have a 16-bit floating-point type,
 370     *   the destination data type must be Word (W).
 371     *
 372     *   The destination must be DWord-aligned and specify a horizontal stride
 373     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 374     *   each destination channel and the upper word is not modified.
 375     *
 376     * The above restriction implies that the f32to16 instruction must use
 377     * align1 mode, because only in align1 mode is it possible to specify
 378     * horizontal stride.  We choose here to defy the hardware docs and emit
 379     * align16 instructions.
 380     *
 381     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 382     * instructions. I was partially successful in that the code passed all
 383     * tests.  However, the code was dubiously correct and fragile, and the
 384     * tests were not harsh enough to probe that frailty. Not trusting the
 385     * code, I chose instead to remain in align16 mode in defiance of the hw
 386     * docs).
 387     *
 388     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 389     * simulator, emitting a f32to16 in align16 mode with UD as destination
 390     * data type is safe. The behavior differs from that specified in the PRM
 391     * in that the upper word of each destination channel is cleared to 0.
 392     */
 393
 394    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 395    src_reg tmp_src(tmp_dst);
 396
 397 #if 0
 398    /* Verify the undocumented behavior on which the following instructions
 399     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 400     * then the result of the bit-or instruction below will be incorrect.
 401     *
 402     * You should inspect the disasm output in order to verify that the MOV is
 403     * not optimized away.
 404     */
 405    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 406 #endif
 407
 408    /* Give tmp the form below, where "." means untouched.
 409     *
 410     *     w z          y          x w z          y          x
 411     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 412     *
 413     * That the upper word of each write-channel be 0 is required for the
 414     * following bit-shift and bit-or instructions to work. Note that this
 415     * relies on the undocumented hardware behavior mentioned above.
 416     */
 417    tmp_dst.writemask = WRITEMASK_XY;
 418    emit(F32TO16(tmp_dst, src0));
 419
 420    /* Give the write-channels of dst the form:
 421     *   0xhhhh0000
 422     */
 423    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 424    emit(SHL(dst, tmp_src, src_reg(16u)));
 425
 426    /* Finally, give the write-channels of dst the form of packHalf2x16's
 427     * output:
 428     *   0xhhhhllll
 429     */
 430    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 431    emit(OR(dst, src_reg(dst), tmp_src));
 432 }
 433
 434 void
 435 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 436 {
 437    if (brw->gen < 7) {
 438       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 439    }
 440
 441    assert(dst.type == BRW_REGISTER_TYPE_F);
 442    assert(src0.type == BRW_REGISTER_TYPE_UD);
 443
 444    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 445     *
 446     *   Because this instruction does not have a 16-bit floating-point type,
 447     *   the source data type must be Word (W). The destination type must be
 448     *   F (Float).
 449     *
 450     * To use W as the source data type, we must adjust horizontal strides,
 451     * which is only possible in align1 mode. All my [chadv] attempts at
 452     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 453     * Piglit tests, so I gave up.
 454     *
 455     * I've verified that, on gen7 hardware and the simulator, it is safe to
 456     * emit f16to32 in align16 mode with UD as source data type.
 457     */
 458
 459    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 460    src_reg tmp_src(tmp_dst);
 461
 462    tmp_dst.writemask = WRITEMASK_X;
 463    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 464
 465    tmp_dst.writemask = WRITEMASK_Y;
 466    emit(SHR(tmp_dst, src0, src_reg(16u)));
 467
 468    dst.writemask = WRITEMASK_XY;
 469    emit(F16TO32(dst, tmp_src));
 470 }
 471
 472 void
 473 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 474 {
 475    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 476     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 477     * is not suitable to generate the shift values, but we can use the packed
 478     * vector float and a type-converting MOV.
 479     */
 480    dst_reg shift(this, glsl_type::uvec4_type);
 481    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 482
 483    dst_reg shifted(this, glsl_type::uvec4_type);
 484    src0.swizzle = BRW_SWIZZLE_XXXX;
 485    emit(SHR(shifted, src0, src_reg(shift)));
 486
 487    shifted.type = BRW_REGISTER_TYPE_UB;
 488    dst_reg f(this, glsl_type::vec4_type);
 489    emit(MOV(f, src_reg(shifted)));
 490
 491    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 492 }
 493
 494 void
 495 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 496 {
 497    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 498     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 499     * is not suitable to generate the shift values, but we can use the packed
 500     * vector float and a type-converting MOV.
 501     */
 502    dst_reg shift(this, glsl_type::uvec4_type);
 503    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 504
 505    dst_reg shifted(this, glsl_type::uvec4_type);
 506    src0.swizzle = BRW_SWIZZLE_XXXX;
 507    emit(SHR(shifted, src0, src_reg(shift)));
 508
 509    shifted.type = BRW_REGISTER_TYPE_B;
 510    dst_reg f(this, glsl_type::vec4_type);
 511    emit(MOV(f, src_reg(shifted)));
 512
 513    dst_reg scaled(this, glsl_type::vec4_type);
 514    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 515
 516    dst_reg max(this, glsl_type::vec4_type);
 517    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 518    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 519 }
 520
 521 void
 522 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 523 {
 524    dst_reg saturated(this, glsl_type::vec4_type);
 525    vec4_instruction *inst = emit(MOV(saturated, src0));
 526    inst->saturate = true;
 527
 528    dst_reg scaled(this, glsl_type::vec4_type);
 529    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 530
 531    dst_reg rounded(this, glsl_type::vec4_type);
 532    emit(RNDE(rounded, src_reg(scaled)));
 533
 534    dst_reg u(this, glsl_type::uvec4_type);
 535    emit(MOV(u, src_reg(rounded)));
 536
 537    src_reg bytes(u);
 538    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 539 }
 540
 541 void
 542 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 543 {
 544    dst_reg max(this, glsl_type::vec4_type);
 545    emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
 546
 547    dst_reg min(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 549
 550    dst_reg scaled(this, glsl_type::vec4_type);
 551    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 552
 553    dst_reg rounded(this, glsl_type::vec4_type);
 554    emit(RNDE(rounded, src_reg(scaled)));
 555
 556    dst_reg i(this, glsl_type::ivec4_type);
 557    emit(MOV(i, src_reg(rounded)));
 558
 559    src_reg bytes(i);
 560    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 561 }
 562
 563 void
 564 vec4_visitor::visit_instructions(const exec_list *list)
 565 {
 566    foreach_in_list(ir_instruction, ir, list) {
 567       base_ir = ir;
 568       ir->accept(this);
 569    }
 570 }
 571
 572
 573 static int
 574 type_size(const struct glsl_type *type)
 575 {
 576    unsigned int i;
 577    int size;
 578
 579    switch (type->base_type) {
 580    case GLSL_TYPE_UINT:
 581    case GLSL_TYPE_INT:
 582    case GLSL_TYPE_FLOAT:
 583    case GLSL_TYPE_BOOL:
 584       if (type->is_matrix()) {
 585          return type->matrix_columns;
 586       } else {
 587          /* Regardless of size of vector, it gets a vec4. This is bad
 588           * packing for things like floats, but otherwise arrays become a
 589           * mess.  Hopefully a later pass over the code can pack scalars
 590           * down if appropriate.
 591           */
 592          return 1;
 593       }
 594    case GLSL_TYPE_ARRAY:
 595       assert(type->length > 0);
 596       return type_size(type->fields.array) * type->length;
 597    case GLSL_TYPE_STRUCT:
 598       size = 0;
 599       for (i = 0; i < type->length; i++) {
 600          size += type_size(type->fields.structure[i].type);
 601       }
 602       return size;
 603    case GLSL_TYPE_SAMPLER:
 604       /* Samplers take up no register space, since they're baked in at
 605        * link time.
 606        */
 607       return 0;
 608    case GLSL_TYPE_ATOMIC_UINT:
 609       return 0;
 610    case GLSL_TYPE_IMAGE:
 611    case GLSL_TYPE_VOID:
 612    case GLSL_TYPE_ERROR:
 613    case GLSL_TYPE_INTERFACE:
 614       unreachable("not reached");
 615    }
 616
 617    return 0;
 618 }
 619
 620 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 621 {
 622    init();
 623
 624    this->file = GRF;
 625    this->reg = v->alloc.allocate(type_size(type));
 626
 627    if (type->is_array() || type->is_record()) {
 628       this->swizzle = BRW_SWIZZLE_NOOP;
 629    } else {
 630       this->swizzle = swizzle_for_size(type->vector_elements);
 631    }
 632
 633    this->type = brw_type_for_base_type(type);
 634 }
 635
 636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 637 {
 638    assert(size > 0);
 639
 640    init();
 641
 642    this->file = GRF;
 643    this->reg = v->alloc.allocate(type_size(type) * size);
 644
 645    this->swizzle = BRW_SWIZZLE_NOOP;
 646
 647    this->type = brw_type_for_base_type(type);
 648 }
 649
 650 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 651 {
 652    init();
 653
 654    this->file = GRF;
 655    this->reg = v->alloc.allocate(type_size(type));
 656
 657    if (type->is_array() || type->is_record()) {
 658       this->writemask = WRITEMASK_XYZW;
 659    } else {
 660       this->writemask = (1 << type->vector_elements) - 1;
 661    }
 662
 663    this->type = brw_type_for_base_type(type);
 664 }
 665
 666 /* Our support for uniforms is piggy-backed on the struct
 667  * gl_fragment_program, because that's where the values actually
 668  * get stored, rather than in some global gl_shader_program uniform
 669  * store.
 670  */
 671 void
 672 vec4_visitor::setup_uniform_values(ir_variable *ir)
 673 {
 674    int namelen = strlen(ir->name);
 675
 676    /* The data for our (non-builtin) uniforms is stored in a series of
 677     * gl_uniform_driver_storage structs for each subcomponent that
 678     * glGetUniformLocation() could name.  We know it's been set up in the same
 679     * order we'd walk the type, so walk the list of storage and find anything
 680     * with our name, or the prefix of a component that starts with our name.
 681     */
 682    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 683       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 684
 685       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 686           (storage->name[namelen] != 0 &&
 687            storage->name[namelen] != '.' &&
 688            storage->name[namelen] != '[')) {
 689          continue;
 690       }
 691
 692       gl_constant_value *components = storage->storage;
 693       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 694                                storage->type->matrix_columns);
 695
 696       for (unsigned s = 0; s < vector_count; s++) {
 697          assert(uniforms < uniform_array_size);
 698          uniform_vector_size[uniforms] = storage->type->vector_elements;
 699
 700          int i;
 701          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 702             stage_prog_data->param[uniforms * 4 + i] = components;
 703             components++;
 704          }
 705          for (; i < 4; i++) {
 706             static gl_constant_value zero = { 0.0 };
 707             stage_prog_data->param[uniforms * 4 + i] = &zero;
 708          }
 709
 710          uniforms++;
 711       }
 712    }
 713 }
 714
 715 void
 716 vec4_visitor::setup_uniform_clipplane_values()
 717 {
 718    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 719
 720    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 721       assert(this->uniforms < uniform_array_size);
 722       this->uniform_vector_size[this->uniforms] = 4;
 723       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 724       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 725       for (int j = 0; j < 4; ++j) {
 726          stage_prog_data->param[this->uniforms * 4 + j] =
 727             (gl_constant_value *) &clip_planes[i][j];
 728       }
 729       ++this->uniforms;
 730    }
 731 }
 732
 733 /* Our support for builtin uniforms is even scarier than non-builtin.
 734  * It sits on top of the PROG_STATE_VAR parameters that are
 735  * automatically updated from GL context state.
 736  */
 737 void
 738 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 739 {
 740    const ir_state_slot *const slots = ir->get_state_slots();
 741    assert(slots != NULL);
 742
 743    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 744       /* This state reference has already been setup by ir_to_mesa,
 745        * but we'll get the same index back here.  We can reference
 746        * ParameterValues directly, since unlike brw_fs.cpp, we never
 747        * add new state references during compile.
 748        */
 749       int index = _mesa_add_state_reference(this->prog->Parameters,
 750                                             (gl_state_index *)slots[i].tokens);
 751       gl_constant_value *values =
 752          &this->prog->Parameters->ParameterValues[index][0];
 753
 754       assert(this->uniforms < uniform_array_size);
 755       this->uniform_vector_size[this->uniforms] = 0;
 756       /* Add each of the unique swizzled channels of the element.
 757        * This will end up matching the size of the glsl_type of this field.
 758        */
 759       int last_swiz = -1;
 760       for (unsigned int j = 0; j < 4; j++) {
 761          int swiz = GET_SWZ(slots[i].swizzle, j);
 762          last_swiz = swiz;
 763
 764          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 765          assert(this->uniforms < uniform_array_size);
 766          if (swiz <= last_swiz)
 767             this->uniform_vector_size[this->uniforms]++;
 768       }
 769       this->uniforms++;
 770    }
 771 }
 772
 773 dst_reg *
 774 vec4_visitor::variable_storage(ir_variable *var)
 775 {
 776    return (dst_reg *)hash_table_find(this->variable_ht, var);
 777 }
 778
 779 void
 780 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 781                                      enum brw_predicate *predicate)
 782 {
 783    ir_expression *expr = ir->as_expression();
 784
 785    *predicate = BRW_PREDICATE_NORMAL;
 786
 787    if (expr && expr->operation != ir_binop_ubo_load) {
 788       src_reg op[3];
 789       vec4_instruction *inst;
 790
 791       assert(expr->get_num_operands() <= 3);
 792       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 793          expr->operands[i]->accept(this);
 794          op[i] = this->result;
 795
 796          resolve_ud_negate(&op[i]);
 797       }
 798
 799       switch (expr->operation) {
 800       case ir_unop_logic_not:
 801          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 802          inst->conditional_mod = BRW_CONDITIONAL_Z;
 803          break;
 804
 805       case ir_binop_logic_xor:
 806          if (brw->gen <= 5) {
 807             src_reg temp = src_reg(this, ir->type);
 808             emit(XOR(dst_reg(temp), op[0], op[1]));
 809             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 810          } else {
 811             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 812          }
 813          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 814          break;
 815
 816       case ir_binop_logic_or:
 817          if (brw->gen <= 5) {
 818             src_reg temp = src_reg(this, ir->type);
 819             emit(OR(dst_reg(temp), op[0], op[1]));
 820             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 821          } else {
 822             inst = emit(OR(dst_null_d(), op[0], op[1]));
 823          }
 824          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 825          break;
 826
 827       case ir_binop_logic_and:
 828          if (brw->gen <= 5) {
 829             src_reg temp = src_reg(this, ir->type);
 830             emit(AND(dst_reg(temp), op[0], op[1]));
 831             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 832          } else {
 833             inst = emit(AND(dst_null_d(), op[0], op[1]));
 834          }
 835          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836          break;
 837
 838       case ir_unop_f2b:
 839          if (brw->gen >= 6) {
 840             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 841          } else {
 842             inst = emit(MOV(dst_null_f(), op[0]));
 843             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 844          }
 845          break;
 846
 847       case ir_unop_i2b:
 848          if (brw->gen >= 6) {
 849             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 850          } else {
 851             inst = emit(MOV(dst_null_d(), op[0]));
 852             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 853          }
 854          break;
 855
 856       case ir_binop_all_equal:
 857          if (brw->gen <= 5) {
 858             resolve_bool_comparison(expr->operands[0], &op[0]);
 859             resolve_bool_comparison(expr->operands[1], &op[1]);
 860          }
 861          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 862          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 863          break;
 864
 865       case ir_binop_any_nequal:
 866          if (brw->gen <= 5) {
 867             resolve_bool_comparison(expr->operands[0], &op[0]);
 868             resolve_bool_comparison(expr->operands[1], &op[1]);
 869          }
 870          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 871          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 872          break;
 873
 874       case ir_unop_any:
 875          if (brw->gen <= 5) {
 876             resolve_bool_comparison(expr->operands[0], &op[0]);
 877          }
 878          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 879          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 880          break;
 881
 882       case ir_binop_greater:
 883       case ir_binop_gequal:
 884       case ir_binop_less:
 885       case ir_binop_lequal:
 886       case ir_binop_equal:
 887       case ir_binop_nequal:
 888          if (brw->gen <= 5) {
 889             resolve_bool_comparison(expr->operands[0], &op[0]);
 890             resolve_bool_comparison(expr->operands[1], &op[1]);
 891          }
 892          emit(CMP(dst_null_d(), op[0], op[1],
 893                   brw_conditional_for_comparison(expr->operation)));
 894          break;
 895
 896       case ir_triop_csel: {
 897          /* Expand the boolean condition into the flag register. */
 898          inst = emit(MOV(dst_null_d(), op[0]));
 899          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 900
 901          /* Select which boolean to return. */
 902          dst_reg temp(this, expr->operands[1]->type);
 903          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 904          inst->predicate = BRW_PREDICATE_NORMAL;
 905
 906          /* Expand the result to a condition code. */
 907          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 908          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 909          break;
 910       }
 911
 912       default:
 913          unreachable("not reached");
 914       }
 915       return;
 916    }
 917
 918    ir->accept(this);
 919
 920    resolve_ud_negate(&this->result);
 921
 922    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 923    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 924 }
 925
 926 /**
 927  * Emit a gen6 IF statement with the comparison folded into the IF
 928  * instruction.
 929  */
 930 void
 931 vec4_visitor::emit_if_gen6(ir_if *ir)
 932 {
 933    ir_expression *expr = ir->condition->as_expression();
 934
 935    if (expr && expr->operation != ir_binop_ubo_load) {
 936       src_reg op[3];
 937       dst_reg temp;
 938
 939       assert(expr->get_num_operands() <= 3);
 940       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 941          expr->operands[i]->accept(this);
 942          op[i] = this->result;
 943       }
 944
 945       switch (expr->operation) {
 946       case ir_unop_logic_not:
 947          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 948          return;
 949
 950       case ir_binop_logic_xor:
 951          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 952          return;
 953
 954       case ir_binop_logic_or:
 955          temp = dst_reg(this, glsl_type::bool_type);
 956          emit(OR(temp, op[0], op[1]));
 957          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 958          return;
 959
 960       case ir_binop_logic_and:
 961          temp = dst_reg(this, glsl_type::bool_type);
 962          emit(AND(temp, op[0], op[1]));
 963          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 964          return;
 965
 966       case ir_unop_f2b:
 967          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_unop_i2b:
 971          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_binop_greater:
 975       case ir_binop_gequal:
 976       case ir_binop_less:
 977       case ir_binop_lequal:
 978       case ir_binop_equal:
 979       case ir_binop_nequal:
 980          emit(IF(op[0], op[1],
 981                  brw_conditional_for_comparison(expr->operation)));
 982          return;
 983
 984       case ir_binop_all_equal:
 985          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 986          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 987          return;
 988
 989       case ir_binop_any_nequal:
 990          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 991          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 992          return;
 993
 994       case ir_unop_any:
 995          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 996          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 997          return;
 998
 999       case ir_triop_csel: {
1000          /* Expand the boolean condition into the flag register. */
1001          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1002          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1003
1004          /* Select which boolean to return. */
1005          dst_reg temp(this, expr->operands[1]->type);
1006          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1007          inst->predicate = BRW_PREDICATE_NORMAL;
1008
1009          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1010          return;
1011       }
1012
1013       default:
1014          unreachable("not reached");
1015       }
1016       return;
1017    }
1018
1019    ir->condition->accept(this);
1020
1021    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_variable *ir)
1026 {
1027    dst_reg *reg = NULL;
1028
1029    if (variable_storage(ir))
1030       return;
1031
1032    switch (ir->data.mode) {
1033    case ir_var_shader_in:
1034       assert(ir->data.location != -1);
1035       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1036       break;
1037
1038    case ir_var_shader_out:
1039       assert(ir->data.location != -1);
1040       reg = new(mem_ctx) dst_reg(this, ir->type);
1041
1042       for (int i = 0; i < type_size(ir->type); i++) {
1043          output_reg[ir->data.location + i] = *reg;
1044          output_reg[ir->data.location + i].reg_offset = i;
1045          output_reg[ir->data.location + i].type =
1046             brw_type_for_base_type(ir->type->get_scalar_type());
1047          output_reg_annotation[ir->data.location + i] = ir->name;
1048       }
1049       break;
1050
1051    case ir_var_auto:
1052    case ir_var_temporary:
1053       reg = new(mem_ctx) dst_reg(this, ir->type);
1054       break;
1055
1056    case ir_var_uniform:
1057       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1058
1059       /* Thanks to the lower_ubo_reference pass, we will see only
1060        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1061        * variables, so no need for them to be in variable_ht.
1062        *
1063        * Some uniforms, such as samplers and atomic counters, have no actual
1064        * storage, so we should ignore them.
1065        */
1066       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1067          return;
1068
1069       /* Track how big the whole uniform variable is, in case we need to put a
1070        * copy of its data into pull constants for array access.
1071        */
1072       assert(this->uniforms < uniform_array_size);
1073       this->uniform_size[this->uniforms] = type_size(ir->type);
1074
1075       if (!strncmp(ir->name, "gl_", 3)) {
1076          setup_builtin_uniform_values(ir);
1077       } else {
1078          setup_uniform_values(ir);
1079       }
1080       break;
1081
1082    case ir_var_system_value:
1083       reg = make_reg_for_system_value(ir);
1084       break;
1085
1086    default:
1087       unreachable("not reached");
1088    }
1089
1090    reg->type = brw_type_for_base_type(ir->type);
1091    hash_table_insert(this->variable_ht, reg, ir);
1092 }
1093
1094 void
1095 vec4_visitor::visit(ir_loop *ir)
1096 {
1097    /* We don't want debugging output to print the whole body of the
1098     * loop as the annotation.
1099     */
1100    this->base_ir = NULL;
1101
1102    emit(BRW_OPCODE_DO);
1103
1104    visit_instructions(&ir->body_instructions);
1105
1106    emit(BRW_OPCODE_WHILE);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop_jump *ir)
1111 {
1112    switch (ir->mode) {
1113    case ir_loop_jump::jump_break:
1114       emit(BRW_OPCODE_BREAK);
1115       break;
1116    case ir_loop_jump::jump_continue:
1117       emit(BRW_OPCODE_CONTINUE);
1118       break;
1119    }
1120 }
1121
1122
1123 void
1124 vec4_visitor::visit(ir_function_signature *)
1125 {
1126    unreachable("not reached");
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_function *ir)
1131 {
1132    /* Ignore function bodies other than main() -- we shouldn't see calls to
1133     * them since they should all be inlined.
1134     */
1135    if (strcmp(ir->name, "main") == 0) {
1136       const ir_function_signature *sig;
1137       exec_list empty;
1138
1139       sig = ir->matching_signature(NULL, &empty, false);
1140
1141       assert(sig);
1142
1143       visit_instructions(&sig->body);
1144    }
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_mad(ir_expression *ir)
1149 {
1150    /* 3-src instructions were introduced in gen6. */
1151    if (brw->gen < 6)
1152       return false;
1153
1154    /* MAD can only handle floating-point data. */
1155    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1156       return false;
1157
1158    ir_rvalue *nonmul = ir->operands[1];
1159    ir_expression *mul = ir->operands[0]->as_expression();
1160
1161    if (!mul || mul->operation != ir_binop_mul) {
1162       nonmul = ir->operands[0];
1163       mul = ir->operands[1]->as_expression();
1164
1165       if (!mul || mul->operation != ir_binop_mul)
1166          return false;
1167    }
1168
1169    nonmul->accept(this);
1170    src_reg src0 = fix_3src_operand(this->result);
1171
1172    mul->operands[0]->accept(this);
1173    src_reg src1 = fix_3src_operand(this->result);
1174
1175    mul->operands[1]->accept(this);
1176    src_reg src2 = fix_3src_operand(this->result);
1177
1178    this->result = src_reg(this, ir->type);
1179    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1180
1181    return true;
1182 }
1183
1184 bool
1185 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1186 {
1187    /* This optimization relies on CMP setting the destination to 0 when
1188     * false.  Early hardware only sets the least significant bit, and
1189     * leaves the other bits undefined.  So we can't use it.
1190     */
1191    if (brw->gen < 6)
1192       return false;
1193
1194    ir_expression *const cmp = ir->operands[0]->as_expression();
1195
1196    if (cmp == NULL)
1197       return false;
1198
1199    switch (cmp->operation) {
1200    case ir_binop_less:
1201    case ir_binop_greater:
1202    case ir_binop_lequal:
1203    case ir_binop_gequal:
1204    case ir_binop_equal:
1205    case ir_binop_nequal:
1206       break;
1207
1208    default:
1209       return false;
1210    }
1211
1212    cmp->operands[0]->accept(this);
1213    const src_reg cmp_src0 = this->result;
1214
1215    cmp->operands[1]->accept(this);
1216    const src_reg cmp_src1 = this->result;
1217
1218    this->result = src_reg(this, ir->type);
1219
1220    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1221             brw_conditional_for_comparison(cmp->operation)));
1222
1223    /* If the comparison is false, this->result will just happen to be zero.
1224     */
1225    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1226                                        this->result, src_reg(1.0f));
1227    inst->predicate = BRW_PREDICATE_NORMAL;
1228    inst->predicate_inverse = true;
1229
1230    return true;
1231 }
1232
1233 void
1234 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1235                           src_reg src0, src_reg src1)
1236 {
1237    vec4_instruction *inst;
1238
1239    if (brw->gen >= 6) {
1240       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1241       inst->conditional_mod = conditionalmod;
1242    } else {
1243       emit(CMP(dst, src0, src1, conditionalmod));
1244
1245       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1246       inst->predicate = BRW_PREDICATE_NORMAL;
1247    }
1248 }
1249
1250 void
1251 vec4_visitor::emit_lrp(const dst_reg &dst,
1252                        const src_reg &x, const src_reg &y, const src_reg &a)
1253 {
1254    if (brw->gen >= 6) {
1255       /* Note that the instruction's argument order is reversed from GLSL
1256        * and the IR.
1257        */
1258       emit(LRP(dst,
1259                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1260    } else {
1261       /* Earlier generations don't support three source operations, so we
1262        * need to emit x*(1-a) + y*a.
1263        */
1264       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1265       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1266       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1267       y_times_a.writemask           = dst.writemask;
1268       one_minus_a.writemask         = dst.writemask;
1269       x_times_one_minus_a.writemask = dst.writemask;
1270
1271       emit(MUL(y_times_a, y, a));
1272       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1273       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1274       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1275    }
1276 }
1277
1278 void
1279 vec4_visitor::visit(ir_expression *ir)
1280 {
1281    unsigned int operand;
1282    src_reg op[Elements(ir->operands)];
1283    vec4_instruction *inst;
1284
1285    if (ir->operation == ir_binop_add) {
1286       if (try_emit_mad(ir))
1287          return;
1288    }
1289
1290    if (ir->operation == ir_unop_b2f) {
1291       if (try_emit_b2f_of_compare(ir))
1292          return;
1293    }
1294
1295    /* Storage for our result.  Ideally for an assignment we'd be using
1296     * the actual storage for the result here, instead.
1297     */
1298    dst_reg result_dst(this, ir->type);
1299    src_reg result_src(result_dst);
1300
1301    if (ir->operation == ir_triop_csel) {
1302       ir->operands[1]->accept(this);
1303       op[1] = this->result;
1304       ir->operands[2]->accept(this);
1305       op[2] = this->result;
1306
1307       enum brw_predicate predicate;
1308       emit_bool_to_cond_code(ir->operands[0], &predicate);
1309       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1310       inst->predicate = predicate;
1311       this->result = result_src;
1312       return;
1313    }
1314
1315    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1316       this->result.file = BAD_FILE;
1317       ir->operands[operand]->accept(this);
1318       if (this->result.file == BAD_FILE) {
1319          fprintf(stderr, "Failed to get tree for expression operand:\n");
1320          ir->operands[operand]->fprint(stderr);
1321          exit(1);
1322       }
1323       op[operand] = this->result;
1324
1325       /* Matrix expression operands should have been broken down to vector
1326        * operations already.
1327        */
1328       assert(!ir->operands[operand]->type->is_matrix());
1329    }
1330
1331    /* If nothing special happens, this is the result. */
1332    this->result = result_src;
1333
1334    switch (ir->operation) {
1335    case ir_unop_logic_not:
1336       emit(NOT(result_dst, op[0]));
1337       break;
1338    case ir_unop_neg:
1339       op[0].negate = !op[0].negate;
1340       emit(MOV(result_dst, op[0]));
1341       break;
1342    case ir_unop_abs:
1343       op[0].abs = true;
1344       op[0].negate = false;
1345       emit(MOV(result_dst, op[0]));
1346       break;
1347
1348    case ir_unop_sign:
1349       if (ir->type->is_float()) {
1350          /* AND(val, 0x80000000) gives the sign bit.
1351           *
1352           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1353           * zero.
1354           */
1355          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1356
1357          op[0].type = BRW_REGISTER_TYPE_UD;
1358          result_dst.type = BRW_REGISTER_TYPE_UD;
1359          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1360
1361          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1362          inst->predicate = BRW_PREDICATE_NORMAL;
1363
1364          this->result.type = BRW_REGISTER_TYPE_F;
1365       } else {
1366          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1367           *               -> non-negative val generates 0x00000000.
1368           *  Predicated OR sets 1 if val is positive.
1369           */
1370          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1371
1372          emit(ASR(result_dst, op[0], src_reg(31)));
1373
1374          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1375          inst->predicate = BRW_PREDICATE_NORMAL;
1376       }
1377       break;
1378
1379    case ir_unop_rcp:
1380       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1381       break;
1382
1383    case ir_unop_exp2:
1384       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1385       break;
1386    case ir_unop_log2:
1387       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1388       break;
1389    case ir_unop_exp:
1390    case ir_unop_log:
1391       unreachable("not reached: should be handled by ir_explog_to_explog2");
1392    case ir_unop_sin:
1393    case ir_unop_sin_reduced:
1394       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1395       break;
1396    case ir_unop_cos:
1397    case ir_unop_cos_reduced:
1398       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1399       break;
1400
1401    case ir_unop_dFdx:
1402    case ir_unop_dFdx_coarse:
1403    case ir_unop_dFdx_fine:
1404    case ir_unop_dFdy:
1405    case ir_unop_dFdy_coarse:
1406    case ir_unop_dFdy_fine:
1407       unreachable("derivatives not valid in vertex shader");
1408
1409    case ir_unop_bitfield_reverse:
1410       emit(BFREV(result_dst, op[0]));
1411       break;
1412    case ir_unop_bit_count:
1413       emit(CBIT(result_dst, op[0]));
1414       break;
1415    case ir_unop_find_msb: {
1416       src_reg temp = src_reg(this, glsl_type::uint_type);
1417
1418       inst = emit(FBH(dst_reg(temp), op[0]));
1419       inst->dst.writemask = WRITEMASK_XYZW;
1420
1421       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1422        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1423        * subtract the result from 31 to convert the MSB count into an LSB count.
1424        */
1425
1426       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1427       temp.swizzle = BRW_SWIZZLE_NOOP;
1428       emit(MOV(result_dst, temp));
1429
1430       src_reg src_tmp = src_reg(result_dst);
1431       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1432
1433       src_tmp.negate = true;
1434       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1435       inst->predicate = BRW_PREDICATE_NORMAL;
1436       break;
1437    }
1438    case ir_unop_find_lsb:
1439       emit(FBL(result_dst, op[0]));
1440       break;
1441    case ir_unop_saturate:
1442       inst = emit(MOV(result_dst, op[0]));
1443       inst->saturate = true;
1444       break;
1445
1446    case ir_unop_noise:
1447       unreachable("not reached: should be handled by lower_noise");
1448
1449    case ir_binop_add:
1450       emit(ADD(result_dst, op[0], op[1]));
1451       break;
1452    case ir_binop_sub:
1453       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1454
1455    case ir_binop_mul:
1456       if (brw->gen < 8 && ir->type->is_integer()) {
1457          /* For integer multiplication, the MUL uses the low 16 bits of one of
1458           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1459           * accumulates in the contribution of the upper 16 bits of that
1460           * operand.  If we can determine that one of the args is in the low
1461           * 16 bits, though, we can just emit a single MUL.
1462           */
1463          if (ir->operands[0]->is_uint16_constant()) {
1464             if (brw->gen < 7)
1465                emit(MUL(result_dst, op[0], op[1]));
1466             else
1467                emit(MUL(result_dst, op[1], op[0]));
1468          } else if (ir->operands[1]->is_uint16_constant()) {
1469             if (brw->gen < 7)
1470                emit(MUL(result_dst, op[1], op[0]));
1471             else
1472                emit(MUL(result_dst, op[0], op[1]));
1473          } else {
1474             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1475
1476             emit(MUL(acc, op[0], op[1]));
1477             emit(MACH(dst_null_d(), op[0], op[1]));
1478             emit(MOV(result_dst, src_reg(acc)));
1479          }
1480       } else {
1481          emit(MUL(result_dst, op[0], op[1]));
1482       }
1483       break;
1484    case ir_binop_imul_high: {
1485       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1486
1487       emit(MUL(acc, op[0], op[1]));
1488       emit(MACH(result_dst, op[0], op[1]));
1489       break;
1490    }
1491    case ir_binop_div:
1492       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1493       assert(ir->type->is_integer());
1494       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1495       break;
1496    case ir_binop_carry: {
1497       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1498
1499       emit(ADDC(dst_null_ud(), op[0], op[1]));
1500       emit(MOV(result_dst, src_reg(acc)));
1501       break;
1502    }
1503    case ir_binop_borrow: {
1504       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1505
1506       emit(SUBB(dst_null_ud(), op[0], op[1]));
1507       emit(MOV(result_dst, src_reg(acc)));
1508       break;
1509    }
1510    case ir_binop_mod:
1511       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1512       assert(ir->type->is_integer());
1513       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1514       break;
1515
1516    case ir_binop_less:
1517    case ir_binop_greater:
1518    case ir_binop_lequal:
1519    case ir_binop_gequal:
1520    case ir_binop_equal:
1521    case ir_binop_nequal: {
1522       if (brw->gen <= 5) {
1523          resolve_bool_comparison(ir->operands[0], &op[0]);
1524          resolve_bool_comparison(ir->operands[1], &op[1]);
1525       }
1526       emit(CMP(result_dst, op[0], op[1],
1527                brw_conditional_for_comparison(ir->operation)));
1528       break;
1529    }
1530
1531    case ir_binop_all_equal:
1532       /* "==" operator producing a scalar boolean. */
1533       if (ir->operands[0]->type->is_vector() ||
1534           ir->operands[1]->type->is_vector()) {
1535          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1536          emit(MOV(result_dst, src_reg(0)));
1537          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1538          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1539       } else {
1540          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1541       }
1542       break;
1543    case ir_binop_any_nequal:
1544       /* "!=" operator producing a scalar boolean. */
1545       if (ir->operands[0]->type->is_vector() ||
1546           ir->operands[1]->type->is_vector()) {
1547          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1548
1549          emit(MOV(result_dst, src_reg(0)));
1550          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1551          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1552       } else {
1553          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1554       }
1555       break;
1556
1557    case ir_unop_any:
1558       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1559       emit(MOV(result_dst, src_reg(0)));
1560
1561       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1562       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1563       break;
1564
1565    case ir_binop_logic_xor:
1566       emit(XOR(result_dst, op[0], op[1]));
1567       break;
1568
1569    case ir_binop_logic_or:
1570       emit(OR(result_dst, op[0], op[1]));
1571       break;
1572
1573    case ir_binop_logic_and:
1574       emit(AND(result_dst, op[0], op[1]));
1575       break;
1576
1577    case ir_binop_dot:
1578       assert(ir->operands[0]->type->is_vector());
1579       assert(ir->operands[0]->type == ir->operands[1]->type);
1580       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1581       break;
1582
1583    case ir_unop_sqrt:
1584       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1585       break;
1586    case ir_unop_rsq:
1587       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1588       break;
1589
1590    case ir_unop_bitcast_i2f:
1591    case ir_unop_bitcast_u2f:
1592       this->result = op[0];
1593       this->result.type = BRW_REGISTER_TYPE_F;
1594       break;
1595
1596    case ir_unop_bitcast_f2i:
1597       this->result = op[0];
1598       this->result.type = BRW_REGISTER_TYPE_D;
1599       break;
1600
1601    case ir_unop_bitcast_f2u:
1602       this->result = op[0];
1603       this->result.type = BRW_REGISTER_TYPE_UD;
1604       break;
1605
1606    case ir_unop_i2f:
1607    case ir_unop_i2u:
1608    case ir_unop_u2i:
1609    case ir_unop_u2f:
1610    case ir_unop_f2i:
1611    case ir_unop_f2u:
1612       emit(MOV(result_dst, op[0]));
1613       break;
1614    case ir_unop_b2i:
1615       emit(AND(result_dst, op[0], src_reg(1)));
1616       break;
1617    case ir_unop_b2f:
1618       if (brw->gen <= 5) {
1619          resolve_bool_comparison(ir->operands[0], &op[0]);
1620       }
1621       op[0].type = BRW_REGISTER_TYPE_D;
1622       result_dst.type = BRW_REGISTER_TYPE_D;
1623       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1624       result_dst.type = BRW_REGISTER_TYPE_F;
1625       break;
1626    case ir_unop_f2b:
1627       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1628       break;
1629    case ir_unop_i2b:
1630       emit(AND(result_dst, op[0], src_reg(1)));
1631       break;
1632
1633    case ir_unop_trunc:
1634       emit(RNDZ(result_dst, op[0]));
1635       break;
1636    case ir_unop_ceil: {
1637          src_reg tmp = src_reg(this, ir->type);
1638          op[0].negate = !op[0].negate;
1639          emit(RNDD(dst_reg(tmp), op[0]));
1640          tmp.negate = true;
1641          emit(MOV(result_dst, tmp));
1642       }
1643       break;
1644    case ir_unop_floor:
1645       inst = emit(RNDD(result_dst, op[0]));
1646       break;
1647    case ir_unop_fract:
1648       inst = emit(FRC(result_dst, op[0]));
1649       break;
1650    case ir_unop_round_even:
1651       emit(RNDE(result_dst, op[0]));
1652       break;
1653
1654    case ir_binop_min:
1655       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1656       break;
1657    case ir_binop_max:
1658       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1659       break;
1660
1661    case ir_binop_pow:
1662       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1663       break;
1664
1665    case ir_unop_bit_not:
1666       inst = emit(NOT(result_dst, op[0]));
1667       break;
1668    case ir_binop_bit_and:
1669       inst = emit(AND(result_dst, op[0], op[1]));
1670       break;
1671    case ir_binop_bit_xor:
1672       inst = emit(XOR(result_dst, op[0], op[1]));
1673       break;
1674    case ir_binop_bit_or:
1675       inst = emit(OR(result_dst, op[0], op[1]));
1676       break;
1677
1678    case ir_binop_lshift:
1679       inst = emit(SHL(result_dst, op[0], op[1]));
1680       break;
1681
1682    case ir_binop_rshift:
1683       if (ir->type->base_type == GLSL_TYPE_INT)
1684          inst = emit(ASR(result_dst, op[0], op[1]));
1685       else
1686          inst = emit(SHR(result_dst, op[0], op[1]));
1687       break;
1688
1689    case ir_binop_bfm:
1690       emit(BFI1(result_dst, op[0], op[1]));
1691       break;
1692
1693    case ir_binop_ubo_load: {
1694       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1695       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1696       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1697       src_reg offset;
1698
1699       /* Now, load the vector from that offset. */
1700       assert(ir->type->is_vector() || ir->type->is_scalar());
1701
1702       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1703       packed_consts.type = result.type;
1704       src_reg surf_index;
1705
1706       if (const_uniform_block) {
1707          /* The block index is a constant, so just emit the binding table entry
1708           * as an immediate.
1709           */
1710          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1711                               const_uniform_block->value.u[0]);
1712       } else {
1713          /* The block index is not a constant. Evaluate the index expression
1714           * per-channel and add the base UBO index; the generator will select
1715           * a value from any live channel.
1716           */
1717          surf_index = src_reg(this, glsl_type::uint_type);
1718          emit(ADD(dst_reg(surf_index), op[0],
1719                   src_reg(prog_data->base.binding_table.ubo_start)));
1720
1721          /* Assume this may touch any UBO. It would be nice to provide
1722           * a tighter bound, but the array information is already lowered away.
1723           */
1724          brw_mark_surface_used(&prog_data->base,
1725                                prog_data->base.binding_table.ubo_start +
1726                                shader_prog->NumUniformBlocks - 1);
1727       }
1728
1729       if (const_offset_ir) {
1730          if (brw->gen >= 8) {
1731             /* Store the offset in a GRF so we can send-from-GRF. */
1732             offset = src_reg(this, glsl_type::int_type);
1733             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1734          } else {
1735             /* Immediates are fine on older generations since they'll be moved
1736              * to a (potentially fake) MRF at the generator level.
1737              */
1738             offset = src_reg(const_offset / 16);
1739          }
1740       } else {
1741          offset = src_reg(this, glsl_type::uint_type);
1742          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1743       }
1744
1745       if (brw->gen >= 7) {
1746          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1747          grf_offset.type = offset.type;
1748
1749          emit(MOV(grf_offset, offset));
1750
1751          emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1752                                             dst_reg(packed_consts),
1753                                             surf_index,
1754                                             src_reg(grf_offset)));
1755       } else {
1756          vec4_instruction *pull =
1757             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1758                                                dst_reg(packed_consts),
1759                                                surf_index,
1760                                                offset));
1761          pull->base_mrf = 14;
1762          pull->mlen = 1;
1763       }
1764
1765       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1766       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1767                                             const_offset % 16 / 4,
1768                                             const_offset % 16 / 4,
1769                                             const_offset % 16 / 4);
1770
1771       /* UBO bools are any nonzero int.  We need to convert them to use the
1772        * value of true stored in ctx->Const.UniformBooleanTrue.
1773        */
1774       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1775          emit(CMP(result_dst, packed_consts, src_reg(0u),
1776                   BRW_CONDITIONAL_NZ));
1777       } else {
1778          emit(MOV(result_dst, packed_consts));
1779       }
1780       break;
1781    }
1782
1783    case ir_binop_vector_extract:
1784       unreachable("should have been lowered by vec_index_to_cond_assign");
1785
1786    case ir_triop_fma:
1787       op[0] = fix_3src_operand(op[0]);
1788       op[1] = fix_3src_operand(op[1]);
1789       op[2] = fix_3src_operand(op[2]);
1790       /* Note that the instruction's argument order is reversed from GLSL
1791        * and the IR.
1792        */
1793       emit(MAD(result_dst, op[2], op[1], op[0]));
1794       break;
1795
1796    case ir_triop_lrp:
1797       emit_lrp(result_dst, op[0], op[1], op[2]);
1798       break;
1799
1800    case ir_triop_csel:
1801       unreachable("already handled above");
1802       break;
1803
1804    case ir_triop_bfi:
1805       op[0] = fix_3src_operand(op[0]);
1806       op[1] = fix_3src_operand(op[1]);
1807       op[2] = fix_3src_operand(op[2]);
1808       emit(BFI2(result_dst, op[0], op[1], op[2]));
1809       break;
1810
1811    case ir_triop_bitfield_extract:
1812       op[0] = fix_3src_operand(op[0]);
1813       op[1] = fix_3src_operand(op[1]);
1814       op[2] = fix_3src_operand(op[2]);
1815       /* Note that the instruction's argument order is reversed from GLSL
1816        * and the IR.
1817        */
1818       emit(BFE(result_dst, op[2], op[1], op[0]));
1819       break;
1820
1821    case ir_triop_vector_insert:
1822       unreachable("should have been lowered by lower_vector_insert");
1823
1824    case ir_quadop_bitfield_insert:
1825       unreachable("not reached: should be handled by "
1826               "bitfield_insert_to_bfm_bfi\n");
1827
1828    case ir_quadop_vector:
1829       unreachable("not reached: should be handled by lower_quadop_vector");
1830
1831    case ir_unop_pack_half_2x16:
1832       emit_pack_half_2x16(result_dst, op[0]);
1833       break;
1834    case ir_unop_unpack_half_2x16:
1835       emit_unpack_half_2x16(result_dst, op[0]);
1836       break;
1837    case ir_unop_unpack_unorm_4x8:
1838       emit_unpack_unorm_4x8(result_dst, op[0]);
1839       break;
1840    case ir_unop_unpack_snorm_4x8:
1841       emit_unpack_snorm_4x8(result_dst, op[0]);
1842       break;
1843    case ir_unop_pack_unorm_4x8:
1844       emit_pack_unorm_4x8(result_dst, op[0]);
1845       break;
1846    case ir_unop_pack_snorm_4x8:
1847       emit_pack_snorm_4x8(result_dst, op[0]);
1848       break;
1849    case ir_unop_pack_snorm_2x16:
1850    case ir_unop_pack_unorm_2x16:
1851    case ir_unop_unpack_snorm_2x16:
1852    case ir_unop_unpack_unorm_2x16:
1853       unreachable("not reached: should be handled by lower_packing_builtins");
1854    case ir_unop_unpack_half_2x16_split_x:
1855    case ir_unop_unpack_half_2x16_split_y:
1856    case ir_binop_pack_half_2x16_split:
1857    case ir_unop_interpolate_at_centroid:
1858    case ir_binop_interpolate_at_sample:
1859    case ir_binop_interpolate_at_offset:
1860       unreachable("not reached: should not occur in vertex shader");
1861    case ir_binop_ldexp:
1862       unreachable("not reached: should be handled by ldexp_to_arith()");
1863    }
1864 }
1865
1866
1867 void
1868 vec4_visitor::visit(ir_swizzle *ir)
1869 {
1870    src_reg src;
1871    int i = 0;
1872    int swizzle[4];
1873
1874    /* Note that this is only swizzles in expressions, not those on the left
1875     * hand side of an assignment, which do write masking.  See ir_assignment
1876     * for that.
1877     */
1878
1879    ir->val->accept(this);
1880    src = this->result;
1881    assert(src.file != BAD_FILE);
1882
1883    for (i = 0; i < ir->type->vector_elements; i++) {
1884       switch (i) {
1885       case 0:
1886          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1887          break;
1888       case 1:
1889          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1890          break;
1891       case 2:
1892          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1893          break;
1894       case 3:
1895          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1896             break;
1897       }
1898    }
1899    for (; i < 4; i++) {
1900       /* Replicate the last channel out. */
1901       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1902    }
1903
1904    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1905
1906    this->result = src;
1907 }
1908
1909 void
1910 vec4_visitor::visit(ir_dereference_variable *ir)
1911 {
1912    const struct glsl_type *type = ir->type;
1913    dst_reg *reg = variable_storage(ir->var);
1914
1915    if (!reg) {
1916       fail("Failed to find variable storage for %s\n", ir->var->name);
1917       this->result = src_reg(brw_null_reg());
1918       return;
1919    }
1920
1921    this->result = src_reg(*reg);
1922
1923    /* System values get their swizzle from the dst_reg writemask */
1924    if (ir->var->data.mode == ir_var_system_value)
1925       return;
1926
1927    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1928       this->result.swizzle = swizzle_for_size(type->vector_elements);
1929 }
1930
1931
1932 int
1933 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1934 {
1935    /* Under normal circumstances array elements are stored consecutively, so
1936     * the stride is equal to the size of the array element.
1937     */
1938    return type_size(ir->type);
1939 }
1940
1941
1942 void
1943 vec4_visitor::visit(ir_dereference_array *ir)
1944 {
1945    ir_constant *constant_index;
1946    src_reg src;
1947    int array_stride = compute_array_stride(ir);
1948
1949    constant_index = ir->array_index->constant_expression_value();
1950
1951    ir->array->accept(this);
1952    src = this->result;
1953
1954    if (constant_index) {
1955       src.reg_offset += constant_index->value.i[0] * array_stride;
1956    } else {
1957       /* Variable index array dereference.  It eats the "vec4" of the
1958        * base of the array and an index that offsets the Mesa register
1959        * index.
1960        */
1961       ir->array_index->accept(this);
1962
1963       src_reg index_reg;
1964
1965       if (array_stride == 1) {
1966          index_reg = this->result;
1967       } else {
1968          index_reg = src_reg(this, glsl_type::int_type);
1969
1970          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1971       }
1972
1973       if (src.reladdr) {
1974          src_reg temp = src_reg(this, glsl_type::int_type);
1975
1976          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1977
1978          index_reg = temp;
1979       }
1980
1981       src.reladdr = ralloc(mem_ctx, src_reg);
1982       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1983    }
1984
1985    /* If the type is smaller than a vec4, replicate the last channel out. */
1986    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1987       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1988    else
1989       src.swizzle = BRW_SWIZZLE_NOOP;
1990    src.type = brw_type_for_base_type(ir->type);
1991
1992    this->result = src;
1993 }
1994
1995 void
1996 vec4_visitor::visit(ir_dereference_record *ir)
1997 {
1998    unsigned int i;
1999    const glsl_type *struct_type = ir->record->type;
2000    int offset = 0;
2001
2002    ir->record->accept(this);
2003
2004    for (i = 0; i < struct_type->length; i++) {
2005       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2006          break;
2007       offset += type_size(struct_type->fields.structure[i].type);
2008    }
2009
2010    /* If the type is smaller than a vec4, replicate the last channel out. */
2011    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2012       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2013    else
2014       this->result.swizzle = BRW_SWIZZLE_NOOP;
2015    this->result.type = brw_type_for_base_type(ir->type);
2016
2017    this->result.reg_offset += offset;
2018 }
2019
2020 /**
2021  * We want to be careful in assignment setup to hit the actual storage
2022  * instead of potentially using a temporary like we might with the
2023  * ir_dereference handler.
2024  */
2025 static dst_reg
2026 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2027 {
2028    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2029     * access of a vector, it must be separated into a series conditional moves
2030     * before reaching this point (see ir_vec_index_to_cond_assign).
2031     */
2032    assert(ir->as_dereference());
2033    ir_dereference_array *deref_array = ir->as_dereference_array();
2034    if (deref_array) {
2035       assert(!deref_array->array->type->is_vector());
2036    }
2037
2038    /* Use the rvalue deref handler for the most part.  We'll ignore
2039     * swizzles in it and write swizzles using writemask, though.
2040     */
2041    ir->accept(v);
2042    return dst_reg(v->result);
2043 }
2044
2045 void
2046 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2047                               const struct glsl_type *type,
2048                               enum brw_predicate predicate)
2049 {
2050    if (type->base_type == GLSL_TYPE_STRUCT) {
2051       for (unsigned int i = 0; i < type->length; i++) {
2052          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2053       }
2054       return;
2055    }
2056
2057    if (type->is_array()) {
2058       for (unsigned int i = 0; i < type->length; i++) {
2059          emit_block_move(dst, src, type->fields.array, predicate);
2060       }
2061       return;
2062    }
2063
2064    if (type->is_matrix()) {
2065       const struct glsl_type *vec_type;
2066
2067       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2068                                          type->vector_elements, 1);
2069
2070       for (int i = 0; i < type->matrix_columns; i++) {
2071          emit_block_move(dst, src, vec_type, predicate);
2072       }
2073       return;
2074    }
2075
2076    assert(type->is_scalar() || type->is_vector());
2077
2078    dst->type = brw_type_for_base_type(type);
2079    src->type = dst->type;
2080
2081    dst->writemask = (1 << type->vector_elements) - 1;
2082
2083    src->swizzle = swizzle_for_size(type->vector_elements);
2084
2085    vec4_instruction *inst = emit(MOV(*dst, *src));
2086    inst->predicate = predicate;
2087
2088    dst->reg_offset++;
2089    src->reg_offset++;
2090 }
2091
2092
2093 /* If the RHS processing resulted in an instruction generating a
2094  * temporary value, and it would be easy to rewrite the instruction to
2095  * generate its result right into the LHS instead, do so.  This ends
2096  * up reliably removing instructions where it can be tricky to do so
2097  * later without real UD chain information.
2098  */
2099 bool
2100 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2101                                      dst_reg dst,
2102                                      src_reg src,
2103                                      vec4_instruction *pre_rhs_inst,
2104                                      vec4_instruction *last_rhs_inst)
2105 {
2106    /* This could be supported, but it would take more smarts. */
2107    if (ir->condition)
2108       return false;
2109
2110    if (pre_rhs_inst == last_rhs_inst)
2111       return false; /* No instructions generated to work with. */
2112
2113    /* Make sure the last instruction generated our source reg. */
2114    if (src.file != GRF ||
2115        src.file != last_rhs_inst->dst.file ||
2116        src.reg != last_rhs_inst->dst.reg ||
2117        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2118        src.reladdr ||
2119        src.abs ||
2120        src.negate ||
2121        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2122       return false;
2123
2124    /* Check that that last instruction fully initialized the channels
2125     * we want to use, in the order we want to use them.  We could
2126     * potentially reswizzle the operands of many instructions so that
2127     * we could handle out of order channels, but don't yet.
2128     */
2129
2130    for (unsigned i = 0; i < 4; i++) {
2131       if (dst.writemask & (1 << i)) {
2132          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2133             return false;
2134
2135          if (BRW_GET_SWZ(src.swizzle, i) != i)
2136             return false;
2137       }
2138    }
2139
2140    /* Success!  Rewrite the instruction. */
2141    last_rhs_inst->dst.file = dst.file;
2142    last_rhs_inst->dst.reg = dst.reg;
2143    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2144    last_rhs_inst->dst.reladdr = dst.reladdr;
2145    last_rhs_inst->dst.writemask &= dst.writemask;
2146
2147    return true;
2148 }
2149
2150 void
2151 vec4_visitor::visit(ir_assignment *ir)
2152 {
2153    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2154    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2155
2156    if (!ir->lhs->type->is_scalar() &&
2157        !ir->lhs->type->is_vector()) {
2158       ir->rhs->accept(this);
2159       src_reg src = this->result;
2160
2161       if (ir->condition) {
2162          emit_bool_to_cond_code(ir->condition, &predicate);
2163       }
2164
2165       /* emit_block_move doesn't account for swizzles in the source register.
2166        * This should be ok, since the source register is a structure or an
2167        * array, and those can't be swizzled.  But double-check to be sure.
2168        */
2169       assert(src.swizzle ==
2170              (ir->rhs->type->is_matrix()
2171               ? swizzle_for_size(ir->rhs->type->vector_elements)
2172               : BRW_SWIZZLE_NOOP));
2173
2174       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2175       return;
2176    }
2177
2178    /* Now we're down to just a scalar/vector with writemasks. */
2179    int i;
2180
2181    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2182    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2183
2184    ir->rhs->accept(this);
2185
2186    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2187
2188    src_reg src = this->result;
2189
2190    int swizzles[4];
2191    int first_enabled_chan = 0;
2192    int src_chan = 0;
2193
2194    assert(ir->lhs->type->is_vector() ||
2195           ir->lhs->type->is_scalar());
2196    dst.writemask = ir->write_mask;
2197
2198    for (int i = 0; i < 4; i++) {
2199       if (dst.writemask & (1 << i)) {
2200          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2201          break;
2202       }
2203    }
2204
2205    /* Swizzle a small RHS vector into the channels being written.
2206     *
2207     * glsl ir treats write_mask as dictating how many channels are
2208     * present on the RHS while in our instructions we need to make
2209     * those channels appear in the slots of the vec4 they're written to.
2210     */
2211    for (int i = 0; i < 4; i++) {
2212       if (dst.writemask & (1 << i))
2213          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2214       else
2215          swizzles[i] = first_enabled_chan;
2216    }
2217    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2218                               swizzles[2], swizzles[3]);
2219
2220    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2221       return;
2222    }
2223
2224    if (ir->condition) {
2225       emit_bool_to_cond_code(ir->condition, &predicate);
2226    }
2227
2228    for (i = 0; i < type_size(ir->lhs->type); i++) {
2229       vec4_instruction *inst = emit(MOV(dst, src));
2230       inst->predicate = predicate;
2231
2232       dst.reg_offset++;
2233       src.reg_offset++;
2234    }
2235 }
2236
2237 void
2238 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2239 {
2240    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2241       foreach_in_list(ir_constant, field_value, &ir->components) {
2242          emit_constant_values(dst, field_value);
2243       }
2244       return;
2245    }
2246
2247    if (ir->type->is_array()) {
2248       for (unsigned int i = 0; i < ir->type->length; i++) {
2249          emit_constant_values(dst, ir->array_elements[i]);
2250       }
2251       return;
2252    }
2253
2254    if (ir->type->is_matrix()) {
2255       for (int i = 0; i < ir->type->matrix_columns; i++) {
2256          float *vec = &ir->value.f[i * ir->type->vector_elements];
2257
2258          for (int j = 0; j < ir->type->vector_elements; j++) {
2259             dst->writemask = 1 << j;
2260             dst->type = BRW_REGISTER_TYPE_F;
2261
2262             emit(MOV(*dst, src_reg(vec[j])));
2263          }
2264          dst->reg_offset++;
2265       }
2266       return;
2267    }
2268
2269    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2270
2271    for (int i = 0; i < ir->type->vector_elements; i++) {
2272       if (!(remaining_writemask & (1 << i)))
2273          continue;
2274
2275       dst->writemask = 1 << i;
2276       dst->type = brw_type_for_base_type(ir->type);
2277
2278       /* Find other components that match the one we're about to
2279        * write.  Emits fewer instructions for things like vec4(0.5,
2280        * 1.5, 1.5, 1.5).
2281        */
2282       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2283          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2284             if (ir->value.b[i] == ir->value.b[j])
2285                dst->writemask |= (1 << j);
2286          } else {
2287             /* u, i, and f storage all line up, so no need for a
2288              * switch case for comparing each type.
2289              */
2290             if (ir->value.u[i] == ir->value.u[j])
2291                dst->writemask |= (1 << j);
2292          }
2293       }
2294
2295       switch (ir->type->base_type) {
2296       case GLSL_TYPE_FLOAT:
2297          emit(MOV(*dst, src_reg(ir->value.f[i])));
2298          break;
2299       case GLSL_TYPE_INT:
2300          emit(MOV(*dst, src_reg(ir->value.i[i])));
2301          break;
2302       case GLSL_TYPE_UINT:
2303          emit(MOV(*dst, src_reg(ir->value.u[i])));
2304          break;
2305       case GLSL_TYPE_BOOL:
2306          emit(MOV(*dst,
2307                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2308                                               : 0)));
2309          break;
2310       default:
2311          unreachable("Non-float/uint/int/bool constant");
2312       }
2313
2314       remaining_writemask &= ~dst->writemask;
2315    }
2316    dst->reg_offset++;
2317 }
2318
2319 void
2320 vec4_visitor::visit(ir_constant *ir)
2321 {
2322    dst_reg dst = dst_reg(this, ir->type);
2323    this->result = src_reg(dst);
2324
2325    emit_constant_values(&dst, ir);
2326 }
2327
2328 void
2329 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2330 {
2331    ir_dereference *deref = static_cast<ir_dereference *>(
2332       ir->actual_parameters.get_head());
2333    ir_variable *location = deref->variable_referenced();
2334    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2335                           location->data.binding);
2336
2337    /* Calculate the surface offset */
2338    src_reg offset(this, glsl_type::uint_type);
2339    ir_dereference_array *deref_array = deref->as_dereference_array();
2340    if (deref_array) {
2341       deref_array->array_index->accept(this);
2342
2343       src_reg tmp(this, glsl_type::uint_type);
2344       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2345       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2346    } else {
2347       offset = location->data.atomic.offset;
2348    }
2349
2350    /* Emit the appropriate machine instruction */
2351    const char *callee = ir->callee->function_name();
2352    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2353
2354    if (!strcmp("__intrinsic_atomic_read", callee)) {
2355       emit_untyped_surface_read(surf_index, dst, offset);
2356
2357    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2358       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2359                           src_reg(), src_reg());
2360
2361    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2362       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2363                           src_reg(), src_reg());
2364    }
2365 }
2366
2367 void
2368 vec4_visitor::visit(ir_call *ir)
2369 {
2370    const char *callee = ir->callee->function_name();
2371
2372    if (!strcmp("__intrinsic_atomic_read", callee) ||
2373        !strcmp("__intrinsic_atomic_increment", callee) ||
2374        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2375       visit_atomic_counter_intrinsic(ir);
2376    } else {
2377       unreachable("Unsupported intrinsic.");
2378    }
2379 }
2380
2381 src_reg
2382 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2383 {
2384    vec4_instruction *inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS);
2385    inst->base_mrf = 2;
2386    inst->mlen = 1;
2387    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2388    inst->dst.writemask = WRITEMASK_XYZW;
2389
2390    inst->src[1] = sampler;
2391
2392    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2393    int param_base = inst->base_mrf;
2394    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2395    int zero_mask = 0xf & ~coord_mask;
2396
2397    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2398             coordinate));
2399
2400    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2401             src_reg(0)));
2402
2403    emit(inst);
2404    return src_reg(inst->dst);
2405 }
2406
2407 static bool
2408 is_high_sampler(struct brw_context *brw, src_reg sampler)
2409 {
2410    if (brw->gen < 8 && !brw->is_haswell)
2411       return false;
2412
2413    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2414 }
2415
2416 void
2417 vec4_visitor::visit(ir_texture *ir)
2418 {
2419    uint32_t sampler =
2420       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2421
2422    ir_rvalue *nonconst_sampler_index =
2423       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2424
2425    /* Handle non-constant sampler array indexing */
2426    src_reg sampler_reg;
2427    if (nonconst_sampler_index) {
2428       /* The highest sampler which may be used by this operation is
2429        * the last element of the array. Mark it here, because the generator
2430        * doesn't have enough information to determine the bound.
2431        */
2432       uint32_t array_size = ir->sampler->as_dereference_array()
2433          ->array->type->array_size();
2434
2435       uint32_t max_used = sampler + array_size - 1;
2436       if (ir->op == ir_tg4 && brw->gen < 8) {
2437          max_used += prog_data->base.binding_table.gather_texture_start;
2438       } else {
2439          max_used += prog_data->base.binding_table.texture_start;
2440       }
2441
2442       brw_mark_surface_used(&prog_data->base, max_used);
2443
2444       /* Emit code to evaluate the actual indexing expression */
2445       nonconst_sampler_index->accept(this);
2446       dst_reg temp(this, glsl_type::uint_type);
2447       emit(ADD(temp, this->result, src_reg(sampler)))
2448          ->force_writemask_all = true;
2449       sampler_reg = src_reg(temp);
2450    } else {
2451       /* Single sampler, or constant array index; the indexing expression
2452        * is just an immediate.
2453        */
2454       sampler_reg = src_reg(sampler);
2455    }
2456
2457    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2458     * emitting anything other than setting up the constant result.
2459     */
2460    if (ir->op == ir_tg4) {
2461       ir_constant *chan = ir->lod_info.component->as_constant();
2462       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2463       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2464          dst_reg result(this, ir->type);
2465          this->result = src_reg(result);
2466          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2467          return;
2468       }
2469    }
2470
2471    /* Should be lowered by do_lower_texture_projection */
2472    assert(!ir->projector);
2473
2474    /* Should be lowered */
2475    assert(!ir->offset || !ir->offset->type->is_array());
2476
2477    /* Generate code to compute all the subexpression trees.  This has to be
2478     * done before loading any values into MRFs for the sampler message since
2479     * generating these values may involve SEND messages that need the MRFs.
2480     */
2481    src_reg coordinate;
2482    if (ir->coordinate) {
2483       ir->coordinate->accept(this);
2484       coordinate = this->result;
2485    }
2486
2487    src_reg shadow_comparitor;
2488    if (ir->shadow_comparitor) {
2489       ir->shadow_comparitor->accept(this);
2490       shadow_comparitor = this->result;
2491    }
2492
2493    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2494    src_reg offset_value;
2495    if (has_nonconstant_offset) {
2496       ir->offset->accept(this);
2497       offset_value = src_reg(this->result);
2498    }
2499
2500    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2501    src_reg lod, dPdx, dPdy, sample_index, mcs;
2502    switch (ir->op) {
2503    case ir_tex:
2504       lod = src_reg(0.0f);
2505       lod_type = glsl_type::float_type;
2506       break;
2507    case ir_txf:
2508    case ir_txl:
2509    case ir_txs:
2510       ir->lod_info.lod->accept(this);
2511       lod = this->result;
2512       lod_type = ir->lod_info.lod->type;
2513       break;
2514    case ir_query_levels:
2515       lod = src_reg(0);
2516       lod_type = glsl_type::int_type;
2517       break;
2518    case ir_txf_ms:
2519       ir->lod_info.sample_index->accept(this);
2520       sample_index = this->result;
2521       sample_index_type = ir->lod_info.sample_index->type;
2522
2523       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2524          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2525       else
2526          mcs = src_reg(0u);
2527       break;
2528    case ir_txd:
2529       ir->lod_info.grad.dPdx->accept(this);
2530       dPdx = this->result;
2531
2532       ir->lod_info.grad.dPdy->accept(this);
2533       dPdy = this->result;
2534
2535       lod_type = ir->lod_info.grad.dPdx->type;
2536       break;
2537    case ir_txb:
2538    case ir_lod:
2539    case ir_tg4:
2540       break;
2541    }
2542
2543    enum opcode opcode;
2544    switch (ir->op) {
2545    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2546    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2547    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2548    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2549    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2550    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2551    case ir_tg4: opcode = has_nonconstant_offset
2552                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2553    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2554    case ir_txb:
2555       unreachable("TXB is not valid for vertex shaders.");
2556    case ir_lod:
2557       unreachable("LOD is not valid for vertex shaders.");
2558    default:
2559       unreachable("Unrecognized tex op");
2560    }
2561
2562    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode);
2563
2564    if (ir->offset != NULL && !has_nonconstant_offset) {
2565       inst->offset =
2566          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2567                             ir->offset->type->vector_elements);
2568    }
2569
2570    /* Stuff the channel select bits in the top of the texture offset */
2571    if (ir->op == ir_tg4)
2572       inst->offset |= gather_channel(ir, sampler) << 16;
2573
2574    /* The message header is necessary for:
2575     * - Gen4 (always)
2576     * - Gen9+ for selecting SIMD4x2
2577     * - Texel offsets
2578     * - Gather channel selection
2579     * - Sampler indices too large to fit in a 4-bit value.
2580     */
2581    inst->header_present =
2582       brw->gen < 5 || brw->gen >= 9 ||
2583       inst->offset != 0 || ir->op == ir_tg4 ||
2584       is_high_sampler(brw, sampler_reg);
2585    inst->base_mrf = 2;
2586    inst->mlen = inst->header_present + 1; /* always at least one */
2587    inst->dst = dst_reg(this, ir->type);
2588    inst->dst.writemask = WRITEMASK_XYZW;
2589    inst->shadow_compare = ir->shadow_comparitor != NULL;
2590
2591    inst->src[1] = sampler_reg;
2592
2593    /* MRF for the first parameter */
2594    int param_base = inst->base_mrf + inst->header_present;
2595
2596    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2597       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2598       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2599    } else {
2600       /* Load the coordinate */
2601       /* FINISHME: gl_clamp_mask and saturate */
2602       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2603       int zero_mask = 0xf & ~coord_mask;
2604
2605       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2606                coordinate));
2607
2608       if (zero_mask != 0) {
2609          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2610                   src_reg(0)));
2611       }
2612       /* Load the shadow comparitor */
2613       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2614          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2615                           WRITEMASK_X),
2616                   shadow_comparitor));
2617          inst->mlen++;
2618       }
2619
2620       /* Load the LOD info */
2621       if (ir->op == ir_tex || ir->op == ir_txl) {
2622          int mrf, writemask;
2623          if (brw->gen >= 5) {
2624             mrf = param_base + 1;
2625             if (ir->shadow_comparitor) {
2626                writemask = WRITEMASK_Y;
2627                /* mlen already incremented */
2628             } else {
2629                writemask = WRITEMASK_X;
2630                inst->mlen++;
2631             }
2632          } else /* brw->gen == 4 */ {
2633             mrf = param_base;
2634             writemask = WRITEMASK_W;
2635          }
2636          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2637       } else if (ir->op == ir_txf) {
2638          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2639       } else if (ir->op == ir_txf_ms) {
2640          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2641                   sample_index));
2642          if (brw->gen >= 7) {
2643             /* MCS data is in the first channel of `mcs`, but we need to get it into
2644              * the .y channel of the second vec4 of params, so replicate .x across
2645              * the whole vec4 and then mask off everything except .y
2646              */
2647             mcs.swizzle = BRW_SWIZZLE_XXXX;
2648             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2649                      mcs));
2650          }
2651          inst->mlen++;
2652       } else if (ir->op == ir_txd) {
2653          const glsl_type *type = lod_type;
2654
2655          if (brw->gen >= 5) {
2656             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2657             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2658             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2659             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2660             inst->mlen++;
2661
2662             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2663                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2664                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2665                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2666                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2667                inst->mlen++;
2668
2669                if (ir->shadow_comparitor) {
2670                   emit(MOV(dst_reg(MRF, param_base + 2,
2671                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2672                            shadow_comparitor));
2673                }
2674             }
2675          } else /* brw->gen == 4 */ {
2676             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2677             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2678             inst->mlen += 2;
2679          }
2680       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2681          if (ir->shadow_comparitor) {
2682             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2683                      shadow_comparitor));
2684          }
2685
2686          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2687                   offset_value));
2688          inst->mlen++;
2689       }
2690    }
2691
2692    emit(inst);
2693
2694    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2695     * spec requires layers.
2696     */
2697    if (ir->op == ir_txs) {
2698       glsl_type const *type = ir->sampler->type;
2699       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2700           type->sampler_array) {
2701          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2702                    writemask(inst->dst, WRITEMASK_Z),
2703                    src_reg(inst->dst), src_reg(6));
2704       }
2705    }
2706
2707    if (brw->gen == 6 && ir->op == ir_tg4) {
2708       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2709    }
2710
2711    swizzle_result(ir, src_reg(inst->dst), sampler);
2712 }
2713
2714 /**
2715  * Apply workarounds for Gen6 gather with UINT/SINT
2716  */
2717 void
2718 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2719 {
2720    if (!wa)
2721       return;
2722
2723    int width = (wa & WA_8BIT) ? 8 : 16;
2724    dst_reg dst_f = dst;
2725    dst_f.type = BRW_REGISTER_TYPE_F;
2726
2727    /* Convert from UNORM to UINT */
2728    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2729    emit(MOV(dst, src_reg(dst_f)));
2730
2731    if (wa & WA_SIGN) {
2732       /* Reinterpret the UINT value as a signed INT value by
2733        * shifting the sign bit into place, then shifting back
2734        * preserving sign.
2735        */
2736       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2737       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2738    }
2739 }
2740
2741 /**
2742  * Set up the gather channel based on the swizzle, for gather4.
2743  */
2744 uint32_t
2745 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2746 {
2747    ir_constant *chan = ir->lod_info.component->as_constant();
2748    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2749    switch (swiz) {
2750       case SWIZZLE_X: return 0;
2751       case SWIZZLE_Y:
2752          /* gather4 sampler is broken for green channel on RG32F --
2753           * we must ask for blue instead.
2754           */
2755          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2756             return 2;
2757          return 1;
2758       case SWIZZLE_Z: return 2;
2759       case SWIZZLE_W: return 3;
2760       default:
2761          unreachable("Not reached"); /* zero, one swizzles handled already */
2762    }
2763 }
2764
2765 void
2766 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2767 {
2768    int s = key->tex.swizzles[sampler];
2769
2770    this->result = src_reg(this, ir->type);
2771    dst_reg swizzled_result(this->result);
2772
2773    if (ir->op == ir_query_levels) {
2774       /* # levels is in .w */
2775       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2776       emit(MOV(swizzled_result, orig_val));
2777       return;
2778    }
2779
2780    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2781                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2782       emit(MOV(swizzled_result, orig_val));
2783       return;
2784    }
2785
2786
2787    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2788    int swizzle[4] = {0};
2789
2790    for (int i = 0; i < 4; i++) {
2791       switch (GET_SWZ(s, i)) {
2792       case SWIZZLE_ZERO:
2793          zero_mask |= (1 << i);
2794          break;
2795       case SWIZZLE_ONE:
2796          one_mask |= (1 << i);
2797          break;
2798       default:
2799          copy_mask |= (1 << i);
2800          swizzle[i] = GET_SWZ(s, i);
2801          break;
2802       }
2803    }
2804
2805    if (copy_mask) {
2806       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2807       swizzled_result.writemask = copy_mask;
2808       emit(MOV(swizzled_result, orig_val));
2809    }
2810
2811    if (zero_mask) {
2812       swizzled_result.writemask = zero_mask;
2813       emit(MOV(swizzled_result, src_reg(0.0f)));
2814    }
2815
2816    if (one_mask) {
2817       swizzled_result.writemask = one_mask;
2818       emit(MOV(swizzled_result, src_reg(1.0f)));
2819    }
2820 }
2821
2822 void
2823 vec4_visitor::visit(ir_return *)
2824 {
2825    unreachable("not reached");
2826 }
2827
2828 void
2829 vec4_visitor::visit(ir_discard *)
2830 {
2831    unreachable("not reached");
2832 }
2833
2834 void
2835 vec4_visitor::visit(ir_if *ir)
2836 {
2837    /* Don't point the annotation at the if statement, because then it plus
2838     * the then and else blocks get printed.
2839     */
2840    this->base_ir = ir->condition;
2841
2842    if (brw->gen == 6) {
2843       emit_if_gen6(ir);
2844    } else {
2845       enum brw_predicate predicate;
2846       emit_bool_to_cond_code(ir->condition, &predicate);
2847       emit(IF(predicate));
2848    }
2849
2850    visit_instructions(&ir->then_instructions);
2851
2852    if (!ir->else_instructions.is_empty()) {
2853       this->base_ir = ir->condition;
2854       emit(BRW_OPCODE_ELSE);
2855
2856       visit_instructions(&ir->else_instructions);
2857    }
2858
2859    this->base_ir = ir->condition;
2860    emit(BRW_OPCODE_ENDIF);
2861 }
2862
2863 void
2864 vec4_visitor::visit(ir_emit_vertex *)
2865 {
2866    unreachable("not reached");
2867 }
2868
2869 void
2870 vec4_visitor::visit(ir_end_primitive *)
2871 {
2872    unreachable("not reached");
2873 }
2874
2875 void
2876 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2877                                   dst_reg dst, src_reg offset,
2878                                   src_reg src0, src_reg src1)
2879 {
2880    unsigned mlen = 0;
2881
2882    /* Set the atomic operation offset. */
2883    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2884    mlen++;
2885
2886    /* Set the atomic operation arguments. */
2887    if (src0.file != BAD_FILE) {
2888       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2889       mlen++;
2890    }
2891
2892    if (src1.file != BAD_FILE) {
2893       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2894       mlen++;
2895    }
2896
2897    /* Emit the instruction.  Note that this maps to the normal SIMD8
2898     * untyped atomic message on Ivy Bridge, but that's OK because
2899     * unused channels will be masked out.
2900     */
2901    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2902                                  src_reg(atomic_op), src_reg(surf_index));
2903    inst->base_mrf = 0;
2904    inst->mlen = mlen;
2905 }
2906
2907 void
2908 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2909                                         src_reg offset)
2910 {
2911    /* Set the surface read offset. */
2912    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2913
2914    /* Emit the instruction.  Note that this maps to the normal SIMD8
2915     * untyped surface read message, but that's OK because unused
2916     * channels will be masked out.
2917     */
2918    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2919                                  dst, src_reg(surf_index));
2920    inst->base_mrf = 0;
2921    inst->mlen = 1;
2922 }
2923
2924 void
2925 vec4_visitor::emit_ndc_computation()
2926 {
2927    /* Get the position */
2928    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2929
2930    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2931    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2932    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2933
2934    current_annotation = "NDC";
2935    dst_reg ndc_w = ndc;
2936    ndc_w.writemask = WRITEMASK_W;
2937    src_reg pos_w = pos;
2938    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2939    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2940
2941    dst_reg ndc_xyz = ndc;
2942    ndc_xyz.writemask = WRITEMASK_XYZ;
2943
2944    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2945 }
2946
2947 void
2948 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2949 {
2950    if (brw->gen < 6 &&
2951        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2952         key->userclip_active || brw->has_negative_rhw_bug)) {
2953       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2954       dst_reg header1_w = header1;
2955       header1_w.writemask = WRITEMASK_W;
2956
2957       emit(MOV(header1, 0u));
2958
2959       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2960          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2961
2962          current_annotation = "Point size";
2963          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2964          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2965       }
2966
2967       if (key->userclip_active) {
2968          current_annotation = "Clipping flags";
2969          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2970          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2971
2972          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2973          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2974          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2975
2976          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2977          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2978          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2979          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2980       }
2981
2982       /* i965 clipping workaround:
2983        * 1) Test for -ve rhw
2984        * 2) If set,
2985        *      set ndc = (0,0,0,0)
2986        *      set ucp[6] = 1
2987        *
2988        * Later, clipping will detect ucp[6] and ensure the primitive is
2989        * clipped against all fixed planes.
2990        */
2991       if (brw->has_negative_rhw_bug) {
2992          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2993          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2994          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2995          vec4_instruction *inst;
2996          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2997          inst->predicate = BRW_PREDICATE_NORMAL;
2998          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2999          inst->predicate = BRW_PREDICATE_NORMAL;
3000       }
3001
3002       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3003    } else if (brw->gen < 6) {
3004       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3005    } else {
3006       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3007       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3008          dst_reg reg_w = reg;
3009          reg_w.writemask = WRITEMASK_W;
3010          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3011       }
3012       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3013          dst_reg reg_y = reg;
3014          reg_y.writemask = WRITEMASK_Y;
3015          reg_y.type = BRW_REGISTER_TYPE_D;
3016          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3017       }
3018       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3019          dst_reg reg_z = reg;
3020          reg_z.writemask = WRITEMASK_Z;
3021          reg_z.type = BRW_REGISTER_TYPE_D;
3022          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3023       }
3024    }
3025 }
3026
3027 void
3028 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3029 {
3030    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3031     *
3032     *     "If a linked set of shaders forming the vertex stage contains no
3033     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3034     *     application has requested clipping against user clip planes through
3035     *     the API, then the coordinate written to gl_Position is used for
3036     *     comparison against the user clip planes."
3037     *
3038     * This function is only called if the shader didn't write to
3039     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3040     * if the user wrote to it; otherwise we use gl_Position.
3041     */
3042    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3043    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3044       clip_vertex = VARYING_SLOT_POS;
3045    }
3046
3047    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3048         ++i) {
3049       reg.writemask = 1 << i;
3050       emit(DP4(reg,
3051                src_reg(output_reg[clip_vertex]),
3052                src_reg(this->userplane[i + offset])));
3053    }
3054 }
3055
3056 vec4_instruction *
3057 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3058 {
3059    assert (varying < VARYING_SLOT_MAX);
3060    reg.type = output_reg[varying].type;
3061    current_annotation = output_reg_annotation[varying];
3062    /* Copy the register, saturating if necessary */
3063    return emit(MOV(reg, src_reg(output_reg[varying])));
3064 }
3065
3066 void
3067 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3068 {
3069    reg.type = BRW_REGISTER_TYPE_F;
3070
3071    switch (varying) {
3072    case VARYING_SLOT_PSIZ:
3073    {
3074       /* PSIZ is always in slot 0, and is coupled with other flags. */
3075       current_annotation = "indices, point width, clip flags";
3076       emit_psiz_and_flags(reg);
3077       break;
3078    }
3079    case BRW_VARYING_SLOT_NDC:
3080       current_annotation = "NDC";
3081       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3082       break;
3083    case VARYING_SLOT_POS:
3084       current_annotation = "gl_Position";
3085       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3086       break;
3087    case VARYING_SLOT_EDGE:
3088       /* This is present when doing unfilled polygons.  We're supposed to copy
3089        * the edge flag from the user-provided vertex array
3090        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3091        * of that attribute (starts as 1.0f).  This is then used in clipping to
3092        * determine which edges should be drawn as wireframe.
3093        */
3094       current_annotation = "edge flag";
3095       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3096                                     glsl_type::float_type, WRITEMASK_XYZW))));
3097       break;
3098    case BRW_VARYING_SLOT_PAD:
3099       /* No need to write to this slot */
3100       break;
3101    case VARYING_SLOT_COL0:
3102    case VARYING_SLOT_COL1:
3103    case VARYING_SLOT_BFC0:
3104    case VARYING_SLOT_BFC1: {
3105       /* These built-in varyings are only supported in compatibility mode,
3106        * and we only support GS in core profile.  So, this must be a vertex
3107        * shader.
3108        */
3109       assert(stage == MESA_SHADER_VERTEX);
3110       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3111       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3112          inst->saturate = true;
3113       break;
3114    }
3115
3116    default:
3117       emit_generic_urb_slot(reg, varying);
3118       break;
3119    }
3120 }
3121
3122 static int
3123 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3124 {
3125    if (brw->gen >= 6) {
3126       /* URB data written (does not include the message header reg) must
3127        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3128        * section 5.4.3.2.2: URB_INTERLEAVED.
3129        *
3130        * URB entries are allocated on a multiple of 1024 bits, so an
3131        * extra 128 bits written here to make the end align to 256 is
3132        * no problem.
3133        */
3134       if ((mlen % 2) != 1)
3135          mlen++;
3136    }
3137
3138    return mlen;
3139 }
3140
3141
3142 /**
3143  * Generates the VUE payload plus the necessary URB write instructions to
3144  * output it.
3145  *
3146  * The VUE layout is documented in Volume 2a.
3147  */
3148 void
3149 vec4_visitor::emit_vertex()
3150 {
3151    /* MRF 0 is reserved for the debugger, so start with message header
3152     * in MRF 1.
3153     */
3154    int base_mrf = 1;
3155    int mrf = base_mrf;
3156    /* In the process of generating our URB write message contents, we
3157     * may need to unspill a register or load from an array.  Those
3158     * reads would use MRFs 14-15.
3159     */
3160    int max_usable_mrf = 13;
3161
3162    /* The following assertion verifies that max_usable_mrf causes an
3163     * even-numbered amount of URB write data, which will meet gen6's
3164     * requirements for length alignment.
3165     */
3166    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3167
3168    /* First mrf is the g0-based message header containing URB handles and
3169     * such.
3170     */
3171    emit_urb_write_header(mrf++);
3172
3173    if (brw->gen < 6) {
3174       emit_ndc_computation();
3175    }
3176
3177    /* Lower legacy ff and ClipVertex clipping to clip distances */
3178    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3179       current_annotation = "user clip distances";
3180
3181       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3182       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3183
3184       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3185       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3186    }
3187
3188    /* We may need to split this up into several URB writes, so do them in a
3189     * loop.
3190     */
3191    int slot = 0;
3192    bool complete = false;
3193    do {
3194       /* URB offset is in URB row increments, and each of our MRFs is half of
3195        * one of those, since we're doing interleaved writes.
3196        */
3197       int offset = slot / 2;
3198
3199       mrf = base_mrf + 1;
3200       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3201          emit_urb_slot(dst_reg(MRF, mrf++),
3202                        prog_data->vue_map.slot_to_varying[slot]);
3203
3204          /* If this was max_usable_mrf, we can't fit anything more into this
3205           * URB WRITE.
3206           */
3207          if (mrf > max_usable_mrf) {
3208             slot++;
3209             break;
3210          }
3211       }
3212
3213       complete = slot >= prog_data->vue_map.num_slots;
3214       current_annotation = "URB write";
3215       vec4_instruction *inst = emit_urb_write_opcode(complete);
3216       inst->base_mrf = base_mrf;
3217       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3218       inst->offset += offset;
3219    } while(!complete);
3220 }
3221
3222
3223 src_reg
3224 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3225                                  src_reg *reladdr, int reg_offset)
3226 {
3227    /* Because we store the values to scratch interleaved like our
3228     * vertex data, we need to scale the vec4 index by 2.
3229     */
3230    int message_header_scale = 2;
3231
3232    /* Pre-gen6, the message header uses byte offsets instead of vec4
3233     * (16-byte) offset units.
3234     */
3235    if (brw->gen < 6)
3236       message_header_scale *= 16;
3237
3238    if (reladdr) {
3239       src_reg index = src_reg(this, glsl_type::int_type);
3240
3241       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3242                                    src_reg(reg_offset)));
3243       emit_before(block, inst, MUL(dst_reg(index), index,
3244                                    src_reg(message_header_scale)));
3245
3246       return index;
3247    } else {
3248       return src_reg(reg_offset * message_header_scale);
3249    }
3250 }
3251
3252 src_reg
3253 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3254                                        src_reg *reladdr, int reg_offset)
3255 {
3256    if (reladdr) {
3257       src_reg index = src_reg(this, glsl_type::int_type);
3258
3259       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3260                                    src_reg(reg_offset)));
3261
3262       /* Pre-gen6, the message header uses byte offsets instead of vec4
3263        * (16-byte) offset units.
3264        */
3265       if (brw->gen < 6) {
3266          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3267       }
3268
3269       return index;
3270    } else if (brw->gen >= 8) {
3271       /* Store the offset in a GRF so we can send-from-GRF. */
3272       src_reg offset = src_reg(this, glsl_type::int_type);
3273       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3274       return offset;
3275    } else {
3276       int message_header_scale = brw->gen < 6 ? 16 : 1;
3277       return src_reg(reg_offset * message_header_scale);
3278    }
3279 }
3280
3281 /**
3282  * Emits an instruction before @inst to load the value named by @orig_src
3283  * from scratch space at @base_offset to @temp.
3284  *
3285  * @base_offset is measured in 32-byte units (the size of a register).
3286  */
3287 void
3288 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3289                                 dst_reg temp, src_reg orig_src,
3290                                 int base_offset)
3291 {
3292    int reg_offset = base_offset + orig_src.reg_offset;
3293    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3294                                       reg_offset);
3295
3296    emit_before(block, inst, SCRATCH_READ(temp, index));
3297 }
3298
3299 /**
3300  * Emits an instruction after @inst to store the value to be written
3301  * to @orig_dst to scratch space at @base_offset, from @temp.
3302  *
3303  * @base_offset is measured in 32-byte units (the size of a register).
3304  */
3305 void
3306 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3307                                  int base_offset)
3308 {
3309    int reg_offset = base_offset + inst->dst.reg_offset;
3310    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3311                                       reg_offset);
3312
3313    /* Create a temporary register to store *inst's result in.
3314     *
3315     * We have to be careful in MOVing from our temporary result register in
3316     * the scratch write.  If we swizzle from channels of the temporary that
3317     * weren't initialized, it will confuse live interval analysis, which will
3318     * make spilling fail to make progress.
3319     */
3320    src_reg temp = src_reg(this, glsl_type::vec4_type);
3321    temp.type = inst->dst.type;
3322    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3323    int swizzles[4];
3324    for (int i = 0; i < 4; i++)
3325       if (inst->dst.writemask & (1 << i))
3326          swizzles[i] = i;
3327       else
3328          swizzles[i] = first_writemask_chan;
3329    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3330                                swizzles[2], swizzles[3]);
3331
3332    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3333                                        inst->dst.writemask));
3334    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3335    write->predicate = inst->predicate;
3336    write->ir = inst->ir;
3337    write->annotation = inst->annotation;
3338    inst->insert_after(block, write);
3339
3340    inst->dst.file = temp.file;
3341    inst->dst.reg = temp.reg;
3342    inst->dst.reg_offset = temp.reg_offset;
3343    inst->dst.reladdr = NULL;
3344 }
3345
3346 /**
3347  * We can't generally support array access in GRF space, because a
3348  * single instruction's destination can only span 2 contiguous
3349  * registers.  So, we send all GRF arrays that get variable index
3350  * access to scratch space.
3351  */
3352 void
3353 vec4_visitor::move_grf_array_access_to_scratch()
3354 {
3355    int scratch_loc[this->alloc.count];
3356    memset(scratch_loc, -1, sizeof(scratch_loc));
3357
3358    /* First, calculate the set of virtual GRFs that need to be punted
3359     * to scratch due to having any array access on them, and where in
3360     * scratch.
3361     */
3362    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3363       if (inst->dst.file == GRF && inst->dst.reladdr &&
3364           scratch_loc[inst->dst.reg] == -1) {
3365          scratch_loc[inst->dst.reg] = c->last_scratch;
3366          c->last_scratch += this->alloc.sizes[inst->dst.reg];
3367       }
3368
3369       for (int i = 0 ; i < 3; i++) {
3370          src_reg *src = &inst->src[i];
3371
3372          if (src->file == GRF && src->reladdr &&
3373              scratch_loc[src->reg] == -1) {
3374             scratch_loc[src->reg] = c->last_scratch;
3375             c->last_scratch += this->alloc.sizes[src->reg];
3376          }
3377       }
3378    }
3379
3380    /* Now, for anything that will be accessed through scratch, rewrite
3381     * it to load/store.  Note that this is a _safe list walk, because
3382     * we may generate a new scratch_write instruction after the one
3383     * we're processing.
3384     */
3385    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3386       /* Set up the annotation tracking for new generated instructions. */
3387       base_ir = inst->ir;
3388       current_annotation = inst->annotation;
3389
3390       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3391          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3392       }
3393
3394       for (int i = 0 ; i < 3; i++) {
3395          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3396             continue;
3397
3398          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3399
3400          emit_scratch_read(block, inst, temp, inst->src[i],
3401                            scratch_loc[inst->src[i].reg]);
3402
3403          inst->src[i].file = temp.file;
3404          inst->src[i].reg = temp.reg;
3405          inst->src[i].reg_offset = temp.reg_offset;
3406          inst->src[i].reladdr = NULL;
3407       }
3408    }
3409 }
3410
3411 /**
3412  * Emits an instruction before @inst to load the value named by @orig_src
3413  * from the pull constant buffer (surface) at @base_offset to @temp.
3414  */
3415 void
3416 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3417                                       dst_reg temp, src_reg orig_src,
3418                                       int base_offset)
3419 {
3420    int reg_offset = base_offset + orig_src.reg_offset;
3421    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3422    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3423                                              reg_offset);
3424    vec4_instruction *load;
3425
3426    if (brw->gen >= 7) {
3427       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3428       grf_offset.type = offset.type;
3429       emit_before(block, inst, MOV(grf_offset, offset));
3430
3431       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3432                                            temp, index, src_reg(grf_offset));
3433    } else {
3434       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3435                                            temp, index, offset);
3436       load->base_mrf = 14;
3437       load->mlen = 1;
3438    }
3439    emit_before(block, inst, load);
3440 }
3441
3442 /**
3443  * Implements array access of uniforms by inserting a
3444  * PULL_CONSTANT_LOAD instruction.
3445  *
3446  * Unlike temporary GRF array access (where we don't support it due to
3447  * the difficulty of doing relative addressing on instruction
3448  * destinations), we could potentially do array access of uniforms
3449  * that were loaded in GRF space as push constants.  In real-world
3450  * usage we've seen, though, the arrays being used are always larger
3451  * than we could load as push constants, so just always move all
3452  * uniform array access out to a pull constant buffer.
3453  */
3454 void
3455 vec4_visitor::move_uniform_array_access_to_pull_constants()
3456 {
3457    int pull_constant_loc[this->uniforms];
3458    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3459    bool nested_reladdr;
3460
3461    /* Walk through and find array access of uniforms.  Put a copy of that
3462     * uniform in the pull constant buffer.
3463     *
3464     * Note that we don't move constant-indexed accesses to arrays.  No
3465     * testing has been done of the performance impact of this choice.
3466     */
3467    do {
3468       nested_reladdr = false;
3469
3470       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3471          for (int i = 0 ; i < 3; i++) {
3472             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3473                continue;
3474
3475             int uniform = inst->src[i].reg;
3476
3477             if (inst->src[i].reladdr->reladdr)
3478                nested_reladdr = true;  /* will need another pass */
3479
3480             /* If this array isn't already present in the pull constant buffer,
3481              * add it.
3482              */
3483             if (pull_constant_loc[uniform] == -1) {
3484                const gl_constant_value **values =
3485                   &stage_prog_data->param[uniform * 4];
3486
3487                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3488
3489                assert(uniform < uniform_array_size);
3490                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3491                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3492                      = values[j];
3493                }
3494             }
3495
3496             /* Set up the annotation tracking for new generated instructions. */
3497             base_ir = inst->ir;
3498             current_annotation = inst->annotation;
3499
3500             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3501
3502             emit_pull_constant_load(block, inst, temp, inst->src[i],
3503                                     pull_constant_loc[uniform]);
3504
3505             inst->src[i].file = temp.file;
3506             inst->src[i].reg = temp.reg;
3507             inst->src[i].reg_offset = temp.reg_offset;
3508             inst->src[i].reladdr = NULL;
3509          }
3510       }
3511    } while (nested_reladdr);
3512
3513    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3514     * no need to track them as larger-than-vec4 objects.  This will be
3515     * relied on in cutting out unused uniform vectors from push
3516     * constants.
3517     */
3518    split_uniform_registers();
3519 }
3520
3521 void
3522 vec4_visitor::resolve_ud_negate(src_reg *reg)
3523 {
3524    if (reg->type != BRW_REGISTER_TYPE_UD ||
3525        !reg->negate)
3526       return;
3527
3528    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3529    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3530    *reg = temp;
3531 }
3532
3533 /**
3534  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3535  *
3536  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3537  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3538  */
3539 void
3540 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3541 {
3542    assert(brw->gen <= 5);
3543
3544    if (!rvalue->type->is_boolean())
3545       return;
3546
3547    src_reg and_result = src_reg(this, rvalue->type);
3548    src_reg neg_result = src_reg(this, rvalue->type);
3549    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3550    emit(MOV(dst_reg(neg_result), negate(and_result)));
3551    *reg = neg_result;
3552 }
3553
3554 vec4_visitor::vec4_visitor(struct brw_context *brw,
3555                            struct brw_vec4_compile *c,
3556                            struct gl_program *prog,
3557                            const struct brw_vue_prog_key *key,
3558                            struct brw_vue_prog_data *prog_data,
3559                            struct gl_shader_program *shader_prog,
3560                            gl_shader_stage stage,
3561                            void *mem_ctx,
3562                            bool debug_flag,
3563                            bool no_spills,
3564                            shader_time_shader_type st_base,
3565                            shader_time_shader_type st_written,
3566                            shader_time_shader_type st_reset)
3567    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3568      c(c),
3569      key(key),
3570      prog_data(prog_data),
3571      sanity_param_count(0),
3572      fail_msg(NULL),
3573      first_non_payload_grf(0),
3574      need_all_constants_in_pull_buffer(false),
3575      debug_flag(debug_flag),
3576      no_spills(no_spills),
3577      st_base(st_base),
3578      st_written(st_written),
3579      st_reset(st_reset)
3580 {
3581    this->mem_ctx = mem_ctx;
3582    this->failed = false;
3583
3584    this->base_ir = NULL;
3585    this->current_annotation = NULL;
3586    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3587
3588    this->variable_ht = hash_table_ctor(0,
3589                                        hash_table_pointer_hash,
3590                                        hash_table_pointer_compare);
3591
3592    this->virtual_grf_start = NULL;
3593    this->virtual_grf_end = NULL;
3594    this->live_intervals = NULL;
3595
3596    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3597
3598    this->uniforms = 0;
3599
3600    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3601     * at least one. See setup_uniforms() in brw_vec4.cpp.
3602     */
3603    this->uniform_array_size = 1;
3604    if (prog_data) {
3605       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3606    }
3607
3608    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3609    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3610 }
3611
3612 vec4_visitor::~vec4_visitor()
3613 {
3614    hash_table_dtor(this->variable_ht);
3615 }
3616
3617
3618 void
3619 vec4_visitor::fail(const char *format, ...)
3620 {
3621    va_list va;
3622    char *msg;
3623
3624    if (failed)
3625       return;
3626
3627    failed = true;
3628
3629    va_start(va, format);
3630    msg = ralloc_vasprintf(mem_ctx, format, va);
3631    va_end(va);
3632    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3633
3634    this->fail_msg = msg;
3635
3636    if (debug_flag) {
3637       fprintf(stderr, "%s",  msg);
3638    }
3639 }
3640
3641 } /* namespace brw */