src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(brw->gen >= 6);                                            \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (brw->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (brw->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (brw->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (brw->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 690           (storage->name[namelen] != 0 &&
 691            storage->name[namelen] != '.' &&
 692            storage->name[namelen] != '[')) {
 693          continue;
 694       }
 695
 696       gl_constant_value *components = storage->storage;
 697       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 698                                storage->type->matrix_columns);
 699
 700       for (unsigned s = 0; s < vector_count; s++) {
 701          assert(uniforms < uniform_array_size);
 702          uniform_vector_size[uniforms] = storage->type->vector_elements;
 703
 704          int i;
 705          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 706             stage_prog_data->param[uniforms * 4 + i] = components;
 707             components++;
 708          }
 709          for (; i < 4; i++) {
 710             static gl_constant_value zero = { 0.0 };
 711             stage_prog_data->param[uniforms * 4 + i] = &zero;
 712          }
 713
 714          uniforms++;
 715       }
 716    }
 717 }
 718
 719 void
 720 vec4_visitor::setup_uniform_clipplane_values()
 721 {
 722    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 723
 724    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 4;
 727       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 728       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 729       for (int j = 0; j < 4; ++j) {
 730          stage_prog_data->param[this->uniforms * 4 + j] =
 731             (gl_constant_value *) &clip_planes[i][j];
 732       }
 733       ++this->uniforms;
 734    }
 735 }
 736
 737 /* Our support for builtin uniforms is even scarier than non-builtin.
 738  * It sits on top of the PROG_STATE_VAR parameters that are
 739  * automatically updated from GL context state.
 740  */
 741 void
 742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 743 {
 744    const ir_state_slot *const slots = ir->get_state_slots();
 745    assert(slots != NULL);
 746
 747    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 748       /* This state reference has already been setup by ir_to_mesa,
 749        * but we'll get the same index back here.  We can reference
 750        * ParameterValues directly, since unlike brw_fs.cpp, we never
 751        * add new state references during compile.
 752        */
 753       int index = _mesa_add_state_reference(this->prog->Parameters,
 754                                             (gl_state_index *)slots[i].tokens);
 755       gl_constant_value *values =
 756          &this->prog->Parameters->ParameterValues[index][0];
 757
 758       assert(this->uniforms < uniform_array_size);
 759       this->uniform_vector_size[this->uniforms] = 0;
 760       /* Add each of the unique swizzled channels of the element.
 761        * This will end up matching the size of the glsl_type of this field.
 762        */
 763       int last_swiz = -1;
 764       for (unsigned int j = 0; j < 4; j++) {
 765          int swiz = GET_SWZ(slots[i].swizzle, j);
 766          last_swiz = swiz;
 767
 768          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 769          assert(this->uniforms < uniform_array_size);
 770          if (swiz <= last_swiz)
 771             this->uniform_vector_size[this->uniforms]++;
 772       }
 773       this->uniforms++;
 774    }
 775 }
 776
 777 dst_reg *
 778 vec4_visitor::variable_storage(ir_variable *var)
 779 {
 780    return (dst_reg *)hash_table_find(this->variable_ht, var);
 781 }
 782
 783 void
 784 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 785                                      enum brw_predicate *predicate)
 786 {
 787    ir_expression *expr = ir->as_expression();
 788
 789    *predicate = BRW_PREDICATE_NORMAL;
 790
 791    if (expr && expr->operation != ir_binop_ubo_load) {
 792       src_reg op[3];
 793       vec4_instruction *inst;
 794
 795       assert(expr->get_num_operands() <= 3);
 796       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 797          expr->operands[i]->accept(this);
 798          op[i] = this->result;
 799
 800          resolve_ud_negate(&op[i]);
 801       }
 802
 803       switch (expr->operation) {
 804       case ir_unop_logic_not:
 805          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 806          inst->conditional_mod = BRW_CONDITIONAL_Z;
 807          break;
 808
 809       case ir_binop_logic_xor:
 810          if (brw->gen <= 5) {
 811             src_reg temp = src_reg(this, ir->type);
 812             emit(XOR(dst_reg(temp), op[0], op[1]));
 813             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 814          } else {
 815             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 816          }
 817          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          break;
 819
 820       case ir_binop_logic_or:
 821          if (brw->gen <= 5) {
 822             src_reg temp = src_reg(this, ir->type);
 823             emit(OR(dst_reg(temp), op[0], op[1]));
 824             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 825          } else {
 826             inst = emit(OR(dst_null_d(), op[0], op[1]));
 827          }
 828          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 829          break;
 830
 831       case ir_binop_logic_and:
 832          if (brw->gen <= 5) {
 833             src_reg temp = src_reg(this, ir->type);
 834             emit(AND(dst_reg(temp), op[0], op[1]));
 835             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 836          } else {
 837             inst = emit(AND(dst_null_d(), op[0], op[1]));
 838          }
 839          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 840          break;
 841
 842       case ir_unop_f2b:
 843          if (brw->gen >= 6) {
 844             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 845          } else {
 846             inst = emit(MOV(dst_null_f(), op[0]));
 847             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 848          }
 849          break;
 850
 851       case ir_unop_i2b:
 852          if (brw->gen >= 6) {
 853             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 854          } else {
 855             inst = emit(MOV(dst_null_d(), op[0]));
 856             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 857          }
 858          break;
 859
 860       case ir_binop_all_equal:
 861          if (brw->gen <= 5) {
 862             resolve_bool_comparison(expr->operands[0], &op[0]);
 863             resolve_bool_comparison(expr->operands[1], &op[1]);
 864          }
 865          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 866          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 867          break;
 868
 869       case ir_binop_any_nequal:
 870          if (brw->gen <= 5) {
 871             resolve_bool_comparison(expr->operands[0], &op[0]);
 872             resolve_bool_comparison(expr->operands[1], &op[1]);
 873          }
 874          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 875          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 876          break;
 877
 878       case ir_unop_any:
 879          if (brw->gen <= 5) {
 880             resolve_bool_comparison(expr->operands[0], &op[0]);
 881          }
 882          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 883          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 884          break;
 885
 886       case ir_binop_greater:
 887       case ir_binop_gequal:
 888       case ir_binop_less:
 889       case ir_binop_lequal:
 890       case ir_binop_equal:
 891       case ir_binop_nequal:
 892          if (brw->gen <= 5) {
 893             resolve_bool_comparison(expr->operands[0], &op[0]);
 894             resolve_bool_comparison(expr->operands[1], &op[1]);
 895          }
 896          emit(CMP(dst_null_d(), op[0], op[1],
 897                   brw_conditional_for_comparison(expr->operation)));
 898          break;
 899
 900       case ir_triop_csel: {
 901          /* Expand the boolean condition into the flag register. */
 902          inst = emit(MOV(dst_null_d(), op[0]));
 903          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 904
 905          /* Select which boolean to return. */
 906          dst_reg temp(this, expr->operands[1]->type);
 907          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 908          inst->predicate = BRW_PREDICATE_NORMAL;
 909
 910          /* Expand the result to a condition code. */
 911          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 912          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 913          break;
 914       }
 915
 916       default:
 917          unreachable("not reached");
 918       }
 919       return;
 920    }
 921
 922    ir->accept(this);
 923
 924    resolve_ud_negate(&this->result);
 925
 926    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 927    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 928 }
 929
 930 /**
 931  * Emit a gen6 IF statement with the comparison folded into the IF
 932  * instruction.
 933  */
 934 void
 935 vec4_visitor::emit_if_gen6(ir_if *ir)
 936 {
 937    ir_expression *expr = ir->condition->as_expression();
 938
 939    if (expr && expr->operation != ir_binop_ubo_load) {
 940       src_reg op[3];
 941       dst_reg temp;
 942
 943       assert(expr->get_num_operands() <= 3);
 944       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 945          expr->operands[i]->accept(this);
 946          op[i] = this->result;
 947       }
 948
 949       switch (expr->operation) {
 950       case ir_unop_logic_not:
 951          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 952          return;
 953
 954       case ir_binop_logic_xor:
 955          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 956          return;
 957
 958       case ir_binop_logic_or:
 959          temp = dst_reg(this, glsl_type::bool_type);
 960          emit(OR(temp, op[0], op[1]));
 961          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 962          return;
 963
 964       case ir_binop_logic_and:
 965          temp = dst_reg(this, glsl_type::bool_type);
 966          emit(AND(temp, op[0], op[1]));
 967          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 968          return;
 969
 970       case ir_unop_f2b:
 971          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_unop_i2b:
 975          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 976          return;
 977
 978       case ir_binop_greater:
 979       case ir_binop_gequal:
 980       case ir_binop_less:
 981       case ir_binop_lequal:
 982       case ir_binop_equal:
 983       case ir_binop_nequal:
 984          emit(IF(op[0], op[1],
 985                  brw_conditional_for_comparison(expr->operation)));
 986          return;
 987
 988       case ir_binop_all_equal:
 989          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 990          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 991          return;
 992
 993       case ir_binop_any_nequal:
 994          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 995          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 996          return;
 997
 998       case ir_unop_any:
 999          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1000          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1001          return;
1002
1003       case ir_triop_csel: {
1004          /* Expand the boolean condition into the flag register. */
1005          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1006          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1007
1008          /* Select which boolean to return. */
1009          dst_reg temp(this, expr->operands[1]->type);
1010          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1011          inst->predicate = BRW_PREDICATE_NORMAL;
1012
1013          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1014          return;
1015       }
1016
1017       default:
1018          unreachable("not reached");
1019       }
1020       return;
1021    }
1022
1023    ir->condition->accept(this);
1024
1025    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_variable *ir)
1030 {
1031    dst_reg *reg = NULL;
1032
1033    if (variable_storage(ir))
1034       return;
1035
1036    switch (ir->data.mode) {
1037    case ir_var_shader_in:
1038       assert(ir->data.location != -1);
1039       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1040       break;
1041
1042    case ir_var_shader_out:
1043       assert(ir->data.location != -1);
1044       reg = new(mem_ctx) dst_reg(this, ir->type);
1045
1046       for (int i = 0; i < type_size(ir->type); i++) {
1047          output_reg[ir->data.location + i] = *reg;
1048          output_reg[ir->data.location + i].reg_offset = i;
1049          output_reg[ir->data.location + i].type =
1050             brw_type_for_base_type(ir->type->get_scalar_type());
1051          output_reg_annotation[ir->data.location + i] = ir->name;
1052       }
1053       break;
1054
1055    case ir_var_auto:
1056    case ir_var_temporary:
1057       reg = new(mem_ctx) dst_reg(this, ir->type);
1058       break;
1059
1060    case ir_var_uniform:
1061       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1062
1063       /* Thanks to the lower_ubo_reference pass, we will see only
1064        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1065        * variables, so no need for them to be in variable_ht.
1066        *
1067        * Some uniforms, such as samplers and atomic counters, have no actual
1068        * storage, so we should ignore them.
1069        */
1070       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1071          return;
1072
1073       /* Track how big the whole uniform variable is, in case we need to put a
1074        * copy of its data into pull constants for array access.
1075        */
1076       assert(this->uniforms < uniform_array_size);
1077       this->uniform_size[this->uniforms] = type_size(ir->type);
1078
1079       if (!strncmp(ir->name, "gl_", 3)) {
1080          setup_builtin_uniform_values(ir);
1081       } else {
1082          setup_uniform_values(ir);
1083       }
1084       break;
1085
1086    case ir_var_system_value:
1087       reg = make_reg_for_system_value(ir);
1088       break;
1089
1090    default:
1091       unreachable("not reached");
1092    }
1093
1094    reg->type = brw_type_for_base_type(ir->type);
1095    hash_table_insert(this->variable_ht, reg, ir);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop *ir)
1100 {
1101    /* We don't want debugging output to print the whole body of the
1102     * loop as the annotation.
1103     */
1104    this->base_ir = NULL;
1105
1106    emit(BRW_OPCODE_DO);
1107
1108    visit_instructions(&ir->body_instructions);
1109
1110    emit(BRW_OPCODE_WHILE);
1111 }
1112
1113 void
1114 vec4_visitor::visit(ir_loop_jump *ir)
1115 {
1116    switch (ir->mode) {
1117    case ir_loop_jump::jump_break:
1118       emit(BRW_OPCODE_BREAK);
1119       break;
1120    case ir_loop_jump::jump_continue:
1121       emit(BRW_OPCODE_CONTINUE);
1122       break;
1123    }
1124 }
1125
1126
1127 void
1128 vec4_visitor::visit(ir_function_signature *)
1129 {
1130    unreachable("not reached");
1131 }
1132
1133 void
1134 vec4_visitor::visit(ir_function *ir)
1135 {
1136    /* Ignore function bodies other than main() -- we shouldn't see calls to
1137     * them since they should all be inlined.
1138     */
1139    if (strcmp(ir->name, "main") == 0) {
1140       const ir_function_signature *sig;
1141       exec_list empty;
1142
1143       sig = ir->matching_signature(NULL, &empty, false);
1144
1145       assert(sig);
1146
1147       visit_instructions(&sig->body);
1148    }
1149 }
1150
1151 bool
1152 vec4_visitor::try_emit_mad(ir_expression *ir)
1153 {
1154    /* 3-src instructions were introduced in gen6. */
1155    if (brw->gen < 6)
1156       return false;
1157
1158    /* MAD can only handle floating-point data. */
1159    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1160       return false;
1161
1162    ir_rvalue *nonmul;
1163    ir_expression *mul;
1164    bool mul_negate, mul_abs;
1165
1166    for (int i = 0; i < 2; i++) {
1167       mul_negate = false;
1168       mul_abs = false;
1169
1170       mul = ir->operands[i]->as_expression();
1171       nonmul = ir->operands[1 - i];
1172
1173       if (mul && mul->operation == ir_unop_abs) {
1174          mul = mul->operands[0]->as_expression();
1175          mul_abs = true;
1176       } else if (mul && mul->operation == ir_unop_neg) {
1177          mul = mul->operands[0]->as_expression();
1178          mul_negate = true;
1179       }
1180
1181       if (mul && mul->operation == ir_binop_mul)
1182          break;
1183    }
1184
1185    if (!mul || mul->operation != ir_binop_mul)
1186       return false;
1187
1188    nonmul->accept(this);
1189    src_reg src0 = fix_3src_operand(this->result);
1190
1191    mul->operands[0]->accept(this);
1192    src_reg src1 = fix_3src_operand(this->result);
1193    src1.negate ^= mul_negate;
1194    src1.abs = mul_abs;
1195    if (mul_abs)
1196       src1.negate = false;
1197
1198    mul->operands[1]->accept(this);
1199    src_reg src2 = fix_3src_operand(this->result);
1200    src2.abs = mul_abs;
1201    if (mul_abs)
1202       src2.negate = false;
1203
1204    this->result = src_reg(this, ir->type);
1205    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1206
1207    return true;
1208 }
1209
1210 bool
1211 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1212 {
1213    /* This optimization relies on CMP setting the destination to 0 when
1214     * false.  Early hardware only sets the least significant bit, and
1215     * leaves the other bits undefined.  So we can't use it.
1216     */
1217    if (brw->gen < 6)
1218       return false;
1219
1220    ir_expression *const cmp = ir->operands[0]->as_expression();
1221
1222    if (cmp == NULL)
1223       return false;
1224
1225    switch (cmp->operation) {
1226    case ir_binop_less:
1227    case ir_binop_greater:
1228    case ir_binop_lequal:
1229    case ir_binop_gequal:
1230    case ir_binop_equal:
1231    case ir_binop_nequal:
1232       break;
1233
1234    default:
1235       return false;
1236    }
1237
1238    cmp->operands[0]->accept(this);
1239    const src_reg cmp_src0 = this->result;
1240
1241    cmp->operands[1]->accept(this);
1242    const src_reg cmp_src1 = this->result;
1243
1244    this->result = src_reg(this, ir->type);
1245
1246    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1247             brw_conditional_for_comparison(cmp->operation)));
1248
1249    /* If the comparison is false, this->result will just happen to be zero.
1250     */
1251    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1252                                        this->result, src_reg(1.0f));
1253    inst->predicate = BRW_PREDICATE_NORMAL;
1254    inst->predicate_inverse = true;
1255
1256    return true;
1257 }
1258
1259 void
1260 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1261                           src_reg src0, src_reg src1)
1262 {
1263    vec4_instruction *inst;
1264
1265    if (brw->gen >= 6) {
1266       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267       inst->conditional_mod = conditionalmod;
1268    } else {
1269       emit(CMP(dst, src0, src1, conditionalmod));
1270
1271       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1272       inst->predicate = BRW_PREDICATE_NORMAL;
1273    }
1274 }
1275
1276 void
1277 vec4_visitor::emit_lrp(const dst_reg &dst,
1278                        const src_reg &x, const src_reg &y, const src_reg &a)
1279 {
1280    if (brw->gen >= 6) {
1281       /* Note that the instruction's argument order is reversed from GLSL
1282        * and the IR.
1283        */
1284       emit(LRP(dst,
1285                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1286    } else {
1287       /* Earlier generations don't support three source operations, so we
1288        * need to emit x*(1-a) + y*a.
1289        */
1290       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1291       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1292       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1293       y_times_a.writemask           = dst.writemask;
1294       one_minus_a.writemask         = dst.writemask;
1295       x_times_one_minus_a.writemask = dst.writemask;
1296
1297       emit(MUL(y_times_a, y, a));
1298       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1299       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1300       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1301    }
1302 }
1303
1304 void
1305 vec4_visitor::visit(ir_expression *ir)
1306 {
1307    unsigned int operand;
1308    src_reg op[ARRAY_SIZE(ir->operands)];
1309    vec4_instruction *inst;
1310
1311    if (ir->operation == ir_binop_add) {
1312       if (try_emit_mad(ir))
1313          return;
1314    }
1315
1316    if (ir->operation == ir_unop_b2f) {
1317       if (try_emit_b2f_of_compare(ir))
1318          return;
1319    }
1320
1321    /* Storage for our result.  Ideally for an assignment we'd be using
1322     * the actual storage for the result here, instead.
1323     */
1324    dst_reg result_dst(this, ir->type);
1325    src_reg result_src(result_dst);
1326
1327    if (ir->operation == ir_triop_csel) {
1328       ir->operands[1]->accept(this);
1329       op[1] = this->result;
1330       ir->operands[2]->accept(this);
1331       op[2] = this->result;
1332
1333       enum brw_predicate predicate;
1334       emit_bool_to_cond_code(ir->operands[0], &predicate);
1335       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1336       inst->predicate = predicate;
1337       this->result = result_src;
1338       return;
1339    }
1340
1341    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1342       this->result.file = BAD_FILE;
1343       ir->operands[operand]->accept(this);
1344       if (this->result.file == BAD_FILE) {
1345          fprintf(stderr, "Failed to get tree for expression operand:\n");
1346          ir->operands[operand]->fprint(stderr);
1347          exit(1);
1348       }
1349       op[operand] = this->result;
1350
1351       /* Matrix expression operands should have been broken down to vector
1352        * operations already.
1353        */
1354       assert(!ir->operands[operand]->type->is_matrix());
1355    }
1356
1357    /* If nothing special happens, this is the result. */
1358    this->result = result_src;
1359
1360    switch (ir->operation) {
1361    case ir_unop_logic_not:
1362       emit(NOT(result_dst, op[0]));
1363       break;
1364    case ir_unop_neg:
1365       op[0].negate = !op[0].negate;
1366       emit(MOV(result_dst, op[0]));
1367       break;
1368    case ir_unop_abs:
1369       op[0].abs = true;
1370       op[0].negate = false;
1371       emit(MOV(result_dst, op[0]));
1372       break;
1373
1374    case ir_unop_sign:
1375       if (ir->type->is_float()) {
1376          /* AND(val, 0x80000000) gives the sign bit.
1377           *
1378           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1379           * zero.
1380           */
1381          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1382
1383          op[0].type = BRW_REGISTER_TYPE_UD;
1384          result_dst.type = BRW_REGISTER_TYPE_UD;
1385          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1386
1387          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1388          inst->predicate = BRW_PREDICATE_NORMAL;
1389
1390          this->result.type = BRW_REGISTER_TYPE_F;
1391       } else {
1392          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1393           *               -> non-negative val generates 0x00000000.
1394           *  Predicated OR sets 1 if val is positive.
1395           */
1396          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1397
1398          emit(ASR(result_dst, op[0], src_reg(31)));
1399
1400          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1401          inst->predicate = BRW_PREDICATE_NORMAL;
1402       }
1403       break;
1404
1405    case ir_unop_rcp:
1406       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1407       break;
1408
1409    case ir_unop_exp2:
1410       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1411       break;
1412    case ir_unop_log2:
1413       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1414       break;
1415    case ir_unop_exp:
1416    case ir_unop_log:
1417       unreachable("not reached: should be handled by ir_explog_to_explog2");
1418    case ir_unop_sin:
1419    case ir_unop_sin_reduced:
1420       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1421       break;
1422    case ir_unop_cos:
1423    case ir_unop_cos_reduced:
1424       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1425       break;
1426
1427    case ir_unop_dFdx:
1428    case ir_unop_dFdx_coarse:
1429    case ir_unop_dFdx_fine:
1430    case ir_unop_dFdy:
1431    case ir_unop_dFdy_coarse:
1432    case ir_unop_dFdy_fine:
1433       unreachable("derivatives not valid in vertex shader");
1434
1435    case ir_unop_bitfield_reverse:
1436       emit(BFREV(result_dst, op[0]));
1437       break;
1438    case ir_unop_bit_count:
1439       emit(CBIT(result_dst, op[0]));
1440       break;
1441    case ir_unop_find_msb: {
1442       src_reg temp = src_reg(this, glsl_type::uint_type);
1443
1444       inst = emit(FBH(dst_reg(temp), op[0]));
1445       inst->dst.writemask = WRITEMASK_XYZW;
1446
1447       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1448        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1449        * subtract the result from 31 to convert the MSB count into an LSB count.
1450        */
1451
1452       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1453       temp.swizzle = BRW_SWIZZLE_NOOP;
1454       emit(MOV(result_dst, temp));
1455
1456       src_reg src_tmp = src_reg(result_dst);
1457       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1458
1459       src_tmp.negate = true;
1460       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1461       inst->predicate = BRW_PREDICATE_NORMAL;
1462       break;
1463    }
1464    case ir_unop_find_lsb:
1465       emit(FBL(result_dst, op[0]));
1466       break;
1467    case ir_unop_saturate:
1468       inst = emit(MOV(result_dst, op[0]));
1469       inst->saturate = true;
1470       break;
1471
1472    case ir_unop_noise:
1473       unreachable("not reached: should be handled by lower_noise");
1474
1475    case ir_binop_add:
1476       emit(ADD(result_dst, op[0], op[1]));
1477       break;
1478    case ir_binop_sub:
1479       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1480
1481    case ir_binop_mul:
1482       if (brw->gen < 8 && ir->type->is_integer()) {
1483          /* For integer multiplication, the MUL uses the low 16 bits of one of
1484           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1485           * accumulates in the contribution of the upper 16 bits of that
1486           * operand.  If we can determine that one of the args is in the low
1487           * 16 bits, though, we can just emit a single MUL.
1488           */
1489          if (ir->operands[0]->is_uint16_constant()) {
1490             if (brw->gen < 7)
1491                emit(MUL(result_dst, op[0], op[1]));
1492             else
1493                emit(MUL(result_dst, op[1], op[0]));
1494          } else if (ir->operands[1]->is_uint16_constant()) {
1495             if (brw->gen < 7)
1496                emit(MUL(result_dst, op[1], op[0]));
1497             else
1498                emit(MUL(result_dst, op[0], op[1]));
1499          } else {
1500             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1501
1502             emit(MUL(acc, op[0], op[1]));
1503             emit(MACH(dst_null_d(), op[0], op[1]));
1504             emit(MOV(result_dst, src_reg(acc)));
1505          }
1506       } else {
1507          emit(MUL(result_dst, op[0], op[1]));
1508       }
1509       break;
1510    case ir_binop_imul_high: {
1511       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1512
1513       emit(MUL(acc, op[0], op[1]));
1514       emit(MACH(result_dst, op[0], op[1]));
1515       break;
1516    }
1517    case ir_binop_div:
1518       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1519       assert(ir->type->is_integer());
1520       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1521       break;
1522    case ir_binop_carry: {
1523       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1524
1525       emit(ADDC(dst_null_ud(), op[0], op[1]));
1526       emit(MOV(result_dst, src_reg(acc)));
1527       break;
1528    }
1529    case ir_binop_borrow: {
1530       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1531
1532       emit(SUBB(dst_null_ud(), op[0], op[1]));
1533       emit(MOV(result_dst, src_reg(acc)));
1534       break;
1535    }
1536    case ir_binop_mod:
1537       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1538       assert(ir->type->is_integer());
1539       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1540       break;
1541
1542    case ir_binop_less:
1543    case ir_binop_greater:
1544    case ir_binop_lequal:
1545    case ir_binop_gequal:
1546    case ir_binop_equal:
1547    case ir_binop_nequal: {
1548       if (brw->gen <= 5) {
1549          resolve_bool_comparison(ir->operands[0], &op[0]);
1550          resolve_bool_comparison(ir->operands[1], &op[1]);
1551       }
1552       emit(CMP(result_dst, op[0], op[1],
1553                brw_conditional_for_comparison(ir->operation)));
1554       break;
1555    }
1556
1557    case ir_binop_all_equal:
1558       if (brw->gen <= 5) {
1559          resolve_bool_comparison(ir->operands[0], &op[0]);
1560          resolve_bool_comparison(ir->operands[1], &op[1]);
1561       }
1562
1563       /* "==" operator producing a scalar boolean. */
1564       if (ir->operands[0]->type->is_vector() ||
1565           ir->operands[1]->type->is_vector()) {
1566          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1567          emit(MOV(result_dst, src_reg(0)));
1568          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1569          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1570       } else {
1571          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1572       }
1573       break;
1574    case ir_binop_any_nequal:
1575       if (brw->gen <= 5) {
1576          resolve_bool_comparison(ir->operands[0], &op[0]);
1577          resolve_bool_comparison(ir->operands[1], &op[1]);
1578       }
1579
1580       /* "!=" operator producing a scalar boolean. */
1581       if (ir->operands[0]->type->is_vector() ||
1582           ir->operands[1]->type->is_vector()) {
1583          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1584
1585          emit(MOV(result_dst, src_reg(0)));
1586          inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1587          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1588       } else {
1589          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1590       }
1591       break;
1592
1593    case ir_unop_any:
1594       if (brw->gen <= 5) {
1595          resolve_bool_comparison(ir->operands[0], &op[0]);
1596       }
1597       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1598       emit(MOV(result_dst, src_reg(0)));
1599
1600       inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1601       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1602       break;
1603
1604    case ir_binop_logic_xor:
1605       emit(XOR(result_dst, op[0], op[1]));
1606       break;
1607
1608    case ir_binop_logic_or:
1609       emit(OR(result_dst, op[0], op[1]));
1610       break;
1611
1612    case ir_binop_logic_and:
1613       emit(AND(result_dst, op[0], op[1]));
1614       break;
1615
1616    case ir_binop_dot:
1617       assert(ir->operands[0]->type->is_vector());
1618       assert(ir->operands[0]->type == ir->operands[1]->type);
1619       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1620       break;
1621
1622    case ir_unop_sqrt:
1623       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1624       break;
1625    case ir_unop_rsq:
1626       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1627       break;
1628
1629    case ir_unop_bitcast_i2f:
1630    case ir_unop_bitcast_u2f:
1631       this->result = op[0];
1632       this->result.type = BRW_REGISTER_TYPE_F;
1633       break;
1634
1635    case ir_unop_bitcast_f2i:
1636       this->result = op[0];
1637       this->result.type = BRW_REGISTER_TYPE_D;
1638       break;
1639
1640    case ir_unop_bitcast_f2u:
1641       this->result = op[0];
1642       this->result.type = BRW_REGISTER_TYPE_UD;
1643       break;
1644
1645    case ir_unop_i2f:
1646    case ir_unop_i2u:
1647    case ir_unop_u2i:
1648    case ir_unop_u2f:
1649    case ir_unop_f2i:
1650    case ir_unop_f2u:
1651       emit(MOV(result_dst, op[0]));
1652       break;
1653    case ir_unop_b2i:
1654       emit(AND(result_dst, op[0], src_reg(1)));
1655       break;
1656    case ir_unop_b2f:
1657       if (brw->gen <= 5) {
1658          resolve_bool_comparison(ir->operands[0], &op[0]);
1659       }
1660       op[0].type = BRW_REGISTER_TYPE_D;
1661       result_dst.type = BRW_REGISTER_TYPE_D;
1662       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1663       result_dst.type = BRW_REGISTER_TYPE_F;
1664       break;
1665    case ir_unop_f2b:
1666       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1667       break;
1668    case ir_unop_i2b:
1669       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1670       break;
1671
1672    case ir_unop_trunc:
1673       emit(RNDZ(result_dst, op[0]));
1674       break;
1675    case ir_unop_ceil: {
1676          src_reg tmp = src_reg(this, ir->type);
1677          op[0].negate = !op[0].negate;
1678          emit(RNDD(dst_reg(tmp), op[0]));
1679          tmp.negate = true;
1680          emit(MOV(result_dst, tmp));
1681       }
1682       break;
1683    case ir_unop_floor:
1684       inst = emit(RNDD(result_dst, op[0]));
1685       break;
1686    case ir_unop_fract:
1687       inst = emit(FRC(result_dst, op[0]));
1688       break;
1689    case ir_unop_round_even:
1690       emit(RNDE(result_dst, op[0]));
1691       break;
1692
1693    case ir_binop_min:
1694       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1695       break;
1696    case ir_binop_max:
1697       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1698       break;
1699
1700    case ir_binop_pow:
1701       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1702       break;
1703
1704    case ir_unop_bit_not:
1705       inst = emit(NOT(result_dst, op[0]));
1706       break;
1707    case ir_binop_bit_and:
1708       inst = emit(AND(result_dst, op[0], op[1]));
1709       break;
1710    case ir_binop_bit_xor:
1711       inst = emit(XOR(result_dst, op[0], op[1]));
1712       break;
1713    case ir_binop_bit_or:
1714       inst = emit(OR(result_dst, op[0], op[1]));
1715       break;
1716
1717    case ir_binop_lshift:
1718       inst = emit(SHL(result_dst, op[0], op[1]));
1719       break;
1720
1721    case ir_binop_rshift:
1722       if (ir->type->base_type == GLSL_TYPE_INT)
1723          inst = emit(ASR(result_dst, op[0], op[1]));
1724       else
1725          inst = emit(SHR(result_dst, op[0], op[1]));
1726       break;
1727
1728    case ir_binop_bfm:
1729       emit(BFI1(result_dst, op[0], op[1]));
1730       break;
1731
1732    case ir_binop_ubo_load: {
1733       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1734       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1735       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1736       src_reg offset;
1737
1738       /* Now, load the vector from that offset. */
1739       assert(ir->type->is_vector() || ir->type->is_scalar());
1740
1741       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1742       packed_consts.type = result.type;
1743       src_reg surf_index;
1744
1745       if (const_uniform_block) {
1746          /* The block index is a constant, so just emit the binding table entry
1747           * as an immediate.
1748           */
1749          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1750                               const_uniform_block->value.u[0]);
1751       } else {
1752          /* The block index is not a constant. Evaluate the index expression
1753           * per-channel and add the base UBO index; the generator will select
1754           * a value from any live channel.
1755           */
1756          surf_index = src_reg(this, glsl_type::uint_type);
1757          emit(ADD(dst_reg(surf_index), op[0],
1758                   src_reg(prog_data->base.binding_table.ubo_start)));
1759
1760          /* Assume this may touch any UBO. It would be nice to provide
1761           * a tighter bound, but the array information is already lowered away.
1762           */
1763          brw_mark_surface_used(&prog_data->base,
1764                                prog_data->base.binding_table.ubo_start +
1765                                shader_prog->NumUniformBlocks - 1);
1766       }
1767
1768       if (const_offset_ir) {
1769          if (brw->gen >= 8) {
1770             /* Store the offset in a GRF so we can send-from-GRF. */
1771             offset = src_reg(this, glsl_type::int_type);
1772             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1773          } else {
1774             /* Immediates are fine on older generations since they'll be moved
1775              * to a (potentially fake) MRF at the generator level.
1776              */
1777             offset = src_reg(const_offset / 16);
1778          }
1779       } else {
1780          offset = src_reg(this, glsl_type::uint_type);
1781          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1782       }
1783
1784       if (brw->gen >= 7) {
1785          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1786
1787          /* We have to use a message header on Skylake to get SIMD4x2 mode.
1788           * Reserve space for the register.
1789           */
1790          if (brw->gen >= 9) {
1791             grf_offset.reg_offset++;
1792             alloc.sizes[grf_offset.reg] = 2;
1793          }
1794
1795          grf_offset.type = offset.type;
1796
1797          emit(MOV(grf_offset, offset));
1798
1799          vec4_instruction *pull =
1800             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1801                                                dst_reg(packed_consts),
1802                                                surf_index,
1803                                                src_reg(grf_offset)));
1804          pull->mlen = 1;
1805       } else {
1806          vec4_instruction *pull =
1807             emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1808                                                dst_reg(packed_consts),
1809                                                surf_index,
1810                                                offset));
1811          pull->base_mrf = 14;
1812          pull->mlen = 1;
1813       }
1814
1815       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1816       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1817                                             const_offset % 16 / 4,
1818                                             const_offset % 16 / 4,
1819                                             const_offset % 16 / 4);
1820
1821       /* UBO bools are any nonzero int.  We need to convert them to use the
1822        * value of true stored in ctx->Const.UniformBooleanTrue.
1823        */
1824       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1825          emit(CMP(result_dst, packed_consts, src_reg(0u),
1826                   BRW_CONDITIONAL_NZ));
1827       } else {
1828          emit(MOV(result_dst, packed_consts));
1829       }
1830       break;
1831    }
1832
1833    case ir_binop_vector_extract:
1834       unreachable("should have been lowered by vec_index_to_cond_assign");
1835
1836    case ir_triop_fma:
1837       op[0] = fix_3src_operand(op[0]);
1838       op[1] = fix_3src_operand(op[1]);
1839       op[2] = fix_3src_operand(op[2]);
1840       /* Note that the instruction's argument order is reversed from GLSL
1841        * and the IR.
1842        */
1843       emit(MAD(result_dst, op[2], op[1], op[0]));
1844       break;
1845
1846    case ir_triop_lrp:
1847       emit_lrp(result_dst, op[0], op[1], op[2]);
1848       break;
1849
1850    case ir_triop_csel:
1851       unreachable("already handled above");
1852       break;
1853
1854    case ir_triop_bfi:
1855       op[0] = fix_3src_operand(op[0]);
1856       op[1] = fix_3src_operand(op[1]);
1857       op[2] = fix_3src_operand(op[2]);
1858       emit(BFI2(result_dst, op[0], op[1], op[2]));
1859       break;
1860
1861    case ir_triop_bitfield_extract:
1862       op[0] = fix_3src_operand(op[0]);
1863       op[1] = fix_3src_operand(op[1]);
1864       op[2] = fix_3src_operand(op[2]);
1865       /* Note that the instruction's argument order is reversed from GLSL
1866        * and the IR.
1867        */
1868       emit(BFE(result_dst, op[2], op[1], op[0]));
1869       break;
1870
1871    case ir_triop_vector_insert:
1872       unreachable("should have been lowered by lower_vector_insert");
1873
1874    case ir_quadop_bitfield_insert:
1875       unreachable("not reached: should be handled by "
1876               "bitfield_insert_to_bfm_bfi\n");
1877
1878    case ir_quadop_vector:
1879       unreachable("not reached: should be handled by lower_quadop_vector");
1880
1881    case ir_unop_pack_half_2x16:
1882       emit_pack_half_2x16(result_dst, op[0]);
1883       break;
1884    case ir_unop_unpack_half_2x16:
1885       emit_unpack_half_2x16(result_dst, op[0]);
1886       break;
1887    case ir_unop_unpack_unorm_4x8:
1888       emit_unpack_unorm_4x8(result_dst, op[0]);
1889       break;
1890    case ir_unop_unpack_snorm_4x8:
1891       emit_unpack_snorm_4x8(result_dst, op[0]);
1892       break;
1893    case ir_unop_pack_unorm_4x8:
1894       emit_pack_unorm_4x8(result_dst, op[0]);
1895       break;
1896    case ir_unop_pack_snorm_4x8:
1897       emit_pack_snorm_4x8(result_dst, op[0]);
1898       break;
1899    case ir_unop_pack_snorm_2x16:
1900    case ir_unop_pack_unorm_2x16:
1901    case ir_unop_unpack_snorm_2x16:
1902    case ir_unop_unpack_unorm_2x16:
1903       unreachable("not reached: should be handled by lower_packing_builtins");
1904    case ir_unop_unpack_half_2x16_split_x:
1905    case ir_unop_unpack_half_2x16_split_y:
1906    case ir_binop_pack_half_2x16_split:
1907    case ir_unop_interpolate_at_centroid:
1908    case ir_binop_interpolate_at_sample:
1909    case ir_binop_interpolate_at_offset:
1910       unreachable("not reached: should not occur in vertex shader");
1911    case ir_binop_ldexp:
1912       unreachable("not reached: should be handled by ldexp_to_arith()");
1913    case ir_unop_d2f:
1914    case ir_unop_f2d:
1915    case ir_unop_d2i:
1916    case ir_unop_i2d:
1917    case ir_unop_d2u:
1918    case ir_unop_u2d:
1919    case ir_unop_d2b:
1920    case ir_unop_pack_double_2x32:
1921    case ir_unop_unpack_double_2x32:
1922    case ir_unop_frexp_sig:
1923    case ir_unop_frexp_exp:
1924       unreachable("fp64 todo");
1925    }
1926 }
1927
1928
1929 void
1930 vec4_visitor::visit(ir_swizzle *ir)
1931 {
1932    src_reg src;
1933    int i = 0;
1934    int swizzle[4];
1935
1936    /* Note that this is only swizzles in expressions, not those on the left
1937     * hand side of an assignment, which do write masking.  See ir_assignment
1938     * for that.
1939     */
1940
1941    ir->val->accept(this);
1942    src = this->result;
1943    assert(src.file != BAD_FILE);
1944
1945    for (i = 0; i < ir->type->vector_elements; i++) {
1946       switch (i) {
1947       case 0:
1948          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1949          break;
1950       case 1:
1951          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1952          break;
1953       case 2:
1954          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1955          break;
1956       case 3:
1957          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1958             break;
1959       }
1960    }
1961    for (; i < 4; i++) {
1962       /* Replicate the last channel out. */
1963       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1964    }
1965
1966    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1967
1968    this->result = src;
1969 }
1970
1971 void
1972 vec4_visitor::visit(ir_dereference_variable *ir)
1973 {
1974    const struct glsl_type *type = ir->type;
1975    dst_reg *reg = variable_storage(ir->var);
1976
1977    if (!reg) {
1978       fail("Failed to find variable storage for %s\n", ir->var->name);
1979       this->result = src_reg(brw_null_reg());
1980       return;
1981    }
1982
1983    this->result = src_reg(*reg);
1984
1985    /* System values get their swizzle from the dst_reg writemask */
1986    if (ir->var->data.mode == ir_var_system_value)
1987       return;
1988
1989    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1990       this->result.swizzle = swizzle_for_size(type->vector_elements);
1991 }
1992
1993
1994 int
1995 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1996 {
1997    /* Under normal circumstances array elements are stored consecutively, so
1998     * the stride is equal to the size of the array element.
1999     */
2000    return type_size(ir->type);
2001 }
2002
2003
2004 void
2005 vec4_visitor::visit(ir_dereference_array *ir)
2006 {
2007    ir_constant *constant_index;
2008    src_reg src;
2009    int array_stride = compute_array_stride(ir);
2010
2011    constant_index = ir->array_index->constant_expression_value();
2012
2013    ir->array->accept(this);
2014    src = this->result;
2015
2016    if (constant_index) {
2017       src.reg_offset += constant_index->value.i[0] * array_stride;
2018    } else {
2019       /* Variable index array dereference.  It eats the "vec4" of the
2020        * base of the array and an index that offsets the Mesa register
2021        * index.
2022        */
2023       ir->array_index->accept(this);
2024
2025       src_reg index_reg;
2026
2027       if (array_stride == 1) {
2028          index_reg = this->result;
2029       } else {
2030          index_reg = src_reg(this, glsl_type::int_type);
2031
2032          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2033       }
2034
2035       if (src.reladdr) {
2036          src_reg temp = src_reg(this, glsl_type::int_type);
2037
2038          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2039
2040          index_reg = temp;
2041       }
2042
2043       src.reladdr = ralloc(mem_ctx, src_reg);
2044       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2045    }
2046
2047    /* If the type is smaller than a vec4, replicate the last channel out. */
2048    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2049       src.swizzle = swizzle_for_size(ir->type->vector_elements);
2050    else
2051       src.swizzle = BRW_SWIZZLE_NOOP;
2052    src.type = brw_type_for_base_type(ir->type);
2053
2054    this->result = src;
2055 }
2056
2057 void
2058 vec4_visitor::visit(ir_dereference_record *ir)
2059 {
2060    unsigned int i;
2061    const glsl_type *struct_type = ir->record->type;
2062    int offset = 0;
2063
2064    ir->record->accept(this);
2065
2066    for (i = 0; i < struct_type->length; i++) {
2067       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2068          break;
2069       offset += type_size(struct_type->fields.structure[i].type);
2070    }
2071
2072    /* If the type is smaller than a vec4, replicate the last channel out. */
2073    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2074       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2075    else
2076       this->result.swizzle = BRW_SWIZZLE_NOOP;
2077    this->result.type = brw_type_for_base_type(ir->type);
2078
2079    this->result.reg_offset += offset;
2080 }
2081
2082 /**
2083  * We want to be careful in assignment setup to hit the actual storage
2084  * instead of potentially using a temporary like we might with the
2085  * ir_dereference handler.
2086  */
2087 static dst_reg
2088 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2089 {
2090    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2091     * access of a vector, it must be separated into a series conditional moves
2092     * before reaching this point (see ir_vec_index_to_cond_assign).
2093     */
2094    assert(ir->as_dereference());
2095    ir_dereference_array *deref_array = ir->as_dereference_array();
2096    if (deref_array) {
2097       assert(!deref_array->array->type->is_vector());
2098    }
2099
2100    /* Use the rvalue deref handler for the most part.  We'll ignore
2101     * swizzles in it and write swizzles using writemask, though.
2102     */
2103    ir->accept(v);
2104    return dst_reg(v->result);
2105 }
2106
2107 void
2108 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2109                               const struct glsl_type *type,
2110                               enum brw_predicate predicate)
2111 {
2112    if (type->base_type == GLSL_TYPE_STRUCT) {
2113       for (unsigned int i = 0; i < type->length; i++) {
2114          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2115       }
2116       return;
2117    }
2118
2119    if (type->is_array()) {
2120       for (unsigned int i = 0; i < type->length; i++) {
2121          emit_block_move(dst, src, type->fields.array, predicate);
2122       }
2123       return;
2124    }
2125
2126    if (type->is_matrix()) {
2127       const struct glsl_type *vec_type;
2128
2129       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2130                                          type->vector_elements, 1);
2131
2132       for (int i = 0; i < type->matrix_columns; i++) {
2133          emit_block_move(dst, src, vec_type, predicate);
2134       }
2135       return;
2136    }
2137
2138    assert(type->is_scalar() || type->is_vector());
2139
2140    dst->type = brw_type_for_base_type(type);
2141    src->type = dst->type;
2142
2143    dst->writemask = (1 << type->vector_elements) - 1;
2144
2145    src->swizzle = swizzle_for_size(type->vector_elements);
2146
2147    vec4_instruction *inst = emit(MOV(*dst, *src));
2148    inst->predicate = predicate;
2149
2150    dst->reg_offset++;
2151    src->reg_offset++;
2152 }
2153
2154
2155 /* If the RHS processing resulted in an instruction generating a
2156  * temporary value, and it would be easy to rewrite the instruction to
2157  * generate its result right into the LHS instead, do so.  This ends
2158  * up reliably removing instructions where it can be tricky to do so
2159  * later without real UD chain information.
2160  */
2161 bool
2162 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2163                                      dst_reg dst,
2164                                      src_reg src,
2165                                      vec4_instruction *pre_rhs_inst,
2166                                      vec4_instruction *last_rhs_inst)
2167 {
2168    /* This could be supported, but it would take more smarts. */
2169    if (ir->condition)
2170       return false;
2171
2172    if (pre_rhs_inst == last_rhs_inst)
2173       return false; /* No instructions generated to work with. */
2174
2175    /* Make sure the last instruction generated our source reg. */
2176    if (src.file != GRF ||
2177        src.file != last_rhs_inst->dst.file ||
2178        src.reg != last_rhs_inst->dst.reg ||
2179        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2180        src.reladdr ||
2181        src.abs ||
2182        src.negate ||
2183        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2184       return false;
2185
2186    /* Check that that last instruction fully initialized the channels
2187     * we want to use, in the order we want to use them.  We could
2188     * potentially reswizzle the operands of many instructions so that
2189     * we could handle out of order channels, but don't yet.
2190     */
2191
2192    for (unsigned i = 0; i < 4; i++) {
2193       if (dst.writemask & (1 << i)) {
2194          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2195             return false;
2196
2197          if (BRW_GET_SWZ(src.swizzle, i) != i)
2198             return false;
2199       }
2200    }
2201
2202    /* Success!  Rewrite the instruction. */
2203    last_rhs_inst->dst.file = dst.file;
2204    last_rhs_inst->dst.reg = dst.reg;
2205    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2206    last_rhs_inst->dst.reladdr = dst.reladdr;
2207    last_rhs_inst->dst.writemask &= dst.writemask;
2208
2209    return true;
2210 }
2211
2212 void
2213 vec4_visitor::visit(ir_assignment *ir)
2214 {
2215    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2216    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2217
2218    if (!ir->lhs->type->is_scalar() &&
2219        !ir->lhs->type->is_vector()) {
2220       ir->rhs->accept(this);
2221       src_reg src = this->result;
2222
2223       if (ir->condition) {
2224          emit_bool_to_cond_code(ir->condition, &predicate);
2225       }
2226
2227       /* emit_block_move doesn't account for swizzles in the source register.
2228        * This should be ok, since the source register is a structure or an
2229        * array, and those can't be swizzled.  But double-check to be sure.
2230        */
2231       assert(src.swizzle ==
2232              (ir->rhs->type->is_matrix()
2233               ? swizzle_for_size(ir->rhs->type->vector_elements)
2234               : BRW_SWIZZLE_NOOP));
2235
2236       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2237       return;
2238    }
2239
2240    /* Now we're down to just a scalar/vector with writemasks. */
2241    int i;
2242
2243    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2244    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2245
2246    ir->rhs->accept(this);
2247
2248    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2249
2250    src_reg src = this->result;
2251
2252    int swizzles[4];
2253    int first_enabled_chan = 0;
2254    int src_chan = 0;
2255
2256    assert(ir->lhs->type->is_vector() ||
2257           ir->lhs->type->is_scalar());
2258    dst.writemask = ir->write_mask;
2259
2260    for (int i = 0; i < 4; i++) {
2261       if (dst.writemask & (1 << i)) {
2262          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2263          break;
2264       }
2265    }
2266
2267    /* Swizzle a small RHS vector into the channels being written.
2268     *
2269     * glsl ir treats write_mask as dictating how many channels are
2270     * present on the RHS while in our instructions we need to make
2271     * those channels appear in the slots of the vec4 they're written to.
2272     */
2273    for (int i = 0; i < 4; i++) {
2274       if (dst.writemask & (1 << i))
2275          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2276       else
2277          swizzles[i] = first_enabled_chan;
2278    }
2279    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2280                               swizzles[2], swizzles[3]);
2281
2282    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2283       return;
2284    }
2285
2286    if (ir->condition) {
2287       emit_bool_to_cond_code(ir->condition, &predicate);
2288    }
2289
2290    for (i = 0; i < type_size(ir->lhs->type); i++) {
2291       vec4_instruction *inst = emit(MOV(dst, src));
2292       inst->predicate = predicate;
2293
2294       dst.reg_offset++;
2295       src.reg_offset++;
2296    }
2297 }
2298
2299 void
2300 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2301 {
2302    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2303       foreach_in_list(ir_constant, field_value, &ir->components) {
2304          emit_constant_values(dst, field_value);
2305       }
2306       return;
2307    }
2308
2309    if (ir->type->is_array()) {
2310       for (unsigned int i = 0; i < ir->type->length; i++) {
2311          emit_constant_values(dst, ir->array_elements[i]);
2312       }
2313       return;
2314    }
2315
2316    if (ir->type->is_matrix()) {
2317       for (int i = 0; i < ir->type->matrix_columns; i++) {
2318          float *vec = &ir->value.f[i * ir->type->vector_elements];
2319
2320          for (int j = 0; j < ir->type->vector_elements; j++) {
2321             dst->writemask = 1 << j;
2322             dst->type = BRW_REGISTER_TYPE_F;
2323
2324             emit(MOV(*dst, src_reg(vec[j])));
2325          }
2326          dst->reg_offset++;
2327       }
2328       return;
2329    }
2330
2331    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2332
2333    for (int i = 0; i < ir->type->vector_elements; i++) {
2334       if (!(remaining_writemask & (1 << i)))
2335          continue;
2336
2337       dst->writemask = 1 << i;
2338       dst->type = brw_type_for_base_type(ir->type);
2339
2340       /* Find other components that match the one we're about to
2341        * write.  Emits fewer instructions for things like vec4(0.5,
2342        * 1.5, 1.5, 1.5).
2343        */
2344       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2345          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2346             if (ir->value.b[i] == ir->value.b[j])
2347                dst->writemask |= (1 << j);
2348          } else {
2349             /* u, i, and f storage all line up, so no need for a
2350              * switch case for comparing each type.
2351              */
2352             if (ir->value.u[i] == ir->value.u[j])
2353                dst->writemask |= (1 << j);
2354          }
2355       }
2356
2357       switch (ir->type->base_type) {
2358       case GLSL_TYPE_FLOAT:
2359          emit(MOV(*dst, src_reg(ir->value.f[i])));
2360          break;
2361       case GLSL_TYPE_INT:
2362          emit(MOV(*dst, src_reg(ir->value.i[i])));
2363          break;
2364       case GLSL_TYPE_UINT:
2365          emit(MOV(*dst, src_reg(ir->value.u[i])));
2366          break;
2367       case GLSL_TYPE_BOOL:
2368          emit(MOV(*dst,
2369                   src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2370                                               : 0)));
2371          break;
2372       default:
2373          unreachable("Non-float/uint/int/bool constant");
2374       }
2375
2376       remaining_writemask &= ~dst->writemask;
2377    }
2378    dst->reg_offset++;
2379 }
2380
2381 void
2382 vec4_visitor::visit(ir_constant *ir)
2383 {
2384    dst_reg dst = dst_reg(this, ir->type);
2385    this->result = src_reg(dst);
2386
2387    emit_constant_values(&dst, ir);
2388 }
2389
2390 void
2391 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2392 {
2393    ir_dereference *deref = static_cast<ir_dereference *>(
2394       ir->actual_parameters.get_head());
2395    ir_variable *location = deref->variable_referenced();
2396    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2397                           location->data.binding);
2398
2399    /* Calculate the surface offset */
2400    src_reg offset(this, glsl_type::uint_type);
2401    ir_dereference_array *deref_array = deref->as_dereference_array();
2402    if (deref_array) {
2403       deref_array->array_index->accept(this);
2404
2405       src_reg tmp(this, glsl_type::uint_type);
2406       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2407       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2408    } else {
2409       offset = location->data.atomic.offset;
2410    }
2411
2412    /* Emit the appropriate machine instruction */
2413    const char *callee = ir->callee->function_name();
2414    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2415
2416    if (!strcmp("__intrinsic_atomic_read", callee)) {
2417       emit_untyped_surface_read(surf_index, dst, offset);
2418
2419    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2420       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2421                           src_reg(), src_reg());
2422
2423    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2424       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2425                           src_reg(), src_reg());
2426    }
2427 }
2428
2429 void
2430 vec4_visitor::visit(ir_call *ir)
2431 {
2432    const char *callee = ir->callee->function_name();
2433
2434    if (!strcmp("__intrinsic_atomic_read", callee) ||
2435        !strcmp("__intrinsic_atomic_increment", callee) ||
2436        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2437       visit_atomic_counter_intrinsic(ir);
2438    } else {
2439       unreachable("Unsupported intrinsic.");
2440    }
2441 }
2442
2443 src_reg
2444 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2445 {
2446    vec4_instruction *inst =
2447       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2448                                     dst_reg(this, glsl_type::uvec4_type));
2449    inst->base_mrf = 2;
2450    inst->mlen = 1;
2451    inst->src[1] = sampler;
2452
2453    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2454    int param_base = inst->base_mrf;
2455    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2456    int zero_mask = 0xf & ~coord_mask;
2457
2458    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2459             coordinate));
2460
2461    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2462             src_reg(0)));
2463
2464    emit(inst);
2465    return src_reg(inst->dst);
2466 }
2467
2468 static bool
2469 is_high_sampler(struct brw_context *brw, src_reg sampler)
2470 {
2471    if (brw->gen < 8 && !brw->is_haswell)
2472       return false;
2473
2474    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2475 }
2476
2477 void
2478 vec4_visitor::visit(ir_texture *ir)
2479 {
2480    uint32_t sampler =
2481       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2482
2483    ir_rvalue *nonconst_sampler_index =
2484       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2485
2486    /* Handle non-constant sampler array indexing */
2487    src_reg sampler_reg;
2488    if (nonconst_sampler_index) {
2489       /* The highest sampler which may be used by this operation is
2490        * the last element of the array. Mark it here, because the generator
2491        * doesn't have enough information to determine the bound.
2492        */
2493       uint32_t array_size = ir->sampler->as_dereference_array()
2494          ->array->type->array_size();
2495
2496       uint32_t max_used = sampler + array_size - 1;
2497       if (ir->op == ir_tg4 && brw->gen < 8) {
2498          max_used += prog_data->base.binding_table.gather_texture_start;
2499       } else {
2500          max_used += prog_data->base.binding_table.texture_start;
2501       }
2502
2503       brw_mark_surface_used(&prog_data->base, max_used);
2504
2505       /* Emit code to evaluate the actual indexing expression */
2506       nonconst_sampler_index->accept(this);
2507       dst_reg temp(this, glsl_type::uint_type);
2508       emit(ADD(temp, this->result, src_reg(sampler)))
2509          ->force_writemask_all = true;
2510       sampler_reg = src_reg(temp);
2511    } else {
2512       /* Single sampler, or constant array index; the indexing expression
2513        * is just an immediate.
2514        */
2515       sampler_reg = src_reg(sampler);
2516    }
2517
2518    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2519     * emitting anything other than setting up the constant result.
2520     */
2521    if (ir->op == ir_tg4) {
2522       ir_constant *chan = ir->lod_info.component->as_constant();
2523       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2524       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2525          dst_reg result(this, ir->type);
2526          this->result = src_reg(result);
2527          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2528          return;
2529       }
2530    }
2531
2532    /* Should be lowered by do_lower_texture_projection */
2533    assert(!ir->projector);
2534
2535    /* Should be lowered */
2536    assert(!ir->offset || !ir->offset->type->is_array());
2537
2538    /* Generate code to compute all the subexpression trees.  This has to be
2539     * done before loading any values into MRFs for the sampler message since
2540     * generating these values may involve SEND messages that need the MRFs.
2541     */
2542    src_reg coordinate;
2543    if (ir->coordinate) {
2544       ir->coordinate->accept(this);
2545       coordinate = this->result;
2546    }
2547
2548    src_reg shadow_comparitor;
2549    if (ir->shadow_comparitor) {
2550       ir->shadow_comparitor->accept(this);
2551       shadow_comparitor = this->result;
2552    }
2553
2554    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2555    src_reg offset_value;
2556    if (has_nonconstant_offset) {
2557       ir->offset->accept(this);
2558       offset_value = src_reg(this->result);
2559    }
2560
2561    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2562    src_reg lod, dPdx, dPdy, sample_index, mcs;
2563    switch (ir->op) {
2564    case ir_tex:
2565       lod = src_reg(0.0f);
2566       lod_type = glsl_type::float_type;
2567       break;
2568    case ir_txf:
2569    case ir_txl:
2570    case ir_txs:
2571       ir->lod_info.lod->accept(this);
2572       lod = this->result;
2573       lod_type = ir->lod_info.lod->type;
2574       break;
2575    case ir_query_levels:
2576       lod = src_reg(0);
2577       lod_type = glsl_type::int_type;
2578       break;
2579    case ir_txf_ms:
2580       ir->lod_info.sample_index->accept(this);
2581       sample_index = this->result;
2582       sample_index_type = ir->lod_info.sample_index->type;
2583
2584       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2585          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2586       else
2587          mcs = src_reg(0u);
2588       break;
2589    case ir_txd:
2590       ir->lod_info.grad.dPdx->accept(this);
2591       dPdx = this->result;
2592
2593       ir->lod_info.grad.dPdy->accept(this);
2594       dPdy = this->result;
2595
2596       lod_type = ir->lod_info.grad.dPdx->type;
2597       break;
2598    case ir_txb:
2599    case ir_lod:
2600    case ir_tg4:
2601       break;
2602    }
2603
2604    enum opcode opcode;
2605    switch (ir->op) {
2606    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2607    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2608    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2609    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2610    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2611    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2612    case ir_tg4: opcode = has_nonconstant_offset
2613                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2614    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2615    case ir_txb:
2616       unreachable("TXB is not valid for vertex shaders.");
2617    case ir_lod:
2618       unreachable("LOD is not valid for vertex shaders.");
2619    default:
2620       unreachable("Unrecognized tex op");
2621    }
2622
2623    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2624       opcode, dst_reg(this, ir->type));
2625
2626    if (ir->offset != NULL && !has_nonconstant_offset) {
2627       inst->offset =
2628          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2629                             ir->offset->type->vector_elements);
2630    }
2631
2632    /* Stuff the channel select bits in the top of the texture offset */
2633    if (ir->op == ir_tg4)
2634       inst->offset |= gather_channel(ir, sampler) << 16;
2635
2636    /* The message header is necessary for:
2637     * - Gen4 (always)
2638     * - Gen9+ for selecting SIMD4x2
2639     * - Texel offsets
2640     * - Gather channel selection
2641     * - Sampler indices too large to fit in a 4-bit value.
2642     */
2643    inst->header_present =
2644       brw->gen < 5 || brw->gen >= 9 ||
2645       inst->offset != 0 || ir->op == ir_tg4 ||
2646       is_high_sampler(brw, sampler_reg);
2647    inst->base_mrf = 2;
2648    inst->mlen = inst->header_present + 1; /* always at least one */
2649    inst->dst.writemask = WRITEMASK_XYZW;
2650    inst->shadow_compare = ir->shadow_comparitor != NULL;
2651
2652    inst->src[1] = sampler_reg;
2653
2654    /* MRF for the first parameter */
2655    int param_base = inst->base_mrf + inst->header_present;
2656
2657    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2658       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2659       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2660    } else {
2661       /* Load the coordinate */
2662       /* FINISHME: gl_clamp_mask and saturate */
2663       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2664       int zero_mask = 0xf & ~coord_mask;
2665
2666       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2667                coordinate));
2668
2669       if (zero_mask != 0) {
2670          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2671                   src_reg(0)));
2672       }
2673       /* Load the shadow comparitor */
2674       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2675          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2676                           WRITEMASK_X),
2677                   shadow_comparitor));
2678          inst->mlen++;
2679       }
2680
2681       /* Load the LOD info */
2682       if (ir->op == ir_tex || ir->op == ir_txl) {
2683          int mrf, writemask;
2684          if (brw->gen >= 5) {
2685             mrf = param_base + 1;
2686             if (ir->shadow_comparitor) {
2687                writemask = WRITEMASK_Y;
2688                /* mlen already incremented */
2689             } else {
2690                writemask = WRITEMASK_X;
2691                inst->mlen++;
2692             }
2693          } else /* brw->gen == 4 */ {
2694             mrf = param_base;
2695             writemask = WRITEMASK_W;
2696          }
2697          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2698       } else if (ir->op == ir_txf) {
2699          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2700       } else if (ir->op == ir_txf_ms) {
2701          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2702                   sample_index));
2703          if (brw->gen >= 7) {
2704             /* MCS data is in the first channel of `mcs`, but we need to get it into
2705              * the .y channel of the second vec4 of params, so replicate .x across
2706              * the whole vec4 and then mask off everything except .y
2707              */
2708             mcs.swizzle = BRW_SWIZZLE_XXXX;
2709             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2710                      mcs));
2711          }
2712          inst->mlen++;
2713       } else if (ir->op == ir_txd) {
2714          const glsl_type *type = lod_type;
2715
2716          if (brw->gen >= 5) {
2717             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2718             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2719             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2720             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2721             inst->mlen++;
2722
2723             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2724                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2725                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2726                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2727                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2728                inst->mlen++;
2729
2730                if (ir->shadow_comparitor) {
2731                   emit(MOV(dst_reg(MRF, param_base + 2,
2732                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2733                            shadow_comparitor));
2734                }
2735             }
2736          } else /* brw->gen == 4 */ {
2737             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2738             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2739             inst->mlen += 2;
2740          }
2741       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2742          if (ir->shadow_comparitor) {
2743             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2744                      shadow_comparitor));
2745          }
2746
2747          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2748                   offset_value));
2749          inst->mlen++;
2750       }
2751    }
2752
2753    emit(inst);
2754
2755    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2756     * spec requires layers.
2757     */
2758    if (ir->op == ir_txs) {
2759       glsl_type const *type = ir->sampler->type;
2760       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2761           type->sampler_array) {
2762          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2763                    writemask(inst->dst, WRITEMASK_Z),
2764                    src_reg(inst->dst), src_reg(6));
2765       }
2766    }
2767
2768    if (brw->gen == 6 && ir->op == ir_tg4) {
2769       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2770    }
2771
2772    swizzle_result(ir, src_reg(inst->dst), sampler);
2773 }
2774
2775 /**
2776  * Apply workarounds for Gen6 gather with UINT/SINT
2777  */
2778 void
2779 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2780 {
2781    if (!wa)
2782       return;
2783
2784    int width = (wa & WA_8BIT) ? 8 : 16;
2785    dst_reg dst_f = dst;
2786    dst_f.type = BRW_REGISTER_TYPE_F;
2787
2788    /* Convert from UNORM to UINT */
2789    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2790    emit(MOV(dst, src_reg(dst_f)));
2791
2792    if (wa & WA_SIGN) {
2793       /* Reinterpret the UINT value as a signed INT value by
2794        * shifting the sign bit into place, then shifting back
2795        * preserving sign.
2796        */
2797       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2798       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2799    }
2800 }
2801
2802 /**
2803  * Set up the gather channel based on the swizzle, for gather4.
2804  */
2805 uint32_t
2806 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2807 {
2808    ir_constant *chan = ir->lod_info.component->as_constant();
2809    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2810    switch (swiz) {
2811       case SWIZZLE_X: return 0;
2812       case SWIZZLE_Y:
2813          /* gather4 sampler is broken for green channel on RG32F --
2814           * we must ask for blue instead.
2815           */
2816          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2817             return 2;
2818          return 1;
2819       case SWIZZLE_Z: return 2;
2820       case SWIZZLE_W: return 3;
2821       default:
2822          unreachable("Not reached"); /* zero, one swizzles handled already */
2823    }
2824 }
2825
2826 void
2827 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2828 {
2829    int s = key->tex.swizzles[sampler];
2830
2831    this->result = src_reg(this, ir->type);
2832    dst_reg swizzled_result(this->result);
2833
2834    if (ir->op == ir_query_levels) {
2835       /* # levels is in .w */
2836       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2837       emit(MOV(swizzled_result, orig_val));
2838       return;
2839    }
2840
2841    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2842                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2843       emit(MOV(swizzled_result, orig_val));
2844       return;
2845    }
2846
2847
2848    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2849    int swizzle[4] = {0};
2850
2851    for (int i = 0; i < 4; i++) {
2852       switch (GET_SWZ(s, i)) {
2853       case SWIZZLE_ZERO:
2854          zero_mask |= (1 << i);
2855          break;
2856       case SWIZZLE_ONE:
2857          one_mask |= (1 << i);
2858          break;
2859       default:
2860          copy_mask |= (1 << i);
2861          swizzle[i] = GET_SWZ(s, i);
2862          break;
2863       }
2864    }
2865
2866    if (copy_mask) {
2867       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2868       swizzled_result.writemask = copy_mask;
2869       emit(MOV(swizzled_result, orig_val));
2870    }
2871
2872    if (zero_mask) {
2873       swizzled_result.writemask = zero_mask;
2874       emit(MOV(swizzled_result, src_reg(0.0f)));
2875    }
2876
2877    if (one_mask) {
2878       swizzled_result.writemask = one_mask;
2879       emit(MOV(swizzled_result, src_reg(1.0f)));
2880    }
2881 }
2882
2883 void
2884 vec4_visitor::visit(ir_return *)
2885 {
2886    unreachable("not reached");
2887 }
2888
2889 void
2890 vec4_visitor::visit(ir_discard *)
2891 {
2892    unreachable("not reached");
2893 }
2894
2895 void
2896 vec4_visitor::visit(ir_if *ir)
2897 {
2898    /* Don't point the annotation at the if statement, because then it plus
2899     * the then and else blocks get printed.
2900     */
2901    this->base_ir = ir->condition;
2902
2903    if (brw->gen == 6) {
2904       emit_if_gen6(ir);
2905    } else {
2906       enum brw_predicate predicate;
2907       emit_bool_to_cond_code(ir->condition, &predicate);
2908       emit(IF(predicate));
2909    }
2910
2911    visit_instructions(&ir->then_instructions);
2912
2913    if (!ir->else_instructions.is_empty()) {
2914       this->base_ir = ir->condition;
2915       emit(BRW_OPCODE_ELSE);
2916
2917       visit_instructions(&ir->else_instructions);
2918    }
2919
2920    this->base_ir = ir->condition;
2921    emit(BRW_OPCODE_ENDIF);
2922 }
2923
2924 void
2925 vec4_visitor::visit(ir_emit_vertex *)
2926 {
2927    unreachable("not reached");
2928 }
2929
2930 void
2931 vec4_visitor::visit(ir_end_primitive *)
2932 {
2933    unreachable("not reached");
2934 }
2935
2936 void
2937 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2938                                   dst_reg dst, src_reg offset,
2939                                   src_reg src0, src_reg src1)
2940 {
2941    unsigned mlen = 0;
2942
2943    /* Set the atomic operation offset. */
2944    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2945    mlen++;
2946
2947    /* Set the atomic operation arguments. */
2948    if (src0.file != BAD_FILE) {
2949       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2950       mlen++;
2951    }
2952
2953    if (src1.file != BAD_FILE) {
2954       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2955       mlen++;
2956    }
2957
2958    /* Emit the instruction.  Note that this maps to the normal SIMD8
2959     * untyped atomic message on Ivy Bridge, but that's OK because
2960     * unused channels will be masked out.
2961     */
2962    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2963                                  src_reg(atomic_op), src_reg(surf_index));
2964    inst->base_mrf = 0;
2965    inst->mlen = mlen;
2966 }
2967
2968 void
2969 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2970                                         src_reg offset)
2971 {
2972    /* Set the surface read offset. */
2973    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2974
2975    /* Emit the instruction.  Note that this maps to the normal SIMD8
2976     * untyped surface read message, but that's OK because unused
2977     * channels will be masked out.
2978     */
2979    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2980                                  dst, src_reg(surf_index));
2981    inst->base_mrf = 0;
2982    inst->mlen = 1;
2983 }
2984
2985 void
2986 vec4_visitor::emit_ndc_computation()
2987 {
2988    /* Get the position */
2989    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2990
2991    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2992    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2993    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2994
2995    current_annotation = "NDC";
2996    dst_reg ndc_w = ndc;
2997    ndc_w.writemask = WRITEMASK_W;
2998    src_reg pos_w = pos;
2999    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3000    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3001
3002    dst_reg ndc_xyz = ndc;
3003    ndc_xyz.writemask = WRITEMASK_XYZ;
3004
3005    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3006 }
3007
3008 void
3009 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3010 {
3011    if (brw->gen < 6 &&
3012        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3013         key->userclip_active || brw->has_negative_rhw_bug)) {
3014       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3015       dst_reg header1_w = header1;
3016       header1_w.writemask = WRITEMASK_W;
3017
3018       emit(MOV(header1, 0u));
3019
3020       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3021          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3022
3023          current_annotation = "Point size";
3024          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3025          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3026       }
3027
3028       if (key->userclip_active) {
3029          current_annotation = "Clipping flags";
3030          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3031          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3032
3033          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3034          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3035          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3036
3037          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3038          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3039          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3040          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3041       }
3042
3043       /* i965 clipping workaround:
3044        * 1) Test for -ve rhw
3045        * 2) If set,
3046        *      set ndc = (0,0,0,0)
3047        *      set ucp[6] = 1
3048        *
3049        * Later, clipping will detect ucp[6] and ensure the primitive is
3050        * clipped against all fixed planes.
3051        */
3052       if (brw->has_negative_rhw_bug) {
3053          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3054          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3055          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3056          vec4_instruction *inst;
3057          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3058          inst->predicate = BRW_PREDICATE_NORMAL;
3059          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3060          inst->predicate = BRW_PREDICATE_NORMAL;
3061       }
3062
3063       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3064    } else if (brw->gen < 6) {
3065       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3066    } else {
3067       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3068       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3069          dst_reg reg_w = reg;
3070          reg_w.writemask = WRITEMASK_W;
3071          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3072       }
3073       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3074          dst_reg reg_y = reg;
3075          reg_y.writemask = WRITEMASK_Y;
3076          reg_y.type = BRW_REGISTER_TYPE_D;
3077          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3078       }
3079       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3080          dst_reg reg_z = reg;
3081          reg_z.writemask = WRITEMASK_Z;
3082          reg_z.type = BRW_REGISTER_TYPE_D;
3083          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3084       }
3085    }
3086 }
3087
3088 void
3089 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3090 {
3091    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3092     *
3093     *     "If a linked set of shaders forming the vertex stage contains no
3094     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3095     *     application has requested clipping against user clip planes through
3096     *     the API, then the coordinate written to gl_Position is used for
3097     *     comparison against the user clip planes."
3098     *
3099     * This function is only called if the shader didn't write to
3100     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3101     * if the user wrote to it; otherwise we use gl_Position.
3102     */
3103    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3104    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3105       clip_vertex = VARYING_SLOT_POS;
3106    }
3107
3108    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3109         ++i) {
3110       reg.writemask = 1 << i;
3111       emit(DP4(reg,
3112                src_reg(output_reg[clip_vertex]),
3113                src_reg(this->userplane[i + offset])));
3114    }
3115 }
3116
3117 vec4_instruction *
3118 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3119 {
3120    assert (varying < VARYING_SLOT_MAX);
3121    reg.type = output_reg[varying].type;
3122    current_annotation = output_reg_annotation[varying];
3123    /* Copy the register, saturating if necessary */
3124    return emit(MOV(reg, src_reg(output_reg[varying])));
3125 }
3126
3127 void
3128 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3129 {
3130    reg.type = BRW_REGISTER_TYPE_F;
3131
3132    switch (varying) {
3133    case VARYING_SLOT_PSIZ:
3134    {
3135       /* PSIZ is always in slot 0, and is coupled with other flags. */
3136       current_annotation = "indices, point width, clip flags";
3137       emit_psiz_and_flags(reg);
3138       break;
3139    }
3140    case BRW_VARYING_SLOT_NDC:
3141       current_annotation = "NDC";
3142       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3143       break;
3144    case VARYING_SLOT_POS:
3145       current_annotation = "gl_Position";
3146       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3147       break;
3148    case VARYING_SLOT_EDGE:
3149       /* This is present when doing unfilled polygons.  We're supposed to copy
3150        * the edge flag from the user-provided vertex array
3151        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3152        * of that attribute (starts as 1.0f).  This is then used in clipping to
3153        * determine which edges should be drawn as wireframe.
3154        */
3155       current_annotation = "edge flag";
3156       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3157                                     glsl_type::float_type, WRITEMASK_XYZW))));
3158       break;
3159    case BRW_VARYING_SLOT_PAD:
3160       /* No need to write to this slot */
3161       break;
3162    case VARYING_SLOT_COL0:
3163    case VARYING_SLOT_COL1:
3164    case VARYING_SLOT_BFC0:
3165    case VARYING_SLOT_BFC1: {
3166       /* These built-in varyings are only supported in compatibility mode,
3167        * and we only support GS in core profile.  So, this must be a vertex
3168        * shader.
3169        */
3170       assert(stage == MESA_SHADER_VERTEX);
3171       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3172       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3173          inst->saturate = true;
3174       break;
3175    }
3176
3177    default:
3178       emit_generic_urb_slot(reg, varying);
3179       break;
3180    }
3181 }
3182
3183 static int
3184 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3185 {
3186    if (brw->gen >= 6) {
3187       /* URB data written (does not include the message header reg) must
3188        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3189        * section 5.4.3.2.2: URB_INTERLEAVED.
3190        *
3191        * URB entries are allocated on a multiple of 1024 bits, so an
3192        * extra 128 bits written here to make the end align to 256 is
3193        * no problem.
3194        */
3195       if ((mlen % 2) != 1)
3196          mlen++;
3197    }
3198
3199    return mlen;
3200 }
3201
3202
3203 /**
3204  * Generates the VUE payload plus the necessary URB write instructions to
3205  * output it.
3206  *
3207  * The VUE layout is documented in Volume 2a.
3208  */
3209 void
3210 vec4_visitor::emit_vertex()
3211 {
3212    /* MRF 0 is reserved for the debugger, so start with message header
3213     * in MRF 1.
3214     */
3215    int base_mrf = 1;
3216    int mrf = base_mrf;
3217    /* In the process of generating our URB write message contents, we
3218     * may need to unspill a register or load from an array.  Those
3219     * reads would use MRFs 14-15.
3220     */
3221    int max_usable_mrf = 13;
3222
3223    /* The following assertion verifies that max_usable_mrf causes an
3224     * even-numbered amount of URB write data, which will meet gen6's
3225     * requirements for length alignment.
3226     */
3227    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3228
3229    /* First mrf is the g0-based message header containing URB handles and
3230     * such.
3231     */
3232    emit_urb_write_header(mrf++);
3233
3234    if (brw->gen < 6) {
3235       emit_ndc_computation();
3236    }
3237
3238    /* Lower legacy ff and ClipVertex clipping to clip distances */
3239    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3240       current_annotation = "user clip distances";
3241
3242       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3243       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3244
3245       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3246       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3247    }
3248
3249    /* We may need to split this up into several URB writes, so do them in a
3250     * loop.
3251     */
3252    int slot = 0;
3253    bool complete = false;
3254    do {
3255       /* URB offset is in URB row increments, and each of our MRFs is half of
3256        * one of those, since we're doing interleaved writes.
3257        */
3258       int offset = slot / 2;
3259
3260       mrf = base_mrf + 1;
3261       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3262          emit_urb_slot(dst_reg(MRF, mrf++),
3263                        prog_data->vue_map.slot_to_varying[slot]);
3264
3265          /* If this was max_usable_mrf, we can't fit anything more into this
3266           * URB WRITE.
3267           */
3268          if (mrf > max_usable_mrf) {
3269             slot++;
3270             break;
3271          }
3272       }
3273
3274       complete = slot >= prog_data->vue_map.num_slots;
3275       current_annotation = "URB write";
3276       vec4_instruction *inst = emit_urb_write_opcode(complete);
3277       inst->base_mrf = base_mrf;
3278       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3279       inst->offset += offset;
3280    } while(!complete);
3281 }
3282
3283
3284 src_reg
3285 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3286                                  src_reg *reladdr, int reg_offset)
3287 {
3288    /* Because we store the values to scratch interleaved like our
3289     * vertex data, we need to scale the vec4 index by 2.
3290     */
3291    int message_header_scale = 2;
3292
3293    /* Pre-gen6, the message header uses byte offsets instead of vec4
3294     * (16-byte) offset units.
3295     */
3296    if (brw->gen < 6)
3297       message_header_scale *= 16;
3298
3299    if (reladdr) {
3300       src_reg index = src_reg(this, glsl_type::int_type);
3301
3302       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3303                                    src_reg(reg_offset)));
3304       emit_before(block, inst, MUL(dst_reg(index), index,
3305                                    src_reg(message_header_scale)));
3306
3307       return index;
3308    } else {
3309       return src_reg(reg_offset * message_header_scale);
3310    }
3311 }
3312
3313 src_reg
3314 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3315                                        src_reg *reladdr, int reg_offset)
3316 {
3317    if (reladdr) {
3318       src_reg index = src_reg(this, glsl_type::int_type);
3319
3320       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3321                                    src_reg(reg_offset)));
3322
3323       /* Pre-gen6, the message header uses byte offsets instead of vec4
3324        * (16-byte) offset units.
3325        */
3326       if (brw->gen < 6) {
3327          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3328       }
3329
3330       return index;
3331    } else if (brw->gen >= 8) {
3332       /* Store the offset in a GRF so we can send-from-GRF. */
3333       src_reg offset = src_reg(this, glsl_type::int_type);
3334       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3335       return offset;
3336    } else {
3337       int message_header_scale = brw->gen < 6 ? 16 : 1;
3338       return src_reg(reg_offset * message_header_scale);
3339    }
3340 }
3341
3342 /**
3343  * Emits an instruction before @inst to load the value named by @orig_src
3344  * from scratch space at @base_offset to @temp.
3345  *
3346  * @base_offset is measured in 32-byte units (the size of a register).
3347  */
3348 void
3349 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3350                                 dst_reg temp, src_reg orig_src,
3351                                 int base_offset)
3352 {
3353    int reg_offset = base_offset + orig_src.reg_offset;
3354    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3355                                       reg_offset);
3356
3357    emit_before(block, inst, SCRATCH_READ(temp, index));
3358 }
3359
3360 /**
3361  * Emits an instruction after @inst to store the value to be written
3362  * to @orig_dst to scratch space at @base_offset, from @temp.
3363  *
3364  * @base_offset is measured in 32-byte units (the size of a register).
3365  */
3366 void
3367 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3368                                  int base_offset)
3369 {
3370    int reg_offset = base_offset + inst->dst.reg_offset;
3371    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3372                                       reg_offset);
3373
3374    /* Create a temporary register to store *inst's result in.
3375     *
3376     * We have to be careful in MOVing from our temporary result register in
3377     * the scratch write.  If we swizzle from channels of the temporary that
3378     * weren't initialized, it will confuse live interval analysis, which will
3379     * make spilling fail to make progress.
3380     */
3381    src_reg temp = src_reg(this, glsl_type::vec4_type);
3382    temp.type = inst->dst.type;
3383    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3384    int swizzles[4];
3385    for (int i = 0; i < 4; i++)
3386       if (inst->dst.writemask & (1 << i))
3387          swizzles[i] = i;
3388       else
3389          swizzles[i] = first_writemask_chan;
3390    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3391                                swizzles[2], swizzles[3]);
3392
3393    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3394                                        inst->dst.writemask));
3395    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3396    write->predicate = inst->predicate;
3397    write->ir = inst->ir;
3398    write->annotation = inst->annotation;
3399    inst->insert_after(block, write);
3400
3401    inst->dst.file = temp.file;
3402    inst->dst.reg = temp.reg;
3403    inst->dst.reg_offset = temp.reg_offset;
3404    inst->dst.reladdr = NULL;
3405 }
3406
3407 /**
3408  * We can't generally support array access in GRF space, because a
3409  * single instruction's destination can only span 2 contiguous
3410  * registers.  So, we send all GRF arrays that get variable index
3411  * access to scratch space.
3412  */
3413 void
3414 vec4_visitor::move_grf_array_access_to_scratch()
3415 {
3416    int scratch_loc[this->alloc.count];
3417    memset(scratch_loc, -1, sizeof(scratch_loc));
3418
3419    /* First, calculate the set of virtual GRFs that need to be punted
3420     * to scratch due to having any array access on them, and where in
3421     * scratch.
3422     */
3423    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3424       if (inst->dst.file == GRF && inst->dst.reladdr &&
3425           scratch_loc[inst->dst.reg] == -1) {
3426          scratch_loc[inst->dst.reg] = c->last_scratch;
3427          c->last_scratch += this->alloc.sizes[inst->dst.reg];
3428       }
3429
3430       for (int i = 0 ; i < 3; i++) {
3431          src_reg *src = &inst->src[i];
3432
3433          if (src->file == GRF && src->reladdr &&
3434              scratch_loc[src->reg] == -1) {
3435             scratch_loc[src->reg] = c->last_scratch;
3436             c->last_scratch += this->alloc.sizes[src->reg];
3437          }
3438       }
3439    }
3440
3441    /* Now, for anything that will be accessed through scratch, rewrite
3442     * it to load/store.  Note that this is a _safe list walk, because
3443     * we may generate a new scratch_write instruction after the one
3444     * we're processing.
3445     */
3446    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3447       /* Set up the annotation tracking for new generated instructions. */
3448       base_ir = inst->ir;
3449       current_annotation = inst->annotation;
3450
3451       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3452          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3453       }
3454
3455       for (int i = 0 ; i < 3; i++) {
3456          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3457             continue;
3458
3459          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3460
3461          emit_scratch_read(block, inst, temp, inst->src[i],
3462                            scratch_loc[inst->src[i].reg]);
3463
3464          inst->src[i].file = temp.file;
3465          inst->src[i].reg = temp.reg;
3466          inst->src[i].reg_offset = temp.reg_offset;
3467          inst->src[i].reladdr = NULL;
3468       }
3469    }
3470 }
3471
3472 /**
3473  * Emits an instruction before @inst to load the value named by @orig_src
3474  * from the pull constant buffer (surface) at @base_offset to @temp.
3475  */
3476 void
3477 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3478                                       dst_reg temp, src_reg orig_src,
3479                                       int base_offset)
3480 {
3481    int reg_offset = base_offset + orig_src.reg_offset;
3482    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3483    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3484                                              reg_offset);
3485    vec4_instruction *load;
3486
3487    if (brw->gen >= 7) {
3488       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3489
3490       /* We have to use a message header on Skylake to get SIMD4x2 mode.
3491        * Reserve space for the register.
3492        */
3493       if (brw->gen >= 9) {
3494          grf_offset.reg_offset++;
3495          alloc.sizes[grf_offset.reg] = 2;
3496       }
3497
3498       grf_offset.type = offset.type;
3499       emit_before(block, inst, MOV(grf_offset, offset));
3500
3501       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3502                                            temp, index, src_reg(grf_offset));
3503       load->mlen = 1;
3504    } else {
3505       load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3506                                            temp, index, offset);
3507       load->base_mrf = 14;
3508       load->mlen = 1;
3509    }
3510    emit_before(block, inst, load);
3511 }
3512
3513 /**
3514  * Implements array access of uniforms by inserting a
3515  * PULL_CONSTANT_LOAD instruction.
3516  *
3517  * Unlike temporary GRF array access (where we don't support it due to
3518  * the difficulty of doing relative addressing on instruction
3519  * destinations), we could potentially do array access of uniforms
3520  * that were loaded in GRF space as push constants.  In real-world
3521  * usage we've seen, though, the arrays being used are always larger
3522  * than we could load as push constants, so just always move all
3523  * uniform array access out to a pull constant buffer.
3524  */
3525 void
3526 vec4_visitor::move_uniform_array_access_to_pull_constants()
3527 {
3528    int pull_constant_loc[this->uniforms];
3529    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3530    bool nested_reladdr;
3531
3532    /* Walk through and find array access of uniforms.  Put a copy of that
3533     * uniform in the pull constant buffer.
3534     *
3535     * Note that we don't move constant-indexed accesses to arrays.  No
3536     * testing has been done of the performance impact of this choice.
3537     */
3538    do {
3539       nested_reladdr = false;
3540
3541       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3542          for (int i = 0 ; i < 3; i++) {
3543             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3544                continue;
3545
3546             int uniform = inst->src[i].reg;
3547
3548             if (inst->src[i].reladdr->reladdr)
3549                nested_reladdr = true;  /* will need another pass */
3550
3551             /* If this array isn't already present in the pull constant buffer,
3552              * add it.
3553              */
3554             if (pull_constant_loc[uniform] == -1) {
3555                const gl_constant_value **values =
3556                   &stage_prog_data->param[uniform * 4];
3557
3558                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3559
3560                assert(uniform < uniform_array_size);
3561                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3562                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3563                      = values[j];
3564                }
3565             }
3566
3567             /* Set up the annotation tracking for new generated instructions. */
3568             base_ir = inst->ir;
3569             current_annotation = inst->annotation;
3570
3571             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3572
3573             emit_pull_constant_load(block, inst, temp, inst->src[i],
3574                                     pull_constant_loc[uniform]);
3575
3576             inst->src[i].file = temp.file;
3577             inst->src[i].reg = temp.reg;
3578             inst->src[i].reg_offset = temp.reg_offset;
3579             inst->src[i].reladdr = NULL;
3580          }
3581       }
3582    } while (nested_reladdr);
3583
3584    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3585     * no need to track them as larger-than-vec4 objects.  This will be
3586     * relied on in cutting out unused uniform vectors from push
3587     * constants.
3588     */
3589    split_uniform_registers();
3590 }
3591
3592 void
3593 vec4_visitor::resolve_ud_negate(src_reg *reg)
3594 {
3595    if (reg->type != BRW_REGISTER_TYPE_UD ||
3596        !reg->negate)
3597       return;
3598
3599    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3600    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3601    *reg = temp;
3602 }
3603
3604 /**
3605  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3606  *
3607  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3608  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3609  */
3610 void
3611 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3612 {
3613    assert(brw->gen <= 5);
3614
3615    if (!rvalue->type->is_boolean())
3616       return;
3617
3618    src_reg and_result = src_reg(this, rvalue->type);
3619    src_reg neg_result = src_reg(this, rvalue->type);
3620    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3621    emit(MOV(dst_reg(neg_result), negate(and_result)));
3622    *reg = neg_result;
3623 }
3624
3625 vec4_visitor::vec4_visitor(struct brw_context *brw,
3626                            struct brw_vec4_compile *c,
3627                            struct gl_program *prog,
3628                            const struct brw_vue_prog_key *key,
3629                            struct brw_vue_prog_data *prog_data,
3630                            struct gl_shader_program *shader_prog,
3631                            gl_shader_stage stage,
3632                            void *mem_ctx,
3633                            bool no_spills,
3634                            shader_time_shader_type st_base,
3635                            shader_time_shader_type st_written,
3636                            shader_time_shader_type st_reset)
3637    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3638      c(c),
3639      key(key),
3640      prog_data(prog_data),
3641      sanity_param_count(0),
3642      fail_msg(NULL),
3643      first_non_payload_grf(0),
3644      need_all_constants_in_pull_buffer(false),
3645      no_spills(no_spills),
3646      st_base(st_base),
3647      st_written(st_written),
3648      st_reset(st_reset)
3649 {
3650    this->mem_ctx = mem_ctx;
3651    this->failed = false;
3652
3653    this->base_ir = NULL;
3654    this->current_annotation = NULL;
3655    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3656
3657    this->variable_ht = hash_table_ctor(0,
3658                                        hash_table_pointer_hash,
3659                                        hash_table_pointer_compare);
3660
3661    this->virtual_grf_start = NULL;
3662    this->virtual_grf_end = NULL;
3663    this->live_intervals = NULL;
3664
3665    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3666
3667    this->uniforms = 0;
3668
3669    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3670     * at least one. See setup_uniforms() in brw_vec4.cpp.
3671     */
3672    this->uniform_array_size = 1;
3673    if (prog_data) {
3674       this->uniform_array_size =
3675          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3676    }
3677
3678    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3679    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3680 }
3681
3682 vec4_visitor::~vec4_visitor()
3683 {
3684    hash_table_dtor(this->variable_ht);
3685 }
3686
3687
3688 void
3689 vec4_visitor::fail(const char *format, ...)
3690 {
3691    va_list va;
3692    char *msg;
3693
3694    if (failed)
3695       return;
3696
3697    failed = true;
3698
3699    va_start(va, format);
3700    msg = ralloc_vasprintf(mem_ctx, format, va);
3701    va_end(va);
3702    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3703
3704    this->fail_msg = msg;
3705
3706    if (debug_enabled) {
3707       fprintf(stderr, "%s",  msg);
3708    }
3709 }
3710
3711 } /* namespace brw */