src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, const dst_reg &dst,
  35                                    const src_reg &src0, const src_reg &src1,
  36                                    const src_reg &src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->saturate = false;
  44    this->force_writemask_all = false;
  45    this->no_dd_clear = false;
  46    this->no_dd_check = false;
  47    this->writes_accumulator = false;
  48    this->conditional_mod = BRW_CONDITIONAL_NONE;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  70                           vec4_instruction *new_inst)
  71 {
  72    new_inst->ir = inst->ir;
  73    new_inst->annotation = inst->annotation;
  74
  75    inst->insert_before(block, new_inst);
  76
  77    return inst;
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  82                    const src_reg &src1, const src_reg &src2)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  85                                              src0, src1, src2));
  86 }
  87
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  91                    const src_reg &src1)
  92 {
  93    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 112 }
 113
 114 #define ALU1(op)                                                        \
 115    vec4_instruction *                                                   \
 116    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 117    {                                                                    \
 118       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 119                                            src0);                       \
 120    }
 121
 122 #define ALU2(op)                                                        \
 123    vec4_instruction *                                                   \
 124    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 125                     const src_reg &src1)                                \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0, src1);                 \
 129    }
 130
 131 #define ALU2_ACC(op)                                                    \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 134                     const src_reg &src1)                                \
 135    {                                                                    \
 136       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 137                        BRW_OPCODE_##op, dst, src0, src1);               \
 138       inst->writes_accumulator = true;                                 \
 139       return inst;                                                     \
 140    }
 141
 142 #define ALU3(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 145                     const src_reg &src1, const src_reg &src2)           \
 146    {                                                                    \
 147       assert(brw->gen >= 6);                                            \
 148       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 149                                            src0, src1, src2);           \
 150    }
 151
 152 ALU1(NOT)
 153 ALU1(MOV)
 154 ALU1(FRC)
 155 ALU1(RNDD)
 156 ALU1(RNDE)
 157 ALU1(RNDZ)
 158 ALU1(F32TO16)
 159 ALU1(F16TO32)
 160 ALU2(ADD)
 161 ALU2(MUL)
 162 ALU2_ACC(MACH)
 163 ALU2(AND)
 164 ALU2(OR)
 165 ALU2(XOR)
 166 ALU2(DP3)
 167 ALU2(DP4)
 168 ALU2(DPH)
 169 ALU2(SHL)
 170 ALU2(SHR)
 171 ALU2(ASR)
 172 ALU3(LRP)
 173 ALU1(BFREV)
 174 ALU3(BFE)
 175 ALU2(BFI1)
 176 ALU3(BFI2)
 177 ALU1(FBH)
 178 ALU1(FBL)
 179 ALU1(CBIT)
 180 ALU3(MAD)
 181 ALU2_ACC(ADDC)
 182 ALU2_ACC(SUBB)
 183 ALU2(MAC)
 184
 185 /** Gen4 predicated IF. */
 186 vec4_instruction *
 187 vec4_visitor::IF(enum brw_predicate predicate)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 vec4_instruction *
 199 vec4_visitor::IF(src_reg src0, src_reg src1,
 200                  enum brw_conditional_mod condition)
 201 {
 202    assert(brw->gen == 6);
 203
 204    vec4_instruction *inst;
 205
 206    resolve_ud_negate(&src0);
 207    resolve_ud_negate(&src1);
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 210                                         src0, src1);
 211    inst->conditional_mod = condition;
 212
 213    return inst;
 214 }
 215
 216 /**
 217  * CMP: Sets the low bit of the destination channels with the result
 218  * of the comparison, while the upper bits are undefined, and updates
 219  * the flag register with the packed 16 bits of the result.
 220  */
 221 vec4_instruction *
 222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 223                   enum brw_conditional_mod condition)
 224 {
 225    vec4_instruction *inst;
 226
 227    /* original gen4 does type conversion to the destination type
 228     * before before comparison, producing garbage results for floating
 229     * point comparisons.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 vec4_instruction *
 247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 248 {
 249    vec4_instruction *inst;
 250
 251    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 252                                         dst, index);
 253    inst->base_mrf = 14;
 254    inst->mlen = 2;
 255
 256    return inst;
 257 }
 258
 259 vec4_instruction *
 260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 261                             const src_reg &index)
 262 {
 263    vec4_instruction *inst;
 264
 265    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 266                                         dst, src, index);
 267    inst->base_mrf = 13;
 268    inst->mlen = 3;
 269
 270    return inst;
 271 }
 272
 273 void
 274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 275 {
 276    static enum opcode dot_opcodes[] = {
 277       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 278    };
 279
 280    emit(dot_opcodes[elements - 2], dst, src0, src1);
 281 }
 282
 283 src_reg
 284 vec4_visitor::fix_3src_operand(src_reg src)
 285 {
 286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 287     * able to use vertical stride of zero to replicate the vec4 uniform, like
 288     *
 289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 290     *
 291     * But you can't, since vertical stride is always four in three-source
 292     * instructions. Instead, insert a MOV instruction to do the replication so
 293     * that the three-source instruction can consume it.
 294     */
 295
 296    /* The MOV is only needed if the source is a uniform or immediate. */
 297    if (src.file != UNIFORM && src.file != IMM)
 298       return src;
 299
 300    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 301       return src;
 302
 303    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 304    expanded.type = src.type;
 305    emit(MOV(expanded, src));
 306    return src_reg(expanded);
 307 }
 308
 309 src_reg
 310 vec4_visitor::fix_math_operand(src_reg src)
 311 {
 312    if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
 313       return src;
 314
 315    /* The gen6 math instruction ignores the source modifiers --
 316     * swizzle, abs, negate, and at least some parts of the register
 317     * region description.
 318     *
 319     * Rather than trying to enumerate all these cases, *always* expand the
 320     * operand to a temp GRF for gen6.
 321     *
 322     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 323     * can't use.
 324     */
 325
 326    if (brw->gen == 7 && src.file != IMM)
 327       return src;
 328
 329    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 330    expanded.type = src.type;
 331    emit(MOV(expanded, src));
 332    return src_reg(expanded);
 333 }
 334
 335 void
 336 vec4_visitor::emit_math(enum opcode opcode,
 337                         const dst_reg &dst,
 338                         const src_reg &src0, const src_reg &src1)
 339 {
 340    vec4_instruction *math =
 341       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 342
 343    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 344       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 345       math->dst = dst_reg(this, glsl_type::vec4_type);
 346       math->dst.type = dst.type;
 347       emit(MOV(dst, src_reg(math->dst)));
 348    } else if (brw->gen < 6) {
 349       math->base_mrf = 1;
 350       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 351    }
 352 }
 353
 354 void
 355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 356 {
 357    if (brw->gen < 7) {
 358       unreachable("ir_unop_pack_half_2x16 should be lowered");
 359    }
 360
 361    assert(dst.type == BRW_REGISTER_TYPE_UD);
 362    assert(src0.type == BRW_REGISTER_TYPE_F);
 363
 364    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 365     *
 366     *   Because this instruction does not have a 16-bit floating-point type,
 367     *   the destination data type must be Word (W).
 368     *
 369     *   The destination must be DWord-aligned and specify a horizontal stride
 370     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 371     *   each destination channel and the upper word is not modified.
 372     *
 373     * The above restriction implies that the f32to16 instruction must use
 374     * align1 mode, because only in align1 mode is it possible to specify
 375     * horizontal stride.  We choose here to defy the hardware docs and emit
 376     * align16 instructions.
 377     *
 378     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 379     * instructions. I was partially successful in that the code passed all
 380     * tests.  However, the code was dubiously correct and fragile, and the
 381     * tests were not harsh enough to probe that frailty. Not trusting the
 382     * code, I chose instead to remain in align16 mode in defiance of the hw
 383     * docs).
 384     *
 385     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 386     * simulator, emitting a f32to16 in align16 mode with UD as destination
 387     * data type is safe. The behavior differs from that specified in the PRM
 388     * in that the upper word of each destination channel is cleared to 0.
 389     */
 390
 391    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 392    src_reg tmp_src(tmp_dst);
 393
 394 #if 0
 395    /* Verify the undocumented behavior on which the following instructions
 396     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 397     * then the result of the bit-or instruction below will be incorrect.
 398     *
 399     * You should inspect the disasm output in order to verify that the MOV is
 400     * not optimized away.
 401     */
 402    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 403 #endif
 404
 405    /* Give tmp the form below, where "." means untouched.
 406     *
 407     *     w z          y          x w z          y          x
 408     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 409     *
 410     * That the upper word of each write-channel be 0 is required for the
 411     * following bit-shift and bit-or instructions to work. Note that this
 412     * relies on the undocumented hardware behavior mentioned above.
 413     */
 414    tmp_dst.writemask = WRITEMASK_XY;
 415    emit(F32TO16(tmp_dst, src0));
 416
 417    /* Give the write-channels of dst the form:
 418     *   0xhhhh0000
 419     */
 420    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 421    emit(SHL(dst, tmp_src, src_reg(16u)));
 422
 423    /* Finally, give the write-channels of dst the form of packHalf2x16's
 424     * output:
 425     *   0xhhhhllll
 426     */
 427    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 428    emit(OR(dst, src_reg(dst), tmp_src));
 429 }
 430
 431 void
 432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 433 {
 434    if (brw->gen < 7) {
 435       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 436    }
 437
 438    assert(dst.type == BRW_REGISTER_TYPE_F);
 439    assert(src0.type == BRW_REGISTER_TYPE_UD);
 440
 441    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 442     *
 443     *   Because this instruction does not have a 16-bit floating-point type,
 444     *   the source data type must be Word (W). The destination type must be
 445     *   F (Float).
 446     *
 447     * To use W as the source data type, we must adjust horizontal strides,
 448     * which is only possible in align1 mode. All my [chadv] attempts at
 449     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 450     * Piglit tests, so I gave up.
 451     *
 452     * I've verified that, on gen7 hardware and the simulator, it is safe to
 453     * emit f16to32 in align16 mode with UD as source data type.
 454     */
 455
 456    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 457    src_reg tmp_src(tmp_dst);
 458
 459    tmp_dst.writemask = WRITEMASK_X;
 460    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 461
 462    tmp_dst.writemask = WRITEMASK_Y;
 463    emit(SHR(tmp_dst, src0, src_reg(16u)));
 464
 465    dst.writemask = WRITEMASK_XY;
 466    emit(F16TO32(dst, tmp_src));
 467 }
 468
 469 void
 470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 471 {
 472    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 473     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 474     * is not suitable to generate the shift values, but we can use the packed
 475     * vector float and a type-converting MOV.
 476     */
 477    dst_reg shift(this, glsl_type::uvec4_type);
 478    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 479
 480    dst_reg shifted(this, glsl_type::uvec4_type);
 481    src0.swizzle = BRW_SWIZZLE_XXXX;
 482    emit(SHR(shifted, src0, src_reg(shift)));
 483
 484    shifted.type = BRW_REGISTER_TYPE_UB;
 485    dst_reg f(this, glsl_type::vec4_type);
 486    emit(MOV(f, src_reg(shifted)));
 487
 488    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 489 }
 490
 491 void
 492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 493 {
 494    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 495     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 496     * is not suitable to generate the shift values, but we can use the packed
 497     * vector float and a type-converting MOV.
 498     */
 499    dst_reg shift(this, glsl_type::uvec4_type);
 500    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 501
 502    dst_reg shifted(this, glsl_type::uvec4_type);
 503    src0.swizzle = BRW_SWIZZLE_XXXX;
 504    emit(SHR(shifted, src0, src_reg(shift)));
 505
 506    shifted.type = BRW_REGISTER_TYPE_B;
 507    dst_reg f(this, glsl_type::vec4_type);
 508    emit(MOV(f, src_reg(shifted)));
 509
 510    dst_reg scaled(this, glsl_type::vec4_type);
 511    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 512
 513    dst_reg max(this, glsl_type::vec4_type);
 514    emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
 515    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 516 }
 517
 518 void
 519 vec4_visitor::visit_instructions(const exec_list *list)
 520 {
 521    foreach_in_list(ir_instruction, ir, list) {
 522       base_ir = ir;
 523       ir->accept(this);
 524    }
 525 }
 526
 527
 528 static int
 529 type_size(const struct glsl_type *type)
 530 {
 531    unsigned int i;
 532    int size;
 533
 534    switch (type->base_type) {
 535    case GLSL_TYPE_UINT:
 536    case GLSL_TYPE_INT:
 537    case GLSL_TYPE_FLOAT:
 538    case GLSL_TYPE_BOOL:
 539       if (type->is_matrix()) {
 540          return type->matrix_columns;
 541       } else {
 542          /* Regardless of size of vector, it gets a vec4. This is bad
 543           * packing for things like floats, but otherwise arrays become a
 544           * mess.  Hopefully a later pass over the code can pack scalars
 545           * down if appropriate.
 546           */
 547          return 1;
 548       }
 549    case GLSL_TYPE_ARRAY:
 550       assert(type->length > 0);
 551       return type_size(type->fields.array) * type->length;
 552    case GLSL_TYPE_STRUCT:
 553       size = 0;
 554       for (i = 0; i < type->length; i++) {
 555          size += type_size(type->fields.structure[i].type);
 556       }
 557       return size;
 558    case GLSL_TYPE_SAMPLER:
 559       /* Samplers take up no register space, since they're baked in at
 560        * link time.
 561        */
 562       return 0;
 563    case GLSL_TYPE_ATOMIC_UINT:
 564       return 0;
 565    case GLSL_TYPE_IMAGE:
 566    case GLSL_TYPE_VOID:
 567    case GLSL_TYPE_ERROR:
 568    case GLSL_TYPE_INTERFACE:
 569       unreachable("not reached");
 570    }
 571
 572    return 0;
 573 }
 574
 575 int
 576 vec4_visitor::virtual_grf_alloc(int size)
 577 {
 578    if (virtual_grf_array_size <= virtual_grf_count) {
 579       if (virtual_grf_array_size == 0)
 580          virtual_grf_array_size = 16;
 581       else
 582          virtual_grf_array_size *= 2;
 583       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 584                                    virtual_grf_array_size);
 585       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 586                                      virtual_grf_array_size);
 587    }
 588    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 589    virtual_grf_reg_count += size;
 590    virtual_grf_sizes[virtual_grf_count] = size;
 591    return virtual_grf_count++;
 592 }
 593
 594 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 595 {
 596    init();
 597
 598    this->file = GRF;
 599    this->reg = v->virtual_grf_alloc(type_size(type));
 600
 601    if (type->is_array() || type->is_record()) {
 602       this->swizzle = BRW_SWIZZLE_NOOP;
 603    } else {
 604       this->swizzle = swizzle_for_size(type->vector_elements);
 605    }
 606
 607    this->type = brw_type_for_base_type(type);
 608 }
 609
 610 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 611 {
 612    assert(size > 0);
 613
 614    init();
 615
 616    this->file = GRF;
 617    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 618
 619    this->swizzle = BRW_SWIZZLE_NOOP;
 620
 621    this->type = brw_type_for_base_type(type);
 622 }
 623
 624 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->virtual_grf_alloc(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->writemask = WRITEMASK_XYZW;
 633    } else {
 634       this->writemask = (1 << type->vector_elements) - 1;
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 /* Our support for uniforms is piggy-backed on the struct
 641  * gl_fragment_program, because that's where the values actually
 642  * get stored, rather than in some global gl_shader_program uniform
 643  * store.
 644  */
 645 void
 646 vec4_visitor::setup_uniform_values(ir_variable *ir)
 647 {
 648    int namelen = strlen(ir->name);
 649
 650    /* The data for our (non-builtin) uniforms is stored in a series of
 651     * gl_uniform_driver_storage structs for each subcomponent that
 652     * glGetUniformLocation() could name.  We know it's been set up in the same
 653     * order we'd walk the type, so walk the list of storage and find anything
 654     * with our name, or the prefix of a component that starts with our name.
 655     */
 656    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 657       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 658
 659       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 660           (storage->name[namelen] != 0 &&
 661            storage->name[namelen] != '.' &&
 662            storage->name[namelen] != '[')) {
 663          continue;
 664       }
 665
 666       gl_constant_value *components = storage->storage;
 667       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 668                                storage->type->matrix_columns);
 669
 670       for (unsigned s = 0; s < vector_count; s++) {
 671          assert(uniforms < uniform_array_size);
 672          uniform_vector_size[uniforms] = storage->type->vector_elements;
 673
 674          int i;
 675          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 676             stage_prog_data->param[uniforms * 4 + i] = components;
 677             components++;
 678          }
 679          for (; i < 4; i++) {
 680             static gl_constant_value zero = { 0.0 };
 681             stage_prog_data->param[uniforms * 4 + i] = &zero;
 682          }
 683
 684          uniforms++;
 685       }
 686    }
 687 }
 688
 689 void
 690 vec4_visitor::setup_uniform_clipplane_values()
 691 {
 692    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 693
 694    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 695       assert(this->uniforms < uniform_array_size);
 696       this->uniform_vector_size[this->uniforms] = 4;
 697       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 698       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 699       for (int j = 0; j < 4; ++j) {
 700          stage_prog_data->param[this->uniforms * 4 + j] =
 701             (gl_constant_value *) &clip_planes[i][j];
 702       }
 703       ++this->uniforms;
 704    }
 705 }
 706
 707 /* Our support for builtin uniforms is even scarier than non-builtin.
 708  * It sits on top of the PROG_STATE_VAR parameters that are
 709  * automatically updated from GL context state.
 710  */
 711 void
 712 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 713 {
 714    const ir_state_slot *const slots = ir->get_state_slots();
 715    assert(slots != NULL);
 716
 717    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 718       /* This state reference has already been setup by ir_to_mesa,
 719        * but we'll get the same index back here.  We can reference
 720        * ParameterValues directly, since unlike brw_fs.cpp, we never
 721        * add new state references during compile.
 722        */
 723       int index = _mesa_add_state_reference(this->prog->Parameters,
 724                                             (gl_state_index *)slots[i].tokens);
 725       gl_constant_value *values =
 726          &this->prog->Parameters->ParameterValues[index][0];
 727
 728       assert(this->uniforms < uniform_array_size);
 729       this->uniform_vector_size[this->uniforms] = 0;
 730       /* Add each of the unique swizzled channels of the element.
 731        * This will end up matching the size of the glsl_type of this field.
 732        */
 733       int last_swiz = -1;
 734       for (unsigned int j = 0; j < 4; j++) {
 735          int swiz = GET_SWZ(slots[i].swizzle, j);
 736          last_swiz = swiz;
 737
 738          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 739          assert(this->uniforms < uniform_array_size);
 740          if (swiz <= last_swiz)
 741             this->uniform_vector_size[this->uniforms]++;
 742       }
 743       this->uniforms++;
 744    }
 745 }
 746
 747 dst_reg *
 748 vec4_visitor::variable_storage(ir_variable *var)
 749 {
 750    return (dst_reg *)hash_table_find(this->variable_ht, var);
 751 }
 752
 753 void
 754 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 755                                      enum brw_predicate *predicate)
 756 {
 757    ir_expression *expr = ir->as_expression();
 758
 759    *predicate = BRW_PREDICATE_NORMAL;
 760
 761    if (expr && expr->operation != ir_binop_ubo_load) {
 762       src_reg op[3];
 763       vec4_instruction *inst;
 764
 765       assert(expr->get_num_operands() <= 3);
 766       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 767          expr->operands[i]->accept(this);
 768          op[i] = this->result;
 769
 770          resolve_ud_negate(&op[i]);
 771       }
 772
 773       switch (expr->operation) {
 774       case ir_unop_logic_not:
 775          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 776          inst->conditional_mod = BRW_CONDITIONAL_Z;
 777          break;
 778
 779       case ir_binop_logic_xor:
 780          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 781          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 782          break;
 783
 784       case ir_binop_logic_or:
 785          inst = emit(OR(dst_null_d(), op[0], op[1]));
 786          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 787          break;
 788
 789       case ir_binop_logic_and:
 790          inst = emit(AND(dst_null_d(), op[0], op[1]));
 791          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 792          break;
 793
 794       case ir_unop_f2b:
 795          if (brw->gen >= 6) {
 796             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 797          } else {
 798             inst = emit(MOV(dst_null_f(), op[0]));
 799             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          }
 801          break;
 802
 803       case ir_unop_i2b:
 804          if (brw->gen >= 6) {
 805             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 806          } else {
 807             inst = emit(MOV(dst_null_d(), op[0]));
 808             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 809          }
 810          break;
 811
 812       case ir_binop_all_equal:
 813          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 814          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 815          break;
 816
 817       case ir_binop_any_nequal:
 818          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 819          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 820          break;
 821
 822       case ir_unop_any:
 823          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 825          break;
 826
 827       case ir_binop_greater:
 828       case ir_binop_gequal:
 829       case ir_binop_less:
 830       case ir_binop_lequal:
 831       case ir_binop_equal:
 832       case ir_binop_nequal:
 833          emit(CMP(dst_null_d(), op[0], op[1],
 834                   brw_conditional_for_comparison(expr->operation)));
 835          break;
 836
 837       case ir_triop_csel: {
 838          /* Expand the boolean condition into the flag register. */
 839          inst = emit(MOV(dst_null_d(), op[0]));
 840          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841
 842          /* Select which boolean to return. */
 843          dst_reg temp(this, expr->operands[1]->type);
 844          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 845          inst->predicate = BRW_PREDICATE_NORMAL;
 846
 847          /* Expand the result to a condition code. */
 848          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 849          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 850          break;
 851       }
 852
 853       default:
 854          unreachable("not reached");
 855       }
 856       return;
 857    }
 858
 859    ir->accept(this);
 860
 861    resolve_ud_negate(&this->result);
 862
 863    if (brw->gen >= 6) {
 864       vec4_instruction *inst = emit(AND(dst_null_d(),
 865                                         this->result, src_reg(1)));
 866       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867    } else {
 868       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 869       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 870    }
 871 }
 872
 873 /**
 874  * Emit a gen6 IF statement with the comparison folded into the IF
 875  * instruction.
 876  */
 877 void
 878 vec4_visitor::emit_if_gen6(ir_if *ir)
 879 {
 880    ir_expression *expr = ir->condition->as_expression();
 881
 882    if (expr && expr->operation != ir_binop_ubo_load) {
 883       src_reg op[3];
 884       dst_reg temp;
 885
 886       assert(expr->get_num_operands() <= 3);
 887       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 888          expr->operands[i]->accept(this);
 889          op[i] = this->result;
 890       }
 891
 892       switch (expr->operation) {
 893       case ir_unop_logic_not:
 894          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 895          return;
 896
 897       case ir_binop_logic_xor:
 898          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 899          return;
 900
 901       case ir_binop_logic_or:
 902          temp = dst_reg(this, glsl_type::bool_type);
 903          emit(OR(temp, op[0], op[1]));
 904          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 905          return;
 906
 907       case ir_binop_logic_and:
 908          temp = dst_reg(this, glsl_type::bool_type);
 909          emit(AND(temp, op[0], op[1]));
 910          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 911          return;
 912
 913       case ir_unop_f2b:
 914          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 915          return;
 916
 917       case ir_unop_i2b:
 918          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 919          return;
 920
 921       case ir_binop_greater:
 922       case ir_binop_gequal:
 923       case ir_binop_less:
 924       case ir_binop_lequal:
 925       case ir_binop_equal:
 926       case ir_binop_nequal:
 927          emit(IF(op[0], op[1],
 928                  brw_conditional_for_comparison(expr->operation)));
 929          return;
 930
 931       case ir_binop_all_equal:
 932          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 933          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 934          return;
 935
 936       case ir_binop_any_nequal:
 937          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 938          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 939          return;
 940
 941       case ir_unop_any:
 942          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 943          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 944          return;
 945
 946       case ir_triop_csel: {
 947          /* Expand the boolean condition into the flag register. */
 948          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 949          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 950
 951          /* Select which boolean to return. */
 952          dst_reg temp(this, expr->operands[1]->type);
 953          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 954          inst->predicate = BRW_PREDICATE_NORMAL;
 955
 956          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958       }
 959
 960       default:
 961          unreachable("not reached");
 962       }
 963       return;
 964    }
 965
 966    ir->condition->accept(this);
 967
 968    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 969 }
 970
 971 void
 972 vec4_visitor::visit(ir_variable *ir)
 973 {
 974    dst_reg *reg = NULL;
 975
 976    if (variable_storage(ir))
 977       return;
 978
 979    switch (ir->data.mode) {
 980    case ir_var_shader_in:
 981       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 982       break;
 983
 984    case ir_var_shader_out:
 985       reg = new(mem_ctx) dst_reg(this, ir->type);
 986
 987       for (int i = 0; i < type_size(ir->type); i++) {
 988          output_reg[ir->data.location + i] = *reg;
 989          output_reg[ir->data.location + i].reg_offset = i;
 990          output_reg[ir->data.location + i].type =
 991             brw_type_for_base_type(ir->type->get_scalar_type());
 992          output_reg_annotation[ir->data.location + i] = ir->name;
 993       }
 994       break;
 995
 996    case ir_var_auto:
 997    case ir_var_temporary:
 998       reg = new(mem_ctx) dst_reg(this, ir->type);
 999       break;
1000
1001    case ir_var_uniform:
1002       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1003
1004       /* Thanks to the lower_ubo_reference pass, we will see only
1005        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1006        * variables, so no need for them to be in variable_ht.
1007        *
1008        * Some uniforms, such as samplers and atomic counters, have no actual
1009        * storage, so we should ignore them.
1010        */
1011       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1012          return;
1013
1014       /* Track how big the whole uniform variable is, in case we need to put a
1015        * copy of its data into pull constants for array access.
1016        */
1017       assert(this->uniforms < uniform_array_size);
1018       this->uniform_size[this->uniforms] = type_size(ir->type);
1019
1020       if (!strncmp(ir->name, "gl_", 3)) {
1021          setup_builtin_uniform_values(ir);
1022       } else {
1023          setup_uniform_values(ir);
1024       }
1025       break;
1026
1027    case ir_var_system_value:
1028       reg = make_reg_for_system_value(ir);
1029       break;
1030
1031    default:
1032       unreachable("not reached");
1033    }
1034
1035    reg->type = brw_type_for_base_type(ir->type);
1036    hash_table_insert(this->variable_ht, reg, ir);
1037 }
1038
1039 void
1040 vec4_visitor::visit(ir_loop *ir)
1041 {
1042    /* We don't want debugging output to print the whole body of the
1043     * loop as the annotation.
1044     */
1045    this->base_ir = NULL;
1046
1047    emit(BRW_OPCODE_DO);
1048
1049    visit_instructions(&ir->body_instructions);
1050
1051    emit(BRW_OPCODE_WHILE);
1052 }
1053
1054 void
1055 vec4_visitor::visit(ir_loop_jump *ir)
1056 {
1057    switch (ir->mode) {
1058    case ir_loop_jump::jump_break:
1059       emit(BRW_OPCODE_BREAK);
1060       break;
1061    case ir_loop_jump::jump_continue:
1062       emit(BRW_OPCODE_CONTINUE);
1063       break;
1064    }
1065 }
1066
1067
1068 void
1069 vec4_visitor::visit(ir_function_signature *)
1070 {
1071    unreachable("not reached");
1072 }
1073
1074 void
1075 vec4_visitor::visit(ir_function *ir)
1076 {
1077    /* Ignore function bodies other than main() -- we shouldn't see calls to
1078     * them since they should all be inlined.
1079     */
1080    if (strcmp(ir->name, "main") == 0) {
1081       const ir_function_signature *sig;
1082       exec_list empty;
1083
1084       sig = ir->matching_signature(NULL, &empty, false);
1085
1086       assert(sig);
1087
1088       visit_instructions(&sig->body);
1089    }
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir)
1094 {
1095    /* 3-src instructions were introduced in gen6. */
1096    if (brw->gen < 6)
1097       return false;
1098
1099    /* MAD can only handle floating-point data. */
1100    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101       return false;
1102
1103    ir_rvalue *nonmul = ir->operands[1];
1104    ir_expression *mul = ir->operands[0]->as_expression();
1105
1106    if (!mul || mul->operation != ir_binop_mul) {
1107       nonmul = ir->operands[0];
1108       mul = ir->operands[1]->as_expression();
1109
1110       if (!mul || mul->operation != ir_binop_mul)
1111          return false;
1112    }
1113
1114    nonmul->accept(this);
1115    src_reg src0 = fix_3src_operand(this->result);
1116
1117    mul->operands[0]->accept(this);
1118    src_reg src1 = fix_3src_operand(this->result);
1119
1120    mul->operands[1]->accept(this);
1121    src_reg src2 = fix_3src_operand(this->result);
1122
1123    this->result = src_reg(this, ir->type);
1124    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1125
1126    return true;
1127 }
1128
1129 bool
1130 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1131 {
1132    /* This optimization relies on CMP setting the destination to 0 when
1133     * false.  Early hardware only sets the least significant bit, and
1134     * leaves the other bits undefined.  So we can't use it.
1135     */
1136    if (brw->gen < 6)
1137       return false;
1138
1139    ir_expression *const cmp = ir->operands[0]->as_expression();
1140
1141    if (cmp == NULL)
1142       return false;
1143
1144    switch (cmp->operation) {
1145    case ir_binop_less:
1146    case ir_binop_greater:
1147    case ir_binop_lequal:
1148    case ir_binop_gequal:
1149    case ir_binop_equal:
1150    case ir_binop_nequal:
1151       break;
1152
1153    default:
1154       return false;
1155    }
1156
1157    cmp->operands[0]->accept(this);
1158    const src_reg cmp_src0 = this->result;
1159
1160    cmp->operands[1]->accept(this);
1161    const src_reg cmp_src1 = this->result;
1162
1163    this->result = src_reg(this, ir->type);
1164
1165    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1166             brw_conditional_for_comparison(cmp->operation)));
1167
1168    /* If the comparison is false, this->result will just happen to be zero.
1169     */
1170    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1171                                        this->result, src_reg(1.0f));
1172    inst->predicate = BRW_PREDICATE_NORMAL;
1173    inst->predicate_inverse = true;
1174
1175    return true;
1176 }
1177
1178 void
1179 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1180                           src_reg src0, src_reg src1)
1181 {
1182    vec4_instruction *inst;
1183
1184    if (brw->gen >= 6) {
1185       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1186       inst->conditional_mod = conditionalmod;
1187    } else {
1188       emit(CMP(dst, src0, src1, conditionalmod));
1189
1190       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1191       inst->predicate = BRW_PREDICATE_NORMAL;
1192    }
1193 }
1194
1195 void
1196 vec4_visitor::emit_lrp(const dst_reg &dst,
1197                        const src_reg &x, const src_reg &y, const src_reg &a)
1198 {
1199    if (brw->gen >= 6) {
1200       /* Note that the instruction's argument order is reversed from GLSL
1201        * and the IR.
1202        */
1203       emit(LRP(dst,
1204                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1205    } else {
1206       /* Earlier generations don't support three source operations, so we
1207        * need to emit x*(1-a) + y*a.
1208        */
1209       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1210       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1211       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1212       y_times_a.writemask           = dst.writemask;
1213       one_minus_a.writemask         = dst.writemask;
1214       x_times_one_minus_a.writemask = dst.writemask;
1215
1216       emit(MUL(y_times_a, y, a));
1217       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1218       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1219       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1220    }
1221 }
1222
1223 void
1224 vec4_visitor::visit(ir_expression *ir)
1225 {
1226    unsigned int operand;
1227    src_reg op[Elements(ir->operands)];
1228    vec4_instruction *inst;
1229
1230    if (ir->operation == ir_binop_add) {
1231       if (try_emit_mad(ir))
1232          return;
1233    }
1234
1235    if (ir->operation == ir_unop_b2f) {
1236       if (try_emit_b2f_of_compare(ir))
1237          return;
1238    }
1239
1240    /* Storage for our result.  Ideally for an assignment we'd be using
1241     * the actual storage for the result here, instead.
1242     */
1243    dst_reg result_dst(this, ir->type);
1244    src_reg result_src(result_dst);
1245
1246    if (ir->operation == ir_triop_csel) {
1247       ir->operands[1]->accept(this);
1248       op[1] = this->result;
1249       ir->operands[2]->accept(this);
1250       op[2] = this->result;
1251
1252       enum brw_predicate predicate;
1253       emit_bool_to_cond_code(ir->operands[0], &predicate);
1254       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1255       inst->predicate = predicate;
1256       this->result = result_src;
1257       return;
1258    }
1259
1260    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1261       this->result.file = BAD_FILE;
1262       ir->operands[operand]->accept(this);
1263       if (this->result.file == BAD_FILE) {
1264          fprintf(stderr, "Failed to get tree for expression operand:\n");
1265          ir->operands[operand]->fprint(stderr);
1266          exit(1);
1267       }
1268       op[operand] = this->result;
1269
1270       /* Matrix expression operands should have been broken down to vector
1271        * operations already.
1272        */
1273       assert(!ir->operands[operand]->type->is_matrix());
1274    }
1275
1276    /* If nothing special happens, this is the result. */
1277    this->result = result_src;
1278
1279    switch (ir->operation) {
1280    case ir_unop_logic_not:
1281       if (ctx->Const.UniformBooleanTrue != 1) {
1282          emit(NOT(result_dst, op[0]));
1283       } else {
1284          emit(XOR(result_dst, op[0], src_reg(1u)));
1285       }
1286       break;
1287    case ir_unop_neg:
1288       op[0].negate = !op[0].negate;
1289       emit(MOV(result_dst, op[0]));
1290       break;
1291    case ir_unop_abs:
1292       op[0].abs = true;
1293       op[0].negate = false;
1294       emit(MOV(result_dst, op[0]));
1295       break;
1296
1297    case ir_unop_sign:
1298       if (ir->type->is_float()) {
1299          /* AND(val, 0x80000000) gives the sign bit.
1300           *
1301           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1302           * zero.
1303           */
1304          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1305
1306          op[0].type = BRW_REGISTER_TYPE_UD;
1307          result_dst.type = BRW_REGISTER_TYPE_UD;
1308          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1309
1310          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1311          inst->predicate = BRW_PREDICATE_NORMAL;
1312
1313          this->result.type = BRW_REGISTER_TYPE_F;
1314       } else {
1315          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1316           *               -> non-negative val generates 0x00000000.
1317           *  Predicated OR sets 1 if val is positive.
1318           */
1319          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1320
1321          emit(ASR(result_dst, op[0], src_reg(31)));
1322
1323          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1324          inst->predicate = BRW_PREDICATE_NORMAL;
1325       }
1326       break;
1327
1328    case ir_unop_rcp:
1329       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1330       break;
1331
1332    case ir_unop_exp2:
1333       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1334       break;
1335    case ir_unop_log2:
1336       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1337       break;
1338    case ir_unop_exp:
1339    case ir_unop_log:
1340       unreachable("not reached: should be handled by ir_explog_to_explog2");
1341    case ir_unop_sin:
1342    case ir_unop_sin_reduced:
1343       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1344       break;
1345    case ir_unop_cos:
1346    case ir_unop_cos_reduced:
1347       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1348       break;
1349
1350    case ir_unop_dFdx:
1351    case ir_unop_dFdx_coarse:
1352    case ir_unop_dFdx_fine:
1353    case ir_unop_dFdy:
1354    case ir_unop_dFdy_coarse:
1355    case ir_unop_dFdy_fine:
1356       unreachable("derivatives not valid in vertex shader");
1357
1358    case ir_unop_bitfield_reverse:
1359       emit(BFREV(result_dst, op[0]));
1360       break;
1361    case ir_unop_bit_count:
1362       emit(CBIT(result_dst, op[0]));
1363       break;
1364    case ir_unop_find_msb: {
1365       src_reg temp = src_reg(this, glsl_type::uint_type);
1366
1367       inst = emit(FBH(dst_reg(temp), op[0]));
1368       inst->dst.writemask = WRITEMASK_XYZW;
1369
1370       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1371        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1372        * subtract the result from 31 to convert the MSB count into an LSB count.
1373        */
1374
1375       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1376       temp.swizzle = BRW_SWIZZLE_NOOP;
1377       emit(MOV(result_dst, temp));
1378
1379       src_reg src_tmp = src_reg(result_dst);
1380       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1381
1382       src_tmp.negate = true;
1383       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1384       inst->predicate = BRW_PREDICATE_NORMAL;
1385       break;
1386    }
1387    case ir_unop_find_lsb:
1388       emit(FBL(result_dst, op[0]));
1389       break;
1390    case ir_unop_saturate:
1391       inst = emit(MOV(result_dst, op[0]));
1392       inst->saturate = true;
1393       break;
1394
1395    case ir_unop_noise:
1396       unreachable("not reached: should be handled by lower_noise");
1397
1398    case ir_binop_add:
1399       emit(ADD(result_dst, op[0], op[1]));
1400       break;
1401    case ir_binop_sub:
1402       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1403
1404    case ir_binop_mul:
1405       if (brw->gen < 8 && ir->type->is_integer()) {
1406          /* For integer multiplication, the MUL uses the low 16 bits of one of
1407           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1408           * accumulates in the contribution of the upper 16 bits of that
1409           * operand.  If we can determine that one of the args is in the low
1410           * 16 bits, though, we can just emit a single MUL.
1411           */
1412          if (ir->operands[0]->is_uint16_constant()) {
1413             if (brw->gen < 7)
1414                emit(MUL(result_dst, op[0], op[1]));
1415             else
1416                emit(MUL(result_dst, op[1], op[0]));
1417          } else if (ir->operands[1]->is_uint16_constant()) {
1418             if (brw->gen < 7)
1419                emit(MUL(result_dst, op[1], op[0]));
1420             else
1421                emit(MUL(result_dst, op[0], op[1]));
1422          } else {
1423             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1424
1425             emit(MUL(acc, op[0], op[1]));
1426             emit(MACH(dst_null_d(), op[0], op[1]));
1427             emit(MOV(result_dst, src_reg(acc)));
1428          }
1429       } else {
1430          emit(MUL(result_dst, op[0], op[1]));
1431       }
1432       break;
1433    case ir_binop_imul_high: {
1434       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1435
1436       emit(MUL(acc, op[0], op[1]));
1437       emit(MACH(result_dst, op[0], op[1]));
1438       break;
1439    }
1440    case ir_binop_div:
1441       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1442       assert(ir->type->is_integer());
1443       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1444       break;
1445    case ir_binop_carry: {
1446       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1447
1448       emit(ADDC(dst_null_ud(), op[0], op[1]));
1449       emit(MOV(result_dst, src_reg(acc)));
1450       break;
1451    }
1452    case ir_binop_borrow: {
1453       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1454
1455       emit(SUBB(dst_null_ud(), op[0], op[1]));
1456       emit(MOV(result_dst, src_reg(acc)));
1457       break;
1458    }
1459    case ir_binop_mod:
1460       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1461       assert(ir->type->is_integer());
1462       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1463       break;
1464
1465    case ir_binop_less:
1466    case ir_binop_greater:
1467    case ir_binop_lequal:
1468    case ir_binop_gequal:
1469    case ir_binop_equal:
1470    case ir_binop_nequal: {
1471       emit(CMP(result_dst, op[0], op[1],
1472                brw_conditional_for_comparison(ir->operation)));
1473       if (ctx->Const.UniformBooleanTrue == 1) {
1474          emit(AND(result_dst, result_src, src_reg(1u)));
1475       }
1476       break;
1477    }
1478
1479    case ir_binop_all_equal:
1480       /* "==" operator producing a scalar boolean. */
1481       if (ir->operands[0]->type->is_vector() ||
1482           ir->operands[1]->type->is_vector()) {
1483          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1484          emit(MOV(result_dst, src_reg(0)));
1485          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1486          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1487       } else {
1488          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1489          if (ctx->Const.UniformBooleanTrue == 1) {
1490             emit(AND(result_dst, result_src, src_reg(1u)));
1491          }
1492       }
1493       break;
1494    case ir_binop_any_nequal:
1495       /* "!=" operator producing a scalar boolean. */
1496       if (ir->operands[0]->type->is_vector() ||
1497           ir->operands[1]->type->is_vector()) {
1498          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1499
1500          emit(MOV(result_dst, src_reg(0)));
1501          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1502          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1503       } else {
1504          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1505          if (ctx->Const.UniformBooleanTrue == 1) {
1506             emit(AND(result_dst, result_src, src_reg(1u)));
1507          }
1508       }
1509       break;
1510
1511    case ir_unop_any:
1512       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1513       emit(MOV(result_dst, src_reg(0)));
1514
1515       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1516       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1517       break;
1518
1519    case ir_binop_logic_xor:
1520       emit(XOR(result_dst, op[0], op[1]));
1521       break;
1522
1523    case ir_binop_logic_or:
1524       emit(OR(result_dst, op[0], op[1]));
1525       break;
1526
1527    case ir_binop_logic_and:
1528       emit(AND(result_dst, op[0], op[1]));
1529       break;
1530
1531    case ir_binop_dot:
1532       assert(ir->operands[0]->type->is_vector());
1533       assert(ir->operands[0]->type == ir->operands[1]->type);
1534       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1535       break;
1536
1537    case ir_unop_sqrt:
1538       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1539       break;
1540    case ir_unop_rsq:
1541       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1542       break;
1543
1544    case ir_unop_bitcast_i2f:
1545    case ir_unop_bitcast_u2f:
1546       this->result = op[0];
1547       this->result.type = BRW_REGISTER_TYPE_F;
1548       break;
1549
1550    case ir_unop_bitcast_f2i:
1551       this->result = op[0];
1552       this->result.type = BRW_REGISTER_TYPE_D;
1553       break;
1554
1555    case ir_unop_bitcast_f2u:
1556       this->result = op[0];
1557       this->result.type = BRW_REGISTER_TYPE_UD;
1558       break;
1559
1560    case ir_unop_i2f:
1561    case ir_unop_i2u:
1562    case ir_unop_u2i:
1563    case ir_unop_u2f:
1564    case ir_unop_f2i:
1565    case ir_unop_f2u:
1566       emit(MOV(result_dst, op[0]));
1567       break;
1568    case ir_unop_b2i:
1569       if (ctx->Const.UniformBooleanTrue != 1) {
1570          emit(AND(result_dst, op[0], src_reg(1u)));
1571       } else {
1572          emit(MOV(result_dst, op[0]));
1573       }
1574       break;
1575    case ir_unop_b2f:
1576       if (ctx->Const.UniformBooleanTrue != 1) {
1577          op[0].type = BRW_REGISTER_TYPE_UD;
1578          result_dst.type = BRW_REGISTER_TYPE_UD;
1579          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1580          result_dst.type = BRW_REGISTER_TYPE_F;
1581       } else {
1582          emit(MOV(result_dst, op[0]));
1583       }
1584       break;
1585    case ir_unop_f2b:
1586    case ir_unop_i2b:
1587       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1588       if (ctx->Const.UniformBooleanTrue == 1) {
1589          emit(AND(result_dst, result_src, src_reg(1u)));
1590       }
1591       break;
1592
1593    case ir_unop_trunc:
1594       emit(RNDZ(result_dst, op[0]));
1595       break;
1596    case ir_unop_ceil:
1597       op[0].negate = !op[0].negate;
1598       inst = emit(RNDD(result_dst, op[0]));
1599       this->result.negate = true;
1600       break;
1601    case ir_unop_floor:
1602       inst = emit(RNDD(result_dst, op[0]));
1603       break;
1604    case ir_unop_fract:
1605       inst = emit(FRC(result_dst, op[0]));
1606       break;
1607    case ir_unop_round_even:
1608       emit(RNDE(result_dst, op[0]));
1609       break;
1610
1611    case ir_binop_min:
1612       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1613       break;
1614    case ir_binop_max:
1615       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1616       break;
1617
1618    case ir_binop_pow:
1619       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1620       break;
1621
1622    case ir_unop_bit_not:
1623       inst = emit(NOT(result_dst, op[0]));
1624       break;
1625    case ir_binop_bit_and:
1626       inst = emit(AND(result_dst, op[0], op[1]));
1627       break;
1628    case ir_binop_bit_xor:
1629       inst = emit(XOR(result_dst, op[0], op[1]));
1630       break;
1631    case ir_binop_bit_or:
1632       inst = emit(OR(result_dst, op[0], op[1]));
1633       break;
1634
1635    case ir_binop_lshift:
1636       inst = emit(SHL(result_dst, op[0], op[1]));
1637       break;
1638
1639    case ir_binop_rshift:
1640       if (ir->type->base_type == GLSL_TYPE_INT)
1641          inst = emit(ASR(result_dst, op[0], op[1]));
1642       else
1643          inst = emit(SHR(result_dst, op[0], op[1]));
1644       break;
1645
1646    case ir_binop_bfm:
1647       emit(BFI1(result_dst, op[0], op[1]));
1648       break;
1649
1650    case ir_binop_ubo_load: {
1651       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1652       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1653       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1654       src_reg offset;
1655
1656       /* Now, load the vector from that offset. */
1657       assert(ir->type->is_vector() || ir->type->is_scalar());
1658
1659       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1660       packed_consts.type = result.type;
1661       src_reg surf_index;
1662
1663       if (const_uniform_block) {
1664          /* The block index is a constant, so just emit the binding table entry
1665           * as an immediate.
1666           */
1667          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1668                               const_uniform_block->value.u[0]);
1669       } else {
1670          /* The block index is not a constant. Evaluate the index expression
1671           * per-channel and add the base UBO index; the generator will select
1672           * a value from any live channel.
1673           */
1674          surf_index = src_reg(this, glsl_type::uint_type);
1675          emit(ADD(dst_reg(surf_index), op[0],
1676                   src_reg(prog_data->base.binding_table.ubo_start)));
1677
1678          /* Assume this may touch any UBO. It would be nice to provide
1679           * a tighter bound, but the array information is already lowered away.
1680           */
1681          brw_mark_surface_used(&prog_data->base,
1682                                prog_data->base.binding_table.ubo_start +
1683                                shader_prog->NumUniformBlocks - 1);
1684       }
1685
1686       if (const_offset_ir) {
1687          if (brw->gen >= 8) {
1688             /* Store the offset in a GRF so we can send-from-GRF. */
1689             offset = src_reg(this, glsl_type::int_type);
1690             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1691          } else {
1692             /* Immediates are fine on older generations since they'll be moved
1693              * to a (potentially fake) MRF at the generator level.
1694              */
1695             offset = src_reg(const_offset / 16);
1696          }
1697       } else {
1698          offset = src_reg(this, glsl_type::uint_type);
1699          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1700       }
1701
1702       if (brw->gen >= 7) {
1703          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1704          grf_offset.type = offset.type;
1705
1706          emit(MOV(grf_offset, offset));
1707
1708          emit(new(mem_ctx) vec4_instruction(this,
1709                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1710                                             dst_reg(packed_consts),
1711                                             surf_index,
1712                                             src_reg(grf_offset)));
1713       } else {
1714          vec4_instruction *pull =
1715             emit(new(mem_ctx) vec4_instruction(this,
1716                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1717                                                dst_reg(packed_consts),
1718                                                surf_index,
1719                                                offset));
1720          pull->base_mrf = 14;
1721          pull->mlen = 1;
1722       }
1723
1724       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1725       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1726                                             const_offset % 16 / 4,
1727                                             const_offset % 16 / 4,
1728                                             const_offset % 16 / 4);
1729
1730       /* UBO bools are any nonzero int.  We need to convert them to use the
1731        * value of true stored in ctx->Const.UniformBooleanTrue.
1732        */
1733       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1734          emit(CMP(result_dst, packed_consts, src_reg(0u),
1735                   BRW_CONDITIONAL_NZ));
1736          if (ctx->Const.UniformBooleanTrue == 1) {
1737             emit(AND(result_dst, result, src_reg(1u)));
1738          }
1739       } else {
1740          emit(MOV(result_dst, packed_consts));
1741       }
1742       break;
1743    }
1744
1745    case ir_binop_vector_extract:
1746       unreachable("should have been lowered by vec_index_to_cond_assign");
1747
1748    case ir_triop_fma:
1749       op[0] = fix_3src_operand(op[0]);
1750       op[1] = fix_3src_operand(op[1]);
1751       op[2] = fix_3src_operand(op[2]);
1752       /* Note that the instruction's argument order is reversed from GLSL
1753        * and the IR.
1754        */
1755       emit(MAD(result_dst, op[2], op[1], op[0]));
1756       break;
1757
1758    case ir_triop_lrp:
1759       emit_lrp(result_dst, op[0], op[1], op[2]);
1760       break;
1761
1762    case ir_triop_csel:
1763       unreachable("already handled above");
1764       break;
1765
1766    case ir_triop_bfi:
1767       op[0] = fix_3src_operand(op[0]);
1768       op[1] = fix_3src_operand(op[1]);
1769       op[2] = fix_3src_operand(op[2]);
1770       emit(BFI2(result_dst, op[0], op[1], op[2]));
1771       break;
1772
1773    case ir_triop_bitfield_extract:
1774       op[0] = fix_3src_operand(op[0]);
1775       op[1] = fix_3src_operand(op[1]);
1776       op[2] = fix_3src_operand(op[2]);
1777       /* Note that the instruction's argument order is reversed from GLSL
1778        * and the IR.
1779        */
1780       emit(BFE(result_dst, op[2], op[1], op[0]));
1781       break;
1782
1783    case ir_triop_vector_insert:
1784       unreachable("should have been lowered by lower_vector_insert");
1785
1786    case ir_quadop_bitfield_insert:
1787       unreachable("not reached: should be handled by "
1788               "bitfield_insert_to_bfm_bfi\n");
1789
1790    case ir_quadop_vector:
1791       unreachable("not reached: should be handled by lower_quadop_vector");
1792
1793    case ir_unop_pack_half_2x16:
1794       emit_pack_half_2x16(result_dst, op[0]);
1795       break;
1796    case ir_unop_unpack_half_2x16:
1797       emit_unpack_half_2x16(result_dst, op[0]);
1798       break;
1799    case ir_unop_unpack_unorm_4x8:
1800       emit_unpack_unorm_4x8(result_dst, op[0]);
1801       break;
1802    case ir_unop_unpack_snorm_4x8:
1803       emit_unpack_snorm_4x8(result_dst, op[0]);
1804       break;
1805    case ir_unop_pack_snorm_2x16:
1806    case ir_unop_pack_snorm_4x8:
1807    case ir_unop_pack_unorm_2x16:
1808    case ir_unop_pack_unorm_4x8:
1809    case ir_unop_unpack_snorm_2x16:
1810    case ir_unop_unpack_unorm_2x16:
1811       unreachable("not reached: should be handled by lower_packing_builtins");
1812    case ir_unop_unpack_half_2x16_split_x:
1813    case ir_unop_unpack_half_2x16_split_y:
1814    case ir_binop_pack_half_2x16_split:
1815    case ir_unop_interpolate_at_centroid:
1816    case ir_binop_interpolate_at_sample:
1817    case ir_binop_interpolate_at_offset:
1818       unreachable("not reached: should not occur in vertex shader");
1819    case ir_binop_ldexp:
1820       unreachable("not reached: should be handled by ldexp_to_arith()");
1821    }
1822 }
1823
1824
1825 void
1826 vec4_visitor::visit(ir_swizzle *ir)
1827 {
1828    src_reg src;
1829    int i = 0;
1830    int swizzle[4];
1831
1832    /* Note that this is only swizzles in expressions, not those on the left
1833     * hand side of an assignment, which do write masking.  See ir_assignment
1834     * for that.
1835     */
1836
1837    ir->val->accept(this);
1838    src = this->result;
1839    assert(src.file != BAD_FILE);
1840
1841    for (i = 0; i < ir->type->vector_elements; i++) {
1842       switch (i) {
1843       case 0:
1844          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1845          break;
1846       case 1:
1847          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1848          break;
1849       case 2:
1850          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1851          break;
1852       case 3:
1853          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1854             break;
1855       }
1856    }
1857    for (; i < 4; i++) {
1858       /* Replicate the last channel out. */
1859       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1860    }
1861
1862    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1863
1864    this->result = src;
1865 }
1866
1867 void
1868 vec4_visitor::visit(ir_dereference_variable *ir)
1869 {
1870    const struct glsl_type *type = ir->type;
1871    dst_reg *reg = variable_storage(ir->var);
1872
1873    if (!reg) {
1874       fail("Failed to find variable storage for %s\n", ir->var->name);
1875       this->result = src_reg(brw_null_reg());
1876       return;
1877    }
1878
1879    this->result = src_reg(*reg);
1880
1881    /* System values get their swizzle from the dst_reg writemask */
1882    if (ir->var->data.mode == ir_var_system_value)
1883       return;
1884
1885    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1886       this->result.swizzle = swizzle_for_size(type->vector_elements);
1887 }
1888
1889
1890 int
1891 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1892 {
1893    /* Under normal circumstances array elements are stored consecutively, so
1894     * the stride is equal to the size of the array element.
1895     */
1896    return type_size(ir->type);
1897 }
1898
1899
1900 void
1901 vec4_visitor::visit(ir_dereference_array *ir)
1902 {
1903    ir_constant *constant_index;
1904    src_reg src;
1905    int array_stride = compute_array_stride(ir);
1906
1907    constant_index = ir->array_index->constant_expression_value();
1908
1909    ir->array->accept(this);
1910    src = this->result;
1911
1912    if (constant_index) {
1913       src.reg_offset += constant_index->value.i[0] * array_stride;
1914    } else {
1915       /* Variable index array dereference.  It eats the "vec4" of the
1916        * base of the array and an index that offsets the Mesa register
1917        * index.
1918        */
1919       ir->array_index->accept(this);
1920
1921       src_reg index_reg;
1922
1923       if (array_stride == 1) {
1924          index_reg = this->result;
1925       } else {
1926          index_reg = src_reg(this, glsl_type::int_type);
1927
1928          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1929       }
1930
1931       if (src.reladdr) {
1932          src_reg temp = src_reg(this, glsl_type::int_type);
1933
1934          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1935
1936          index_reg = temp;
1937       }
1938
1939       src.reladdr = ralloc(mem_ctx, src_reg);
1940       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1941    }
1942
1943    /* If the type is smaller than a vec4, replicate the last channel out. */
1944    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1945       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1946    else
1947       src.swizzle = BRW_SWIZZLE_NOOP;
1948    src.type = brw_type_for_base_type(ir->type);
1949
1950    this->result = src;
1951 }
1952
1953 void
1954 vec4_visitor::visit(ir_dereference_record *ir)
1955 {
1956    unsigned int i;
1957    const glsl_type *struct_type = ir->record->type;
1958    int offset = 0;
1959
1960    ir->record->accept(this);
1961
1962    for (i = 0; i < struct_type->length; i++) {
1963       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1964          break;
1965       offset += type_size(struct_type->fields.structure[i].type);
1966    }
1967
1968    /* If the type is smaller than a vec4, replicate the last channel out. */
1969    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1970       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1971    else
1972       this->result.swizzle = BRW_SWIZZLE_NOOP;
1973    this->result.type = brw_type_for_base_type(ir->type);
1974
1975    this->result.reg_offset += offset;
1976 }
1977
1978 /**
1979  * We want to be careful in assignment setup to hit the actual storage
1980  * instead of potentially using a temporary like we might with the
1981  * ir_dereference handler.
1982  */
1983 static dst_reg
1984 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1985 {
1986    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1987     * access of a vector, it must be separated into a series conditional moves
1988     * before reaching this point (see ir_vec_index_to_cond_assign).
1989     */
1990    assert(ir->as_dereference());
1991    ir_dereference_array *deref_array = ir->as_dereference_array();
1992    if (deref_array) {
1993       assert(!deref_array->array->type->is_vector());
1994    }
1995
1996    /* Use the rvalue deref handler for the most part.  We'll ignore
1997     * swizzles in it and write swizzles using writemask, though.
1998     */
1999    ir->accept(v);
2000    return dst_reg(v->result);
2001 }
2002
2003 void
2004 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2005                               const struct glsl_type *type,
2006                               enum brw_predicate predicate)
2007 {
2008    if (type->base_type == GLSL_TYPE_STRUCT) {
2009       for (unsigned int i = 0; i < type->length; i++) {
2010          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2011       }
2012       return;
2013    }
2014
2015    if (type->is_array()) {
2016       for (unsigned int i = 0; i < type->length; i++) {
2017          emit_block_move(dst, src, type->fields.array, predicate);
2018       }
2019       return;
2020    }
2021
2022    if (type->is_matrix()) {
2023       const struct glsl_type *vec_type;
2024
2025       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2026                                          type->vector_elements, 1);
2027
2028       for (int i = 0; i < type->matrix_columns; i++) {
2029          emit_block_move(dst, src, vec_type, predicate);
2030       }
2031       return;
2032    }
2033
2034    assert(type->is_scalar() || type->is_vector());
2035
2036    dst->type = brw_type_for_base_type(type);
2037    src->type = dst->type;
2038
2039    dst->writemask = (1 << type->vector_elements) - 1;
2040
2041    src->swizzle = swizzle_for_size(type->vector_elements);
2042
2043    vec4_instruction *inst = emit(MOV(*dst, *src));
2044    inst->predicate = predicate;
2045
2046    dst->reg_offset++;
2047    src->reg_offset++;
2048 }
2049
2050
2051 /* If the RHS processing resulted in an instruction generating a
2052  * temporary value, and it would be easy to rewrite the instruction to
2053  * generate its result right into the LHS instead, do so.  This ends
2054  * up reliably removing instructions where it can be tricky to do so
2055  * later without real UD chain information.
2056  */
2057 bool
2058 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2059                                      dst_reg dst,
2060                                      src_reg src,
2061                                      vec4_instruction *pre_rhs_inst,
2062                                      vec4_instruction *last_rhs_inst)
2063 {
2064    /* This could be supported, but it would take more smarts. */
2065    if (ir->condition)
2066       return false;
2067
2068    if (pre_rhs_inst == last_rhs_inst)
2069       return false; /* No instructions generated to work with. */
2070
2071    /* Make sure the last instruction generated our source reg. */
2072    if (src.file != GRF ||
2073        src.file != last_rhs_inst->dst.file ||
2074        src.reg != last_rhs_inst->dst.reg ||
2075        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2076        src.reladdr ||
2077        src.abs ||
2078        src.negate ||
2079        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2080       return false;
2081
2082    /* Check that that last instruction fully initialized the channels
2083     * we want to use, in the order we want to use them.  We could
2084     * potentially reswizzle the operands of many instructions so that
2085     * we could handle out of order channels, but don't yet.
2086     */
2087
2088    for (unsigned i = 0; i < 4; i++) {
2089       if (dst.writemask & (1 << i)) {
2090          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2091             return false;
2092
2093          if (BRW_GET_SWZ(src.swizzle, i) != i)
2094             return false;
2095       }
2096    }
2097
2098    /* Success!  Rewrite the instruction. */
2099    last_rhs_inst->dst.file = dst.file;
2100    last_rhs_inst->dst.reg = dst.reg;
2101    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2102    last_rhs_inst->dst.reladdr = dst.reladdr;
2103    last_rhs_inst->dst.writemask &= dst.writemask;
2104
2105    return true;
2106 }
2107
2108 void
2109 vec4_visitor::visit(ir_assignment *ir)
2110 {
2111    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2112    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2113
2114    if (!ir->lhs->type->is_scalar() &&
2115        !ir->lhs->type->is_vector()) {
2116       ir->rhs->accept(this);
2117       src_reg src = this->result;
2118
2119       if (ir->condition) {
2120          emit_bool_to_cond_code(ir->condition, &predicate);
2121       }
2122
2123       /* emit_block_move doesn't account for swizzles in the source register.
2124        * This should be ok, since the source register is a structure or an
2125        * array, and those can't be swizzled.  But double-check to be sure.
2126        */
2127       assert(src.swizzle ==
2128              (ir->rhs->type->is_matrix()
2129               ? swizzle_for_size(ir->rhs->type->vector_elements)
2130               : BRW_SWIZZLE_NOOP));
2131
2132       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2133       return;
2134    }
2135
2136    /* Now we're down to just a scalar/vector with writemasks. */
2137    int i;
2138
2139    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2140    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2141
2142    ir->rhs->accept(this);
2143
2144    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2145
2146    src_reg src = this->result;
2147
2148    int swizzles[4];
2149    int first_enabled_chan = 0;
2150    int src_chan = 0;
2151
2152    assert(ir->lhs->type->is_vector() ||
2153           ir->lhs->type->is_scalar());
2154    dst.writemask = ir->write_mask;
2155
2156    for (int i = 0; i < 4; i++) {
2157       if (dst.writemask & (1 << i)) {
2158          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2159          break;
2160       }
2161    }
2162
2163    /* Swizzle a small RHS vector into the channels being written.
2164     *
2165     * glsl ir treats write_mask as dictating how many channels are
2166     * present on the RHS while in our instructions we need to make
2167     * those channels appear in the slots of the vec4 they're written to.
2168     */
2169    for (int i = 0; i < 4; i++) {
2170       if (dst.writemask & (1 << i))
2171          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2172       else
2173          swizzles[i] = first_enabled_chan;
2174    }
2175    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2176                               swizzles[2], swizzles[3]);
2177
2178    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2179       return;
2180    }
2181
2182    if (ir->condition) {
2183       emit_bool_to_cond_code(ir->condition, &predicate);
2184    }
2185
2186    for (i = 0; i < type_size(ir->lhs->type); i++) {
2187       vec4_instruction *inst = emit(MOV(dst, src));
2188       inst->predicate = predicate;
2189
2190       dst.reg_offset++;
2191       src.reg_offset++;
2192    }
2193 }
2194
2195 void
2196 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2197 {
2198    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2199       foreach_in_list(ir_constant, field_value, &ir->components) {
2200          emit_constant_values(dst, field_value);
2201       }
2202       return;
2203    }
2204
2205    if (ir->type->is_array()) {
2206       for (unsigned int i = 0; i < ir->type->length; i++) {
2207          emit_constant_values(dst, ir->array_elements[i]);
2208       }
2209       return;
2210    }
2211
2212    if (ir->type->is_matrix()) {
2213       for (int i = 0; i < ir->type->matrix_columns; i++) {
2214          float *vec = &ir->value.f[i * ir->type->vector_elements];
2215
2216          for (int j = 0; j < ir->type->vector_elements; j++) {
2217             dst->writemask = 1 << j;
2218             dst->type = BRW_REGISTER_TYPE_F;
2219
2220             emit(MOV(*dst, src_reg(vec[j])));
2221          }
2222          dst->reg_offset++;
2223       }
2224       return;
2225    }
2226
2227    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2228
2229    for (int i = 0; i < ir->type->vector_elements; i++) {
2230       if (!(remaining_writemask & (1 << i)))
2231          continue;
2232
2233       dst->writemask = 1 << i;
2234       dst->type = brw_type_for_base_type(ir->type);
2235
2236       /* Find other components that match the one we're about to
2237        * write.  Emits fewer instructions for things like vec4(0.5,
2238        * 1.5, 1.5, 1.5).
2239        */
2240       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2241          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2242             if (ir->value.b[i] == ir->value.b[j])
2243                dst->writemask |= (1 << j);
2244          } else {
2245             /* u, i, and f storage all line up, so no need for a
2246              * switch case for comparing each type.
2247              */
2248             if (ir->value.u[i] == ir->value.u[j])
2249                dst->writemask |= (1 << j);
2250          }
2251       }
2252
2253       switch (ir->type->base_type) {
2254       case GLSL_TYPE_FLOAT:
2255          emit(MOV(*dst, src_reg(ir->value.f[i])));
2256          break;
2257       case GLSL_TYPE_INT:
2258          emit(MOV(*dst, src_reg(ir->value.i[i])));
2259          break;
2260       case GLSL_TYPE_UINT:
2261          emit(MOV(*dst, src_reg(ir->value.u[i])));
2262          break;
2263       case GLSL_TYPE_BOOL:
2264          emit(MOV(*dst,
2265                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2266                                               : 0u)));
2267          break;
2268       default:
2269          unreachable("Non-float/uint/int/bool constant");
2270       }
2271
2272       remaining_writemask &= ~dst->writemask;
2273    }
2274    dst->reg_offset++;
2275 }
2276
2277 void
2278 vec4_visitor::visit(ir_constant *ir)
2279 {
2280    dst_reg dst = dst_reg(this, ir->type);
2281    this->result = src_reg(dst);
2282
2283    emit_constant_values(&dst, ir);
2284 }
2285
2286 void
2287 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2288 {
2289    ir_dereference *deref = static_cast<ir_dereference *>(
2290       ir->actual_parameters.get_head());
2291    ir_variable *location = deref->variable_referenced();
2292    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2293                           location->data.binding);
2294
2295    /* Calculate the surface offset */
2296    src_reg offset(this, glsl_type::uint_type);
2297    ir_dereference_array *deref_array = deref->as_dereference_array();
2298    if (deref_array) {
2299       deref_array->array_index->accept(this);
2300
2301       src_reg tmp(this, glsl_type::uint_type);
2302       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2303       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2304    } else {
2305       offset = location->data.atomic.offset;
2306    }
2307
2308    /* Emit the appropriate machine instruction */
2309    const char *callee = ir->callee->function_name();
2310    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2311
2312    if (!strcmp("__intrinsic_atomic_read", callee)) {
2313       emit_untyped_surface_read(surf_index, dst, offset);
2314
2315    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2316       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2317                           src_reg(), src_reg());
2318
2319    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2320       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2321                           src_reg(), src_reg());
2322    }
2323 }
2324
2325 void
2326 vec4_visitor::visit(ir_call *ir)
2327 {
2328    const char *callee = ir->callee->function_name();
2329
2330    if (!strcmp("__intrinsic_atomic_read", callee) ||
2331        !strcmp("__intrinsic_atomic_increment", callee) ||
2332        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2333       visit_atomic_counter_intrinsic(ir);
2334    } else {
2335       unreachable("Unsupported intrinsic.");
2336    }
2337 }
2338
2339 src_reg
2340 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2341 {
2342    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2343    inst->base_mrf = 2;
2344    inst->mlen = 1;
2345    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2346    inst->dst.writemask = WRITEMASK_XYZW;
2347
2348    inst->src[1] = sampler;
2349
2350    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2351    int param_base = inst->base_mrf;
2352    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2353    int zero_mask = 0xf & ~coord_mask;
2354
2355    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2356             coordinate));
2357
2358    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2359             src_reg(0)));
2360
2361    emit(inst);
2362    return src_reg(inst->dst);
2363 }
2364
2365 static bool
2366 is_high_sampler(struct brw_context *brw, src_reg sampler)
2367 {
2368    if (brw->gen < 8 && !brw->is_haswell)
2369       return false;
2370
2371    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2372 }
2373
2374 void
2375 vec4_visitor::visit(ir_texture *ir)
2376 {
2377    uint32_t sampler =
2378       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2379
2380    ir_rvalue *nonconst_sampler_index =
2381       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2382
2383    /* Handle non-constant sampler array indexing */
2384    src_reg sampler_reg;
2385    if (nonconst_sampler_index) {
2386       /* The highest sampler which may be used by this operation is
2387        * the last element of the array. Mark it here, because the generator
2388        * doesn't have enough information to determine the bound.
2389        */
2390       uint32_t array_size = ir->sampler->as_dereference_array()
2391          ->array->type->array_size();
2392
2393       uint32_t max_used = sampler + array_size - 1;
2394       if (ir->op == ir_tg4 && brw->gen < 8) {
2395          max_used += prog_data->base.binding_table.gather_texture_start;
2396       } else {
2397          max_used += prog_data->base.binding_table.texture_start;
2398       }
2399
2400       brw_mark_surface_used(&prog_data->base, max_used);
2401
2402       /* Emit code to evaluate the actual indexing expression */
2403       nonconst_sampler_index->accept(this);
2404       dst_reg temp(this, glsl_type::uint_type);
2405       emit(ADD(temp, this->result, src_reg(sampler)))
2406          ->force_writemask_all = true;
2407       sampler_reg = src_reg(temp);
2408    } else {
2409       /* Single sampler, or constant array index; the indexing expression
2410        * is just an immediate.
2411        */
2412       sampler_reg = src_reg(sampler);
2413    }
2414
2415    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2416     * emitting anything other than setting up the constant result.
2417     */
2418    if (ir->op == ir_tg4) {
2419       ir_constant *chan = ir->lod_info.component->as_constant();
2420       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2421       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2422          dst_reg result(this, ir->type);
2423          this->result = src_reg(result);
2424          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2425          return;
2426       }
2427    }
2428
2429    /* Should be lowered by do_lower_texture_projection */
2430    assert(!ir->projector);
2431
2432    /* Should be lowered */
2433    assert(!ir->offset || !ir->offset->type->is_array());
2434
2435    /* Generate code to compute all the subexpression trees.  This has to be
2436     * done before loading any values into MRFs for the sampler message since
2437     * generating these values may involve SEND messages that need the MRFs.
2438     */
2439    src_reg coordinate;
2440    if (ir->coordinate) {
2441       ir->coordinate->accept(this);
2442       coordinate = this->result;
2443    }
2444
2445    src_reg shadow_comparitor;
2446    if (ir->shadow_comparitor) {
2447       ir->shadow_comparitor->accept(this);
2448       shadow_comparitor = this->result;
2449    }
2450
2451    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2452    src_reg offset_value;
2453    if (has_nonconstant_offset) {
2454       ir->offset->accept(this);
2455       offset_value = src_reg(this->result);
2456    }
2457
2458    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2459    src_reg lod, dPdx, dPdy, sample_index, mcs;
2460    switch (ir->op) {
2461    case ir_tex:
2462       lod = src_reg(0.0f);
2463       lod_type = glsl_type::float_type;
2464       break;
2465    case ir_txf:
2466    case ir_txl:
2467    case ir_txs:
2468       ir->lod_info.lod->accept(this);
2469       lod = this->result;
2470       lod_type = ir->lod_info.lod->type;
2471       break;
2472    case ir_query_levels:
2473       lod = src_reg(0);
2474       lod_type = glsl_type::int_type;
2475       break;
2476    case ir_txf_ms:
2477       ir->lod_info.sample_index->accept(this);
2478       sample_index = this->result;
2479       sample_index_type = ir->lod_info.sample_index->type;
2480
2481       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2482          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2483       else
2484          mcs = src_reg(0u);
2485       break;
2486    case ir_txd:
2487       ir->lod_info.grad.dPdx->accept(this);
2488       dPdx = this->result;
2489
2490       ir->lod_info.grad.dPdy->accept(this);
2491       dPdy = this->result;
2492
2493       lod_type = ir->lod_info.grad.dPdx->type;
2494       break;
2495    case ir_txb:
2496    case ir_lod:
2497    case ir_tg4:
2498       break;
2499    }
2500
2501    enum opcode opcode;
2502    switch (ir->op) {
2503    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2504    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2505    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2506    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2507    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2508    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2509    case ir_tg4: opcode = has_nonconstant_offset
2510                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2511    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2512    case ir_txb:
2513       unreachable("TXB is not valid for vertex shaders.");
2514    case ir_lod:
2515       unreachable("LOD is not valid for vertex shaders.");
2516    default:
2517       unreachable("Unrecognized tex op");
2518    }
2519
2520    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2521
2522    if (ir->offset != NULL && !has_nonconstant_offset) {
2523       inst->offset =
2524          brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2525                             ir->offset->type->vector_elements);
2526    }
2527
2528    /* Stuff the channel select bits in the top of the texture offset */
2529    if (ir->op == ir_tg4)
2530       inst->offset |= gather_channel(ir, sampler) << 16;
2531
2532    /* The message header is necessary for:
2533     * - Gen4 (always)
2534     * - Texel offsets
2535     * - Gather channel selection
2536     * - Sampler indices too large to fit in a 4-bit value.
2537     */
2538    inst->header_present =
2539       brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2540       is_high_sampler(brw, sampler_reg);
2541    inst->base_mrf = 2;
2542    inst->mlen = inst->header_present + 1; /* always at least one */
2543    inst->dst = dst_reg(this, ir->type);
2544    inst->dst.writemask = WRITEMASK_XYZW;
2545    inst->shadow_compare = ir->shadow_comparitor != NULL;
2546
2547    inst->src[1] = sampler_reg;
2548
2549    /* MRF for the first parameter */
2550    int param_base = inst->base_mrf + inst->header_present;
2551
2552    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2553       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2554       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2555    } else {
2556       /* Load the coordinate */
2557       /* FINISHME: gl_clamp_mask and saturate */
2558       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2559       int zero_mask = 0xf & ~coord_mask;
2560
2561       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2562                coordinate));
2563
2564       if (zero_mask != 0) {
2565          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2566                   src_reg(0)));
2567       }
2568       /* Load the shadow comparitor */
2569       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2570          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2571                           WRITEMASK_X),
2572                   shadow_comparitor));
2573          inst->mlen++;
2574       }
2575
2576       /* Load the LOD info */
2577       if (ir->op == ir_tex || ir->op == ir_txl) {
2578          int mrf, writemask;
2579          if (brw->gen >= 5) {
2580             mrf = param_base + 1;
2581             if (ir->shadow_comparitor) {
2582                writemask = WRITEMASK_Y;
2583                /* mlen already incremented */
2584             } else {
2585                writemask = WRITEMASK_X;
2586                inst->mlen++;
2587             }
2588          } else /* brw->gen == 4 */ {
2589             mrf = param_base;
2590             writemask = WRITEMASK_W;
2591          }
2592          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2593       } else if (ir->op == ir_txf) {
2594          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2595       } else if (ir->op == ir_txf_ms) {
2596          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2597                   sample_index));
2598          if (brw->gen >= 7) {
2599             /* MCS data is in the first channel of `mcs`, but we need to get it into
2600              * the .y channel of the second vec4 of params, so replicate .x across
2601              * the whole vec4 and then mask off everything except .y
2602              */
2603             mcs.swizzle = BRW_SWIZZLE_XXXX;
2604             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2605                      mcs));
2606          }
2607          inst->mlen++;
2608       } else if (ir->op == ir_txd) {
2609          const glsl_type *type = lod_type;
2610
2611          if (brw->gen >= 5) {
2612             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2613             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2614             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2615             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2616             inst->mlen++;
2617
2618             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2619                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2620                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2621                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2622                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2623                inst->mlen++;
2624
2625                if (ir->shadow_comparitor) {
2626                   emit(MOV(dst_reg(MRF, param_base + 2,
2627                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2628                            shadow_comparitor));
2629                }
2630             }
2631          } else /* brw->gen == 4 */ {
2632             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2633             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2634             inst->mlen += 2;
2635          }
2636       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2637          if (ir->shadow_comparitor) {
2638             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2639                      shadow_comparitor));
2640          }
2641
2642          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2643                   offset_value));
2644          inst->mlen++;
2645       }
2646    }
2647
2648    emit(inst);
2649
2650    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2651     * spec requires layers.
2652     */
2653    if (ir->op == ir_txs) {
2654       glsl_type const *type = ir->sampler->type;
2655       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2656           type->sampler_array) {
2657          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2658                    writemask(inst->dst, WRITEMASK_Z),
2659                    src_reg(inst->dst), src_reg(6));
2660       }
2661    }
2662
2663    if (brw->gen == 6 && ir->op == ir_tg4) {
2664       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2665    }
2666
2667    swizzle_result(ir, src_reg(inst->dst), sampler);
2668 }
2669
2670 /**
2671  * Apply workarounds for Gen6 gather with UINT/SINT
2672  */
2673 void
2674 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2675 {
2676    if (!wa)
2677       return;
2678
2679    int width = (wa & WA_8BIT) ? 8 : 16;
2680    dst_reg dst_f = dst;
2681    dst_f.type = BRW_REGISTER_TYPE_F;
2682
2683    /* Convert from UNORM to UINT */
2684    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2685    emit(MOV(dst, src_reg(dst_f)));
2686
2687    if (wa & WA_SIGN) {
2688       /* Reinterpret the UINT value as a signed INT value by
2689        * shifting the sign bit into place, then shifting back
2690        * preserving sign.
2691        */
2692       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2693       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2694    }
2695 }
2696
2697 /**
2698  * Set up the gather channel based on the swizzle, for gather4.
2699  */
2700 uint32_t
2701 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2702 {
2703    ir_constant *chan = ir->lod_info.component->as_constant();
2704    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2705    switch (swiz) {
2706       case SWIZZLE_X: return 0;
2707       case SWIZZLE_Y:
2708          /* gather4 sampler is broken for green channel on RG32F --
2709           * we must ask for blue instead.
2710           */
2711          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2712             return 2;
2713          return 1;
2714       case SWIZZLE_Z: return 2;
2715       case SWIZZLE_W: return 3;
2716       default:
2717          unreachable("Not reached"); /* zero, one swizzles handled already */
2718    }
2719 }
2720
2721 void
2722 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2723 {
2724    int s = key->tex.swizzles[sampler];
2725
2726    this->result = src_reg(this, ir->type);
2727    dst_reg swizzled_result(this->result);
2728
2729    if (ir->op == ir_query_levels) {
2730       /* # levels is in .w */
2731       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2732       emit(MOV(swizzled_result, orig_val));
2733       return;
2734    }
2735
2736    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2737                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2738       emit(MOV(swizzled_result, orig_val));
2739       return;
2740    }
2741
2742
2743    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2744    int swizzle[4] = {0};
2745
2746    for (int i = 0; i < 4; i++) {
2747       switch (GET_SWZ(s, i)) {
2748       case SWIZZLE_ZERO:
2749          zero_mask |= (1 << i);
2750          break;
2751       case SWIZZLE_ONE:
2752          one_mask |= (1 << i);
2753          break;
2754       default:
2755          copy_mask |= (1 << i);
2756          swizzle[i] = GET_SWZ(s, i);
2757          break;
2758       }
2759    }
2760
2761    if (copy_mask) {
2762       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2763       swizzled_result.writemask = copy_mask;
2764       emit(MOV(swizzled_result, orig_val));
2765    }
2766
2767    if (zero_mask) {
2768       swizzled_result.writemask = zero_mask;
2769       emit(MOV(swizzled_result, src_reg(0.0f)));
2770    }
2771
2772    if (one_mask) {
2773       swizzled_result.writemask = one_mask;
2774       emit(MOV(swizzled_result, src_reg(1.0f)));
2775    }
2776 }
2777
2778 void
2779 vec4_visitor::visit(ir_return *)
2780 {
2781    unreachable("not reached");
2782 }
2783
2784 void
2785 vec4_visitor::visit(ir_discard *)
2786 {
2787    unreachable("not reached");
2788 }
2789
2790 void
2791 vec4_visitor::visit(ir_if *ir)
2792 {
2793    /* Don't point the annotation at the if statement, because then it plus
2794     * the then and else blocks get printed.
2795     */
2796    this->base_ir = ir->condition;
2797
2798    if (brw->gen == 6) {
2799       emit_if_gen6(ir);
2800    } else {
2801       enum brw_predicate predicate;
2802       emit_bool_to_cond_code(ir->condition, &predicate);
2803       emit(IF(predicate));
2804    }
2805
2806    visit_instructions(&ir->then_instructions);
2807
2808    if (!ir->else_instructions.is_empty()) {
2809       this->base_ir = ir->condition;
2810       emit(BRW_OPCODE_ELSE);
2811
2812       visit_instructions(&ir->else_instructions);
2813    }
2814
2815    this->base_ir = ir->condition;
2816    emit(BRW_OPCODE_ENDIF);
2817 }
2818
2819 void
2820 vec4_visitor::visit(ir_emit_vertex *)
2821 {
2822    unreachable("not reached");
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_end_primitive *)
2827 {
2828    unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2833                                   dst_reg dst, src_reg offset,
2834                                   src_reg src0, src_reg src1)
2835 {
2836    unsigned mlen = 0;
2837
2838    /* Set the atomic operation offset. */
2839    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2840    mlen++;
2841
2842    /* Set the atomic operation arguments. */
2843    if (src0.file != BAD_FILE) {
2844       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2845       mlen++;
2846    }
2847
2848    if (src1.file != BAD_FILE) {
2849       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2850       mlen++;
2851    }
2852
2853    /* Emit the instruction.  Note that this maps to the normal SIMD8
2854     * untyped atomic message on Ivy Bridge, but that's OK because
2855     * unused channels will be masked out.
2856     */
2857    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2858                                  src_reg(atomic_op), src_reg(surf_index));
2859    inst->base_mrf = 0;
2860    inst->mlen = mlen;
2861 }
2862
2863 void
2864 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2865                                         src_reg offset)
2866 {
2867    /* Set the surface read offset. */
2868    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2869
2870    /* Emit the instruction.  Note that this maps to the normal SIMD8
2871     * untyped surface read message, but that's OK because unused
2872     * channels will be masked out.
2873     */
2874    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2875                                  dst, src_reg(surf_index));
2876    inst->base_mrf = 0;
2877    inst->mlen = 1;
2878 }
2879
2880 void
2881 vec4_visitor::emit_ndc_computation()
2882 {
2883    /* Get the position */
2884    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2885
2886    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2887    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2888    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2889
2890    current_annotation = "NDC";
2891    dst_reg ndc_w = ndc;
2892    ndc_w.writemask = WRITEMASK_W;
2893    src_reg pos_w = pos;
2894    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2895    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2896
2897    dst_reg ndc_xyz = ndc;
2898    ndc_xyz.writemask = WRITEMASK_XYZ;
2899
2900    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2901 }
2902
2903 void
2904 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2905 {
2906    if (brw->gen < 6 &&
2907        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2908         key->userclip_active || brw->has_negative_rhw_bug)) {
2909       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2910       dst_reg header1_w = header1;
2911       header1_w.writemask = WRITEMASK_W;
2912
2913       emit(MOV(header1, 0u));
2914
2915       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2916          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2917
2918          current_annotation = "Point size";
2919          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2920          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2921       }
2922
2923       if (key->userclip_active) {
2924          current_annotation = "Clipping flags";
2925          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2926          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2927
2928          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2929          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2930          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2931
2932          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2934          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2935          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2936       }
2937
2938       /* i965 clipping workaround:
2939        * 1) Test for -ve rhw
2940        * 2) If set,
2941        *      set ndc = (0,0,0,0)
2942        *      set ucp[6] = 1
2943        *
2944        * Later, clipping will detect ucp[6] and ensure the primitive is
2945        * clipped against all fixed planes.
2946        */
2947       if (brw->has_negative_rhw_bug) {
2948          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2949          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2950          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2951          vec4_instruction *inst;
2952          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2953          inst->predicate = BRW_PREDICATE_NORMAL;
2954          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2955          inst->predicate = BRW_PREDICATE_NORMAL;
2956       }
2957
2958       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2959    } else if (brw->gen < 6) {
2960       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2961    } else {
2962       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2963       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964          dst_reg reg_w = reg;
2965          reg_w.writemask = WRITEMASK_W;
2966          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2967       }
2968       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2969          dst_reg reg_y = reg;
2970          reg_y.writemask = WRITEMASK_Y;
2971          reg_y.type = BRW_REGISTER_TYPE_D;
2972          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
2973       }
2974       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2975          dst_reg reg_z = reg;
2976          reg_z.writemask = WRITEMASK_Z;
2977          reg_z.type = BRW_REGISTER_TYPE_D;
2978          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2979       }
2980    }
2981 }
2982
2983 void
2984 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2985 {
2986    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2987     *
2988     *     "If a linked set of shaders forming the vertex stage contains no
2989     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2990     *     application has requested clipping against user clip planes through
2991     *     the API, then the coordinate written to gl_Position is used for
2992     *     comparison against the user clip planes."
2993     *
2994     * This function is only called if the shader didn't write to
2995     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2996     * if the user wrote to it; otherwise we use gl_Position.
2997     */
2998    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2999    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3000       clip_vertex = VARYING_SLOT_POS;
3001    }
3002
3003    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3004         ++i) {
3005       reg.writemask = 1 << i;
3006       emit(DP4(reg,
3007                src_reg(output_reg[clip_vertex]),
3008                src_reg(this->userplane[i + offset])));
3009    }
3010 }
3011
3012 void
3013 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3014 {
3015    assert (varying < VARYING_SLOT_MAX);
3016    reg.type = output_reg[varying].type;
3017    current_annotation = output_reg_annotation[varying];
3018    /* Copy the register, saturating if necessary */
3019    vec4_instruction *inst = emit(MOV(reg,
3020                                      src_reg(output_reg[varying])));
3021    if ((varying == VARYING_SLOT_COL0 ||
3022         varying == VARYING_SLOT_COL1 ||
3023         varying == VARYING_SLOT_BFC0 ||
3024         varying == VARYING_SLOT_BFC1) &&
3025        key->clamp_vertex_color) {
3026       inst->saturate = true;
3027    }
3028 }
3029
3030 void
3031 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3032 {
3033    reg.type = BRW_REGISTER_TYPE_F;
3034
3035    switch (varying) {
3036    case VARYING_SLOT_PSIZ:
3037    {
3038       /* PSIZ is always in slot 0, and is coupled with other flags. */
3039       current_annotation = "indices, point width, clip flags";
3040       emit_psiz_and_flags(reg);
3041       break;
3042    }
3043    case BRW_VARYING_SLOT_NDC:
3044       current_annotation = "NDC";
3045       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3046       break;
3047    case VARYING_SLOT_POS:
3048       current_annotation = "gl_Position";
3049       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3050       break;
3051    case VARYING_SLOT_EDGE:
3052       /* This is present when doing unfilled polygons.  We're supposed to copy
3053        * the edge flag from the user-provided vertex array
3054        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3055        * of that attribute (starts as 1.0f).  This is then used in clipping to
3056        * determine which edges should be drawn as wireframe.
3057        */
3058       current_annotation = "edge flag";
3059       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3060                                     glsl_type::float_type, WRITEMASK_XYZW))));
3061       break;
3062    case BRW_VARYING_SLOT_PAD:
3063       /* No need to write to this slot */
3064       break;
3065    default:
3066       emit_generic_urb_slot(reg, varying);
3067       break;
3068    }
3069 }
3070
3071 static int
3072 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3073 {
3074    if (brw->gen >= 6) {
3075       /* URB data written (does not include the message header reg) must
3076        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3077        * section 5.4.3.2.2: URB_INTERLEAVED.
3078        *
3079        * URB entries are allocated on a multiple of 1024 bits, so an
3080        * extra 128 bits written here to make the end align to 256 is
3081        * no problem.
3082        */
3083       if ((mlen % 2) != 1)
3084          mlen++;
3085    }
3086
3087    return mlen;
3088 }
3089
3090
3091 /**
3092  * Generates the VUE payload plus the necessary URB write instructions to
3093  * output it.
3094  *
3095  * The VUE layout is documented in Volume 2a.
3096  */
3097 void
3098 vec4_visitor::emit_vertex()
3099 {
3100    /* MRF 0 is reserved for the debugger, so start with message header
3101     * in MRF 1.
3102     */
3103    int base_mrf = 1;
3104    int mrf = base_mrf;
3105    /* In the process of generating our URB write message contents, we
3106     * may need to unspill a register or load from an array.  Those
3107     * reads would use MRFs 14-15.
3108     */
3109    int max_usable_mrf = 13;
3110
3111    /* The following assertion verifies that max_usable_mrf causes an
3112     * even-numbered amount of URB write data, which will meet gen6's
3113     * requirements for length alignment.
3114     */
3115    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3116
3117    /* First mrf is the g0-based message header containing URB handles and
3118     * such.
3119     */
3120    emit_urb_write_header(mrf++);
3121
3122    if (brw->gen < 6) {
3123       emit_ndc_computation();
3124    }
3125
3126    /* Lower legacy ff and ClipVertex clipping to clip distances */
3127    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3128       current_annotation = "user clip distances";
3129
3130       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3131       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3132
3133       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3134       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3135    }
3136
3137    /* We may need to split this up into several URB writes, so do them in a
3138     * loop.
3139     */
3140    int slot = 0;
3141    bool complete = false;
3142    do {
3143       /* URB offset is in URB row increments, and each of our MRFs is half of
3144        * one of those, since we're doing interleaved writes.
3145        */
3146       int offset = slot / 2;
3147
3148       mrf = base_mrf + 1;
3149       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3150          emit_urb_slot(dst_reg(MRF, mrf++),
3151                        prog_data->vue_map.slot_to_varying[slot]);
3152
3153          /* If this was max_usable_mrf, we can't fit anything more into this
3154           * URB WRITE.
3155           */
3156          if (mrf > max_usable_mrf) {
3157             slot++;
3158             break;
3159          }
3160       }
3161
3162       complete = slot >= prog_data->vue_map.num_slots;
3163       current_annotation = "URB write";
3164       vec4_instruction *inst = emit_urb_write_opcode(complete);
3165       inst->base_mrf = base_mrf;
3166       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3167       inst->offset += offset;
3168    } while(!complete);
3169 }
3170
3171
3172 src_reg
3173 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3174                                  src_reg *reladdr, int reg_offset)
3175 {
3176    /* Because we store the values to scratch interleaved like our
3177     * vertex data, we need to scale the vec4 index by 2.
3178     */
3179    int message_header_scale = 2;
3180
3181    /* Pre-gen6, the message header uses byte offsets instead of vec4
3182     * (16-byte) offset units.
3183     */
3184    if (brw->gen < 6)
3185       message_header_scale *= 16;
3186
3187    if (reladdr) {
3188       src_reg index = src_reg(this, glsl_type::int_type);
3189
3190       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3191                                    src_reg(reg_offset)));
3192       emit_before(block, inst, MUL(dst_reg(index), index,
3193                                    src_reg(message_header_scale)));
3194
3195       return index;
3196    } else {
3197       return src_reg(reg_offset * message_header_scale);
3198    }
3199 }
3200
3201 src_reg
3202 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3203                                        src_reg *reladdr, int reg_offset)
3204 {
3205    if (reladdr) {
3206       src_reg index = src_reg(this, glsl_type::int_type);
3207
3208       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3209                                    src_reg(reg_offset)));
3210
3211       /* Pre-gen6, the message header uses byte offsets instead of vec4
3212        * (16-byte) offset units.
3213        */
3214       if (brw->gen < 6) {
3215          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3216       }
3217
3218       return index;
3219    } else if (brw->gen >= 8) {
3220       /* Store the offset in a GRF so we can send-from-GRF. */
3221       src_reg offset = src_reg(this, glsl_type::int_type);
3222       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3223       return offset;
3224    } else {
3225       int message_header_scale = brw->gen < 6 ? 16 : 1;
3226       return src_reg(reg_offset * message_header_scale);
3227    }
3228 }
3229
3230 /**
3231  * Emits an instruction before @inst to load the value named by @orig_src
3232  * from scratch space at @base_offset to @temp.
3233  *
3234  * @base_offset is measured in 32-byte units (the size of a register).
3235  */
3236 void
3237 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3238                                 dst_reg temp, src_reg orig_src,
3239                                 int base_offset)
3240 {
3241    int reg_offset = base_offset + orig_src.reg_offset;
3242    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3243                                       reg_offset);
3244
3245    emit_before(block, inst, SCRATCH_READ(temp, index));
3246 }
3247
3248 /**
3249  * Emits an instruction after @inst to store the value to be written
3250  * to @orig_dst to scratch space at @base_offset, from @temp.
3251  *
3252  * @base_offset is measured in 32-byte units (the size of a register).
3253  */
3254 void
3255 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3256                                  int base_offset)
3257 {
3258    int reg_offset = base_offset + inst->dst.reg_offset;
3259    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3260                                       reg_offset);
3261
3262    /* Create a temporary register to store *inst's result in.
3263     *
3264     * We have to be careful in MOVing from our temporary result register in
3265     * the scratch write.  If we swizzle from channels of the temporary that
3266     * weren't initialized, it will confuse live interval analysis, which will
3267     * make spilling fail to make progress.
3268     */
3269    src_reg temp = src_reg(this, glsl_type::vec4_type);
3270    temp.type = inst->dst.type;
3271    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3272    int swizzles[4];
3273    for (int i = 0; i < 4; i++)
3274       if (inst->dst.writemask & (1 << i))
3275          swizzles[i] = i;
3276       else
3277          swizzles[i] = first_writemask_chan;
3278    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3279                                swizzles[2], swizzles[3]);
3280
3281    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3282                                        inst->dst.writemask));
3283    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3284    write->predicate = inst->predicate;
3285    write->ir = inst->ir;
3286    write->annotation = inst->annotation;
3287    inst->insert_after(block, write);
3288
3289    inst->dst.file = temp.file;
3290    inst->dst.reg = temp.reg;
3291    inst->dst.reg_offset = temp.reg_offset;
3292    inst->dst.reladdr = NULL;
3293 }
3294
3295 /**
3296  * We can't generally support array access in GRF space, because a
3297  * single instruction's destination can only span 2 contiguous
3298  * registers.  So, we send all GRF arrays that get variable index
3299  * access to scratch space.
3300  */
3301 void
3302 vec4_visitor::move_grf_array_access_to_scratch()
3303 {
3304    int scratch_loc[this->virtual_grf_count];
3305    memset(scratch_loc, -1, sizeof(scratch_loc));
3306
3307    /* First, calculate the set of virtual GRFs that need to be punted
3308     * to scratch due to having any array access on them, and where in
3309     * scratch.
3310     */
3311    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3312       if (inst->dst.file == GRF && inst->dst.reladdr &&
3313           scratch_loc[inst->dst.reg] == -1) {
3314          scratch_loc[inst->dst.reg] = c->last_scratch;
3315          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3316       }
3317
3318       for (int i = 0 ; i < 3; i++) {
3319          src_reg *src = &inst->src[i];
3320
3321          if (src->file == GRF && src->reladdr &&
3322              scratch_loc[src->reg] == -1) {
3323             scratch_loc[src->reg] = c->last_scratch;
3324             c->last_scratch += this->virtual_grf_sizes[src->reg];
3325          }
3326       }
3327    }
3328
3329    /* Now, for anything that will be accessed through scratch, rewrite
3330     * it to load/store.  Note that this is a _safe list walk, because
3331     * we may generate a new scratch_write instruction after the one
3332     * we're processing.
3333     */
3334    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3335       /* Set up the annotation tracking for new generated instructions. */
3336       base_ir = inst->ir;
3337       current_annotation = inst->annotation;
3338
3339       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3340          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3341       }
3342
3343       for (int i = 0 ; i < 3; i++) {
3344          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3345             continue;
3346
3347          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3348
3349          emit_scratch_read(block, inst, temp, inst->src[i],
3350                            scratch_loc[inst->src[i].reg]);
3351
3352          inst->src[i].file = temp.file;
3353          inst->src[i].reg = temp.reg;
3354          inst->src[i].reg_offset = temp.reg_offset;
3355          inst->src[i].reladdr = NULL;
3356       }
3357    }
3358 }
3359
3360 /**
3361  * Emits an instruction before @inst to load the value named by @orig_src
3362  * from the pull constant buffer (surface) at @base_offset to @temp.
3363  */
3364 void
3365 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3366                                       dst_reg temp, src_reg orig_src,
3367                                       int base_offset)
3368 {
3369    int reg_offset = base_offset + orig_src.reg_offset;
3370    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3371    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3372                                              reg_offset);
3373    vec4_instruction *load;
3374
3375    if (brw->gen >= 7) {
3376       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3377       grf_offset.type = offset.type;
3378       emit_before(block, inst, MOV(grf_offset, offset));
3379
3380       load = new(mem_ctx) vec4_instruction(this,
3381                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3382                                            temp, index, src_reg(grf_offset));
3383    } else {
3384       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3385                                            temp, index, offset);
3386       load->base_mrf = 14;
3387       load->mlen = 1;
3388    }
3389    emit_before(block, inst, load);
3390 }
3391
3392 /**
3393  * Implements array access of uniforms by inserting a
3394  * PULL_CONSTANT_LOAD instruction.
3395  *
3396  * Unlike temporary GRF array access (where we don't support it due to
3397  * the difficulty of doing relative addressing on instruction
3398  * destinations), we could potentially do array access of uniforms
3399  * that were loaded in GRF space as push constants.  In real-world
3400  * usage we've seen, though, the arrays being used are always larger
3401  * than we could load as push constants, so just always move all
3402  * uniform array access out to a pull constant buffer.
3403  */
3404 void
3405 vec4_visitor::move_uniform_array_access_to_pull_constants()
3406 {
3407    int pull_constant_loc[this->uniforms];
3408    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3409    bool nested_reladdr;
3410
3411    /* Walk through and find array access of uniforms.  Put a copy of that
3412     * uniform in the pull constant buffer.
3413     *
3414     * Note that we don't move constant-indexed accesses to arrays.  No
3415     * testing has been done of the performance impact of this choice.
3416     */
3417    do {
3418       nested_reladdr = false;
3419
3420       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3421          for (int i = 0 ; i < 3; i++) {
3422             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3423                continue;
3424
3425             int uniform = inst->src[i].reg;
3426
3427             if (inst->src[i].reladdr->reladdr)
3428                nested_reladdr = true;  /* will need another pass */
3429
3430             /* If this array isn't already present in the pull constant buffer,
3431              * add it.
3432              */
3433             if (pull_constant_loc[uniform] == -1) {
3434                const gl_constant_value **values =
3435                   &stage_prog_data->param[uniform * 4];
3436
3437                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3438
3439                assert(uniform < uniform_array_size);
3440                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3441                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3442                      = values[j];
3443                }
3444             }
3445
3446             /* Set up the annotation tracking for new generated instructions. */
3447             base_ir = inst->ir;
3448             current_annotation = inst->annotation;
3449
3450             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3451
3452             emit_pull_constant_load(block, inst, temp, inst->src[i],
3453                                     pull_constant_loc[uniform]);
3454
3455             inst->src[i].file = temp.file;
3456             inst->src[i].reg = temp.reg;
3457             inst->src[i].reg_offset = temp.reg_offset;
3458             inst->src[i].reladdr = NULL;
3459          }
3460       }
3461    } while (nested_reladdr);
3462
3463    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3464     * no need to track them as larger-than-vec4 objects.  This will be
3465     * relied on in cutting out unused uniform vectors from push
3466     * constants.
3467     */
3468    split_uniform_registers();
3469 }
3470
3471 void
3472 vec4_visitor::resolve_ud_negate(src_reg *reg)
3473 {
3474    if (reg->type != BRW_REGISTER_TYPE_UD ||
3475        !reg->negate)
3476       return;
3477
3478    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3479    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3480    *reg = temp;
3481 }
3482
3483 vec4_visitor::vec4_visitor(struct brw_context *brw,
3484                            struct brw_vec4_compile *c,
3485                            struct gl_program *prog,
3486                            const struct brw_vec4_prog_key *key,
3487                            struct brw_vec4_prog_data *prog_data,
3488                            struct gl_shader_program *shader_prog,
3489                            gl_shader_stage stage,
3490                            void *mem_ctx,
3491                            bool debug_flag,
3492                            bool no_spills,
3493                            shader_time_shader_type st_base,
3494                            shader_time_shader_type st_written,
3495                            shader_time_shader_type st_reset)
3496    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3497      c(c),
3498      key(key),
3499      prog_data(prog_data),
3500      sanity_param_count(0),
3501      fail_msg(NULL),
3502      first_non_payload_grf(0),
3503      need_all_constants_in_pull_buffer(false),
3504      debug_flag(debug_flag),
3505      no_spills(no_spills),
3506      st_base(st_base),
3507      st_written(st_written),
3508      st_reset(st_reset)
3509 {
3510    this->mem_ctx = mem_ctx;
3511    this->failed = false;
3512
3513    this->base_ir = NULL;
3514    this->current_annotation = NULL;
3515    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3516
3517    this->variable_ht = hash_table_ctor(0,
3518                                        hash_table_pointer_hash,
3519                                        hash_table_pointer_compare);
3520
3521    this->virtual_grf_start = NULL;
3522    this->virtual_grf_end = NULL;
3523    this->virtual_grf_sizes = NULL;
3524    this->virtual_grf_count = 0;
3525    this->virtual_grf_reg_map = NULL;
3526    this->virtual_grf_reg_count = 0;
3527    this->virtual_grf_array_size = 0;
3528    this->live_intervals_valid = false;
3529
3530    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3531
3532    this->uniforms = 0;
3533
3534    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3535     * at least one. See setup_uniforms() in brw_vec4.cpp.
3536     */
3537    this->uniform_array_size = 1;
3538    if (prog_data) {
3539       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3540    }
3541
3542    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3543    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3544 }
3545
3546 vec4_visitor::~vec4_visitor()
3547 {
3548    hash_table_dtor(this->variable_ht);
3549 }
3550
3551
3552 void
3553 vec4_visitor::fail(const char *format, ...)
3554 {
3555    va_list va;
3556    char *msg;
3557
3558    if (failed)
3559       return;
3560
3561    failed = true;
3562
3563    va_start(va, format);
3564    msg = ralloc_vasprintf(mem_ctx, format, va);
3565    va_end(va);
3566    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3567
3568    this->fail_msg = msg;
3569
3570    if (debug_flag) {
3571       fprintf(stderr, "%s",  msg);
3572    }
3573 }
3574
3575 } /* namespace brw */