src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up no register space, since they're baked in at
 592        * link time.
 593        */
 594       return 0;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 643 {
 644    assert(size > 0);
 645
 646    init();
 647
 648    this->file = GRF;
 649    this->reg = v->virtual_grf_alloc(type_size(type) * size);
 650
 651    this->swizzle = BRW_SWIZZLE_NOOP;
 652
 653    this->type = brw_type_for_base_type(type);
 654 }
 655
 656 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 657 {
 658    init();
 659
 660    this->file = GRF;
 661    this->reg = v->virtual_grf_alloc(type_size(type));
 662
 663    if (type->is_array() || type->is_record()) {
 664       this->writemask = WRITEMASK_XYZW;
 665    } else {
 666       this->writemask = (1 << type->vector_elements) - 1;
 667    }
 668
 669    this->type = brw_type_for_base_type(type);
 670 }
 671
 672 /* Our support for uniforms is piggy-backed on the struct
 673  * gl_fragment_program, because that's where the values actually
 674  * get stored, rather than in some global gl_shader_program uniform
 675  * store.
 676  */
 677 void
 678 vec4_visitor::setup_uniform_values(ir_variable *ir)
 679 {
 680    int namelen = strlen(ir->name);
 681
 682    /* The data for our (non-builtin) uniforms is stored in a series of
 683     * gl_uniform_driver_storage structs for each subcomponent that
 684     * glGetUniformLocation() could name.  We know it's been set up in the same
 685     * order we'd walk the type, so walk the list of storage and find anything
 686     * with our name, or the prefix of a component that starts with our name.
 687     */
 688    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 689       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 690
 691       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 692           (storage->name[namelen] != 0 &&
 693            storage->name[namelen] != '.' &&
 694            storage->name[namelen] != '[')) {
 695          continue;
 696       }
 697
 698       gl_constant_value *components = storage->storage;
 699       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 700                                storage->type->matrix_columns);
 701
 702       for (unsigned s = 0; s < vector_count; s++) {
 703          assert(uniforms < uniform_array_size);
 704          uniform_vector_size[uniforms] = storage->type->vector_elements;
 705
 706          int i;
 707          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 708             stage_prog_data->param[uniforms * 4 + i] = components;
 709             components++;
 710          }
 711          for (; i < 4; i++) {
 712             static gl_constant_value zero = { 0.0 };
 713             stage_prog_data->param[uniforms * 4 + i] = &zero;
 714          }
 715
 716          uniforms++;
 717       }
 718    }
 719 }
 720
 721 void
 722 vec4_visitor::setup_uniform_clipplane_values()
 723 {
 724    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 725
 726    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 727       assert(this->uniforms < uniform_array_size);
 728       this->uniform_vector_size[this->uniforms] = 4;
 729       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 730       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 731       for (int j = 0; j < 4; ++j) {
 732          stage_prog_data->param[this->uniforms * 4 + j] =
 733             (gl_constant_value *) &clip_planes[i][j];
 734       }
 735       ++this->uniforms;
 736    }
 737 }
 738
 739 /* Our support for builtin uniforms is even scarier than non-builtin.
 740  * It sits on top of the PROG_STATE_VAR parameters that are
 741  * automatically updated from GL context state.
 742  */
 743 void
 744 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 745 {
 746    const ir_state_slot *const slots = ir->state_slots;
 747    assert(ir->state_slots != NULL);
 748
 749    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 750       /* This state reference has already been setup by ir_to_mesa,
 751        * but we'll get the same index back here.  We can reference
 752        * ParameterValues directly, since unlike brw_fs.cpp, we never
 753        * add new state references during compile.
 754        */
 755       int index = _mesa_add_state_reference(this->prog->Parameters,
 756                                             (gl_state_index *)slots[i].tokens);
 757       gl_constant_value *values =
 758          &this->prog->Parameters->ParameterValues[index][0];
 759
 760       assert(this->uniforms < uniform_array_size);
 761       this->uniform_vector_size[this->uniforms] = 0;
 762       /* Add each of the unique swizzled channels of the element.
 763        * This will end up matching the size of the glsl_type of this field.
 764        */
 765       int last_swiz = -1;
 766       for (unsigned int j = 0; j < 4; j++) {
 767          int swiz = GET_SWZ(slots[i].swizzle, j);
 768          last_swiz = swiz;
 769
 770          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 771          assert(this->uniforms < uniform_array_size);
 772          if (swiz <= last_swiz)
 773             this->uniform_vector_size[this->uniforms]++;
 774       }
 775       this->uniforms++;
 776    }
 777 }
 778
 779 dst_reg *
 780 vec4_visitor::variable_storage(ir_variable *var)
 781 {
 782    return (dst_reg *)hash_table_find(this->variable_ht, var);
 783 }
 784
 785 void
 786 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 787                                      enum brw_predicate *predicate)
 788 {
 789    ir_expression *expr = ir->as_expression();
 790
 791    *predicate = BRW_PREDICATE_NORMAL;
 792
 793    if (expr && expr->operation != ir_binop_ubo_load) {
 794       src_reg op[3];
 795       vec4_instruction *inst;
 796
 797       assert(expr->get_num_operands() <= 3);
 798       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 799          expr->operands[i]->accept(this);
 800          op[i] = this->result;
 801
 802          resolve_ud_negate(&op[i]);
 803       }
 804
 805       switch (expr->operation) {
 806       case ir_unop_logic_not:
 807          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 808          inst->conditional_mod = BRW_CONDITIONAL_Z;
 809          break;
 810
 811       case ir_binop_logic_xor:
 812          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 813          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 814          break;
 815
 816       case ir_binop_logic_or:
 817          inst = emit(OR(dst_null_d(), op[0], op[1]));
 818          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 819          break;
 820
 821       case ir_binop_logic_and:
 822          inst = emit(AND(dst_null_d(), op[0], op[1]));
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_unop_f2b:
 827          if (brw->gen >= 6) {
 828             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 829          } else {
 830             inst = emit(MOV(dst_null_f(), op[0]));
 831             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 832          }
 833          break;
 834
 835       case ir_unop_i2b:
 836          if (brw->gen >= 6) {
 837             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 838          } else {
 839             inst = emit(MOV(dst_null_d(), op[0]));
 840             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 841          }
 842          break;
 843
 844       case ir_binop_all_equal:
 845          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 846          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 847          break;
 848
 849       case ir_binop_any_nequal:
 850          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 851          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 852          break;
 853
 854       case ir_unop_any:
 855          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 856          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 857          break;
 858
 859       case ir_binop_greater:
 860       case ir_binop_gequal:
 861       case ir_binop_less:
 862       case ir_binop_lequal:
 863       case ir_binop_equal:
 864       case ir_binop_nequal:
 865          emit(CMP(dst_null_d(), op[0], op[1],
 866                   brw_conditional_for_comparison(expr->operation)));
 867          break;
 868
 869       case ir_triop_csel: {
 870          /* Expand the boolean condition into the flag register. */
 871          inst = emit(MOV(dst_null_d(), op[0]));
 872          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 873
 874          /* Select which boolean to return. */
 875          dst_reg temp(this, expr->operands[1]->type);
 876          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 877          inst->predicate = BRW_PREDICATE_NORMAL;
 878
 879          /* Expand the result to a condition code. */
 880          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 881          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 882          break;
 883       }
 884
 885       default:
 886          unreachable("not reached");
 887       }
 888       return;
 889    }
 890
 891    ir->accept(this);
 892
 893    resolve_ud_negate(&this->result);
 894
 895    if (brw->gen >= 6) {
 896       vec4_instruction *inst = emit(AND(dst_null_d(),
 897                                         this->result, src_reg(1)));
 898       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 899    } else {
 900       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 901       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 902    }
 903 }
 904
 905 /**
 906  * Emit a gen6 IF statement with the comparison folded into the IF
 907  * instruction.
 908  */
 909 void
 910 vec4_visitor::emit_if_gen6(ir_if *ir)
 911 {
 912    ir_expression *expr = ir->condition->as_expression();
 913
 914    if (expr && expr->operation != ir_binop_ubo_load) {
 915       src_reg op[3];
 916       dst_reg temp;
 917
 918       assert(expr->get_num_operands() <= 3);
 919       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 920          expr->operands[i]->accept(this);
 921          op[i] = this->result;
 922       }
 923
 924       switch (expr->operation) {
 925       case ir_unop_logic_not:
 926          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 927          return;
 928
 929       case ir_binop_logic_xor:
 930          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 931          return;
 932
 933       case ir_binop_logic_or:
 934          temp = dst_reg(this, glsl_type::bool_type);
 935          emit(OR(temp, op[0], op[1]));
 936          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 937          return;
 938
 939       case ir_binop_logic_and:
 940          temp = dst_reg(this, glsl_type::bool_type);
 941          emit(AND(temp, op[0], op[1]));
 942          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 943          return;
 944
 945       case ir_unop_f2b:
 946          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 947          return;
 948
 949       case ir_unop_i2b:
 950          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 951          return;
 952
 953       case ir_binop_greater:
 954       case ir_binop_gequal:
 955       case ir_binop_less:
 956       case ir_binop_lequal:
 957       case ir_binop_equal:
 958       case ir_binop_nequal:
 959          emit(IF(op[0], op[1],
 960                  brw_conditional_for_comparison(expr->operation)));
 961          return;
 962
 963       case ir_binop_all_equal:
 964          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 965          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 966          return;
 967
 968       case ir_binop_any_nequal:
 969          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 970          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 971          return;
 972
 973       case ir_unop_any:
 974          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 975          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 976          return;
 977
 978       case ir_triop_csel: {
 979          /* Expand the boolean condition into the flag register. */
 980          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
 981          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 982
 983          /* Select which boolean to return. */
 984          dst_reg temp(this, expr->operands[1]->type);
 985          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 986          inst->predicate = BRW_PREDICATE_NORMAL;
 987
 988          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 989          return;
 990       }
 991
 992       default:
 993          unreachable("not reached");
 994       }
 995       return;
 996    }
 997
 998    ir->condition->accept(this);
 999
1000    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1001 }
1002
1003 void
1004 vec4_visitor::visit(ir_variable *ir)
1005 {
1006    dst_reg *reg = NULL;
1007
1008    if (variable_storage(ir))
1009       return;
1010
1011    switch (ir->data.mode) {
1012    case ir_var_shader_in:
1013       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1014       break;
1015
1016    case ir_var_shader_out:
1017       reg = new(mem_ctx) dst_reg(this, ir->type);
1018
1019       for (int i = 0; i < type_size(ir->type); i++) {
1020          output_reg[ir->data.location + i] = *reg;
1021          output_reg[ir->data.location + i].reg_offset = i;
1022          output_reg[ir->data.location + i].type =
1023             brw_type_for_base_type(ir->type->get_scalar_type());
1024          output_reg_annotation[ir->data.location + i] = ir->name;
1025       }
1026       break;
1027
1028    case ir_var_auto:
1029    case ir_var_temporary:
1030       reg = new(mem_ctx) dst_reg(this, ir->type);
1031       break;
1032
1033    case ir_var_uniform:
1034       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1035
1036       /* Thanks to the lower_ubo_reference pass, we will see only
1037        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1038        * variables, so no need for them to be in variable_ht.
1039        *
1040        * Some uniforms, such as samplers and atomic counters, have no actual
1041        * storage, so we should ignore them.
1042        */
1043       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1044          return;
1045
1046       /* Track how big the whole uniform variable is, in case we need to put a
1047        * copy of its data into pull constants for array access.
1048        */
1049       assert(this->uniforms < uniform_array_size);
1050       this->uniform_size[this->uniforms] = type_size(ir->type);
1051
1052       if (!strncmp(ir->name, "gl_", 3)) {
1053          setup_builtin_uniform_values(ir);
1054       } else {
1055          setup_uniform_values(ir);
1056       }
1057       break;
1058
1059    case ir_var_system_value:
1060       reg = make_reg_for_system_value(ir);
1061       break;
1062
1063    default:
1064       unreachable("not reached");
1065    }
1066
1067    reg->type = brw_type_for_base_type(ir->type);
1068    hash_table_insert(this->variable_ht, reg, ir);
1069 }
1070
1071 void
1072 vec4_visitor::visit(ir_loop *ir)
1073 {
1074    /* We don't want debugging output to print the whole body of the
1075     * loop as the annotation.
1076     */
1077    this->base_ir = NULL;
1078
1079    emit(BRW_OPCODE_DO);
1080
1081    visit_instructions(&ir->body_instructions);
1082
1083    emit(BRW_OPCODE_WHILE);
1084 }
1085
1086 void
1087 vec4_visitor::visit(ir_loop_jump *ir)
1088 {
1089    switch (ir->mode) {
1090    case ir_loop_jump::jump_break:
1091       emit(BRW_OPCODE_BREAK);
1092       break;
1093    case ir_loop_jump::jump_continue:
1094       emit(BRW_OPCODE_CONTINUE);
1095       break;
1096    }
1097 }
1098
1099
1100 void
1101 vec4_visitor::visit(ir_function_signature *)
1102 {
1103    unreachable("not reached");
1104 }
1105
1106 void
1107 vec4_visitor::visit(ir_function *ir)
1108 {
1109    /* Ignore function bodies other than main() -- we shouldn't see calls to
1110     * them since they should all be inlined.
1111     */
1112    if (strcmp(ir->name, "main") == 0) {
1113       const ir_function_signature *sig;
1114       exec_list empty;
1115
1116       sig = ir->matching_signature(NULL, &empty, false);
1117
1118       assert(sig);
1119
1120       visit_instructions(&sig->body);
1121    }
1122 }
1123
1124 bool
1125 vec4_visitor::try_emit_mad(ir_expression *ir)
1126 {
1127    /* 3-src instructions were introduced in gen6. */
1128    if (brw->gen < 6)
1129       return false;
1130
1131    /* MAD can only handle floating-point data. */
1132    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1133       return false;
1134
1135    ir_rvalue *nonmul = ir->operands[1];
1136    ir_expression *mul = ir->operands[0]->as_expression();
1137
1138    if (!mul || mul->operation != ir_binop_mul) {
1139       nonmul = ir->operands[0];
1140       mul = ir->operands[1]->as_expression();
1141
1142       if (!mul || mul->operation != ir_binop_mul)
1143          return false;
1144    }
1145
1146    nonmul->accept(this);
1147    src_reg src0 = fix_3src_operand(this->result);
1148
1149    mul->operands[0]->accept(this);
1150    src_reg src1 = fix_3src_operand(this->result);
1151
1152    mul->operands[1]->accept(this);
1153    src_reg src2 = fix_3src_operand(this->result);
1154
1155    this->result = src_reg(this, ir->type);
1156    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1157
1158    return true;
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1163 {
1164    /* This optimization relies on CMP setting the destination to 0 when
1165     * false.  Early hardware only sets the least significant bit, and
1166     * leaves the other bits undefined.  So we can't use it.
1167     */
1168    if (brw->gen < 6)
1169       return false;
1170
1171    ir_expression *const cmp = ir->operands[0]->as_expression();
1172
1173    if (cmp == NULL)
1174       return false;
1175
1176    switch (cmp->operation) {
1177    case ir_binop_less:
1178    case ir_binop_greater:
1179    case ir_binop_lequal:
1180    case ir_binop_gequal:
1181    case ir_binop_equal:
1182    case ir_binop_nequal:
1183       break;
1184
1185    default:
1186       return false;
1187    }
1188
1189    cmp->operands[0]->accept(this);
1190    const src_reg cmp_src0 = this->result;
1191
1192    cmp->operands[1]->accept(this);
1193    const src_reg cmp_src1 = this->result;
1194
1195    this->result = src_reg(this, ir->type);
1196
1197    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1198             brw_conditional_for_comparison(cmp->operation)));
1199
1200    /* If the comparison is false, this->result will just happen to be zero.
1201     */
1202    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1203                                        this->result, src_reg(1.0f));
1204    inst->predicate = BRW_PREDICATE_NORMAL;
1205    inst->predicate_inverse = true;
1206
1207    return true;
1208 }
1209
1210 void
1211 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1212                           src_reg src0, src_reg src1)
1213 {
1214    vec4_instruction *inst;
1215
1216    if (brw->gen >= 6) {
1217       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1218       inst->conditional_mod = conditionalmod;
1219    } else {
1220       emit(CMP(dst, src0, src1, conditionalmod));
1221
1222       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1223       inst->predicate = BRW_PREDICATE_NORMAL;
1224    }
1225 }
1226
1227 void
1228 vec4_visitor::emit_lrp(const dst_reg &dst,
1229                        const src_reg &x, const src_reg &y, const src_reg &a)
1230 {
1231    if (brw->gen >= 6) {
1232       /* Note that the instruction's argument order is reversed from GLSL
1233        * and the IR.
1234        */
1235       emit(LRP(dst,
1236                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1237    } else {
1238       /* Earlier generations don't support three source operations, so we
1239        * need to emit x*(1-a) + y*a.
1240        */
1241       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1242       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1243       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1244       y_times_a.writemask           = dst.writemask;
1245       one_minus_a.writemask         = dst.writemask;
1246       x_times_one_minus_a.writemask = dst.writemask;
1247
1248       emit(MUL(y_times_a, y, a));
1249       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1250       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1251       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1252    }
1253 }
1254
1255 void
1256 vec4_visitor::visit(ir_expression *ir)
1257 {
1258    unsigned int operand;
1259    src_reg op[Elements(ir->operands)];
1260    src_reg result_src;
1261    dst_reg result_dst;
1262    vec4_instruction *inst;
1263
1264    if (ir->operation == ir_binop_add) {
1265       if (try_emit_mad(ir))
1266          return;
1267    }
1268
1269    if (ir->operation == ir_unop_b2f) {
1270       if (try_emit_b2f_of_compare(ir))
1271          return;
1272    }
1273
1274    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1275       this->result.file = BAD_FILE;
1276       ir->operands[operand]->accept(this);
1277       if (this->result.file == BAD_FILE) {
1278          fprintf(stderr, "Failed to get tree for expression operand:\n");
1279          ir->operands[operand]->fprint(stderr);
1280          exit(1);
1281       }
1282       op[operand] = this->result;
1283
1284       /* Matrix expression operands should have been broken down to vector
1285        * operations already.
1286        */
1287       assert(!ir->operands[operand]->type->is_matrix());
1288    }
1289
1290    int vector_elements = ir->operands[0]->type->vector_elements;
1291    if (ir->operands[1]) {
1292       vector_elements = MAX2(vector_elements,
1293                              ir->operands[1]->type->vector_elements);
1294    }
1295
1296    this->result.file = BAD_FILE;
1297
1298    /* Storage for our result.  Ideally for an assignment we'd be using
1299     * the actual storage for the result here, instead.
1300     */
1301    result_src = src_reg(this, ir->type);
1302    /* convenience for the emit functions below. */
1303    result_dst = dst_reg(result_src);
1304    /* If nothing special happens, this is the result. */
1305    this->result = result_src;
1306    /* Limit writes to the channels that will be used by result_src later.
1307     * This does limit this temp's use as a temporary for multi-instruction
1308     * sequences.
1309     */
1310    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1311
1312    switch (ir->operation) {
1313    case ir_unop_logic_not:
1314       if (ctx->Const.UniformBooleanTrue != 1) {
1315          emit(NOT(result_dst, op[0]));
1316       } else {
1317          emit(XOR(result_dst, op[0], src_reg(1)));
1318       }
1319       break;
1320    case ir_unop_neg:
1321       op[0].negate = !op[0].negate;
1322       emit(MOV(result_dst, op[0]));
1323       break;
1324    case ir_unop_abs:
1325       op[0].abs = true;
1326       op[0].negate = false;
1327       emit(MOV(result_dst, op[0]));
1328       break;
1329
1330    case ir_unop_sign:
1331       if (ir->type->is_float()) {
1332          /* AND(val, 0x80000000) gives the sign bit.
1333           *
1334           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1335           * zero.
1336           */
1337          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1338
1339          op[0].type = BRW_REGISTER_TYPE_UD;
1340          result_dst.type = BRW_REGISTER_TYPE_UD;
1341          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1342
1343          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1344          inst->predicate = BRW_PREDICATE_NORMAL;
1345
1346          this->result.type = BRW_REGISTER_TYPE_F;
1347       } else {
1348          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1349           *               -> non-negative val generates 0x00000000.
1350           *  Predicated OR sets 1 if val is positive.
1351           */
1352          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1353
1354          emit(ASR(result_dst, op[0], src_reg(31)));
1355
1356          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1357          inst->predicate = BRW_PREDICATE_NORMAL;
1358       }
1359       break;
1360
1361    case ir_unop_rcp:
1362       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1363       break;
1364
1365    case ir_unop_exp2:
1366       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1367       break;
1368    case ir_unop_log2:
1369       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1370       break;
1371    case ir_unop_exp:
1372    case ir_unop_log:
1373       unreachable("not reached: should be handled by ir_explog_to_explog2");
1374    case ir_unop_sin:
1375    case ir_unop_sin_reduced:
1376       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1377       break;
1378    case ir_unop_cos:
1379    case ir_unop_cos_reduced:
1380       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1381       break;
1382
1383    case ir_unop_dFdx:
1384    case ir_unop_dFdx_coarse:
1385    case ir_unop_dFdx_fine:
1386    case ir_unop_dFdy:
1387    case ir_unop_dFdy_coarse:
1388    case ir_unop_dFdy_fine:
1389       unreachable("derivatives not valid in vertex shader");
1390
1391    case ir_unop_bitfield_reverse:
1392       emit(BFREV(result_dst, op[0]));
1393       break;
1394    case ir_unop_bit_count:
1395       emit(CBIT(result_dst, op[0]));
1396       break;
1397    case ir_unop_find_msb: {
1398       src_reg temp = src_reg(this, glsl_type::uint_type);
1399
1400       inst = emit(FBH(dst_reg(temp), op[0]));
1401       inst->dst.writemask = WRITEMASK_XYZW;
1402
1403       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1404        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1405        * subtract the result from 31 to convert the MSB count into an LSB count.
1406        */
1407
1408       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1409       temp.swizzle = BRW_SWIZZLE_NOOP;
1410       emit(MOV(result_dst, temp));
1411
1412       src_reg src_tmp = src_reg(result_dst);
1413       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1414
1415       src_tmp.negate = true;
1416       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1417       inst->predicate = BRW_PREDICATE_NORMAL;
1418       break;
1419    }
1420    case ir_unop_find_lsb:
1421       emit(FBL(result_dst, op[0]));
1422       break;
1423    case ir_unop_saturate:
1424       inst = emit(MOV(result_dst, op[0]));
1425       inst->saturate = true;
1426       break;
1427
1428    case ir_unop_noise:
1429       unreachable("not reached: should be handled by lower_noise");
1430
1431    case ir_binop_add:
1432       emit(ADD(result_dst, op[0], op[1]));
1433       break;
1434    case ir_binop_sub:
1435       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1436
1437    case ir_binop_mul:
1438       if (brw->gen < 8 && ir->type->is_integer()) {
1439          /* For integer multiplication, the MUL uses the low 16 bits of one of
1440           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1441           * accumulates in the contribution of the upper 16 bits of that
1442           * operand.  If we can determine that one of the args is in the low
1443           * 16 bits, though, we can just emit a single MUL.
1444           */
1445          if (ir->operands[0]->is_uint16_constant()) {
1446             if (brw->gen < 7)
1447                emit(MUL(result_dst, op[0], op[1]));
1448             else
1449                emit(MUL(result_dst, op[1], op[0]));
1450          } else if (ir->operands[1]->is_uint16_constant()) {
1451             if (brw->gen < 7)
1452                emit(MUL(result_dst, op[1], op[0]));
1453             else
1454                emit(MUL(result_dst, op[0], op[1]));
1455          } else {
1456             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1457
1458             emit(MUL(acc, op[0], op[1]));
1459             emit(MACH(dst_null_d(), op[0], op[1]));
1460             emit(MOV(result_dst, src_reg(acc)));
1461          }
1462       } else {
1463          emit(MUL(result_dst, op[0], op[1]));
1464       }
1465       break;
1466    case ir_binop_imul_high: {
1467       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1468
1469       emit(MUL(acc, op[0], op[1]));
1470       emit(MACH(result_dst, op[0], op[1]));
1471       break;
1472    }
1473    case ir_binop_div:
1474       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1475       assert(ir->type->is_integer());
1476       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1477       break;
1478    case ir_binop_carry: {
1479       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1480
1481       emit(ADDC(dst_null_ud(), op[0], op[1]));
1482       emit(MOV(result_dst, src_reg(acc)));
1483       break;
1484    }
1485    case ir_binop_borrow: {
1486       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1487
1488       emit(SUBB(dst_null_ud(), op[0], op[1]));
1489       emit(MOV(result_dst, src_reg(acc)));
1490       break;
1491    }
1492    case ir_binop_mod:
1493       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1494       assert(ir->type->is_integer());
1495       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1496       break;
1497
1498    case ir_binop_less:
1499    case ir_binop_greater:
1500    case ir_binop_lequal:
1501    case ir_binop_gequal:
1502    case ir_binop_equal:
1503    case ir_binop_nequal: {
1504       emit(CMP(result_dst, op[0], op[1],
1505                brw_conditional_for_comparison(ir->operation)));
1506       if (ctx->Const.UniformBooleanTrue == 1) {
1507          emit(AND(result_dst, result_src, src_reg(1)));
1508       }
1509       break;
1510    }
1511
1512    case ir_binop_all_equal:
1513       /* "==" operator producing a scalar boolean. */
1514       if (ir->operands[0]->type->is_vector() ||
1515           ir->operands[1]->type->is_vector()) {
1516          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1517          emit(MOV(result_dst, src_reg(0)));
1518          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1519          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1520       } else {
1521          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1522          if (ctx->Const.UniformBooleanTrue == 1) {
1523             emit(AND(result_dst, result_src, src_reg(1)));
1524          }
1525       }
1526       break;
1527    case ir_binop_any_nequal:
1528       /* "!=" operator producing a scalar boolean. */
1529       if (ir->operands[0]->type->is_vector() ||
1530           ir->operands[1]->type->is_vector()) {
1531          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1532
1533          emit(MOV(result_dst, src_reg(0)));
1534          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1535          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1536       } else {
1537          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1538          if (ctx->Const.UniformBooleanTrue == 1) {
1539             emit(AND(result_dst, result_src, src_reg(1)));
1540          }
1541       }
1542       break;
1543
1544    case ir_unop_any:
1545       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1546       emit(MOV(result_dst, src_reg(0)));
1547
1548       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1549       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1550       break;
1551
1552    case ir_binop_logic_xor:
1553       emit(XOR(result_dst, op[0], op[1]));
1554       break;
1555
1556    case ir_binop_logic_or:
1557       emit(OR(result_dst, op[0], op[1]));
1558       break;
1559
1560    case ir_binop_logic_and:
1561       emit(AND(result_dst, op[0], op[1]));
1562       break;
1563
1564    case ir_binop_dot:
1565       assert(ir->operands[0]->type->is_vector());
1566       assert(ir->operands[0]->type == ir->operands[1]->type);
1567       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1568       break;
1569
1570    case ir_unop_sqrt:
1571       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1572       break;
1573    case ir_unop_rsq:
1574       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1575       break;
1576
1577    case ir_unop_bitcast_i2f:
1578    case ir_unop_bitcast_u2f:
1579       this->result = op[0];
1580       this->result.type = BRW_REGISTER_TYPE_F;
1581       break;
1582
1583    case ir_unop_bitcast_f2i:
1584       this->result = op[0];
1585       this->result.type = BRW_REGISTER_TYPE_D;
1586       break;
1587
1588    case ir_unop_bitcast_f2u:
1589       this->result = op[0];
1590       this->result.type = BRW_REGISTER_TYPE_UD;
1591       break;
1592
1593    case ir_unop_i2f:
1594    case ir_unop_i2u:
1595    case ir_unop_u2i:
1596    case ir_unop_u2f:
1597    case ir_unop_f2i:
1598    case ir_unop_f2u:
1599       emit(MOV(result_dst, op[0]));
1600       break;
1601    case ir_unop_b2i:
1602       if (ctx->Const.UniformBooleanTrue != 1) {
1603          emit(AND(result_dst, op[0], src_reg(1)));
1604       } else {
1605          emit(MOV(result_dst, op[0]));
1606       }
1607       break;
1608    case ir_unop_b2f:
1609       if (ctx->Const.UniformBooleanTrue != 1) {
1610          op[0].type = BRW_REGISTER_TYPE_UD;
1611          result_dst.type = BRW_REGISTER_TYPE_UD;
1612          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1613          result_dst.type = BRW_REGISTER_TYPE_F;
1614       } else {
1615          emit(MOV(result_dst, op[0]));
1616       }
1617       break;
1618    case ir_unop_f2b:
1619    case ir_unop_i2b:
1620       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1621       if (ctx->Const.UniformBooleanTrue == 1) {
1622          emit(AND(result_dst, result_src, src_reg(1)));
1623       }
1624       break;
1625
1626    case ir_unop_trunc:
1627       emit(RNDZ(result_dst, op[0]));
1628       break;
1629    case ir_unop_ceil:
1630       op[0].negate = !op[0].negate;
1631       inst = emit(RNDD(result_dst, op[0]));
1632       this->result.negate = true;
1633       break;
1634    case ir_unop_floor:
1635       inst = emit(RNDD(result_dst, op[0]));
1636       break;
1637    case ir_unop_fract:
1638       inst = emit(FRC(result_dst, op[0]));
1639       break;
1640    case ir_unop_round_even:
1641       emit(RNDE(result_dst, op[0]));
1642       break;
1643
1644    case ir_binop_min:
1645       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1646       break;
1647    case ir_binop_max:
1648       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1649       break;
1650
1651    case ir_binop_pow:
1652       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1653       break;
1654
1655    case ir_unop_bit_not:
1656       inst = emit(NOT(result_dst, op[0]));
1657       break;
1658    case ir_binop_bit_and:
1659       inst = emit(AND(result_dst, op[0], op[1]));
1660       break;
1661    case ir_binop_bit_xor:
1662       inst = emit(XOR(result_dst, op[0], op[1]));
1663       break;
1664    case ir_binop_bit_or:
1665       inst = emit(OR(result_dst, op[0], op[1]));
1666       break;
1667
1668    case ir_binop_lshift:
1669       inst = emit(SHL(result_dst, op[0], op[1]));
1670       break;
1671
1672    case ir_binop_rshift:
1673       if (ir->type->base_type == GLSL_TYPE_INT)
1674          inst = emit(ASR(result_dst, op[0], op[1]));
1675       else
1676          inst = emit(SHR(result_dst, op[0], op[1]));
1677       break;
1678
1679    case ir_binop_bfm:
1680       emit(BFI1(result_dst, op[0], op[1]));
1681       break;
1682
1683    case ir_binop_ubo_load: {
1684       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1685       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1686       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1687       src_reg offset;
1688
1689       /* Now, load the vector from that offset. */
1690       assert(ir->type->is_vector() || ir->type->is_scalar());
1691
1692       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1693       packed_consts.type = result.type;
1694       src_reg surf_index;
1695
1696       if (const_uniform_block) {
1697          /* The block index is a constant, so just emit the binding table entry
1698           * as an immediate.
1699           */
1700          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1701                               const_uniform_block->value.u[0]);
1702       } else {
1703          /* The block index is not a constant. Evaluate the index expression
1704           * per-channel and add the base UBO index; the generator will select
1705           * a value from any live channel.
1706           */
1707          surf_index = src_reg(this, glsl_type::uint_type);
1708          emit(ADD(dst_reg(surf_index), op[0],
1709                   src_reg(prog_data->base.binding_table.ubo_start)));
1710
1711          /* Assume this may touch any UBO. It would be nice to provide
1712           * a tighter bound, but the array information is already lowered away.
1713           */
1714          brw_mark_surface_used(&prog_data->base,
1715                                prog_data->base.binding_table.ubo_start +
1716                                shader_prog->NumUniformBlocks - 1);
1717       }
1718
1719       if (const_offset_ir) {
1720          if (brw->gen >= 8) {
1721             /* Store the offset in a GRF so we can send-from-GRF. */
1722             offset = src_reg(this, glsl_type::int_type);
1723             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1724          } else {
1725             /* Immediates are fine on older generations since they'll be moved
1726              * to a (potentially fake) MRF at the generator level.
1727              */
1728             offset = src_reg(const_offset / 16);
1729          }
1730       } else {
1731          offset = src_reg(this, glsl_type::uint_type);
1732          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1733       }
1734
1735       if (brw->gen >= 7) {
1736          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1737          grf_offset.type = offset.type;
1738
1739          emit(MOV(grf_offset, offset));
1740
1741          emit(new(mem_ctx) vec4_instruction(this,
1742                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1743                                             dst_reg(packed_consts),
1744                                             surf_index,
1745                                             src_reg(grf_offset)));
1746       } else {
1747          vec4_instruction *pull =
1748             emit(new(mem_ctx) vec4_instruction(this,
1749                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1750                                                dst_reg(packed_consts),
1751                                                surf_index,
1752                                                offset));
1753          pull->base_mrf = 14;
1754          pull->mlen = 1;
1755       }
1756
1757       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1758       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1759                                             const_offset % 16 / 4,
1760                                             const_offset % 16 / 4,
1761                                             const_offset % 16 / 4);
1762
1763       /* UBO bools are any nonzero int.  We need to convert them to use the
1764        * value of true stored in ctx->Const.UniformBooleanTrue.
1765        */
1766       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1767          emit(CMP(result_dst, packed_consts, src_reg(0u),
1768                   BRW_CONDITIONAL_NZ));
1769          if (ctx->Const.UniformBooleanTrue == 1) {
1770             emit(AND(result_dst, result, src_reg(1)));
1771          }
1772       } else {
1773          emit(MOV(result_dst, packed_consts));
1774       }
1775       break;
1776    }
1777
1778    case ir_binop_vector_extract:
1779       unreachable("should have been lowered by vec_index_to_cond_assign");
1780
1781    case ir_triop_fma:
1782       op[0] = fix_3src_operand(op[0]);
1783       op[1] = fix_3src_operand(op[1]);
1784       op[2] = fix_3src_operand(op[2]);
1785       /* Note that the instruction's argument order is reversed from GLSL
1786        * and the IR.
1787        */
1788       emit(MAD(result_dst, op[2], op[1], op[0]));
1789       break;
1790
1791    case ir_triop_lrp:
1792       emit_lrp(result_dst, op[0], op[1], op[2]);
1793       break;
1794
1795    case ir_triop_csel:
1796       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1797       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1798       inst->predicate = BRW_PREDICATE_NORMAL;
1799       break;
1800
1801    case ir_triop_bfi:
1802       op[0] = fix_3src_operand(op[0]);
1803       op[1] = fix_3src_operand(op[1]);
1804       op[2] = fix_3src_operand(op[2]);
1805       emit(BFI2(result_dst, op[0], op[1], op[2]));
1806       break;
1807
1808    case ir_triop_bitfield_extract:
1809       op[0] = fix_3src_operand(op[0]);
1810       op[1] = fix_3src_operand(op[1]);
1811       op[2] = fix_3src_operand(op[2]);
1812       /* Note that the instruction's argument order is reversed from GLSL
1813        * and the IR.
1814        */
1815       emit(BFE(result_dst, op[2], op[1], op[0]));
1816       break;
1817
1818    case ir_triop_vector_insert:
1819       unreachable("should have been lowered by lower_vector_insert");
1820
1821    case ir_quadop_bitfield_insert:
1822       unreachable("not reached: should be handled by "
1823               "bitfield_insert_to_bfm_bfi\n");
1824
1825    case ir_quadop_vector:
1826       unreachable("not reached: should be handled by lower_quadop_vector");
1827
1828    case ir_unop_pack_half_2x16:
1829       emit_pack_half_2x16(result_dst, op[0]);
1830       break;
1831    case ir_unop_unpack_half_2x16:
1832       emit_unpack_half_2x16(result_dst, op[0]);
1833       break;
1834    case ir_unop_pack_snorm_2x16:
1835    case ir_unop_pack_snorm_4x8:
1836    case ir_unop_pack_unorm_2x16:
1837    case ir_unop_pack_unorm_4x8:
1838    case ir_unop_unpack_snorm_2x16:
1839    case ir_unop_unpack_snorm_4x8:
1840    case ir_unop_unpack_unorm_2x16:
1841    case ir_unop_unpack_unorm_4x8:
1842       unreachable("not reached: should be handled by lower_packing_builtins");
1843    case ir_unop_unpack_half_2x16_split_x:
1844    case ir_unop_unpack_half_2x16_split_y:
1845    case ir_binop_pack_half_2x16_split:
1846    case ir_unop_interpolate_at_centroid:
1847    case ir_binop_interpolate_at_sample:
1848    case ir_binop_interpolate_at_offset:
1849       unreachable("not reached: should not occur in vertex shader");
1850    case ir_binop_ldexp:
1851       unreachable("not reached: should be handled by ldexp_to_arith()");
1852    }
1853 }
1854
1855
1856 void
1857 vec4_visitor::visit(ir_swizzle *ir)
1858 {
1859    src_reg src;
1860    int i = 0;
1861    int swizzle[4];
1862
1863    /* Note that this is only swizzles in expressions, not those on the left
1864     * hand side of an assignment, which do write masking.  See ir_assignment
1865     * for that.
1866     */
1867
1868    ir->val->accept(this);
1869    src = this->result;
1870    assert(src.file != BAD_FILE);
1871
1872    for (i = 0; i < ir->type->vector_elements; i++) {
1873       switch (i) {
1874       case 0:
1875          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1876          break;
1877       case 1:
1878          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1879          break;
1880       case 2:
1881          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1882          break;
1883       case 3:
1884          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1885             break;
1886       }
1887    }
1888    for (; i < 4; i++) {
1889       /* Replicate the last channel out. */
1890       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1891    }
1892
1893    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1894
1895    this->result = src;
1896 }
1897
1898 void
1899 vec4_visitor::visit(ir_dereference_variable *ir)
1900 {
1901    const struct glsl_type *type = ir->type;
1902    dst_reg *reg = variable_storage(ir->var);
1903
1904    if (!reg) {
1905       fail("Failed to find variable storage for %s\n", ir->var->name);
1906       this->result = src_reg(brw_null_reg());
1907       return;
1908    }
1909
1910    this->result = src_reg(*reg);
1911
1912    /* System values get their swizzle from the dst_reg writemask */
1913    if (ir->var->data.mode == ir_var_system_value)
1914       return;
1915
1916    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1917       this->result.swizzle = swizzle_for_size(type->vector_elements);
1918 }
1919
1920
1921 int
1922 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1923 {
1924    /* Under normal circumstances array elements are stored consecutively, so
1925     * the stride is equal to the size of the array element.
1926     */
1927    return type_size(ir->type);
1928 }
1929
1930
1931 void
1932 vec4_visitor::visit(ir_dereference_array *ir)
1933 {
1934    ir_constant *constant_index;
1935    src_reg src;
1936    int array_stride = compute_array_stride(ir);
1937
1938    constant_index = ir->array_index->constant_expression_value();
1939
1940    ir->array->accept(this);
1941    src = this->result;
1942
1943    if (constant_index) {
1944       src.reg_offset += constant_index->value.i[0] * array_stride;
1945    } else {
1946       /* Variable index array dereference.  It eats the "vec4" of the
1947        * base of the array and an index that offsets the Mesa register
1948        * index.
1949        */
1950       ir->array_index->accept(this);
1951
1952       src_reg index_reg;
1953
1954       if (array_stride == 1) {
1955          index_reg = this->result;
1956       } else {
1957          index_reg = src_reg(this, glsl_type::int_type);
1958
1959          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1960       }
1961
1962       if (src.reladdr) {
1963          src_reg temp = src_reg(this, glsl_type::int_type);
1964
1965          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1966
1967          index_reg = temp;
1968       }
1969
1970       src.reladdr = ralloc(mem_ctx, src_reg);
1971       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1972    }
1973
1974    /* If the type is smaller than a vec4, replicate the last channel out. */
1975    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1976       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1977    else
1978       src.swizzle = BRW_SWIZZLE_NOOP;
1979    src.type = brw_type_for_base_type(ir->type);
1980
1981    this->result = src;
1982 }
1983
1984 void
1985 vec4_visitor::visit(ir_dereference_record *ir)
1986 {
1987    unsigned int i;
1988    const glsl_type *struct_type = ir->record->type;
1989    int offset = 0;
1990
1991    ir->record->accept(this);
1992
1993    for (i = 0; i < struct_type->length; i++) {
1994       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1995          break;
1996       offset += type_size(struct_type->fields.structure[i].type);
1997    }
1998
1999    /* If the type is smaller than a vec4, replicate the last channel out. */
2000    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2001       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2002    else
2003       this->result.swizzle = BRW_SWIZZLE_NOOP;
2004    this->result.type = brw_type_for_base_type(ir->type);
2005
2006    this->result.reg_offset += offset;
2007 }
2008
2009 /**
2010  * We want to be careful in assignment setup to hit the actual storage
2011  * instead of potentially using a temporary like we might with the
2012  * ir_dereference handler.
2013  */
2014 static dst_reg
2015 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2016 {
2017    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2018     * access of a vector, it must be separated into a series conditional moves
2019     * before reaching this point (see ir_vec_index_to_cond_assign).
2020     */
2021    assert(ir->as_dereference());
2022    ir_dereference_array *deref_array = ir->as_dereference_array();
2023    if (deref_array) {
2024       assert(!deref_array->array->type->is_vector());
2025    }
2026
2027    /* Use the rvalue deref handler for the most part.  We'll ignore
2028     * swizzles in it and write swizzles using writemask, though.
2029     */
2030    ir->accept(v);
2031    return dst_reg(v->result);
2032 }
2033
2034 void
2035 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2036                               const struct glsl_type *type,
2037                               enum brw_predicate predicate)
2038 {
2039    if (type->base_type == GLSL_TYPE_STRUCT) {
2040       for (unsigned int i = 0; i < type->length; i++) {
2041          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2042       }
2043       return;
2044    }
2045
2046    if (type->is_array()) {
2047       for (unsigned int i = 0; i < type->length; i++) {
2048          emit_block_move(dst, src, type->fields.array, predicate);
2049       }
2050       return;
2051    }
2052
2053    if (type->is_matrix()) {
2054       const struct glsl_type *vec_type;
2055
2056       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2057                                          type->vector_elements, 1);
2058
2059       for (int i = 0; i < type->matrix_columns; i++) {
2060          emit_block_move(dst, src, vec_type, predicate);
2061       }
2062       return;
2063    }
2064
2065    assert(type->is_scalar() || type->is_vector());
2066
2067    dst->type = brw_type_for_base_type(type);
2068    src->type = dst->type;
2069
2070    dst->writemask = (1 << type->vector_elements) - 1;
2071
2072    src->swizzle = swizzle_for_size(type->vector_elements);
2073
2074    vec4_instruction *inst = emit(MOV(*dst, *src));
2075    inst->predicate = predicate;
2076
2077    dst->reg_offset++;
2078    src->reg_offset++;
2079 }
2080
2081
2082 /* If the RHS processing resulted in an instruction generating a
2083  * temporary value, and it would be easy to rewrite the instruction to
2084  * generate its result right into the LHS instead, do so.  This ends
2085  * up reliably removing instructions where it can be tricky to do so
2086  * later without real UD chain information.
2087  */
2088 bool
2089 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2090                                      dst_reg dst,
2091                                      src_reg src,
2092                                      vec4_instruction *pre_rhs_inst,
2093                                      vec4_instruction *last_rhs_inst)
2094 {
2095    /* This could be supported, but it would take more smarts. */
2096    if (ir->condition)
2097       return false;
2098
2099    if (pre_rhs_inst == last_rhs_inst)
2100       return false; /* No instructions generated to work with. */
2101
2102    /* Make sure the last instruction generated our source reg. */
2103    if (src.file != GRF ||
2104        src.file != last_rhs_inst->dst.file ||
2105        src.reg != last_rhs_inst->dst.reg ||
2106        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2107        src.reladdr ||
2108        src.abs ||
2109        src.negate ||
2110        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2111       return false;
2112
2113    /* Check that that last instruction fully initialized the channels
2114     * we want to use, in the order we want to use them.  We could
2115     * potentially reswizzle the operands of many instructions so that
2116     * we could handle out of order channels, but don't yet.
2117     */
2118
2119    for (unsigned i = 0; i < 4; i++) {
2120       if (dst.writemask & (1 << i)) {
2121          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2122             return false;
2123
2124          if (BRW_GET_SWZ(src.swizzle, i) != i)
2125             return false;
2126       }
2127    }
2128
2129    /* Success!  Rewrite the instruction. */
2130    last_rhs_inst->dst.file = dst.file;
2131    last_rhs_inst->dst.reg = dst.reg;
2132    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2133    last_rhs_inst->dst.reladdr = dst.reladdr;
2134    last_rhs_inst->dst.writemask &= dst.writemask;
2135
2136    return true;
2137 }
2138
2139 void
2140 vec4_visitor::visit(ir_assignment *ir)
2141 {
2142    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2143    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2144
2145    if (!ir->lhs->type->is_scalar() &&
2146        !ir->lhs->type->is_vector()) {
2147       ir->rhs->accept(this);
2148       src_reg src = this->result;
2149
2150       if (ir->condition) {
2151          emit_bool_to_cond_code(ir->condition, &predicate);
2152       }
2153
2154       /* emit_block_move doesn't account for swizzles in the source register.
2155        * This should be ok, since the source register is a structure or an
2156        * array, and those can't be swizzled.  But double-check to be sure.
2157        */
2158       assert(src.swizzle ==
2159              (ir->rhs->type->is_matrix()
2160               ? swizzle_for_size(ir->rhs->type->vector_elements)
2161               : BRW_SWIZZLE_NOOP));
2162
2163       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2164       return;
2165    }
2166
2167    /* Now we're down to just a scalar/vector with writemasks. */
2168    int i;
2169
2170    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2171    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2172
2173    ir->rhs->accept(this);
2174
2175    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2176
2177    src_reg src = this->result;
2178
2179    int swizzles[4];
2180    int first_enabled_chan = 0;
2181    int src_chan = 0;
2182
2183    assert(ir->lhs->type->is_vector() ||
2184           ir->lhs->type->is_scalar());
2185    dst.writemask = ir->write_mask;
2186
2187    for (int i = 0; i < 4; i++) {
2188       if (dst.writemask & (1 << i)) {
2189          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2190          break;
2191       }
2192    }
2193
2194    /* Swizzle a small RHS vector into the channels being written.
2195     *
2196     * glsl ir treats write_mask as dictating how many channels are
2197     * present on the RHS while in our instructions we need to make
2198     * those channels appear in the slots of the vec4 they're written to.
2199     */
2200    for (int i = 0; i < 4; i++) {
2201       if (dst.writemask & (1 << i))
2202          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2203       else
2204          swizzles[i] = first_enabled_chan;
2205    }
2206    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2207                               swizzles[2], swizzles[3]);
2208
2209    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2210       return;
2211    }
2212
2213    if (ir->condition) {
2214       emit_bool_to_cond_code(ir->condition, &predicate);
2215    }
2216
2217    for (i = 0; i < type_size(ir->lhs->type); i++) {
2218       vec4_instruction *inst = emit(MOV(dst, src));
2219       inst->predicate = predicate;
2220
2221       dst.reg_offset++;
2222       src.reg_offset++;
2223    }
2224 }
2225
2226 void
2227 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2228 {
2229    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2230       foreach_in_list(ir_constant, field_value, &ir->components) {
2231          emit_constant_values(dst, field_value);
2232       }
2233       return;
2234    }
2235
2236    if (ir->type->is_array()) {
2237       for (unsigned int i = 0; i < ir->type->length; i++) {
2238          emit_constant_values(dst, ir->array_elements[i]);
2239       }
2240       return;
2241    }
2242
2243    if (ir->type->is_matrix()) {
2244       for (int i = 0; i < ir->type->matrix_columns; i++) {
2245          float *vec = &ir->value.f[i * ir->type->vector_elements];
2246
2247          for (int j = 0; j < ir->type->vector_elements; j++) {
2248             dst->writemask = 1 << j;
2249             dst->type = BRW_REGISTER_TYPE_F;
2250
2251             emit(MOV(*dst, src_reg(vec[j])));
2252          }
2253          dst->reg_offset++;
2254       }
2255       return;
2256    }
2257
2258    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2259
2260    for (int i = 0; i < ir->type->vector_elements; i++) {
2261       if (!(remaining_writemask & (1 << i)))
2262          continue;
2263
2264       dst->writemask = 1 << i;
2265       dst->type = brw_type_for_base_type(ir->type);
2266
2267       /* Find other components that match the one we're about to
2268        * write.  Emits fewer instructions for things like vec4(0.5,
2269        * 1.5, 1.5, 1.5).
2270        */
2271       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2272          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2273             if (ir->value.b[i] == ir->value.b[j])
2274                dst->writemask |= (1 << j);
2275          } else {
2276             /* u, i, and f storage all line up, so no need for a
2277              * switch case for comparing each type.
2278              */
2279             if (ir->value.u[i] == ir->value.u[j])
2280                dst->writemask |= (1 << j);
2281          }
2282       }
2283
2284       switch (ir->type->base_type) {
2285       case GLSL_TYPE_FLOAT:
2286          emit(MOV(*dst, src_reg(ir->value.f[i])));
2287          break;
2288       case GLSL_TYPE_INT:
2289          emit(MOV(*dst, src_reg(ir->value.i[i])));
2290          break;
2291       case GLSL_TYPE_UINT:
2292          emit(MOV(*dst, src_reg(ir->value.u[i])));
2293          break;
2294       case GLSL_TYPE_BOOL:
2295          emit(MOV(*dst,
2296                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2297                                               : 0)));
2298          break;
2299       default:
2300          unreachable("Non-float/uint/int/bool constant");
2301       }
2302
2303       remaining_writemask &= ~dst->writemask;
2304    }
2305    dst->reg_offset++;
2306 }
2307
2308 void
2309 vec4_visitor::visit(ir_constant *ir)
2310 {
2311    dst_reg dst = dst_reg(this, ir->type);
2312    this->result = src_reg(dst);
2313
2314    emit_constant_values(&dst, ir);
2315 }
2316
2317 void
2318 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2319 {
2320    ir_dereference *deref = static_cast<ir_dereference *>(
2321       ir->actual_parameters.get_head());
2322    ir_variable *location = deref->variable_referenced();
2323    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2324                           location->data.binding);
2325
2326    /* Calculate the surface offset */
2327    src_reg offset(this, glsl_type::uint_type);
2328    ir_dereference_array *deref_array = deref->as_dereference_array();
2329    if (deref_array) {
2330       deref_array->array_index->accept(this);
2331
2332       src_reg tmp(this, glsl_type::uint_type);
2333       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2334       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2335    } else {
2336       offset = location->data.atomic.offset;
2337    }
2338
2339    /* Emit the appropriate machine instruction */
2340    const char *callee = ir->callee->function_name();
2341    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2342
2343    if (!strcmp("__intrinsic_atomic_read", callee)) {
2344       emit_untyped_surface_read(surf_index, dst, offset);
2345
2346    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2347       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2348                           src_reg(), src_reg());
2349
2350    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2351       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2352                           src_reg(), src_reg());
2353    }
2354 }
2355
2356 void
2357 vec4_visitor::visit(ir_call *ir)
2358 {
2359    const char *callee = ir->callee->function_name();
2360
2361    if (!strcmp("__intrinsic_atomic_read", callee) ||
2362        !strcmp("__intrinsic_atomic_increment", callee) ||
2363        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2364       visit_atomic_counter_intrinsic(ir);
2365    } else {
2366       unreachable("Unsupported intrinsic.");
2367    }
2368 }
2369
2370 src_reg
2371 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2372 {
2373    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2374    inst->base_mrf = 2;
2375    inst->mlen = 1;
2376    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2377    inst->dst.writemask = WRITEMASK_XYZW;
2378
2379    inst->src[1] = sampler;
2380
2381    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2382    int param_base = inst->base_mrf;
2383    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2384    int zero_mask = 0xf & ~coord_mask;
2385
2386    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2387             coordinate));
2388
2389    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2390             src_reg(0)));
2391
2392    emit(inst);
2393    return src_reg(inst->dst);
2394 }
2395
2396 static bool
2397 is_high_sampler(struct brw_context *brw, src_reg sampler)
2398 {
2399    if (brw->gen < 8 && !brw->is_haswell)
2400       return false;
2401
2402    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2403 }
2404
2405 void
2406 vec4_visitor::visit(ir_texture *ir)
2407 {
2408    uint32_t sampler =
2409       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2410
2411    ir_rvalue *nonconst_sampler_index =
2412       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2413
2414    /* Handle non-constant sampler array indexing */
2415    src_reg sampler_reg;
2416    if (nonconst_sampler_index) {
2417       /* The highest sampler which may be used by this operation is
2418        * the last element of the array. Mark it here, because the generator
2419        * doesn't have enough information to determine the bound.
2420        */
2421       uint32_t array_size = ir->sampler->as_dereference_array()
2422          ->array->type->array_size();
2423
2424       uint32_t max_used = sampler + array_size - 1;
2425       if (ir->op == ir_tg4 && brw->gen < 8) {
2426          max_used += prog_data->base.binding_table.gather_texture_start;
2427       } else {
2428          max_used += prog_data->base.binding_table.texture_start;
2429       }
2430
2431       brw_mark_surface_used(&prog_data->base, max_used);
2432
2433       /* Emit code to evaluate the actual indexing expression */
2434       nonconst_sampler_index->accept(this);
2435       dst_reg temp(this, glsl_type::uint_type);
2436       emit(ADD(temp, this->result, src_reg(sampler)))
2437          ->force_writemask_all = true;
2438       sampler_reg = src_reg(temp);
2439    } else {
2440       /* Single sampler, or constant array index; the indexing expression
2441        * is just an immediate.
2442        */
2443       sampler_reg = src_reg(sampler);
2444    }
2445
2446    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2447     * emitting anything other than setting up the constant result.
2448     */
2449    if (ir->op == ir_tg4) {
2450       ir_constant *chan = ir->lod_info.component->as_constant();
2451       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2452       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2453          dst_reg result(this, ir->type);
2454          this->result = src_reg(result);
2455          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2456          return;
2457       }
2458    }
2459
2460    /* Should be lowered by do_lower_texture_projection */
2461    assert(!ir->projector);
2462
2463    /* Should be lowered */
2464    assert(!ir->offset || !ir->offset->type->is_array());
2465
2466    /* Generate code to compute all the subexpression trees.  This has to be
2467     * done before loading any values into MRFs for the sampler message since
2468     * generating these values may involve SEND messages that need the MRFs.
2469     */
2470    src_reg coordinate;
2471    if (ir->coordinate) {
2472       ir->coordinate->accept(this);
2473       coordinate = this->result;
2474    }
2475
2476    src_reg shadow_comparitor;
2477    if (ir->shadow_comparitor) {
2478       ir->shadow_comparitor->accept(this);
2479       shadow_comparitor = this->result;
2480    }
2481
2482    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2483    src_reg offset_value;
2484    if (has_nonconstant_offset) {
2485       ir->offset->accept(this);
2486       offset_value = src_reg(this->result);
2487    }
2488
2489    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2490    src_reg lod, dPdx, dPdy, sample_index, mcs;
2491    switch (ir->op) {
2492    case ir_tex:
2493       lod = src_reg(0.0f);
2494       lod_type = glsl_type::float_type;
2495       break;
2496    case ir_txf:
2497    case ir_txl:
2498    case ir_txs:
2499       ir->lod_info.lod->accept(this);
2500       lod = this->result;
2501       lod_type = ir->lod_info.lod->type;
2502       break;
2503    case ir_query_levels:
2504       lod = src_reg(0);
2505       lod_type = glsl_type::int_type;
2506       break;
2507    case ir_txf_ms:
2508       ir->lod_info.sample_index->accept(this);
2509       sample_index = this->result;
2510       sample_index_type = ir->lod_info.sample_index->type;
2511
2512       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2513          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2514       else
2515          mcs = src_reg(0u);
2516       break;
2517    case ir_txd:
2518       ir->lod_info.grad.dPdx->accept(this);
2519       dPdx = this->result;
2520
2521       ir->lod_info.grad.dPdy->accept(this);
2522       dPdy = this->result;
2523
2524       lod_type = ir->lod_info.grad.dPdx->type;
2525       break;
2526    case ir_txb:
2527    case ir_lod:
2528    case ir_tg4:
2529       break;
2530    }
2531
2532    enum opcode opcode;
2533    switch (ir->op) {
2534    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2535    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2536    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2537    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2538    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2539    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2540    case ir_tg4: opcode = has_nonconstant_offset
2541                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2542    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2543    case ir_txb:
2544       unreachable("TXB is not valid for vertex shaders.");
2545    case ir_lod:
2546       unreachable("LOD is not valid for vertex shaders.");
2547    default:
2548       unreachable("Unrecognized tex op");
2549    }
2550
2551    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2552
2553    if (ir->offset != NULL && ir->op != ir_txf)
2554       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2555
2556    /* Stuff the channel select bits in the top of the texture offset */
2557    if (ir->op == ir_tg4)
2558       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2559
2560    /* The message header is necessary for:
2561     * - Gen4 (always)
2562     * - Texel offsets
2563     * - Gather channel selection
2564     * - Sampler indices too large to fit in a 4-bit value.
2565     */
2566    inst->header_present =
2567       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2568       is_high_sampler(brw, sampler_reg);
2569    inst->base_mrf = 2;
2570    inst->mlen = inst->header_present + 1; /* always at least one */
2571    inst->dst = dst_reg(this, ir->type);
2572    inst->dst.writemask = WRITEMASK_XYZW;
2573    inst->shadow_compare = ir->shadow_comparitor != NULL;
2574
2575    inst->src[1] = sampler_reg;
2576
2577    /* MRF for the first parameter */
2578    int param_base = inst->base_mrf + inst->header_present;
2579
2580    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2581       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2582       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2583    } else {
2584       /* Load the coordinate */
2585       /* FINISHME: gl_clamp_mask and saturate */
2586       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2587       int zero_mask = 0xf & ~coord_mask;
2588
2589       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2590                coordinate));
2591
2592       if (zero_mask != 0) {
2593          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2594                   src_reg(0)));
2595       }
2596       /* Load the shadow comparitor */
2597       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2598          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2599                           WRITEMASK_X),
2600                   shadow_comparitor));
2601          inst->mlen++;
2602       }
2603
2604       /* Load the LOD info */
2605       if (ir->op == ir_tex || ir->op == ir_txl) {
2606          int mrf, writemask;
2607          if (brw->gen >= 5) {
2608             mrf = param_base + 1;
2609             if (ir->shadow_comparitor) {
2610                writemask = WRITEMASK_Y;
2611                /* mlen already incremented */
2612             } else {
2613                writemask = WRITEMASK_X;
2614                inst->mlen++;
2615             }
2616          } else /* brw->gen == 4 */ {
2617             mrf = param_base;
2618             writemask = WRITEMASK_W;
2619          }
2620          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2621       } else if (ir->op == ir_txf) {
2622          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2623       } else if (ir->op == ir_txf_ms) {
2624          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2625                   sample_index));
2626          if (brw->gen >= 7) {
2627             /* MCS data is in the first channel of `mcs`, but we need to get it into
2628              * the .y channel of the second vec4 of params, so replicate .x across
2629              * the whole vec4 and then mask off everything except .y
2630              */
2631             mcs.swizzle = BRW_SWIZZLE_XXXX;
2632             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2633                      mcs));
2634          }
2635          inst->mlen++;
2636       } else if (ir->op == ir_txd) {
2637          const glsl_type *type = lod_type;
2638
2639          if (brw->gen >= 5) {
2640             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2641             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2642             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2643             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2644             inst->mlen++;
2645
2646             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2647                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2648                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2649                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2650                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2651                inst->mlen++;
2652
2653                if (ir->shadow_comparitor) {
2654                   emit(MOV(dst_reg(MRF, param_base + 2,
2655                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2656                            shadow_comparitor));
2657                }
2658             }
2659          } else /* brw->gen == 4 */ {
2660             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2661             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2662             inst->mlen += 2;
2663          }
2664       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2665          if (ir->shadow_comparitor) {
2666             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2667                      shadow_comparitor));
2668          }
2669
2670          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2671                   offset_value));
2672          inst->mlen++;
2673       }
2674    }
2675
2676    emit(inst);
2677
2678    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2679     * spec requires layers.
2680     */
2681    if (ir->op == ir_txs) {
2682       glsl_type const *type = ir->sampler->type;
2683       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2684           type->sampler_array) {
2685          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2686                    writemask(inst->dst, WRITEMASK_Z),
2687                    src_reg(inst->dst), src_reg(6));
2688       }
2689    }
2690
2691    if (brw->gen == 6 && ir->op == ir_tg4) {
2692       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2693    }
2694
2695    swizzle_result(ir, src_reg(inst->dst), sampler);
2696 }
2697
2698 /**
2699  * Apply workarounds for Gen6 gather with UINT/SINT
2700  */
2701 void
2702 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2703 {
2704    if (!wa)
2705       return;
2706
2707    int width = (wa & WA_8BIT) ? 8 : 16;
2708    dst_reg dst_f = dst;
2709    dst_f.type = BRW_REGISTER_TYPE_F;
2710
2711    /* Convert from UNORM to UINT */
2712    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2713    emit(MOV(dst, src_reg(dst_f)));
2714
2715    if (wa & WA_SIGN) {
2716       /* Reinterpret the UINT value as a signed INT value by
2717        * shifting the sign bit into place, then shifting back
2718        * preserving sign.
2719        */
2720       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2721       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2722    }
2723 }
2724
2725 /**
2726  * Set up the gather channel based on the swizzle, for gather4.
2727  */
2728 uint32_t
2729 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2730 {
2731    ir_constant *chan = ir->lod_info.component->as_constant();
2732    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2733    switch (swiz) {
2734       case SWIZZLE_X: return 0;
2735       case SWIZZLE_Y:
2736          /* gather4 sampler is broken for green channel on RG32F --
2737           * we must ask for blue instead.
2738           */
2739          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2740             return 2;
2741          return 1;
2742       case SWIZZLE_Z: return 2;
2743       case SWIZZLE_W: return 3;
2744       default:
2745          unreachable("Not reached"); /* zero, one swizzles handled already */
2746    }
2747 }
2748
2749 void
2750 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2751 {
2752    int s = key->tex.swizzles[sampler];
2753
2754    this->result = src_reg(this, ir->type);
2755    dst_reg swizzled_result(this->result);
2756
2757    if (ir->op == ir_query_levels) {
2758       /* # levels is in .w */
2759       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2760       emit(MOV(swizzled_result, orig_val));
2761       return;
2762    }
2763
2764    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2765                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2766       emit(MOV(swizzled_result, orig_val));
2767       return;
2768    }
2769
2770
2771    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2772    int swizzle[4] = {0};
2773
2774    for (int i = 0; i < 4; i++) {
2775       switch (GET_SWZ(s, i)) {
2776       case SWIZZLE_ZERO:
2777          zero_mask |= (1 << i);
2778          break;
2779       case SWIZZLE_ONE:
2780          one_mask |= (1 << i);
2781          break;
2782       default:
2783          copy_mask |= (1 << i);
2784          swizzle[i] = GET_SWZ(s, i);
2785          break;
2786       }
2787    }
2788
2789    if (copy_mask) {
2790       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2791       swizzled_result.writemask = copy_mask;
2792       emit(MOV(swizzled_result, orig_val));
2793    }
2794
2795    if (zero_mask) {
2796       swizzled_result.writemask = zero_mask;
2797       emit(MOV(swizzled_result, src_reg(0.0f)));
2798    }
2799
2800    if (one_mask) {
2801       swizzled_result.writemask = one_mask;
2802       emit(MOV(swizzled_result, src_reg(1.0f)));
2803    }
2804 }
2805
2806 void
2807 vec4_visitor::visit(ir_return *)
2808 {
2809    unreachable("not reached");
2810 }
2811
2812 void
2813 vec4_visitor::visit(ir_discard *)
2814 {
2815    unreachable("not reached");
2816 }
2817
2818 void
2819 vec4_visitor::visit(ir_if *ir)
2820 {
2821    /* Don't point the annotation at the if statement, because then it plus
2822     * the then and else blocks get printed.
2823     */
2824    this->base_ir = ir->condition;
2825
2826    if (brw->gen == 6) {
2827       emit_if_gen6(ir);
2828    } else {
2829       enum brw_predicate predicate;
2830       emit_bool_to_cond_code(ir->condition, &predicate);
2831       emit(IF(predicate));
2832    }
2833
2834    visit_instructions(&ir->then_instructions);
2835
2836    if (!ir->else_instructions.is_empty()) {
2837       this->base_ir = ir->condition;
2838       emit(BRW_OPCODE_ELSE);
2839
2840       visit_instructions(&ir->else_instructions);
2841    }
2842
2843    this->base_ir = ir->condition;
2844    emit(BRW_OPCODE_ENDIF);
2845 }
2846
2847 void
2848 vec4_visitor::visit(ir_emit_vertex *)
2849 {
2850    unreachable("not reached");
2851 }
2852
2853 void
2854 vec4_visitor::visit(ir_end_primitive *)
2855 {
2856    unreachable("not reached");
2857 }
2858
2859 void
2860 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2861                                   dst_reg dst, src_reg offset,
2862                                   src_reg src0, src_reg src1)
2863 {
2864    unsigned mlen = 0;
2865
2866    /* Set the atomic operation offset. */
2867    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2868    mlen++;
2869
2870    /* Set the atomic operation arguments. */
2871    if (src0.file != BAD_FILE) {
2872       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2873       mlen++;
2874    }
2875
2876    if (src1.file != BAD_FILE) {
2877       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2878       mlen++;
2879    }
2880
2881    /* Emit the instruction.  Note that this maps to the normal SIMD8
2882     * untyped atomic message on Ivy Bridge, but that's OK because
2883     * unused channels will be masked out.
2884     */
2885    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2886                                  src_reg(atomic_op), src_reg(surf_index));
2887    inst->base_mrf = 0;
2888    inst->mlen = mlen;
2889 }
2890
2891 void
2892 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2893                                         src_reg offset)
2894 {
2895    /* Set the surface read offset. */
2896    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2897
2898    /* Emit the instruction.  Note that this maps to the normal SIMD8
2899     * untyped surface read message, but that's OK because unused
2900     * channels will be masked out.
2901     */
2902    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2903                                  dst, src_reg(surf_index));
2904    inst->base_mrf = 0;
2905    inst->mlen = 1;
2906 }
2907
2908 void
2909 vec4_visitor::emit_ndc_computation()
2910 {
2911    /* Get the position */
2912    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2913
2914    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2915    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2916    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2917
2918    current_annotation = "NDC";
2919    dst_reg ndc_w = ndc;
2920    ndc_w.writemask = WRITEMASK_W;
2921    src_reg pos_w = pos;
2922    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2923    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2924
2925    dst_reg ndc_xyz = ndc;
2926    ndc_xyz.writemask = WRITEMASK_XYZ;
2927
2928    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2929 }
2930
2931 void
2932 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2933 {
2934    if (brw->gen < 6 &&
2935        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2936         key->userclip_active || brw->has_negative_rhw_bug)) {
2937       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2938       dst_reg header1_w = header1;
2939       header1_w.writemask = WRITEMASK_W;
2940
2941       emit(MOV(header1, 0u));
2942
2943       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2944          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2945
2946          current_annotation = "Point size";
2947          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2948          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2949       }
2950
2951       if (key->userclip_active) {
2952          current_annotation = "Clipping flags";
2953          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2954          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2955
2956          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2957          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2958          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2959
2960          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2961          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2962          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2963          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2964       }
2965
2966       /* i965 clipping workaround:
2967        * 1) Test for -ve rhw
2968        * 2) If set,
2969        *      set ndc = (0,0,0,0)
2970        *      set ucp[6] = 1
2971        *
2972        * Later, clipping will detect ucp[6] and ensure the primitive is
2973        * clipped against all fixed planes.
2974        */
2975       if (brw->has_negative_rhw_bug) {
2976          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2977          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2978          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2979          vec4_instruction *inst;
2980          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2981          inst->predicate = BRW_PREDICATE_NORMAL;
2982          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2983          inst->predicate = BRW_PREDICATE_NORMAL;
2984       }
2985
2986       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2987    } else if (brw->gen < 6) {
2988       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2989    } else {
2990       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2991       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2992          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2993                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2994       }
2995       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2996          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2997                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2998       }
2999       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3000          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
3001                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3002       }
3003    }
3004 }
3005
3006 void
3007 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3008 {
3009    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3010     *
3011     *     "If a linked set of shaders forming the vertex stage contains no
3012     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3013     *     application has requested clipping against user clip planes through
3014     *     the API, then the coordinate written to gl_Position is used for
3015     *     comparison against the user clip planes."
3016     *
3017     * This function is only called if the shader didn't write to
3018     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3019     * if the user wrote to it; otherwise we use gl_Position.
3020     */
3021    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3022    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3023       clip_vertex = VARYING_SLOT_POS;
3024    }
3025
3026    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3027         ++i) {
3028       reg.writemask = 1 << i;
3029       emit(DP4(reg,
3030                src_reg(output_reg[clip_vertex]),
3031                src_reg(this->userplane[i + offset])));
3032    }
3033 }
3034
3035 void
3036 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3037 {
3038    assert (varying < VARYING_SLOT_MAX);
3039    reg.type = output_reg[varying].type;
3040    current_annotation = output_reg_annotation[varying];
3041    /* Copy the register, saturating if necessary */
3042    vec4_instruction *inst = emit(MOV(reg,
3043                                      src_reg(output_reg[varying])));
3044    if ((varying == VARYING_SLOT_COL0 ||
3045         varying == VARYING_SLOT_COL1 ||
3046         varying == VARYING_SLOT_BFC0 ||
3047         varying == VARYING_SLOT_BFC1) &&
3048        key->clamp_vertex_color) {
3049       inst->saturate = true;
3050    }
3051 }
3052
3053 void
3054 vec4_visitor::emit_urb_slot(int mrf, int varying)
3055 {
3056    struct brw_reg hw_reg = brw_message_reg(mrf);
3057    dst_reg reg = dst_reg(MRF, mrf);
3058    reg.type = BRW_REGISTER_TYPE_F;
3059
3060    switch (varying) {
3061    case VARYING_SLOT_PSIZ:
3062       /* PSIZ is always in slot 0, and is coupled with other flags. */
3063       current_annotation = "indices, point width, clip flags";
3064       emit_psiz_and_flags(hw_reg);
3065       break;
3066    case BRW_VARYING_SLOT_NDC:
3067       current_annotation = "NDC";
3068       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3069       break;
3070    case VARYING_SLOT_POS:
3071       current_annotation = "gl_Position";
3072       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3073       break;
3074    case VARYING_SLOT_EDGE:
3075       /* This is present when doing unfilled polygons.  We're supposed to copy
3076        * the edge flag from the user-provided vertex array
3077        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3078        * of that attribute (starts as 1.0f).  This is then used in clipping to
3079        * determine which edges should be drawn as wireframe.
3080        */
3081       current_annotation = "edge flag";
3082       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3083                                     glsl_type::float_type, WRITEMASK_XYZW))));
3084       break;
3085    case BRW_VARYING_SLOT_PAD:
3086       /* No need to write to this slot */
3087       break;
3088    default:
3089       emit_generic_urb_slot(reg, varying);
3090       break;
3091    }
3092 }
3093
3094 static int
3095 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3096 {
3097    if (brw->gen >= 6) {
3098       /* URB data written (does not include the message header reg) must
3099        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3100        * section 5.4.3.2.2: URB_INTERLEAVED.
3101        *
3102        * URB entries are allocated on a multiple of 1024 bits, so an
3103        * extra 128 bits written here to make the end align to 256 is
3104        * no problem.
3105        */
3106       if ((mlen % 2) != 1)
3107          mlen++;
3108    }
3109
3110    return mlen;
3111 }
3112
3113
3114 /**
3115  * Generates the VUE payload plus the necessary URB write instructions to
3116  * output it.
3117  *
3118  * The VUE layout is documented in Volume 2a.
3119  */
3120 void
3121 vec4_visitor::emit_vertex()
3122 {
3123    /* MRF 0 is reserved for the debugger, so start with message header
3124     * in MRF 1.
3125     */
3126    int base_mrf = 1;
3127    int mrf = base_mrf;
3128    /* In the process of generating our URB write message contents, we
3129     * may need to unspill a register or load from an array.  Those
3130     * reads would use MRFs 14-15.
3131     */
3132    int max_usable_mrf = 13;
3133
3134    /* The following assertion verifies that max_usable_mrf causes an
3135     * even-numbered amount of URB write data, which will meet gen6's
3136     * requirements for length alignment.
3137     */
3138    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3139
3140    /* First mrf is the g0-based message header containing URB handles and
3141     * such.
3142     */
3143    emit_urb_write_header(mrf++);
3144
3145    if (brw->gen < 6) {
3146       emit_ndc_computation();
3147    }
3148
3149    /* Lower legacy ff and ClipVertex clipping to clip distances */
3150    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3151       current_annotation = "user clip distances";
3152
3153       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3154       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3155
3156       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3157       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3158    }
3159
3160    /* We may need to split this up into several URB writes, so do them in a
3161     * loop.
3162     */
3163    int slot = 0;
3164    bool complete = false;
3165    do {
3166       /* URB offset is in URB row increments, and each of our MRFs is half of
3167        * one of those, since we're doing interleaved writes.
3168        */
3169       int offset = slot / 2;
3170
3171       mrf = base_mrf + 1;
3172       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3173          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3174
3175          /* If this was max_usable_mrf, we can't fit anything more into this
3176           * URB WRITE.
3177           */
3178          if (mrf > max_usable_mrf) {
3179             slot++;
3180             break;
3181          }
3182       }
3183
3184       complete = slot >= prog_data->vue_map.num_slots;
3185       current_annotation = "URB write";
3186       vec4_instruction *inst = emit_urb_write_opcode(complete);
3187       inst->base_mrf = base_mrf;
3188       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3189       inst->offset += offset;
3190    } while(!complete);
3191 }
3192
3193
3194 src_reg
3195 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3196                                  src_reg *reladdr, int reg_offset)
3197 {
3198    /* Because we store the values to scratch interleaved like our
3199     * vertex data, we need to scale the vec4 index by 2.
3200     */
3201    int message_header_scale = 2;
3202
3203    /* Pre-gen6, the message header uses byte offsets instead of vec4
3204     * (16-byte) offset units.
3205     */
3206    if (brw->gen < 6)
3207       message_header_scale *= 16;
3208
3209    if (reladdr) {
3210       src_reg index = src_reg(this, glsl_type::int_type);
3211
3212       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3213       emit_before(inst, MUL(dst_reg(index),
3214                             index, src_reg(message_header_scale)));
3215
3216       return index;
3217    } else {
3218       return src_reg(reg_offset * message_header_scale);
3219    }
3220 }
3221
3222 src_reg
3223 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3224                                        src_reg *reladdr, int reg_offset)
3225 {
3226    if (reladdr) {
3227       src_reg index = src_reg(this, glsl_type::int_type);
3228
3229       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3230
3231       /* Pre-gen6, the message header uses byte offsets instead of vec4
3232        * (16-byte) offset units.
3233        */
3234       if (brw->gen < 6) {
3235          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3236       }
3237
3238       return index;
3239    } else if (brw->gen >= 8) {
3240       /* Store the offset in a GRF so we can send-from-GRF. */
3241       src_reg offset = src_reg(this, glsl_type::int_type);
3242       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3243       return offset;
3244    } else {
3245       int message_header_scale = brw->gen < 6 ? 16 : 1;
3246       return src_reg(reg_offset * message_header_scale);
3247    }
3248 }
3249
3250 /**
3251  * Emits an instruction before @inst to load the value named by @orig_src
3252  * from scratch space at @base_offset to @temp.
3253  *
3254  * @base_offset is measured in 32-byte units (the size of a register).
3255  */
3256 void
3257 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3258                                 dst_reg temp, src_reg orig_src,
3259                                 int base_offset)
3260 {
3261    int reg_offset = base_offset + orig_src.reg_offset;
3262    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3263
3264    emit_before(inst, SCRATCH_READ(temp, index));
3265 }
3266
3267 /**
3268  * Emits an instruction after @inst to store the value to be written
3269  * to @orig_dst to scratch space at @base_offset, from @temp.
3270  *
3271  * @base_offset is measured in 32-byte units (the size of a register).
3272  */
3273 void
3274 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3275 {
3276    int reg_offset = base_offset + inst->dst.reg_offset;
3277    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3278
3279    /* Create a temporary register to store *inst's result in.
3280     *
3281     * We have to be careful in MOVing from our temporary result register in
3282     * the scratch write.  If we swizzle from channels of the temporary that
3283     * weren't initialized, it will confuse live interval analysis, which will
3284     * make spilling fail to make progress.
3285     */
3286    src_reg temp = src_reg(this, glsl_type::vec4_type);
3287    temp.type = inst->dst.type;
3288    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3289    int swizzles[4];
3290    for (int i = 0; i < 4; i++)
3291       if (inst->dst.writemask & (1 << i))
3292          swizzles[i] = i;
3293       else
3294          swizzles[i] = first_writemask_chan;
3295    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3296                                swizzles[2], swizzles[3]);
3297
3298    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3299                                        inst->dst.writemask));
3300    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3301    write->predicate = inst->predicate;
3302    write->ir = inst->ir;
3303    write->annotation = inst->annotation;
3304    inst->insert_after(write);
3305
3306    inst->dst.file = temp.file;
3307    inst->dst.reg = temp.reg;
3308    inst->dst.reg_offset = temp.reg_offset;
3309    inst->dst.reladdr = NULL;
3310 }
3311
3312 /**
3313  * We can't generally support array access in GRF space, because a
3314  * single instruction's destination can only span 2 contiguous
3315  * registers.  So, we send all GRF arrays that get variable index
3316  * access to scratch space.
3317  */
3318 void
3319 vec4_visitor::move_grf_array_access_to_scratch()
3320 {
3321    int scratch_loc[this->virtual_grf_count];
3322
3323    for (int i = 0; i < this->virtual_grf_count; i++) {
3324       scratch_loc[i] = -1;
3325    }
3326
3327    /* First, calculate the set of virtual GRFs that need to be punted
3328     * to scratch due to having any array access on them, and where in
3329     * scratch.
3330     */
3331    foreach_in_list(vec4_instruction, inst, &instructions) {
3332       if (inst->dst.file == GRF && inst->dst.reladdr &&
3333           scratch_loc[inst->dst.reg] == -1) {
3334          scratch_loc[inst->dst.reg] = c->last_scratch;
3335          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3336       }
3337
3338       for (int i = 0 ; i < 3; i++) {
3339          src_reg *src = &inst->src[i];
3340
3341          if (src->file == GRF && src->reladdr &&
3342              scratch_loc[src->reg] == -1) {
3343             scratch_loc[src->reg] = c->last_scratch;
3344             c->last_scratch += this->virtual_grf_sizes[src->reg];
3345          }
3346       }
3347    }
3348
3349    /* Now, for anything that will be accessed through scratch, rewrite
3350     * it to load/store.  Note that this is a _safe list walk, because
3351     * we may generate a new scratch_write instruction after the one
3352     * we're processing.
3353     */
3354    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3355       /* Set up the annotation tracking for new generated instructions. */
3356       base_ir = inst->ir;
3357       current_annotation = inst->annotation;
3358
3359       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3360          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3361       }
3362
3363       for (int i = 0 ; i < 3; i++) {
3364          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3365             continue;
3366
3367          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3368
3369          emit_scratch_read(inst, temp, inst->src[i],
3370                            scratch_loc[inst->src[i].reg]);
3371
3372          inst->src[i].file = temp.file;
3373          inst->src[i].reg = temp.reg;
3374          inst->src[i].reg_offset = temp.reg_offset;
3375          inst->src[i].reladdr = NULL;
3376       }
3377    }
3378 }
3379
3380 /**
3381  * Emits an instruction before @inst to load the value named by @orig_src
3382  * from the pull constant buffer (surface) at @base_offset to @temp.
3383  */
3384 void
3385 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3386                                       dst_reg temp, src_reg orig_src,
3387                                       int base_offset)
3388 {
3389    int reg_offset = base_offset + orig_src.reg_offset;
3390    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3391    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3392    vec4_instruction *load;
3393
3394    if (brw->gen >= 7) {
3395       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3396       grf_offset.type = offset.type;
3397       emit_before(inst, MOV(grf_offset, offset));
3398
3399       load = new(mem_ctx) vec4_instruction(this,
3400                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3401                                            temp, index, src_reg(grf_offset));
3402    } else {
3403       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3404                                            temp, index, offset);
3405       load->base_mrf = 14;
3406       load->mlen = 1;
3407    }
3408    emit_before(inst, load);
3409 }
3410
3411 /**
3412  * Implements array access of uniforms by inserting a
3413  * PULL_CONSTANT_LOAD instruction.
3414  *
3415  * Unlike temporary GRF array access (where we don't support it due to
3416  * the difficulty of doing relative addressing on instruction
3417  * destinations), we could potentially do array access of uniforms
3418  * that were loaded in GRF space as push constants.  In real-world
3419  * usage we've seen, though, the arrays being used are always larger
3420  * than we could load as push constants, so just always move all
3421  * uniform array access out to a pull constant buffer.
3422  */
3423 void
3424 vec4_visitor::move_uniform_array_access_to_pull_constants()
3425 {
3426    int pull_constant_loc[this->uniforms];
3427
3428    for (int i = 0; i < this->uniforms; i++) {
3429       pull_constant_loc[i] = -1;
3430    }
3431
3432    /* Walk through and find array access of uniforms.  Put a copy of that
3433     * uniform in the pull constant buffer.
3434     *
3435     * Note that we don't move constant-indexed accesses to arrays.  No
3436     * testing has been done of the performance impact of this choice.
3437     */
3438    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3439       for (int i = 0 ; i < 3; i++) {
3440          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3441             continue;
3442
3443          int uniform = inst->src[i].reg;
3444
3445          /* If this array isn't already present in the pull constant buffer,
3446           * add it.
3447           */
3448          if (pull_constant_loc[uniform] == -1) {
3449             const gl_constant_value **values =
3450                &stage_prog_data->param[uniform * 4];
3451
3452             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3453
3454             assert(uniform < uniform_array_size);
3455             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3456                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3457                   = values[j];
3458             }
3459          }
3460
3461          /* Set up the annotation tracking for new generated instructions. */
3462          base_ir = inst->ir;
3463          current_annotation = inst->annotation;
3464
3465          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3466
3467          emit_pull_constant_load(inst, temp, inst->src[i],
3468                                  pull_constant_loc[uniform]);
3469
3470          inst->src[i].file = temp.file;
3471          inst->src[i].reg = temp.reg;
3472          inst->src[i].reg_offset = temp.reg_offset;
3473          inst->src[i].reladdr = NULL;
3474       }
3475    }
3476
3477    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3478     * no need to track them as larger-than-vec4 objects.  This will be
3479     * relied on in cutting out unused uniform vectors from push
3480     * constants.
3481     */
3482    split_uniform_registers();
3483 }
3484
3485 void
3486 vec4_visitor::resolve_ud_negate(src_reg *reg)
3487 {
3488    if (reg->type != BRW_REGISTER_TYPE_UD ||
3489        !reg->negate)
3490       return;
3491
3492    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3493    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3494    *reg = temp;
3495 }
3496
3497 vec4_visitor::vec4_visitor(struct brw_context *brw,
3498                            struct brw_vec4_compile *c,
3499                            struct gl_program *prog,
3500                            const struct brw_vec4_prog_key *key,
3501                            struct brw_vec4_prog_data *prog_data,
3502                            struct gl_shader_program *shader_prog,
3503                            gl_shader_stage stage,
3504                            void *mem_ctx,
3505                            bool debug_flag,
3506                            bool no_spills,
3507                            shader_time_shader_type st_base,
3508                            shader_time_shader_type st_written,
3509                            shader_time_shader_type st_reset)
3510    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3511      c(c),
3512      key(key),
3513      prog_data(prog_data),
3514      sanity_param_count(0),
3515      fail_msg(NULL),
3516      first_non_payload_grf(0),
3517      need_all_constants_in_pull_buffer(false),
3518      debug_flag(debug_flag),
3519      no_spills(no_spills),
3520      st_base(st_base),
3521      st_written(st_written),
3522      st_reset(st_reset)
3523 {
3524    this->mem_ctx = mem_ctx;
3525    this->failed = false;
3526
3527    this->base_ir = NULL;
3528    this->current_annotation = NULL;
3529    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3530
3531    this->variable_ht = hash_table_ctor(0,
3532                                        hash_table_pointer_hash,
3533                                        hash_table_pointer_compare);
3534
3535    this->virtual_grf_start = NULL;
3536    this->virtual_grf_end = NULL;
3537    this->virtual_grf_sizes = NULL;
3538    this->virtual_grf_count = 0;
3539    this->virtual_grf_reg_map = NULL;
3540    this->virtual_grf_reg_count = 0;
3541    this->virtual_grf_array_size = 0;
3542    this->live_intervals_valid = false;
3543
3544    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3545
3546    this->uniforms = 0;
3547
3548    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3549     * at least one. See setup_uniforms() in brw_vec4.cpp.
3550     */
3551    this->uniform_array_size = 1;
3552    if (prog_data) {
3553       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3554    }
3555
3556    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3557    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3558 }
3559
3560 vec4_visitor::~vec4_visitor()
3561 {
3562    hash_table_dtor(this->variable_ht);
3563 }
3564
3565
3566 void
3567 vec4_visitor::fail(const char *format, ...)
3568 {
3569    va_list va;
3570    char *msg;
3571
3572    if (failed)
3573       return;
3574
3575    failed = true;
3576
3577    va_start(va, format);
3578    msg = ralloc_vasprintf(mem_ctx, format, va);
3579    va_end(va);
3580    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3581
3582    this->fail_msg = msg;
3583
3584    if (debug_flag) {
3585       fprintf(stderr, "%s",  msg);
3586    }
3587 }
3588
3589 } /* namespace brw */