src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr) {
 780       src_reg op[3];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 3);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       case ir_triop_csel: {
 856          /* Expand the boolean condition into the flag register. */
 857          inst = emit(MOV(dst_null_d(), op[0]));
 858          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 859
 860          /* Select which boolean to return. */
 861          dst_reg temp(this, expr->operands[1]->type);
 862          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 863          inst->predicate = BRW_PREDICATE_NORMAL;
 864
 865          /* Expand the result to a condition code. */
 866          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 867          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 868          break;
 869       }
 870
 871       default:
 872          unreachable("not reached");
 873       }
 874       return;
 875    }
 876
 877    ir->accept(this);
 878
 879    resolve_ud_negate(&this->result);
 880
 881    if (brw->gen >= 6) {
 882       vec4_instruction *inst = emit(AND(dst_null_d(),
 883                                         this->result, src_reg(1)));
 884       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 885    } else {
 886       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 887       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 888    }
 889 }
 890
 891 /**
 892  * Emit a gen6 IF statement with the comparison folded into the IF
 893  * instruction.
 894  */
 895 void
 896 vec4_visitor::emit_if_gen6(ir_if *ir)
 897 {
 898    ir_expression *expr = ir->condition->as_expression();
 899
 900    if (expr) {
 901       src_reg op[2];
 902       dst_reg temp;
 903
 904       assert(expr->get_num_operands() <= 2);
 905       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 906          expr->operands[i]->accept(this);
 907          op[i] = this->result;
 908       }
 909
 910       switch (expr->operation) {
 911       case ir_unop_logic_not:
 912          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 913          return;
 914
 915       case ir_binop_logic_xor:
 916          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_binop_logic_or:
 920          temp = dst_reg(this, glsl_type::bool_type);
 921          emit(OR(temp, op[0], op[1]));
 922          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 923          return;
 924
 925       case ir_binop_logic_and:
 926          temp = dst_reg(this, glsl_type::bool_type);
 927          emit(AND(temp, op[0], op[1]));
 928          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 929          return;
 930
 931       case ir_unop_f2b:
 932          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 933          return;
 934
 935       case ir_unop_i2b:
 936          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 937          return;
 938
 939       case ir_binop_greater:
 940       case ir_binop_gequal:
 941       case ir_binop_less:
 942       case ir_binop_lequal:
 943       case ir_binop_equal:
 944       case ir_binop_nequal:
 945          emit(IF(op[0], op[1],
 946                  brw_conditional_for_comparison(expr->operation)));
 947          return;
 948
 949       case ir_binop_all_equal:
 950          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 951          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 952          return;
 953
 954       case ir_binop_any_nequal:
 955          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 956          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 957          return;
 958
 959       case ir_unop_any:
 960          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 961          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 962          return;
 963
 964       default:
 965          unreachable("not reached");
 966       }
 967       return;
 968    }
 969
 970    ir->condition->accept(this);
 971
 972    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 973 }
 974
 975 void
 976 vec4_visitor::visit(ir_variable *ir)
 977 {
 978    dst_reg *reg = NULL;
 979
 980    if (variable_storage(ir))
 981       return;
 982
 983    switch (ir->data.mode) {
 984    case ir_var_shader_in:
 985       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 986       break;
 987
 988    case ir_var_shader_out:
 989       reg = new(mem_ctx) dst_reg(this, ir->type);
 990
 991       for (int i = 0; i < type_size(ir->type); i++) {
 992          output_reg[ir->data.location + i] = *reg;
 993          output_reg[ir->data.location + i].reg_offset = i;
 994          output_reg[ir->data.location + i].type =
 995             brw_type_for_base_type(ir->type->get_scalar_type());
 996          output_reg_annotation[ir->data.location + i] = ir->name;
 997       }
 998       break;
 999
1000    case ir_var_auto:
1001    case ir_var_temporary:
1002       reg = new(mem_ctx) dst_reg(this, ir->type);
1003       break;
1004
1005    case ir_var_uniform:
1006       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1007
1008       /* Thanks to the lower_ubo_reference pass, we will see only
1009        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1010        * variables, so no need for them to be in variable_ht.
1011        *
1012        * Atomic counters take no uniform storage, no need to do
1013        * anything here.
1014        */
1015       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1016          return;
1017
1018       /* Track how big the whole uniform variable is, in case we need to put a
1019        * copy of its data into pull constants for array access.
1020        */
1021       assert(this->uniforms < uniform_array_size);
1022       this->uniform_size[this->uniforms] = type_size(ir->type);
1023
1024       if (!strncmp(ir->name, "gl_", 3)) {
1025          setup_builtin_uniform_values(ir);
1026       } else {
1027          setup_uniform_values(ir);
1028       }
1029       break;
1030
1031    case ir_var_system_value:
1032       reg = make_reg_for_system_value(ir);
1033       break;
1034
1035    default:
1036       unreachable("not reached");
1037    }
1038
1039    reg->type = brw_type_for_base_type(ir->type);
1040    hash_table_insert(this->variable_ht, reg, ir);
1041 }
1042
1043 void
1044 vec4_visitor::visit(ir_loop *ir)
1045 {
1046    /* We don't want debugging output to print the whole body of the
1047     * loop as the annotation.
1048     */
1049    this->base_ir = NULL;
1050
1051    emit(BRW_OPCODE_DO);
1052
1053    visit_instructions(&ir->body_instructions);
1054
1055    emit(BRW_OPCODE_WHILE);
1056 }
1057
1058 void
1059 vec4_visitor::visit(ir_loop_jump *ir)
1060 {
1061    switch (ir->mode) {
1062    case ir_loop_jump::jump_break:
1063       emit(BRW_OPCODE_BREAK);
1064       break;
1065    case ir_loop_jump::jump_continue:
1066       emit(BRW_OPCODE_CONTINUE);
1067       break;
1068    }
1069 }
1070
1071
1072 void
1073 vec4_visitor::visit(ir_function_signature *)
1074 {
1075    unreachable("not reached");
1076 }
1077
1078 void
1079 vec4_visitor::visit(ir_function *ir)
1080 {
1081    /* Ignore function bodies other than main() -- we shouldn't see calls to
1082     * them since they should all be inlined.
1083     */
1084    if (strcmp(ir->name, "main") == 0) {
1085       const ir_function_signature *sig;
1086       exec_list empty;
1087
1088       sig = ir->matching_signature(NULL, &empty, false);
1089
1090       assert(sig);
1091
1092       visit_instructions(&sig->body);
1093    }
1094 }
1095
1096 bool
1097 vec4_visitor::try_emit_mad(ir_expression *ir)
1098 {
1099    /* 3-src instructions were introduced in gen6. */
1100    if (brw->gen < 6)
1101       return false;
1102
1103    /* MAD can only handle floating-point data. */
1104    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1105       return false;
1106
1107    ir_rvalue *nonmul = ir->operands[1];
1108    ir_expression *mul = ir->operands[0]->as_expression();
1109
1110    if (!mul || mul->operation != ir_binop_mul) {
1111       nonmul = ir->operands[0];
1112       mul = ir->operands[1]->as_expression();
1113
1114       if (!mul || mul->operation != ir_binop_mul)
1115          return false;
1116    }
1117
1118    nonmul->accept(this);
1119    src_reg src0 = fix_3src_operand(this->result);
1120
1121    mul->operands[0]->accept(this);
1122    src_reg src1 = fix_3src_operand(this->result);
1123
1124    mul->operands[1]->accept(this);
1125    src_reg src2 = fix_3src_operand(this->result);
1126
1127    this->result = src_reg(this, ir->type);
1128    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1129
1130    return true;
1131 }
1132
1133 bool
1134 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1135 {
1136    /* This optimization relies on CMP setting the destination to 0 when
1137     * false.  Early hardware only sets the least significant bit, and
1138     * leaves the other bits undefined.  So we can't use it.
1139     */
1140    if (brw->gen < 6)
1141       return false;
1142
1143    ir_expression *const cmp = ir->operands[0]->as_expression();
1144
1145    if (cmp == NULL)
1146       return false;
1147
1148    switch (cmp->operation) {
1149    case ir_binop_less:
1150    case ir_binop_greater:
1151    case ir_binop_lequal:
1152    case ir_binop_gequal:
1153    case ir_binop_equal:
1154    case ir_binop_nequal:
1155       break;
1156
1157    default:
1158       return false;
1159    }
1160
1161    cmp->operands[0]->accept(this);
1162    const src_reg cmp_src0 = this->result;
1163
1164    cmp->operands[1]->accept(this);
1165    const src_reg cmp_src1 = this->result;
1166
1167    this->result = src_reg(this, ir->type);
1168
1169    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1170             brw_conditional_for_comparison(cmp->operation)));
1171
1172    /* If the comparison is false, this->result will just happen to be zero.
1173     */
1174    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1175                                        this->result, src_reg(1.0f));
1176    inst->predicate = BRW_PREDICATE_NORMAL;
1177    inst->predicate_inverse = true;
1178
1179    return true;
1180 }
1181
1182 void
1183 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1184                           src_reg src0, src_reg src1)
1185 {
1186    vec4_instruction *inst;
1187
1188    if (brw->gen >= 6) {
1189       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1190       inst->conditional_mod = conditionalmod;
1191    } else {
1192       emit(CMP(dst, src0, src1, conditionalmod));
1193
1194       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1195       inst->predicate = BRW_PREDICATE_NORMAL;
1196    }
1197 }
1198
1199 void
1200 vec4_visitor::emit_lrp(const dst_reg &dst,
1201                        const src_reg &x, const src_reg &y, const src_reg &a)
1202 {
1203    if (brw->gen >= 6) {
1204       /* Note that the instruction's argument order is reversed from GLSL
1205        * and the IR.
1206        */
1207       emit(LRP(dst,
1208                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1209    } else {
1210       /* Earlier generations don't support three source operations, so we
1211        * need to emit x*(1-a) + y*a.
1212        */
1213       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1214       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1215       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1216       y_times_a.writemask           = dst.writemask;
1217       one_minus_a.writemask         = dst.writemask;
1218       x_times_one_minus_a.writemask = dst.writemask;
1219
1220       emit(MUL(y_times_a, y, a));
1221       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1222       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1223       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1224    }
1225 }
1226
1227 void
1228 vec4_visitor::visit(ir_expression *ir)
1229 {
1230    unsigned int operand;
1231    src_reg op[Elements(ir->operands)];
1232    src_reg result_src;
1233    dst_reg result_dst;
1234    vec4_instruction *inst;
1235
1236    if (ir->operation == ir_binop_add) {
1237       if (try_emit_mad(ir))
1238          return;
1239    }
1240
1241    if (ir->operation == ir_unop_b2f) {
1242       if (try_emit_b2f_of_compare(ir))
1243          return;
1244    }
1245
1246    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1247       this->result.file = BAD_FILE;
1248       ir->operands[operand]->accept(this);
1249       if (this->result.file == BAD_FILE) {
1250          fprintf(stderr, "Failed to get tree for expression operand:\n");
1251          ir->operands[operand]->fprint(stderr);
1252          exit(1);
1253       }
1254       op[operand] = this->result;
1255
1256       /* Matrix expression operands should have been broken down to vector
1257        * operations already.
1258        */
1259       assert(!ir->operands[operand]->type->is_matrix());
1260    }
1261
1262    int vector_elements = ir->operands[0]->type->vector_elements;
1263    if (ir->operands[1]) {
1264       vector_elements = MAX2(vector_elements,
1265                              ir->operands[1]->type->vector_elements);
1266    }
1267
1268    this->result.file = BAD_FILE;
1269
1270    /* Storage for our result.  Ideally for an assignment we'd be using
1271     * the actual storage for the result here, instead.
1272     */
1273    result_src = src_reg(this, ir->type);
1274    /* convenience for the emit functions below. */
1275    result_dst = dst_reg(result_src);
1276    /* If nothing special happens, this is the result. */
1277    this->result = result_src;
1278    /* Limit writes to the channels that will be used by result_src later.
1279     * This does limit this temp's use as a temporary for multi-instruction
1280     * sequences.
1281     */
1282    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1283
1284    switch (ir->operation) {
1285    case ir_unop_logic_not:
1286       if (ctx->Const.UniformBooleanTrue != 1) {
1287          emit(NOT(result_dst, op[0]));
1288       } else {
1289          emit(XOR(result_dst, op[0], src_reg(1)));
1290       }
1291       break;
1292    case ir_unop_neg:
1293       op[0].negate = !op[0].negate;
1294       emit(MOV(result_dst, op[0]));
1295       break;
1296    case ir_unop_abs:
1297       op[0].abs = true;
1298       op[0].negate = false;
1299       emit(MOV(result_dst, op[0]));
1300       break;
1301
1302    case ir_unop_sign:
1303       if (ir->type->is_float()) {
1304          /* AND(val, 0x80000000) gives the sign bit.
1305           *
1306           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1307           * zero.
1308           */
1309          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1310
1311          op[0].type = BRW_REGISTER_TYPE_UD;
1312          result_dst.type = BRW_REGISTER_TYPE_UD;
1313          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1314
1315          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1316          inst->predicate = BRW_PREDICATE_NORMAL;
1317
1318          this->result.type = BRW_REGISTER_TYPE_F;
1319       } else {
1320          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1321           *               -> non-negative val generates 0x00000000.
1322           *  Predicated OR sets 1 if val is positive.
1323           */
1324          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1325
1326          emit(ASR(result_dst, op[0], src_reg(31)));
1327
1328          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1329          inst->predicate = BRW_PREDICATE_NORMAL;
1330       }
1331       break;
1332
1333    case ir_unop_rcp:
1334       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1335       break;
1336
1337    case ir_unop_exp2:
1338       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1339       break;
1340    case ir_unop_log2:
1341       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1342       break;
1343    case ir_unop_exp:
1344    case ir_unop_log:
1345       unreachable("not reached: should be handled by ir_explog_to_explog2");
1346    case ir_unop_sin:
1347    case ir_unop_sin_reduced:
1348       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1349       break;
1350    case ir_unop_cos:
1351    case ir_unop_cos_reduced:
1352       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1353       break;
1354
1355    case ir_unop_dFdx:
1356    case ir_unop_dFdx_coarse:
1357    case ir_unop_dFdx_fine:
1358    case ir_unop_dFdy:
1359    case ir_unop_dFdy_coarse:
1360    case ir_unop_dFdy_fine:
1361       unreachable("derivatives not valid in vertex shader");
1362
1363    case ir_unop_bitfield_reverse:
1364       emit(BFREV(result_dst, op[0]));
1365       break;
1366    case ir_unop_bit_count:
1367       emit(CBIT(result_dst, op[0]));
1368       break;
1369    case ir_unop_find_msb: {
1370       src_reg temp = src_reg(this, glsl_type::uint_type);
1371
1372       inst = emit(FBH(dst_reg(temp), op[0]));
1373       inst->dst.writemask = WRITEMASK_XYZW;
1374
1375       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1376        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1377        * subtract the result from 31 to convert the MSB count into an LSB count.
1378        */
1379
1380       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1381       temp.swizzle = BRW_SWIZZLE_NOOP;
1382       emit(MOV(result_dst, temp));
1383
1384       src_reg src_tmp = src_reg(result_dst);
1385       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1386
1387       src_tmp.negate = true;
1388       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1389       inst->predicate = BRW_PREDICATE_NORMAL;
1390       break;
1391    }
1392    case ir_unop_find_lsb:
1393       emit(FBL(result_dst, op[0]));
1394       break;
1395    case ir_unop_saturate:
1396       inst = emit(MOV(result_dst, op[0]));
1397       inst->saturate = true;
1398       break;
1399
1400    case ir_unop_noise:
1401       unreachable("not reached: should be handled by lower_noise");
1402
1403    case ir_binop_add:
1404       emit(ADD(result_dst, op[0], op[1]));
1405       break;
1406    case ir_binop_sub:
1407       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1408
1409    case ir_binop_mul:
1410       if (brw->gen < 8 && ir->type->is_integer()) {
1411          /* For integer multiplication, the MUL uses the low 16 bits of one of
1412           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1413           * accumulates in the contribution of the upper 16 bits of that
1414           * operand.  If we can determine that one of the args is in the low
1415           * 16 bits, though, we can just emit a single MUL.
1416           */
1417          if (ir->operands[0]->is_uint16_constant()) {
1418             if (brw->gen < 7)
1419                emit(MUL(result_dst, op[0], op[1]));
1420             else
1421                emit(MUL(result_dst, op[1], op[0]));
1422          } else if (ir->operands[1]->is_uint16_constant()) {
1423             if (brw->gen < 7)
1424                emit(MUL(result_dst, op[1], op[0]));
1425             else
1426                emit(MUL(result_dst, op[0], op[1]));
1427          } else {
1428             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1429
1430             emit(MUL(acc, op[0], op[1]));
1431             emit(MACH(dst_null_d(), op[0], op[1]));
1432             emit(MOV(result_dst, src_reg(acc)));
1433          }
1434       } else {
1435          emit(MUL(result_dst, op[0], op[1]));
1436       }
1437       break;
1438    case ir_binop_imul_high: {
1439       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1440
1441       emit(MUL(acc, op[0], op[1]));
1442       emit(MACH(result_dst, op[0], op[1]));
1443       break;
1444    }
1445    case ir_binop_div:
1446       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1447       assert(ir->type->is_integer());
1448       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1449       break;
1450    case ir_binop_carry: {
1451       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1452
1453       emit(ADDC(dst_null_ud(), op[0], op[1]));
1454       emit(MOV(result_dst, src_reg(acc)));
1455       break;
1456    }
1457    case ir_binop_borrow: {
1458       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1459
1460       emit(SUBB(dst_null_ud(), op[0], op[1]));
1461       emit(MOV(result_dst, src_reg(acc)));
1462       break;
1463    }
1464    case ir_binop_mod:
1465       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1466       assert(ir->type->is_integer());
1467       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1468       break;
1469
1470    case ir_binop_less:
1471    case ir_binop_greater:
1472    case ir_binop_lequal:
1473    case ir_binop_gequal:
1474    case ir_binop_equal:
1475    case ir_binop_nequal: {
1476       emit(CMP(result_dst, op[0], op[1],
1477                brw_conditional_for_comparison(ir->operation)));
1478       if (ctx->Const.UniformBooleanTrue == 1) {
1479          emit(AND(result_dst, result_src, src_reg(1)));
1480       }
1481       break;
1482    }
1483
1484    case ir_binop_all_equal:
1485       /* "==" operator producing a scalar boolean. */
1486       if (ir->operands[0]->type->is_vector() ||
1487           ir->operands[1]->type->is_vector()) {
1488          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1489          emit(MOV(result_dst, src_reg(0)));
1490          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1491          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1492       } else {
1493          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1494          if (ctx->Const.UniformBooleanTrue == 1) {
1495             emit(AND(result_dst, result_src, src_reg(1)));
1496          }
1497       }
1498       break;
1499    case ir_binop_any_nequal:
1500       /* "!=" operator producing a scalar boolean. */
1501       if (ir->operands[0]->type->is_vector() ||
1502           ir->operands[1]->type->is_vector()) {
1503          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1504
1505          emit(MOV(result_dst, src_reg(0)));
1506          inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1507          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1508       } else {
1509          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1510          if (ctx->Const.UniformBooleanTrue == 1) {
1511             emit(AND(result_dst, result_src, src_reg(1)));
1512          }
1513       }
1514       break;
1515
1516    case ir_unop_any:
1517       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1518       emit(MOV(result_dst, src_reg(0)));
1519
1520       inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1522       break;
1523
1524    case ir_binop_logic_xor:
1525       emit(XOR(result_dst, op[0], op[1]));
1526       break;
1527
1528    case ir_binop_logic_or:
1529       emit(OR(result_dst, op[0], op[1]));
1530       break;
1531
1532    case ir_binop_logic_and:
1533       emit(AND(result_dst, op[0], op[1]));
1534       break;
1535
1536    case ir_binop_dot:
1537       assert(ir->operands[0]->type->is_vector());
1538       assert(ir->operands[0]->type == ir->operands[1]->type);
1539       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1540       break;
1541
1542    case ir_unop_sqrt:
1543       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1544       break;
1545    case ir_unop_rsq:
1546       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1547       break;
1548
1549    case ir_unop_bitcast_i2f:
1550    case ir_unop_bitcast_u2f:
1551       this->result = op[0];
1552       this->result.type = BRW_REGISTER_TYPE_F;
1553       break;
1554
1555    case ir_unop_bitcast_f2i:
1556       this->result = op[0];
1557       this->result.type = BRW_REGISTER_TYPE_D;
1558       break;
1559
1560    case ir_unop_bitcast_f2u:
1561       this->result = op[0];
1562       this->result.type = BRW_REGISTER_TYPE_UD;
1563       break;
1564
1565    case ir_unop_i2f:
1566    case ir_unop_i2u:
1567    case ir_unop_u2i:
1568    case ir_unop_u2f:
1569    case ir_unop_f2i:
1570    case ir_unop_f2u:
1571       emit(MOV(result_dst, op[0]));
1572       break;
1573    case ir_unop_b2i:
1574       if (ctx->Const.UniformBooleanTrue != 1) {
1575          emit(AND(result_dst, op[0], src_reg(1)));
1576       } else {
1577          emit(MOV(result_dst, op[0]));
1578       }
1579       break;
1580    case ir_unop_b2f:
1581       if (ctx->Const.UniformBooleanTrue != 1) {
1582          op[0].type = BRW_REGISTER_TYPE_UD;
1583          result_dst.type = BRW_REGISTER_TYPE_UD;
1584          emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1585          result_dst.type = BRW_REGISTER_TYPE_F;
1586       } else {
1587          emit(MOV(result_dst, op[0]));
1588       }
1589       break;
1590    case ir_unop_f2b:
1591    case ir_unop_i2b:
1592       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1593       if (ctx->Const.UniformBooleanTrue == 1) {
1594          emit(AND(result_dst, result_src, src_reg(1)));
1595       }
1596       break;
1597
1598    case ir_unop_trunc:
1599       emit(RNDZ(result_dst, op[0]));
1600       break;
1601    case ir_unop_ceil:
1602       op[0].negate = !op[0].negate;
1603       inst = emit(RNDD(result_dst, op[0]));
1604       this->result.negate = true;
1605       break;
1606    case ir_unop_floor:
1607       inst = emit(RNDD(result_dst, op[0]));
1608       break;
1609    case ir_unop_fract:
1610       inst = emit(FRC(result_dst, op[0]));
1611       break;
1612    case ir_unop_round_even:
1613       emit(RNDE(result_dst, op[0]));
1614       break;
1615
1616    case ir_binop_min:
1617       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1618       break;
1619    case ir_binop_max:
1620       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1621       break;
1622
1623    case ir_binop_pow:
1624       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1625       break;
1626
1627    case ir_unop_bit_not:
1628       inst = emit(NOT(result_dst, op[0]));
1629       break;
1630    case ir_binop_bit_and:
1631       inst = emit(AND(result_dst, op[0], op[1]));
1632       break;
1633    case ir_binop_bit_xor:
1634       inst = emit(XOR(result_dst, op[0], op[1]));
1635       break;
1636    case ir_binop_bit_or:
1637       inst = emit(OR(result_dst, op[0], op[1]));
1638       break;
1639
1640    case ir_binop_lshift:
1641       inst = emit(SHL(result_dst, op[0], op[1]));
1642       break;
1643
1644    case ir_binop_rshift:
1645       if (ir->type->base_type == GLSL_TYPE_INT)
1646          inst = emit(ASR(result_dst, op[0], op[1]));
1647       else
1648          inst = emit(SHR(result_dst, op[0], op[1]));
1649       break;
1650
1651    case ir_binop_bfm:
1652       emit(BFI1(result_dst, op[0], op[1]));
1653       break;
1654
1655    case ir_binop_ubo_load: {
1656       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1657       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1658       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1659       src_reg offset;
1660
1661       /* Now, load the vector from that offset. */
1662       assert(ir->type->is_vector() || ir->type->is_scalar());
1663
1664       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1665       packed_consts.type = result.type;
1666       src_reg surf_index;
1667
1668       if (const_uniform_block) {
1669          /* The block index is a constant, so just emit the binding table entry
1670           * as an immediate.
1671           */
1672          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1673                               const_uniform_block->value.u[0]);
1674       } else {
1675          /* The block index is not a constant. Evaluate the index expression
1676           * per-channel and add the base UBO index; the generator will select
1677           * a value from any live channel.
1678           */
1679          surf_index = src_reg(this, glsl_type::uint_type);
1680          emit(ADD(dst_reg(surf_index), op[0],
1681                   src_reg(prog_data->base.binding_table.ubo_start)));
1682
1683          /* Assume this may touch any UBO. It would be nice to provide
1684           * a tighter bound, but the array information is already lowered away.
1685           */
1686          brw_mark_surface_used(&prog_data->base,
1687                                prog_data->base.binding_table.ubo_start +
1688                                shader_prog->NumUniformBlocks - 1);
1689       }
1690
1691       if (const_offset_ir) {
1692          if (brw->gen >= 8) {
1693             /* Store the offset in a GRF so we can send-from-GRF. */
1694             offset = src_reg(this, glsl_type::int_type);
1695             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1696          } else {
1697             /* Immediates are fine on older generations since they'll be moved
1698              * to a (potentially fake) MRF at the generator level.
1699              */
1700             offset = src_reg(const_offset / 16);
1701          }
1702       } else {
1703          offset = src_reg(this, glsl_type::uint_type);
1704          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1705       }
1706
1707       if (brw->gen >= 7) {
1708          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1709          grf_offset.type = offset.type;
1710
1711          emit(MOV(grf_offset, offset));
1712
1713          emit(new(mem_ctx) vec4_instruction(this,
1714                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1715                                             dst_reg(packed_consts),
1716                                             surf_index,
1717                                             src_reg(grf_offset)));
1718       } else {
1719          vec4_instruction *pull =
1720             emit(new(mem_ctx) vec4_instruction(this,
1721                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1722                                                dst_reg(packed_consts),
1723                                                surf_index,
1724                                                offset));
1725          pull->base_mrf = 14;
1726          pull->mlen = 1;
1727       }
1728
1729       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1730       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1731                                             const_offset % 16 / 4,
1732                                             const_offset % 16 / 4,
1733                                             const_offset % 16 / 4);
1734
1735       /* UBO bools are any nonzero int.  We need to convert them to use the
1736        * value of true stored in ctx->Const.UniformBooleanTrue.
1737        */
1738       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1739          emit(CMP(result_dst, packed_consts, src_reg(0u),
1740                   BRW_CONDITIONAL_NZ));
1741          if (ctx->Const.UniformBooleanTrue == 1) {
1742             emit(AND(result_dst, result, src_reg(1)));
1743          }
1744       } else {
1745          emit(MOV(result_dst, packed_consts));
1746       }
1747       break;
1748    }
1749
1750    case ir_binop_vector_extract:
1751       unreachable("should have been lowered by vec_index_to_cond_assign");
1752
1753    case ir_triop_fma:
1754       op[0] = fix_3src_operand(op[0]);
1755       op[1] = fix_3src_operand(op[1]);
1756       op[2] = fix_3src_operand(op[2]);
1757       /* Note that the instruction's argument order is reversed from GLSL
1758        * and the IR.
1759        */
1760       emit(MAD(result_dst, op[2], op[1], op[0]));
1761       break;
1762
1763    case ir_triop_lrp:
1764       emit_lrp(result_dst, op[0], op[1], op[2]);
1765       break;
1766
1767    case ir_triop_csel:
1768       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1769       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1770       inst->predicate = BRW_PREDICATE_NORMAL;
1771       break;
1772
1773    case ir_triop_bfi:
1774       op[0] = fix_3src_operand(op[0]);
1775       op[1] = fix_3src_operand(op[1]);
1776       op[2] = fix_3src_operand(op[2]);
1777       emit(BFI2(result_dst, op[0], op[1], op[2]));
1778       break;
1779
1780    case ir_triop_bitfield_extract:
1781       op[0] = fix_3src_operand(op[0]);
1782       op[1] = fix_3src_operand(op[1]);
1783       op[2] = fix_3src_operand(op[2]);
1784       /* Note that the instruction's argument order is reversed from GLSL
1785        * and the IR.
1786        */
1787       emit(BFE(result_dst, op[2], op[1], op[0]));
1788       break;
1789
1790    case ir_triop_vector_insert:
1791       unreachable("should have been lowered by lower_vector_insert");
1792
1793    case ir_quadop_bitfield_insert:
1794       unreachable("not reached: should be handled by "
1795               "bitfield_insert_to_bfm_bfi\n");
1796
1797    case ir_quadop_vector:
1798       unreachable("not reached: should be handled by lower_quadop_vector");
1799
1800    case ir_unop_pack_half_2x16:
1801       emit_pack_half_2x16(result_dst, op[0]);
1802       break;
1803    case ir_unop_unpack_half_2x16:
1804       emit_unpack_half_2x16(result_dst, op[0]);
1805       break;
1806    case ir_unop_pack_snorm_2x16:
1807    case ir_unop_pack_snorm_4x8:
1808    case ir_unop_pack_unorm_2x16:
1809    case ir_unop_pack_unorm_4x8:
1810    case ir_unop_unpack_snorm_2x16:
1811    case ir_unop_unpack_snorm_4x8:
1812    case ir_unop_unpack_unorm_2x16:
1813    case ir_unop_unpack_unorm_4x8:
1814       unreachable("not reached: should be handled by lower_packing_builtins");
1815    case ir_unop_unpack_half_2x16_split_x:
1816    case ir_unop_unpack_half_2x16_split_y:
1817    case ir_binop_pack_half_2x16_split:
1818    case ir_unop_interpolate_at_centroid:
1819    case ir_binop_interpolate_at_sample:
1820    case ir_binop_interpolate_at_offset:
1821       unreachable("not reached: should not occur in vertex shader");
1822    case ir_binop_ldexp:
1823       unreachable("not reached: should be handled by ldexp_to_arith()");
1824    }
1825 }
1826
1827
1828 void
1829 vec4_visitor::visit(ir_swizzle *ir)
1830 {
1831    src_reg src;
1832    int i = 0;
1833    int swizzle[4];
1834
1835    /* Note that this is only swizzles in expressions, not those on the left
1836     * hand side of an assignment, which do write masking.  See ir_assignment
1837     * for that.
1838     */
1839
1840    ir->val->accept(this);
1841    src = this->result;
1842    assert(src.file != BAD_FILE);
1843
1844    for (i = 0; i < ir->type->vector_elements; i++) {
1845       switch (i) {
1846       case 0:
1847          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1848          break;
1849       case 1:
1850          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1851          break;
1852       case 2:
1853          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1854          break;
1855       case 3:
1856          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1857             break;
1858       }
1859    }
1860    for (; i < 4; i++) {
1861       /* Replicate the last channel out. */
1862       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1863    }
1864
1865    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1866
1867    this->result = src;
1868 }
1869
1870 void
1871 vec4_visitor::visit(ir_dereference_variable *ir)
1872 {
1873    const struct glsl_type *type = ir->type;
1874    dst_reg *reg = variable_storage(ir->var);
1875
1876    if (!reg) {
1877       fail("Failed to find variable storage for %s\n", ir->var->name);
1878       this->result = src_reg(brw_null_reg());
1879       return;
1880    }
1881
1882    this->result = src_reg(*reg);
1883
1884    /* System values get their swizzle from the dst_reg writemask */
1885    if (ir->var->data.mode == ir_var_system_value)
1886       return;
1887
1888    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1889       this->result.swizzle = swizzle_for_size(type->vector_elements);
1890 }
1891
1892
1893 int
1894 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1895 {
1896    /* Under normal circumstances array elements are stored consecutively, so
1897     * the stride is equal to the size of the array element.
1898     */
1899    return type_size(ir->type);
1900 }
1901
1902
1903 void
1904 vec4_visitor::visit(ir_dereference_array *ir)
1905 {
1906    ir_constant *constant_index;
1907    src_reg src;
1908    int array_stride = compute_array_stride(ir);
1909
1910    constant_index = ir->array_index->constant_expression_value();
1911
1912    ir->array->accept(this);
1913    src = this->result;
1914
1915    if (constant_index) {
1916       src.reg_offset += constant_index->value.i[0] * array_stride;
1917    } else {
1918       /* Variable index array dereference.  It eats the "vec4" of the
1919        * base of the array and an index that offsets the Mesa register
1920        * index.
1921        */
1922       ir->array_index->accept(this);
1923
1924       src_reg index_reg;
1925
1926       if (array_stride == 1) {
1927          index_reg = this->result;
1928       } else {
1929          index_reg = src_reg(this, glsl_type::int_type);
1930
1931          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1932       }
1933
1934       if (src.reladdr) {
1935          src_reg temp = src_reg(this, glsl_type::int_type);
1936
1937          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1938
1939          index_reg = temp;
1940       }
1941
1942       src.reladdr = ralloc(mem_ctx, src_reg);
1943       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1944    }
1945
1946    /* If the type is smaller than a vec4, replicate the last channel out. */
1947    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1948       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1949    else
1950       src.swizzle = BRW_SWIZZLE_NOOP;
1951    src.type = brw_type_for_base_type(ir->type);
1952
1953    this->result = src;
1954 }
1955
1956 void
1957 vec4_visitor::visit(ir_dereference_record *ir)
1958 {
1959    unsigned int i;
1960    const glsl_type *struct_type = ir->record->type;
1961    int offset = 0;
1962
1963    ir->record->accept(this);
1964
1965    for (i = 0; i < struct_type->length; i++) {
1966       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1967          break;
1968       offset += type_size(struct_type->fields.structure[i].type);
1969    }
1970
1971    /* If the type is smaller than a vec4, replicate the last channel out. */
1972    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1973       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1974    else
1975       this->result.swizzle = BRW_SWIZZLE_NOOP;
1976    this->result.type = brw_type_for_base_type(ir->type);
1977
1978    this->result.reg_offset += offset;
1979 }
1980
1981 /**
1982  * We want to be careful in assignment setup to hit the actual storage
1983  * instead of potentially using a temporary like we might with the
1984  * ir_dereference handler.
1985  */
1986 static dst_reg
1987 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1988 {
1989    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1990     * access of a vector, it must be separated into a series conditional moves
1991     * before reaching this point (see ir_vec_index_to_cond_assign).
1992     */
1993    assert(ir->as_dereference());
1994    ir_dereference_array *deref_array = ir->as_dereference_array();
1995    if (deref_array) {
1996       assert(!deref_array->array->type->is_vector());
1997    }
1998
1999    /* Use the rvalue deref handler for the most part.  We'll ignore
2000     * swizzles in it and write swizzles using writemask, though.
2001     */
2002    ir->accept(v);
2003    return dst_reg(v->result);
2004 }
2005
2006 void
2007 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2008                               const struct glsl_type *type,
2009                               enum brw_predicate predicate)
2010 {
2011    if (type->base_type == GLSL_TYPE_STRUCT) {
2012       for (unsigned int i = 0; i < type->length; i++) {
2013          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2014       }
2015       return;
2016    }
2017
2018    if (type->is_array()) {
2019       for (unsigned int i = 0; i < type->length; i++) {
2020          emit_block_move(dst, src, type->fields.array, predicate);
2021       }
2022       return;
2023    }
2024
2025    if (type->is_matrix()) {
2026       const struct glsl_type *vec_type;
2027
2028       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2029                                          type->vector_elements, 1);
2030
2031       for (int i = 0; i < type->matrix_columns; i++) {
2032          emit_block_move(dst, src, vec_type, predicate);
2033       }
2034       return;
2035    }
2036
2037    assert(type->is_scalar() || type->is_vector());
2038
2039    dst->type = brw_type_for_base_type(type);
2040    src->type = dst->type;
2041
2042    dst->writemask = (1 << type->vector_elements) - 1;
2043
2044    src->swizzle = swizzle_for_size(type->vector_elements);
2045
2046    vec4_instruction *inst = emit(MOV(*dst, *src));
2047    inst->predicate = predicate;
2048
2049    dst->reg_offset++;
2050    src->reg_offset++;
2051 }
2052
2053
2054 /* If the RHS processing resulted in an instruction generating a
2055  * temporary value, and it would be easy to rewrite the instruction to
2056  * generate its result right into the LHS instead, do so.  This ends
2057  * up reliably removing instructions where it can be tricky to do so
2058  * later without real UD chain information.
2059  */
2060 bool
2061 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2062                                      dst_reg dst,
2063                                      src_reg src,
2064                                      vec4_instruction *pre_rhs_inst,
2065                                      vec4_instruction *last_rhs_inst)
2066 {
2067    /* This could be supported, but it would take more smarts. */
2068    if (ir->condition)
2069       return false;
2070
2071    if (pre_rhs_inst == last_rhs_inst)
2072       return false; /* No instructions generated to work with. */
2073
2074    /* Make sure the last instruction generated our source reg. */
2075    if (src.file != GRF ||
2076        src.file != last_rhs_inst->dst.file ||
2077        src.reg != last_rhs_inst->dst.reg ||
2078        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2079        src.reladdr ||
2080        src.abs ||
2081        src.negate ||
2082        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2083       return false;
2084
2085    /* Check that that last instruction fully initialized the channels
2086     * we want to use, in the order we want to use them.  We could
2087     * potentially reswizzle the operands of many instructions so that
2088     * we could handle out of order channels, but don't yet.
2089     */
2090
2091    for (unsigned i = 0; i < 4; i++) {
2092       if (dst.writemask & (1 << i)) {
2093          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2094             return false;
2095
2096          if (BRW_GET_SWZ(src.swizzle, i) != i)
2097             return false;
2098       }
2099    }
2100
2101    /* Success!  Rewrite the instruction. */
2102    last_rhs_inst->dst.file = dst.file;
2103    last_rhs_inst->dst.reg = dst.reg;
2104    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2105    last_rhs_inst->dst.reladdr = dst.reladdr;
2106    last_rhs_inst->dst.writemask &= dst.writemask;
2107
2108    return true;
2109 }
2110
2111 void
2112 vec4_visitor::visit(ir_assignment *ir)
2113 {
2114    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2115    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2116
2117    if (!ir->lhs->type->is_scalar() &&
2118        !ir->lhs->type->is_vector()) {
2119       ir->rhs->accept(this);
2120       src_reg src = this->result;
2121
2122       if (ir->condition) {
2123          emit_bool_to_cond_code(ir->condition, &predicate);
2124       }
2125
2126       /* emit_block_move doesn't account for swizzles in the source register.
2127        * This should be ok, since the source register is a structure or an
2128        * array, and those can't be swizzled.  But double-check to be sure.
2129        */
2130       assert(src.swizzle ==
2131              (ir->rhs->type->is_matrix()
2132               ? swizzle_for_size(ir->rhs->type->vector_elements)
2133               : BRW_SWIZZLE_NOOP));
2134
2135       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2136       return;
2137    }
2138
2139    /* Now we're down to just a scalar/vector with writemasks. */
2140    int i;
2141
2142    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2143    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2144
2145    ir->rhs->accept(this);
2146
2147    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2148
2149    src_reg src = this->result;
2150
2151    int swizzles[4];
2152    int first_enabled_chan = 0;
2153    int src_chan = 0;
2154
2155    assert(ir->lhs->type->is_vector() ||
2156           ir->lhs->type->is_scalar());
2157    dst.writemask = ir->write_mask;
2158
2159    for (int i = 0; i < 4; i++) {
2160       if (dst.writemask & (1 << i)) {
2161          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2162          break;
2163       }
2164    }
2165
2166    /* Swizzle a small RHS vector into the channels being written.
2167     *
2168     * glsl ir treats write_mask as dictating how many channels are
2169     * present on the RHS while in our instructions we need to make
2170     * those channels appear in the slots of the vec4 they're written to.
2171     */
2172    for (int i = 0; i < 4; i++) {
2173       if (dst.writemask & (1 << i))
2174          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2175       else
2176          swizzles[i] = first_enabled_chan;
2177    }
2178    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2179                               swizzles[2], swizzles[3]);
2180
2181    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2182       return;
2183    }
2184
2185    if (ir->condition) {
2186       emit_bool_to_cond_code(ir->condition, &predicate);
2187    }
2188
2189    for (i = 0; i < type_size(ir->lhs->type); i++) {
2190       vec4_instruction *inst = emit(MOV(dst, src));
2191       inst->predicate = predicate;
2192
2193       dst.reg_offset++;
2194       src.reg_offset++;
2195    }
2196 }
2197
2198 void
2199 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2200 {
2201    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2202       foreach_in_list(ir_constant, field_value, &ir->components) {
2203          emit_constant_values(dst, field_value);
2204       }
2205       return;
2206    }
2207
2208    if (ir->type->is_array()) {
2209       for (unsigned int i = 0; i < ir->type->length; i++) {
2210          emit_constant_values(dst, ir->array_elements[i]);
2211       }
2212       return;
2213    }
2214
2215    if (ir->type->is_matrix()) {
2216       for (int i = 0; i < ir->type->matrix_columns; i++) {
2217          float *vec = &ir->value.f[i * ir->type->vector_elements];
2218
2219          for (int j = 0; j < ir->type->vector_elements; j++) {
2220             dst->writemask = 1 << j;
2221             dst->type = BRW_REGISTER_TYPE_F;
2222
2223             emit(MOV(*dst, src_reg(vec[j])));
2224          }
2225          dst->reg_offset++;
2226       }
2227       return;
2228    }
2229
2230    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2231
2232    for (int i = 0; i < ir->type->vector_elements; i++) {
2233       if (!(remaining_writemask & (1 << i)))
2234          continue;
2235
2236       dst->writemask = 1 << i;
2237       dst->type = brw_type_for_base_type(ir->type);
2238
2239       /* Find other components that match the one we're about to
2240        * write.  Emits fewer instructions for things like vec4(0.5,
2241        * 1.5, 1.5, 1.5).
2242        */
2243       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2244          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2245             if (ir->value.b[i] == ir->value.b[j])
2246                dst->writemask |= (1 << j);
2247          } else {
2248             /* u, i, and f storage all line up, so no need for a
2249              * switch case for comparing each type.
2250              */
2251             if (ir->value.u[i] == ir->value.u[j])
2252                dst->writemask |= (1 << j);
2253          }
2254       }
2255
2256       switch (ir->type->base_type) {
2257       case GLSL_TYPE_FLOAT:
2258          emit(MOV(*dst, src_reg(ir->value.f[i])));
2259          break;
2260       case GLSL_TYPE_INT:
2261          emit(MOV(*dst, src_reg(ir->value.i[i])));
2262          break;
2263       case GLSL_TYPE_UINT:
2264          emit(MOV(*dst, src_reg(ir->value.u[i])));
2265          break;
2266       case GLSL_TYPE_BOOL:
2267          emit(MOV(*dst,
2268                   src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2269                                               : 0)));
2270          break;
2271       default:
2272          unreachable("Non-float/uint/int/bool constant");
2273       }
2274
2275       remaining_writemask &= ~dst->writemask;
2276    }
2277    dst->reg_offset++;
2278 }
2279
2280 void
2281 vec4_visitor::visit(ir_constant *ir)
2282 {
2283    dst_reg dst = dst_reg(this, ir->type);
2284    this->result = src_reg(dst);
2285
2286    emit_constant_values(&dst, ir);
2287 }
2288
2289 void
2290 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2291 {
2292    ir_dereference *deref = static_cast<ir_dereference *>(
2293       ir->actual_parameters.get_head());
2294    ir_variable *location = deref->variable_referenced();
2295    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2296                           location->data.binding);
2297
2298    /* Calculate the surface offset */
2299    src_reg offset(this, glsl_type::uint_type);
2300    ir_dereference_array *deref_array = deref->as_dereference_array();
2301    if (deref_array) {
2302       deref_array->array_index->accept(this);
2303
2304       src_reg tmp(this, glsl_type::uint_type);
2305       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2306       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2307    } else {
2308       offset = location->data.atomic.offset;
2309    }
2310
2311    /* Emit the appropriate machine instruction */
2312    const char *callee = ir->callee->function_name();
2313    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2314
2315    if (!strcmp("__intrinsic_atomic_read", callee)) {
2316       emit_untyped_surface_read(surf_index, dst, offset);
2317
2318    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2319       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2320                           src_reg(), src_reg());
2321
2322    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2323       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2324                           src_reg(), src_reg());
2325    }
2326 }
2327
2328 void
2329 vec4_visitor::visit(ir_call *ir)
2330 {
2331    const char *callee = ir->callee->function_name();
2332
2333    if (!strcmp("__intrinsic_atomic_read", callee) ||
2334        !strcmp("__intrinsic_atomic_increment", callee) ||
2335        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2336       visit_atomic_counter_intrinsic(ir);
2337    } else {
2338       unreachable("Unsupported intrinsic.");
2339    }
2340 }
2341
2342 src_reg
2343 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2344 {
2345    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2346    inst->base_mrf = 2;
2347    inst->mlen = 1;
2348    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2349    inst->dst.writemask = WRITEMASK_XYZW;
2350
2351    inst->src[1] = sampler;
2352
2353    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2354    int param_base = inst->base_mrf;
2355    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2356    int zero_mask = 0xf & ~coord_mask;
2357
2358    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2359             coordinate));
2360
2361    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2362             src_reg(0)));
2363
2364    emit(inst);
2365    return src_reg(inst->dst);
2366 }
2367
2368 static bool
2369 is_high_sampler(struct brw_context *brw, src_reg sampler)
2370 {
2371    if (brw->gen < 8 && !brw->is_haswell)
2372       return false;
2373
2374    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2375 }
2376
2377 void
2378 vec4_visitor::visit(ir_texture *ir)
2379 {
2380    uint32_t sampler =
2381       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2382
2383    ir_rvalue *nonconst_sampler_index =
2384       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2385
2386    /* Handle non-constant sampler array indexing */
2387    src_reg sampler_reg;
2388    if (nonconst_sampler_index) {
2389       /* The highest sampler which may be used by this operation is
2390        * the last element of the array. Mark it here, because the generator
2391        * doesn't have enough information to determine the bound.
2392        */
2393       uint32_t array_size = ir->sampler->as_dereference_array()
2394          ->array->type->array_size();
2395
2396       uint32_t max_used = sampler + array_size - 1;
2397       if (ir->op == ir_tg4 && brw->gen < 8) {
2398          max_used += prog_data->base.binding_table.gather_texture_start;
2399       } else {
2400          max_used += prog_data->base.binding_table.texture_start;
2401       }
2402
2403       brw_mark_surface_used(&prog_data->base, max_used);
2404
2405       /* Emit code to evaluate the actual indexing expression */
2406       nonconst_sampler_index->accept(this);
2407       dst_reg temp(this, glsl_type::uint_type);
2408       emit(ADD(temp, this->result, src_reg(sampler)))
2409          ->force_writemask_all = true;
2410       sampler_reg = src_reg(temp);
2411    } else {
2412       /* Single sampler, or constant array index; the indexing expression
2413        * is just an immediate.
2414        */
2415       sampler_reg = src_reg(sampler);
2416    }
2417
2418    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2419     * emitting anything other than setting up the constant result.
2420     */
2421    if (ir->op == ir_tg4) {
2422       ir_constant *chan = ir->lod_info.component->as_constant();
2423       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2424       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2425          dst_reg result(this, ir->type);
2426          this->result = src_reg(result);
2427          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2428          return;
2429       }
2430    }
2431
2432    /* Should be lowered by do_lower_texture_projection */
2433    assert(!ir->projector);
2434
2435    /* Should be lowered */
2436    assert(!ir->offset || !ir->offset->type->is_array());
2437
2438    /* Generate code to compute all the subexpression trees.  This has to be
2439     * done before loading any values into MRFs for the sampler message since
2440     * generating these values may involve SEND messages that need the MRFs.
2441     */
2442    src_reg coordinate;
2443    if (ir->coordinate) {
2444       ir->coordinate->accept(this);
2445       coordinate = this->result;
2446    }
2447
2448    src_reg shadow_comparitor;
2449    if (ir->shadow_comparitor) {
2450       ir->shadow_comparitor->accept(this);
2451       shadow_comparitor = this->result;
2452    }
2453
2454    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2455    src_reg offset_value;
2456    if (has_nonconstant_offset) {
2457       ir->offset->accept(this);
2458       offset_value = src_reg(this->result);
2459    }
2460
2461    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2462    src_reg lod, dPdx, dPdy, sample_index, mcs;
2463    switch (ir->op) {
2464    case ir_tex:
2465       lod = src_reg(0.0f);
2466       lod_type = glsl_type::float_type;
2467       break;
2468    case ir_txf:
2469    case ir_txl:
2470    case ir_txs:
2471       ir->lod_info.lod->accept(this);
2472       lod = this->result;
2473       lod_type = ir->lod_info.lod->type;
2474       break;
2475    case ir_query_levels:
2476       lod = src_reg(0);
2477       lod_type = glsl_type::int_type;
2478       break;
2479    case ir_txf_ms:
2480       ir->lod_info.sample_index->accept(this);
2481       sample_index = this->result;
2482       sample_index_type = ir->lod_info.sample_index->type;
2483
2484       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2485          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2486       else
2487          mcs = src_reg(0u);
2488       break;
2489    case ir_txd:
2490       ir->lod_info.grad.dPdx->accept(this);
2491       dPdx = this->result;
2492
2493       ir->lod_info.grad.dPdy->accept(this);
2494       dPdy = this->result;
2495
2496       lod_type = ir->lod_info.grad.dPdx->type;
2497       break;
2498    case ir_txb:
2499    case ir_lod:
2500    case ir_tg4:
2501       break;
2502    }
2503
2504    enum opcode opcode;
2505    switch (ir->op) {
2506    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2507    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2508    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2509    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2510    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2511    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2512    case ir_tg4: opcode = has_nonconstant_offset
2513                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2514    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2515    case ir_txb:
2516       unreachable("TXB is not valid for vertex shaders.");
2517    case ir_lod:
2518       unreachable("LOD is not valid for vertex shaders.");
2519    default:
2520       unreachable("Unrecognized tex op");
2521    }
2522
2523    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2524
2525    if (ir->offset != NULL && ir->op != ir_txf)
2526       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2527
2528    /* Stuff the channel select bits in the top of the texture offset */
2529    if (ir->op == ir_tg4)
2530       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2531
2532    /* The message header is necessary for:
2533     * - Gen4 (always)
2534     * - Texel offsets
2535     * - Gather channel selection
2536     * - Sampler indices too large to fit in a 4-bit value.
2537     */
2538    inst->header_present =
2539       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2540       is_high_sampler(brw, sampler_reg);
2541    inst->base_mrf = 2;
2542    inst->mlen = inst->header_present + 1; /* always at least one */
2543    inst->dst = dst_reg(this, ir->type);
2544    inst->dst.writemask = WRITEMASK_XYZW;
2545    inst->shadow_compare = ir->shadow_comparitor != NULL;
2546
2547    inst->src[1] = sampler_reg;
2548
2549    /* MRF for the first parameter */
2550    int param_base = inst->base_mrf + inst->header_present;
2551
2552    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2553       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2554       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2555    } else {
2556       /* Load the coordinate */
2557       /* FINISHME: gl_clamp_mask and saturate */
2558       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2559       int zero_mask = 0xf & ~coord_mask;
2560
2561       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2562                coordinate));
2563
2564       if (zero_mask != 0) {
2565          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2566                   src_reg(0)));
2567       }
2568       /* Load the shadow comparitor */
2569       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2570          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2571                           WRITEMASK_X),
2572                   shadow_comparitor));
2573          inst->mlen++;
2574       }
2575
2576       /* Load the LOD info */
2577       if (ir->op == ir_tex || ir->op == ir_txl) {
2578          int mrf, writemask;
2579          if (brw->gen >= 5) {
2580             mrf = param_base + 1;
2581             if (ir->shadow_comparitor) {
2582                writemask = WRITEMASK_Y;
2583                /* mlen already incremented */
2584             } else {
2585                writemask = WRITEMASK_X;
2586                inst->mlen++;
2587             }
2588          } else /* brw->gen == 4 */ {
2589             mrf = param_base;
2590             writemask = WRITEMASK_W;
2591          }
2592          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2593       } else if (ir->op == ir_txf) {
2594          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2595       } else if (ir->op == ir_txf_ms) {
2596          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2597                   sample_index));
2598          if (brw->gen >= 7) {
2599             /* MCS data is in the first channel of `mcs`, but we need to get it into
2600              * the .y channel of the second vec4 of params, so replicate .x across
2601              * the whole vec4 and then mask off everything except .y
2602              */
2603             mcs.swizzle = BRW_SWIZZLE_XXXX;
2604             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2605                      mcs));
2606          }
2607          inst->mlen++;
2608       } else if (ir->op == ir_txd) {
2609          const glsl_type *type = lod_type;
2610
2611          if (brw->gen >= 5) {
2612             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2613             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2614             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2615             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2616             inst->mlen++;
2617
2618             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2619                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2620                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2621                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2622                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2623                inst->mlen++;
2624
2625                if (ir->shadow_comparitor) {
2626                   emit(MOV(dst_reg(MRF, param_base + 2,
2627                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2628                            shadow_comparitor));
2629                }
2630             }
2631          } else /* brw->gen == 4 */ {
2632             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2633             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2634             inst->mlen += 2;
2635          }
2636       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2637          if (ir->shadow_comparitor) {
2638             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2639                      shadow_comparitor));
2640          }
2641
2642          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2643                   offset_value));
2644          inst->mlen++;
2645       }
2646    }
2647
2648    emit(inst);
2649
2650    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2651     * spec requires layers.
2652     */
2653    if (ir->op == ir_txs) {
2654       glsl_type const *type = ir->sampler->type;
2655       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2656           type->sampler_array) {
2657          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2658                    writemask(inst->dst, WRITEMASK_Z),
2659                    src_reg(inst->dst), src_reg(6));
2660       }
2661    }
2662
2663    if (brw->gen == 6 && ir->op == ir_tg4) {
2664       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2665    }
2666
2667    swizzle_result(ir, src_reg(inst->dst), sampler);
2668 }
2669
2670 /**
2671  * Apply workarounds for Gen6 gather with UINT/SINT
2672  */
2673 void
2674 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2675 {
2676    if (!wa)
2677       return;
2678
2679    int width = (wa & WA_8BIT) ? 8 : 16;
2680    dst_reg dst_f = dst;
2681    dst_f.type = BRW_REGISTER_TYPE_F;
2682
2683    /* Convert from UNORM to UINT */
2684    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2685    emit(MOV(dst, src_reg(dst_f)));
2686
2687    if (wa & WA_SIGN) {
2688       /* Reinterpret the UINT value as a signed INT value by
2689        * shifting the sign bit into place, then shifting back
2690        * preserving sign.
2691        */
2692       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2693       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2694    }
2695 }
2696
2697 /**
2698  * Set up the gather channel based on the swizzle, for gather4.
2699  */
2700 uint32_t
2701 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2702 {
2703    ir_constant *chan = ir->lod_info.component->as_constant();
2704    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2705    switch (swiz) {
2706       case SWIZZLE_X: return 0;
2707       case SWIZZLE_Y:
2708          /* gather4 sampler is broken for green channel on RG32F --
2709           * we must ask for blue instead.
2710           */
2711          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2712             return 2;
2713          return 1;
2714       case SWIZZLE_Z: return 2;
2715       case SWIZZLE_W: return 3;
2716       default:
2717          unreachable("Not reached"); /* zero, one swizzles handled already */
2718    }
2719 }
2720
2721 void
2722 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2723 {
2724    int s = key->tex.swizzles[sampler];
2725
2726    this->result = src_reg(this, ir->type);
2727    dst_reg swizzled_result(this->result);
2728
2729    if (ir->op == ir_query_levels) {
2730       /* # levels is in .w */
2731       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2732       emit(MOV(swizzled_result, orig_val));
2733       return;
2734    }
2735
2736    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2737                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2738       emit(MOV(swizzled_result, orig_val));
2739       return;
2740    }
2741
2742
2743    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2744    int swizzle[4] = {0};
2745
2746    for (int i = 0; i < 4; i++) {
2747       switch (GET_SWZ(s, i)) {
2748       case SWIZZLE_ZERO:
2749          zero_mask |= (1 << i);
2750          break;
2751       case SWIZZLE_ONE:
2752          one_mask |= (1 << i);
2753          break;
2754       default:
2755          copy_mask |= (1 << i);
2756          swizzle[i] = GET_SWZ(s, i);
2757          break;
2758       }
2759    }
2760
2761    if (copy_mask) {
2762       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2763       swizzled_result.writemask = copy_mask;
2764       emit(MOV(swizzled_result, orig_val));
2765    }
2766
2767    if (zero_mask) {
2768       swizzled_result.writemask = zero_mask;
2769       emit(MOV(swizzled_result, src_reg(0.0f)));
2770    }
2771
2772    if (one_mask) {
2773       swizzled_result.writemask = one_mask;
2774       emit(MOV(swizzled_result, src_reg(1.0f)));
2775    }
2776 }
2777
2778 void
2779 vec4_visitor::visit(ir_return *)
2780 {
2781    unreachable("not reached");
2782 }
2783
2784 void
2785 vec4_visitor::visit(ir_discard *)
2786 {
2787    unreachable("not reached");
2788 }
2789
2790 void
2791 vec4_visitor::visit(ir_if *ir)
2792 {
2793    /* Don't point the annotation at the if statement, because then it plus
2794     * the then and else blocks get printed.
2795     */
2796    this->base_ir = ir->condition;
2797
2798    if (brw->gen == 6) {
2799       emit_if_gen6(ir);
2800    } else {
2801       enum brw_predicate predicate;
2802       emit_bool_to_cond_code(ir->condition, &predicate);
2803       emit(IF(predicate));
2804    }
2805
2806    visit_instructions(&ir->then_instructions);
2807
2808    if (!ir->else_instructions.is_empty()) {
2809       this->base_ir = ir->condition;
2810       emit(BRW_OPCODE_ELSE);
2811
2812       visit_instructions(&ir->else_instructions);
2813    }
2814
2815    this->base_ir = ir->condition;
2816    emit(BRW_OPCODE_ENDIF);
2817 }
2818
2819 void
2820 vec4_visitor::visit(ir_emit_vertex *)
2821 {
2822    unreachable("not reached");
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_end_primitive *)
2827 {
2828    unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2833                                   dst_reg dst, src_reg offset,
2834                                   src_reg src0, src_reg src1)
2835 {
2836    unsigned mlen = 0;
2837
2838    /* Set the atomic operation offset. */
2839    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2840    mlen++;
2841
2842    /* Set the atomic operation arguments. */
2843    if (src0.file != BAD_FILE) {
2844       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2845       mlen++;
2846    }
2847
2848    if (src1.file != BAD_FILE) {
2849       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2850       mlen++;
2851    }
2852
2853    /* Emit the instruction.  Note that this maps to the normal SIMD8
2854     * untyped atomic message on Ivy Bridge, but that's OK because
2855     * unused channels will be masked out.
2856     */
2857    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2858                                  src_reg(atomic_op), src_reg(surf_index));
2859    inst->base_mrf = 0;
2860    inst->mlen = mlen;
2861 }
2862
2863 void
2864 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2865                                         src_reg offset)
2866 {
2867    /* Set the surface read offset. */
2868    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2869
2870    /* Emit the instruction.  Note that this maps to the normal SIMD8
2871     * untyped surface read message, but that's OK because unused
2872     * channels will be masked out.
2873     */
2874    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2875                                  dst, src_reg(surf_index));
2876    inst->base_mrf = 0;
2877    inst->mlen = 1;
2878 }
2879
2880 void
2881 vec4_visitor::emit_ndc_computation()
2882 {
2883    /* Get the position */
2884    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2885
2886    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2887    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2888    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2889
2890    current_annotation = "NDC";
2891    dst_reg ndc_w = ndc;
2892    ndc_w.writemask = WRITEMASK_W;
2893    src_reg pos_w = pos;
2894    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2895    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2896
2897    dst_reg ndc_xyz = ndc;
2898    ndc_xyz.writemask = WRITEMASK_XYZ;
2899
2900    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2901 }
2902
2903 void
2904 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2905 {
2906    if (brw->gen < 6 &&
2907        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2908         key->userclip_active || brw->has_negative_rhw_bug)) {
2909       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2910       dst_reg header1_w = header1;
2911       header1_w.writemask = WRITEMASK_W;
2912
2913       emit(MOV(header1, 0u));
2914
2915       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2916          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2917
2918          current_annotation = "Point size";
2919          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2920          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2921       }
2922
2923       if (key->userclip_active) {
2924          current_annotation = "Clipping flags";
2925          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2926          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2927
2928          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2929          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2930          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2931
2932          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2934          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2935          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2936       }
2937
2938       /* i965 clipping workaround:
2939        * 1) Test for -ve rhw
2940        * 2) If set,
2941        *      set ndc = (0,0,0,0)
2942        *      set ucp[6] = 1
2943        *
2944        * Later, clipping will detect ucp[6] and ensure the primitive is
2945        * clipped against all fixed planes.
2946        */
2947       if (brw->has_negative_rhw_bug) {
2948          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2949          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2950          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2951          vec4_instruction *inst;
2952          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2953          inst->predicate = BRW_PREDICATE_NORMAL;
2954          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2955          inst->predicate = BRW_PREDICATE_NORMAL;
2956       }
2957
2958       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2959    } else if (brw->gen < 6) {
2960       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2961    } else {
2962       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2963       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2965                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2966       }
2967       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2968          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2969                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2970       }
2971       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2972          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2973                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2974       }
2975    }
2976 }
2977
2978 void
2979 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2980 {
2981    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2982     *
2983     *     "If a linked set of shaders forming the vertex stage contains no
2984     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2985     *     application has requested clipping against user clip planes through
2986     *     the API, then the coordinate written to gl_Position is used for
2987     *     comparison against the user clip planes."
2988     *
2989     * This function is only called if the shader didn't write to
2990     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2991     * if the user wrote to it; otherwise we use gl_Position.
2992     */
2993    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2994    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2995       clip_vertex = VARYING_SLOT_POS;
2996    }
2997
2998    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2999         ++i) {
3000       reg.writemask = 1 << i;
3001       emit(DP4(reg,
3002                src_reg(output_reg[clip_vertex]),
3003                src_reg(this->userplane[i + offset])));
3004    }
3005 }
3006
3007 void
3008 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3009 {
3010    assert (varying < VARYING_SLOT_MAX);
3011    reg.type = output_reg[varying].type;
3012    current_annotation = output_reg_annotation[varying];
3013    /* Copy the register, saturating if necessary */
3014    vec4_instruction *inst = emit(MOV(reg,
3015                                      src_reg(output_reg[varying])));
3016    if ((varying == VARYING_SLOT_COL0 ||
3017         varying == VARYING_SLOT_COL1 ||
3018         varying == VARYING_SLOT_BFC0 ||
3019         varying == VARYING_SLOT_BFC1) &&
3020        key->clamp_vertex_color) {
3021       inst->saturate = true;
3022    }
3023 }
3024
3025 void
3026 vec4_visitor::emit_urb_slot(int mrf, int varying)
3027 {
3028    struct brw_reg hw_reg = brw_message_reg(mrf);
3029    dst_reg reg = dst_reg(MRF, mrf);
3030    reg.type = BRW_REGISTER_TYPE_F;
3031
3032    switch (varying) {
3033    case VARYING_SLOT_PSIZ:
3034       /* PSIZ is always in slot 0, and is coupled with other flags. */
3035       current_annotation = "indices, point width, clip flags";
3036       emit_psiz_and_flags(hw_reg);
3037       break;
3038    case BRW_VARYING_SLOT_NDC:
3039       current_annotation = "NDC";
3040       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3041       break;
3042    case VARYING_SLOT_POS:
3043       current_annotation = "gl_Position";
3044       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3045       break;
3046    case VARYING_SLOT_EDGE:
3047       /* This is present when doing unfilled polygons.  We're supposed to copy
3048        * the edge flag from the user-provided vertex array
3049        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3050        * of that attribute (starts as 1.0f).  This is then used in clipping to
3051        * determine which edges should be drawn as wireframe.
3052        */
3053       current_annotation = "edge flag";
3054       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3055                                     glsl_type::float_type, WRITEMASK_XYZW))));
3056       break;
3057    case BRW_VARYING_SLOT_PAD:
3058       /* No need to write to this slot */
3059       break;
3060    default:
3061       emit_generic_urb_slot(reg, varying);
3062       break;
3063    }
3064 }
3065
3066 static int
3067 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3068 {
3069    if (brw->gen >= 6) {
3070       /* URB data written (does not include the message header reg) must
3071        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3072        * section 5.4.3.2.2: URB_INTERLEAVED.
3073        *
3074        * URB entries are allocated on a multiple of 1024 bits, so an
3075        * extra 128 bits written here to make the end align to 256 is
3076        * no problem.
3077        */
3078       if ((mlen % 2) != 1)
3079          mlen++;
3080    }
3081
3082    return mlen;
3083 }
3084
3085
3086 /**
3087  * Generates the VUE payload plus the necessary URB write instructions to
3088  * output it.
3089  *
3090  * The VUE layout is documented in Volume 2a.
3091  */
3092 void
3093 vec4_visitor::emit_vertex()
3094 {
3095    /* MRF 0 is reserved for the debugger, so start with message header
3096     * in MRF 1.
3097     */
3098    int base_mrf = 1;
3099    int mrf = base_mrf;
3100    /* In the process of generating our URB write message contents, we
3101     * may need to unspill a register or load from an array.  Those
3102     * reads would use MRFs 14-15.
3103     */
3104    int max_usable_mrf = 13;
3105
3106    /* The following assertion verifies that max_usable_mrf causes an
3107     * even-numbered amount of URB write data, which will meet gen6's
3108     * requirements for length alignment.
3109     */
3110    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3111
3112    /* First mrf is the g0-based message header containing URB handles and
3113     * such.
3114     */
3115    emit_urb_write_header(mrf++);
3116
3117    if (brw->gen < 6) {
3118       emit_ndc_computation();
3119    }
3120
3121    /* Lower legacy ff and ClipVertex clipping to clip distances */
3122    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3123       current_annotation = "user clip distances";
3124
3125       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3126       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3127
3128       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3129       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3130    }
3131
3132    /* We may need to split this up into several URB writes, so do them in a
3133     * loop.
3134     */
3135    int slot = 0;
3136    bool complete = false;
3137    do {
3138       /* URB offset is in URB row increments, and each of our MRFs is half of
3139        * one of those, since we're doing interleaved writes.
3140        */
3141       int offset = slot / 2;
3142
3143       mrf = base_mrf + 1;
3144       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3145          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3146
3147          /* If this was max_usable_mrf, we can't fit anything more into this
3148           * URB WRITE.
3149           */
3150          if (mrf > max_usable_mrf) {
3151             slot++;
3152             break;
3153          }
3154       }
3155
3156       complete = slot >= prog_data->vue_map.num_slots;
3157       current_annotation = "URB write";
3158       vec4_instruction *inst = emit_urb_write_opcode(complete);
3159       inst->base_mrf = base_mrf;
3160       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3161       inst->offset += offset;
3162    } while(!complete);
3163 }
3164
3165
3166 src_reg
3167 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3168                                  src_reg *reladdr, int reg_offset)
3169 {
3170    /* Because we store the values to scratch interleaved like our
3171     * vertex data, we need to scale the vec4 index by 2.
3172     */
3173    int message_header_scale = 2;
3174
3175    /* Pre-gen6, the message header uses byte offsets instead of vec4
3176     * (16-byte) offset units.
3177     */
3178    if (brw->gen < 6)
3179       message_header_scale *= 16;
3180
3181    if (reladdr) {
3182       src_reg index = src_reg(this, glsl_type::int_type);
3183
3184       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3185       emit_before(inst, MUL(dst_reg(index),
3186                             index, src_reg(message_header_scale)));
3187
3188       return index;
3189    } else {
3190       return src_reg(reg_offset * message_header_scale);
3191    }
3192 }
3193
3194 src_reg
3195 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3196                                        src_reg *reladdr, int reg_offset)
3197 {
3198    if (reladdr) {
3199       src_reg index = src_reg(this, glsl_type::int_type);
3200
3201       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3202
3203       /* Pre-gen6, the message header uses byte offsets instead of vec4
3204        * (16-byte) offset units.
3205        */
3206       if (brw->gen < 6) {
3207          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3208       }
3209
3210       return index;
3211    } else if (brw->gen >= 8) {
3212       /* Store the offset in a GRF so we can send-from-GRF. */
3213       src_reg offset = src_reg(this, glsl_type::int_type);
3214       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3215       return offset;
3216    } else {
3217       int message_header_scale = brw->gen < 6 ? 16 : 1;
3218       return src_reg(reg_offset * message_header_scale);
3219    }
3220 }
3221
3222 /**
3223  * Emits an instruction before @inst to load the value named by @orig_src
3224  * from scratch space at @base_offset to @temp.
3225  *
3226  * @base_offset is measured in 32-byte units (the size of a register).
3227  */
3228 void
3229 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3230                                 dst_reg temp, src_reg orig_src,
3231                                 int base_offset)
3232 {
3233    int reg_offset = base_offset + orig_src.reg_offset;
3234    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3235
3236    emit_before(inst, SCRATCH_READ(temp, index));
3237 }
3238
3239 /**
3240  * Emits an instruction after @inst to store the value to be written
3241  * to @orig_dst to scratch space at @base_offset, from @temp.
3242  *
3243  * @base_offset is measured in 32-byte units (the size of a register).
3244  */
3245 void
3246 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3247 {
3248    int reg_offset = base_offset + inst->dst.reg_offset;
3249    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3250
3251    /* Create a temporary register to store *inst's result in.
3252     *
3253     * We have to be careful in MOVing from our temporary result register in
3254     * the scratch write.  If we swizzle from channels of the temporary that
3255     * weren't initialized, it will confuse live interval analysis, which will
3256     * make spilling fail to make progress.
3257     */
3258    src_reg temp = src_reg(this, glsl_type::vec4_type);
3259    temp.type = inst->dst.type;
3260    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3261    int swizzles[4];
3262    for (int i = 0; i < 4; i++)
3263       if (inst->dst.writemask & (1 << i))
3264          swizzles[i] = i;
3265       else
3266          swizzles[i] = first_writemask_chan;
3267    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3268                                swizzles[2], swizzles[3]);
3269
3270    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3271                                        inst->dst.writemask));
3272    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3273    write->predicate = inst->predicate;
3274    write->ir = inst->ir;
3275    write->annotation = inst->annotation;
3276    inst->insert_after(write);
3277
3278    inst->dst.file = temp.file;
3279    inst->dst.reg = temp.reg;
3280    inst->dst.reg_offset = temp.reg_offset;
3281    inst->dst.reladdr = NULL;
3282 }
3283
3284 /**
3285  * We can't generally support array access in GRF space, because a
3286  * single instruction's destination can only span 2 contiguous
3287  * registers.  So, we send all GRF arrays that get variable index
3288  * access to scratch space.
3289  */
3290 void
3291 vec4_visitor::move_grf_array_access_to_scratch()
3292 {
3293    int scratch_loc[this->virtual_grf_count];
3294
3295    for (int i = 0; i < this->virtual_grf_count; i++) {
3296       scratch_loc[i] = -1;
3297    }
3298
3299    /* First, calculate the set of virtual GRFs that need to be punted
3300     * to scratch due to having any array access on them, and where in
3301     * scratch.
3302     */
3303    foreach_in_list(vec4_instruction, inst, &instructions) {
3304       if (inst->dst.file == GRF && inst->dst.reladdr &&
3305           scratch_loc[inst->dst.reg] == -1) {
3306          scratch_loc[inst->dst.reg] = c->last_scratch;
3307          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3308       }
3309
3310       for (int i = 0 ; i < 3; i++) {
3311          src_reg *src = &inst->src[i];
3312
3313          if (src->file == GRF && src->reladdr &&
3314              scratch_loc[src->reg] == -1) {
3315             scratch_loc[src->reg] = c->last_scratch;
3316             c->last_scratch += this->virtual_grf_sizes[src->reg];
3317          }
3318       }
3319    }
3320
3321    /* Now, for anything that will be accessed through scratch, rewrite
3322     * it to load/store.  Note that this is a _safe list walk, because
3323     * we may generate a new scratch_write instruction after the one
3324     * we're processing.
3325     */
3326    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3327       /* Set up the annotation tracking for new generated instructions. */
3328       base_ir = inst->ir;
3329       current_annotation = inst->annotation;
3330
3331       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3332          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3333       }
3334
3335       for (int i = 0 ; i < 3; i++) {
3336          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3337             continue;
3338
3339          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3340
3341          emit_scratch_read(inst, temp, inst->src[i],
3342                            scratch_loc[inst->src[i].reg]);
3343
3344          inst->src[i].file = temp.file;
3345          inst->src[i].reg = temp.reg;
3346          inst->src[i].reg_offset = temp.reg_offset;
3347          inst->src[i].reladdr = NULL;
3348       }
3349    }
3350 }
3351
3352 /**
3353  * Emits an instruction before @inst to load the value named by @orig_src
3354  * from the pull constant buffer (surface) at @base_offset to @temp.
3355  */
3356 void
3357 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3358                                       dst_reg temp, src_reg orig_src,
3359                                       int base_offset)
3360 {
3361    int reg_offset = base_offset + orig_src.reg_offset;
3362    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3363    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3364    vec4_instruction *load;
3365
3366    if (brw->gen >= 7) {
3367       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3368       grf_offset.type = offset.type;
3369       emit_before(inst, MOV(grf_offset, offset));
3370
3371       load = new(mem_ctx) vec4_instruction(this,
3372                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3373                                            temp, index, src_reg(grf_offset));
3374    } else {
3375       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3376                                            temp, index, offset);
3377       load->base_mrf = 14;
3378       load->mlen = 1;
3379    }
3380    emit_before(inst, load);
3381 }
3382
3383 /**
3384  * Implements array access of uniforms by inserting a
3385  * PULL_CONSTANT_LOAD instruction.
3386  *
3387  * Unlike temporary GRF array access (where we don't support it due to
3388  * the difficulty of doing relative addressing on instruction
3389  * destinations), we could potentially do array access of uniforms
3390  * that were loaded in GRF space as push constants.  In real-world
3391  * usage we've seen, though, the arrays being used are always larger
3392  * than we could load as push constants, so just always move all
3393  * uniform array access out to a pull constant buffer.
3394  */
3395 void
3396 vec4_visitor::move_uniform_array_access_to_pull_constants()
3397 {
3398    int pull_constant_loc[this->uniforms];
3399
3400    for (int i = 0; i < this->uniforms; i++) {
3401       pull_constant_loc[i] = -1;
3402    }
3403
3404    /* Walk through and find array access of uniforms.  Put a copy of that
3405     * uniform in the pull constant buffer.
3406     *
3407     * Note that we don't move constant-indexed accesses to arrays.  No
3408     * testing has been done of the performance impact of this choice.
3409     */
3410    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3411       for (int i = 0 ; i < 3; i++) {
3412          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3413             continue;
3414
3415          int uniform = inst->src[i].reg;
3416
3417          /* If this array isn't already present in the pull constant buffer,
3418           * add it.
3419           */
3420          if (pull_constant_loc[uniform] == -1) {
3421             const gl_constant_value **values =
3422                &stage_prog_data->param[uniform * 4];
3423
3424             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3425
3426             assert(uniform < uniform_array_size);
3427             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3428                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3429                   = values[j];
3430             }
3431          }
3432
3433          /* Set up the annotation tracking for new generated instructions. */
3434          base_ir = inst->ir;
3435          current_annotation = inst->annotation;
3436
3437          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3438
3439          emit_pull_constant_load(inst, temp, inst->src[i],
3440                                  pull_constant_loc[uniform]);
3441
3442          inst->src[i].file = temp.file;
3443          inst->src[i].reg = temp.reg;
3444          inst->src[i].reg_offset = temp.reg_offset;
3445          inst->src[i].reladdr = NULL;
3446       }
3447    }
3448
3449    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3450     * no need to track them as larger-than-vec4 objects.  This will be
3451     * relied on in cutting out unused uniform vectors from push
3452     * constants.
3453     */
3454    split_uniform_registers();
3455 }
3456
3457 void
3458 vec4_visitor::resolve_ud_negate(src_reg *reg)
3459 {
3460    if (reg->type != BRW_REGISTER_TYPE_UD ||
3461        !reg->negate)
3462       return;
3463
3464    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3465    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3466    *reg = temp;
3467 }
3468
3469 vec4_visitor::vec4_visitor(struct brw_context *brw,
3470                            struct brw_vec4_compile *c,
3471                            struct gl_program *prog,
3472                            const struct brw_vec4_prog_key *key,
3473                            struct brw_vec4_prog_data *prog_data,
3474                            struct gl_shader_program *shader_prog,
3475                            gl_shader_stage stage,
3476                            void *mem_ctx,
3477                            bool debug_flag,
3478                            bool no_spills,
3479                            shader_time_shader_type st_base,
3480                            shader_time_shader_type st_written,
3481                            shader_time_shader_type st_reset)
3482    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3483      c(c),
3484      key(key),
3485      prog_data(prog_data),
3486      sanity_param_count(0),
3487      fail_msg(NULL),
3488      first_non_payload_grf(0),
3489      need_all_constants_in_pull_buffer(false),
3490      debug_flag(debug_flag),
3491      no_spills(no_spills),
3492      st_base(st_base),
3493      st_written(st_written),
3494      st_reset(st_reset)
3495 {
3496    this->mem_ctx = mem_ctx;
3497    this->failed = false;
3498
3499    this->base_ir = NULL;
3500    this->current_annotation = NULL;
3501    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3502
3503    this->variable_ht = hash_table_ctor(0,
3504                                        hash_table_pointer_hash,
3505                                        hash_table_pointer_compare);
3506
3507    this->virtual_grf_start = NULL;
3508    this->virtual_grf_end = NULL;
3509    this->virtual_grf_sizes = NULL;
3510    this->virtual_grf_count = 0;
3511    this->virtual_grf_reg_map = NULL;
3512    this->virtual_grf_reg_count = 0;
3513    this->virtual_grf_array_size = 0;
3514    this->live_intervals_valid = false;
3515
3516    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3517
3518    this->uniforms = 0;
3519
3520    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3521     * at least one. See setup_uniforms() in brw_vec4.cpp.
3522     */
3523    this->uniform_array_size = 1;
3524    if (prog_data) {
3525       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3526    }
3527
3528    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3529    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3530 }
3531
3532 vec4_visitor::~vec4_visitor()
3533 {
3534    hash_table_dtor(this->variable_ht);
3535 }
3536
3537
3538 void
3539 vec4_visitor::fail(const char *format, ...)
3540 {
3541    va_list va;
3542    char *msg;
3543
3544    if (failed)
3545       return;
3546
3547    failed = true;
3548
3549    va_start(va, format);
3550    msg = ralloc_vasprintf(mem_ctx, format, va);
3551    va_end(va);
3552    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3553
3554    this->fail_msg = msg;
3555
3556    if (debug_flag) {
3557       fprintf(stderr, "%s",  msg);
3558    }
3559 }
3560
3561 } /* namespace brw */