src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = components;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static gl_constant_value zero = { 0.0 };
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] =
 719             (gl_constant_value *) &clip_planes[i][j];
 720       }
 721       ++this->uniforms;
 722    }
 723 }
 724
 725 /* Our support for builtin uniforms is even scarier than non-builtin.
 726  * It sits on top of the PROG_STATE_VAR parameters that are
 727  * automatically updated from GL context state.
 728  */
 729 void
 730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 731 {
 732    const ir_state_slot *const slots = ir->state_slots;
 733    assert(ir->state_slots != NULL);
 734
 735    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 736       /* This state reference has already been setup by ir_to_mesa,
 737        * but we'll get the same index back here.  We can reference
 738        * ParameterValues directly, since unlike brw_fs.cpp, we never
 739        * add new state references during compile.
 740        */
 741       int index = _mesa_add_state_reference(this->prog->Parameters,
 742                                             (gl_state_index *)slots[i].tokens);
 743       gl_constant_value *values =
 744          &this->prog->Parameters->ParameterValues[index][0];
 745
 746       assert(this->uniforms < uniform_array_size);
 747       this->uniform_vector_size[this->uniforms] = 0;
 748       /* Add each of the unique swizzled channels of the element.
 749        * This will end up matching the size of the glsl_type of this field.
 750        */
 751       int last_swiz = -1;
 752       for (unsigned int j = 0; j < 4; j++) {
 753          int swiz = GET_SWZ(slots[i].swizzle, j);
 754          last_swiz = swiz;
 755
 756          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 757          assert(this->uniforms < uniform_array_size);
 758          if (swiz <= last_swiz)
 759             this->uniform_vector_size[this->uniforms]++;
 760       }
 761       this->uniforms++;
 762    }
 763 }
 764
 765 dst_reg *
 766 vec4_visitor::variable_storage(ir_variable *var)
 767 {
 768    return (dst_reg *)hash_table_find(this->variable_ht, var);
 769 }
 770
 771 void
 772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 773                                      enum brw_predicate *predicate)
 774 {
 775    ir_expression *expr = ir->as_expression();
 776
 777    *predicate = BRW_PREDICATE_NORMAL;
 778
 779    if (expr) {
 780       src_reg op[2];
 781       vec4_instruction *inst;
 782
 783       assert(expr->get_num_operands() <= 2);
 784       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 785          expr->operands[i]->accept(this);
 786          op[i] = this->result;
 787
 788          resolve_ud_negate(&op[i]);
 789       }
 790
 791       switch (expr->operation) {
 792       case ir_unop_logic_not:
 793          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 794          inst->conditional_mod = BRW_CONDITIONAL_Z;
 795          break;
 796
 797       case ir_binop_logic_xor:
 798          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_binop_logic_or:
 803          inst = emit(OR(dst_null_d(), op[0], op[1]));
 804          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 805          break;
 806
 807       case ir_binop_logic_and:
 808          inst = emit(AND(dst_null_d(), op[0], op[1]));
 809          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 810          break;
 811
 812       case ir_unop_f2b:
 813          if (brw->gen >= 6) {
 814             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 815          } else {
 816             inst = emit(MOV(dst_null_f(), op[0]));
 817             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818          }
 819          break;
 820
 821       case ir_unop_i2b:
 822          if (brw->gen >= 6) {
 823             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 824          } else {
 825             inst = emit(MOV(dst_null_d(), op[0]));
 826             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827          }
 828          break;
 829
 830       case ir_binop_all_equal:
 831          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 832          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 833          break;
 834
 835       case ir_binop_any_nequal:
 836          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 837          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 838          break;
 839
 840       case ir_unop_any:
 841          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 842          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 843          break;
 844
 845       case ir_binop_greater:
 846       case ir_binop_gequal:
 847       case ir_binop_less:
 848       case ir_binop_lequal:
 849       case ir_binop_equal:
 850       case ir_binop_nequal:
 851          emit(CMP(dst_null_d(), op[0], op[1],
 852                   brw_conditional_for_comparison(expr->operation)));
 853          break;
 854
 855       default:
 856          unreachable("not reached");
 857       }
 858       return;
 859    }
 860
 861    ir->accept(this);
 862
 863    resolve_ud_negate(&this->result);
 864
 865    if (brw->gen >= 6) {
 866       vec4_instruction *inst = emit(AND(dst_null_d(),
 867                                         this->result, src_reg(1)));
 868       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 869    } else {
 870       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 871       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 872    }
 873 }
 874
 875 /**
 876  * Emit a gen6 IF statement with the comparison folded into the IF
 877  * instruction.
 878  */
 879 void
 880 vec4_visitor::emit_if_gen6(ir_if *ir)
 881 {
 882    ir_expression *expr = ir->condition->as_expression();
 883
 884    if (expr) {
 885       src_reg op[2];
 886       dst_reg temp;
 887
 888       assert(expr->get_num_operands() <= 2);
 889       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 890          expr->operands[i]->accept(this);
 891          op[i] = this->result;
 892       }
 893
 894       switch (expr->operation) {
 895       case ir_unop_logic_not:
 896          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 897          return;
 898
 899       case ir_binop_logic_xor:
 900          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 901          return;
 902
 903       case ir_binop_logic_or:
 904          temp = dst_reg(this, glsl_type::bool_type);
 905          emit(OR(temp, op[0], op[1]));
 906          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 907          return;
 908
 909       case ir_binop_logic_and:
 910          temp = dst_reg(this, glsl_type::bool_type);
 911          emit(AND(temp, op[0], op[1]));
 912          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 913          return;
 914
 915       case ir_unop_f2b:
 916          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 917          return;
 918
 919       case ir_unop_i2b:
 920          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 921          return;
 922
 923       case ir_binop_greater:
 924       case ir_binop_gequal:
 925       case ir_binop_less:
 926       case ir_binop_lequal:
 927       case ir_binop_equal:
 928       case ir_binop_nequal:
 929          emit(IF(op[0], op[1],
 930                  brw_conditional_for_comparison(expr->operation)));
 931          return;
 932
 933       case ir_binop_all_equal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 936          return;
 937
 938       case ir_binop_any_nequal:
 939          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       case ir_unop_any:
 944          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 945          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 946          return;
 947
 948       default:
 949          unreachable("not reached");
 950       }
 951       return;
 952    }
 953
 954    ir->condition->accept(this);
 955
 956    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 957 }
 958
 959 void
 960 vec4_visitor::visit(ir_variable *ir)
 961 {
 962    dst_reg *reg = NULL;
 963
 964    if (variable_storage(ir))
 965       return;
 966
 967    switch (ir->data.mode) {
 968    case ir_var_shader_in:
 969       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 970       break;
 971
 972    case ir_var_shader_out:
 973       reg = new(mem_ctx) dst_reg(this, ir->type);
 974
 975       for (int i = 0; i < type_size(ir->type); i++) {
 976          output_reg[ir->data.location + i] = *reg;
 977          output_reg[ir->data.location + i].reg_offset = i;
 978          output_reg[ir->data.location + i].type =
 979             brw_type_for_base_type(ir->type->get_scalar_type());
 980          output_reg_annotation[ir->data.location + i] = ir->name;
 981       }
 982       break;
 983
 984    case ir_var_auto:
 985    case ir_var_temporary:
 986       reg = new(mem_ctx) dst_reg(this, ir->type);
 987       break;
 988
 989    case ir_var_uniform:
 990       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 991
 992       /* Thanks to the lower_ubo_reference pass, we will see only
 993        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 994        * variables, so no need for them to be in variable_ht.
 995        *
 996        * Atomic counters take no uniform storage, no need to do
 997        * anything here.
 998        */
 999       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000          return;
1001
1002       /* Track how big the whole uniform variable is, in case we need to put a
1003        * copy of its data into pull constants for array access.
1004        */
1005       assert(this->uniforms < uniform_array_size);
1006       this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008       if (!strncmp(ir->name, "gl_", 3)) {
1009          setup_builtin_uniform_values(ir);
1010       } else {
1011          setup_uniform_values(ir);
1012       }
1013       break;
1014
1015    case ir_var_system_value:
1016       reg = make_reg_for_system_value(ir);
1017       break;
1018
1019    default:
1020       unreachable("not reached");
1021    }
1022
1023    reg->type = brw_type_for_base_type(ir->type);
1024    hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030    /* We don't want debugging output to print the whole body of the
1031     * loop as the annotation.
1032     */
1033    this->base_ir = NULL;
1034
1035    emit(BRW_OPCODE_DO);
1036
1037    visit_instructions(&ir->body_instructions);
1038
1039    emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045    switch (ir->mode) {
1046    case ir_loop_jump::jump_break:
1047       emit(BRW_OPCODE_BREAK);
1048       break;
1049    case ir_loop_jump::jump_continue:
1050       emit(BRW_OPCODE_CONTINUE);
1051       break;
1052    }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059    unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065    /* Ignore function bodies other than main() -- we shouldn't see calls to
1066     * them since they should all be inlined.
1067     */
1068    if (strcmp(ir->name, "main") == 0) {
1069       const ir_function_signature *sig;
1070       exec_list empty;
1071
1072       sig = ir->matching_signature(NULL, &empty, false);
1073
1074       assert(sig);
1075
1076       visit_instructions(&sig->body);
1077    }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084    if (!sat_src)
1085       return false;
1086
1087    sat_src->accept(this);
1088    src_reg src = this->result;
1089
1090    this->result = src_reg(this, ir->type);
1091    vec4_instruction *inst;
1092    inst = emit(MOV(dst_reg(this->result), src));
1093    inst->saturate = true;
1094
1095    return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101    /* 3-src instructions were introduced in gen6. */
1102    if (brw->gen < 6)
1103       return false;
1104
1105    /* MAD can only handle floating-point data. */
1106    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107       return false;
1108
1109    ir_rvalue *nonmul = ir->operands[1];
1110    ir_expression *mul = ir->operands[0]->as_expression();
1111
1112    if (!mul || mul->operation != ir_binop_mul) {
1113       nonmul = ir->operands[0];
1114       mul = ir->operands[1]->as_expression();
1115
1116       if (!mul || mul->operation != ir_binop_mul)
1117          return false;
1118    }
1119
1120    nonmul->accept(this);
1121    src_reg src0 = fix_3src_operand(this->result);
1122
1123    mul->operands[0]->accept(this);
1124    src_reg src1 = fix_3src_operand(this->result);
1125
1126    mul->operands[1]->accept(this);
1127    src_reg src2 = fix_3src_operand(this->result);
1128
1129    this->result = src_reg(this, ir->type);
1130    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132    return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138    ir_expression *const cmp = ir->operands[0]->as_expression();
1139
1140    if (cmp == NULL)
1141       return false;
1142
1143    switch (cmp->operation) {
1144    case ir_binop_less:
1145    case ir_binop_greater:
1146    case ir_binop_lequal:
1147    case ir_binop_gequal:
1148    case ir_binop_equal:
1149    case ir_binop_nequal:
1150       break;
1151
1152    default:
1153       return false;
1154    }
1155
1156    cmp->operands[0]->accept(this);
1157    const src_reg cmp_src0 = this->result;
1158
1159    cmp->operands[1]->accept(this);
1160    const src_reg cmp_src1 = this->result;
1161
1162    this->result = src_reg(this, ir->type);
1163
1164    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1165             brw_conditional_for_comparison(cmp->operation)));
1166
1167    /* If the comparison is false, this->result will just happen to be zero.
1168     */
1169    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1170                                        this->result, src_reg(1.0f));
1171    inst->predicate = BRW_PREDICATE_NORMAL;
1172    inst->predicate_inverse = true;
1173
1174    return true;
1175 }
1176
1177 void
1178 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1179                           src_reg src0, src_reg src1)
1180 {
1181    vec4_instruction *inst;
1182
1183    if (brw->gen >= 6) {
1184       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1185       inst->conditional_mod = conditionalmod;
1186    } else {
1187       emit(CMP(dst, src0, src1, conditionalmod));
1188
1189       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1190       inst->predicate = BRW_PREDICATE_NORMAL;
1191    }
1192 }
1193
1194 void
1195 vec4_visitor::emit_lrp(const dst_reg &dst,
1196                        const src_reg &x, const src_reg &y, const src_reg &a)
1197 {
1198    if (brw->gen >= 6) {
1199       /* Note that the instruction's argument order is reversed from GLSL
1200        * and the IR.
1201        */
1202       emit(LRP(dst,
1203                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1204    } else {
1205       /* Earlier generations don't support three source operations, so we
1206        * need to emit x*(1-a) + y*a.
1207        */
1208       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1209       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1210       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1211       y_times_a.writemask           = dst.writemask;
1212       one_minus_a.writemask         = dst.writemask;
1213       x_times_one_minus_a.writemask = dst.writemask;
1214
1215       emit(MUL(y_times_a, y, a));
1216       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1217       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1218       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1219    }
1220 }
1221
1222 void
1223 vec4_visitor::visit(ir_expression *ir)
1224 {
1225    unsigned int operand;
1226    src_reg op[Elements(ir->operands)];
1227    src_reg result_src;
1228    dst_reg result_dst;
1229    vec4_instruction *inst;
1230
1231    if (try_emit_sat(ir))
1232       return;
1233
1234    if (ir->operation == ir_binop_add) {
1235       if (try_emit_mad(ir))
1236          return;
1237    }
1238
1239    if (ir->operation == ir_unop_b2f) {
1240       if (try_emit_b2f_of_compare(ir))
1241          return;
1242    }
1243
1244    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1245       this->result.file = BAD_FILE;
1246       ir->operands[operand]->accept(this);
1247       if (this->result.file == BAD_FILE) {
1248          fprintf(stderr, "Failed to get tree for expression operand:\n");
1249          ir->operands[operand]->fprint(stderr);
1250          exit(1);
1251       }
1252       op[operand] = this->result;
1253
1254       /* Matrix expression operands should have been broken down to vector
1255        * operations already.
1256        */
1257       assert(!ir->operands[operand]->type->is_matrix());
1258    }
1259
1260    int vector_elements = ir->operands[0]->type->vector_elements;
1261    if (ir->operands[1]) {
1262       vector_elements = MAX2(vector_elements,
1263                              ir->operands[1]->type->vector_elements);
1264    }
1265
1266    this->result.file = BAD_FILE;
1267
1268    /* Storage for our result.  Ideally for an assignment we'd be using
1269     * the actual storage for the result here, instead.
1270     */
1271    result_src = src_reg(this, ir->type);
1272    /* convenience for the emit functions below. */
1273    result_dst = dst_reg(result_src);
1274    /* If nothing special happens, this is the result. */
1275    this->result = result_src;
1276    /* Limit writes to the channels that will be used by result_src later.
1277     * This does limit this temp's use as a temporary for multi-instruction
1278     * sequences.
1279     */
1280    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1281
1282    switch (ir->operation) {
1283    case ir_unop_logic_not:
1284       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1285        * ones complement of the whole register, not just bit 0.
1286        */
1287       emit(XOR(result_dst, op[0], src_reg(1)));
1288       break;
1289    case ir_unop_neg:
1290       op[0].negate = !op[0].negate;
1291       emit(MOV(result_dst, op[0]));
1292       break;
1293    case ir_unop_abs:
1294       op[0].abs = true;
1295       op[0].negate = false;
1296       emit(MOV(result_dst, op[0]));
1297       break;
1298
1299    case ir_unop_sign:
1300       if (ir->type->is_float()) {
1301          /* AND(val, 0x80000000) gives the sign bit.
1302           *
1303           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1304           * zero.
1305           */
1306          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1307
1308          op[0].type = BRW_REGISTER_TYPE_UD;
1309          result_dst.type = BRW_REGISTER_TYPE_UD;
1310          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1311
1312          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1313          inst->predicate = BRW_PREDICATE_NORMAL;
1314
1315          this->result.type = BRW_REGISTER_TYPE_F;
1316       } else {
1317          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1318           *               -> non-negative val generates 0x00000000.
1319           *  Predicated OR sets 1 if val is positive.
1320           */
1321          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1322
1323          emit(ASR(result_dst, op[0], src_reg(31)));
1324
1325          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1326          inst->predicate = BRW_PREDICATE_NORMAL;
1327       }
1328       break;
1329
1330    case ir_unop_rcp:
1331       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1332       break;
1333
1334    case ir_unop_exp2:
1335       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1336       break;
1337    case ir_unop_log2:
1338       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1339       break;
1340    case ir_unop_exp:
1341    case ir_unop_log:
1342       unreachable("not reached: should be handled by ir_explog_to_explog2");
1343    case ir_unop_sin:
1344    case ir_unop_sin_reduced:
1345       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1346       break;
1347    case ir_unop_cos:
1348    case ir_unop_cos_reduced:
1349       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1350       break;
1351
1352    case ir_unop_dFdx:
1353    case ir_unop_dFdx_coarse:
1354    case ir_unop_dFdx_fine:
1355    case ir_unop_dFdy:
1356    case ir_unop_dFdy_coarse:
1357    case ir_unop_dFdy_fine:
1358       unreachable("derivatives not valid in vertex shader");
1359
1360    case ir_unop_bitfield_reverse:
1361       emit(BFREV(result_dst, op[0]));
1362       break;
1363    case ir_unop_bit_count:
1364       emit(CBIT(result_dst, op[0]));
1365       break;
1366    case ir_unop_find_msb: {
1367       src_reg temp = src_reg(this, glsl_type::uint_type);
1368
1369       inst = emit(FBH(dst_reg(temp), op[0]));
1370       inst->dst.writemask = WRITEMASK_XYZW;
1371
1372       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1373        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1374        * subtract the result from 31 to convert the MSB count into an LSB count.
1375        */
1376
1377       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1378       temp.swizzle = BRW_SWIZZLE_NOOP;
1379       emit(MOV(result_dst, temp));
1380
1381       src_reg src_tmp = src_reg(result_dst);
1382       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1383
1384       src_tmp.negate = true;
1385       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1386       inst->predicate = BRW_PREDICATE_NORMAL;
1387       break;
1388    }
1389    case ir_unop_find_lsb:
1390       emit(FBL(result_dst, op[0]));
1391       break;
1392
1393    case ir_unop_noise:
1394       unreachable("not reached: should be handled by lower_noise");
1395
1396    case ir_binop_add:
1397       emit(ADD(result_dst, op[0], op[1]));
1398       break;
1399    case ir_binop_sub:
1400       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1401
1402    case ir_binop_mul:
1403       if (brw->gen < 8 && ir->type->is_integer()) {
1404          /* For integer multiplication, the MUL uses the low 16 bits of one of
1405           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1406           * accumulates in the contribution of the upper 16 bits of that
1407           * operand.  If we can determine that one of the args is in the low
1408           * 16 bits, though, we can just emit a single MUL.
1409           */
1410          if (ir->operands[0]->is_uint16_constant()) {
1411             if (brw->gen < 7)
1412                emit(MUL(result_dst, op[0], op[1]));
1413             else
1414                emit(MUL(result_dst, op[1], op[0]));
1415          } else if (ir->operands[1]->is_uint16_constant()) {
1416             if (brw->gen < 7)
1417                emit(MUL(result_dst, op[1], op[0]));
1418             else
1419                emit(MUL(result_dst, op[0], op[1]));
1420          } else {
1421             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1422
1423             emit(MUL(acc, op[0], op[1]));
1424             emit(MACH(dst_null_d(), op[0], op[1]));
1425             emit(MOV(result_dst, src_reg(acc)));
1426          }
1427       } else {
1428          emit(MUL(result_dst, op[0], op[1]));
1429       }
1430       break;
1431    case ir_binop_imul_high: {
1432       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1433
1434       emit(MUL(acc, op[0], op[1]));
1435       emit(MACH(result_dst, op[0], op[1]));
1436       break;
1437    }
1438    case ir_binop_div:
1439       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1440       assert(ir->type->is_integer());
1441       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1442       break;
1443    case ir_binop_carry: {
1444       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1445
1446       emit(ADDC(dst_null_ud(), op[0], op[1]));
1447       emit(MOV(result_dst, src_reg(acc)));
1448       break;
1449    }
1450    case ir_binop_borrow: {
1451       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1452
1453       emit(SUBB(dst_null_ud(), op[0], op[1]));
1454       emit(MOV(result_dst, src_reg(acc)));
1455       break;
1456    }
1457    case ir_binop_mod:
1458       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1459       assert(ir->type->is_integer());
1460       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1461       break;
1462
1463    case ir_binop_less:
1464    case ir_binop_greater:
1465    case ir_binop_lequal:
1466    case ir_binop_gequal:
1467    case ir_binop_equal:
1468    case ir_binop_nequal: {
1469       emit(CMP(result_dst, op[0], op[1],
1470                brw_conditional_for_comparison(ir->operation)));
1471       emit(AND(result_dst, result_src, src_reg(0x1)));
1472       break;
1473    }
1474
1475    case ir_binop_all_equal:
1476       /* "==" operator producing a scalar boolean. */
1477       if (ir->operands[0]->type->is_vector() ||
1478           ir->operands[1]->type->is_vector()) {
1479          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1480          emit(MOV(result_dst, src_reg(0)));
1481          inst = emit(MOV(result_dst, src_reg(1)));
1482          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1483       } else {
1484          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1485          emit(AND(result_dst, result_src, src_reg(0x1)));
1486       }
1487       break;
1488    case ir_binop_any_nequal:
1489       /* "!=" operator producing a scalar boolean. */
1490       if (ir->operands[0]->type->is_vector() ||
1491           ir->operands[1]->type->is_vector()) {
1492          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1493
1494          emit(MOV(result_dst, src_reg(0)));
1495          inst = emit(MOV(result_dst, src_reg(1)));
1496          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1497       } else {
1498          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1499          emit(AND(result_dst, result_src, src_reg(0x1)));
1500       }
1501       break;
1502
1503    case ir_unop_any:
1504       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1505       emit(MOV(result_dst, src_reg(0)));
1506
1507       inst = emit(MOV(result_dst, src_reg(1)));
1508       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1509       break;
1510
1511    case ir_binop_logic_xor:
1512       emit(XOR(result_dst, op[0], op[1]));
1513       break;
1514
1515    case ir_binop_logic_or:
1516       emit(OR(result_dst, op[0], op[1]));
1517       break;
1518
1519    case ir_binop_logic_and:
1520       emit(AND(result_dst, op[0], op[1]));
1521       break;
1522
1523    case ir_binop_dot:
1524       assert(ir->operands[0]->type->is_vector());
1525       assert(ir->operands[0]->type == ir->operands[1]->type);
1526       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1527       break;
1528
1529    case ir_unop_sqrt:
1530       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1531       break;
1532    case ir_unop_rsq:
1533       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1534       break;
1535
1536    case ir_unop_bitcast_i2f:
1537    case ir_unop_bitcast_u2f:
1538       this->result = op[0];
1539       this->result.type = BRW_REGISTER_TYPE_F;
1540       break;
1541
1542    case ir_unop_bitcast_f2i:
1543       this->result = op[0];
1544       this->result.type = BRW_REGISTER_TYPE_D;
1545       break;
1546
1547    case ir_unop_bitcast_f2u:
1548       this->result = op[0];
1549       this->result.type = BRW_REGISTER_TYPE_UD;
1550       break;
1551
1552    case ir_unop_i2f:
1553    case ir_unop_i2u:
1554    case ir_unop_u2i:
1555    case ir_unop_u2f:
1556    case ir_unop_b2f:
1557    case ir_unop_b2i:
1558    case ir_unop_f2i:
1559    case ir_unop_f2u:
1560       emit(MOV(result_dst, op[0]));
1561       break;
1562    case ir_unop_f2b:
1563    case ir_unop_i2b: {
1564       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1565       emit(AND(result_dst, result_src, src_reg(1)));
1566       break;
1567    }
1568
1569    case ir_unop_trunc:
1570       emit(RNDZ(result_dst, op[0]));
1571       break;
1572    case ir_unop_ceil:
1573       op[0].negate = !op[0].negate;
1574       inst = emit(RNDD(result_dst, op[0]));
1575       this->result.negate = true;
1576       break;
1577    case ir_unop_floor:
1578       inst = emit(RNDD(result_dst, op[0]));
1579       break;
1580    case ir_unop_fract:
1581       inst = emit(FRC(result_dst, op[0]));
1582       break;
1583    case ir_unop_round_even:
1584       emit(RNDE(result_dst, op[0]));
1585       break;
1586
1587    case ir_binop_min:
1588       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1589       break;
1590    case ir_binop_max:
1591       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1592       break;
1593
1594    case ir_binop_pow:
1595       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1596       break;
1597
1598    case ir_unop_bit_not:
1599       inst = emit(NOT(result_dst, op[0]));
1600       break;
1601    case ir_binop_bit_and:
1602       inst = emit(AND(result_dst, op[0], op[1]));
1603       break;
1604    case ir_binop_bit_xor:
1605       inst = emit(XOR(result_dst, op[0], op[1]));
1606       break;
1607    case ir_binop_bit_or:
1608       inst = emit(OR(result_dst, op[0], op[1]));
1609       break;
1610
1611    case ir_binop_lshift:
1612       inst = emit(SHL(result_dst, op[0], op[1]));
1613       break;
1614
1615    case ir_binop_rshift:
1616       if (ir->type->base_type == GLSL_TYPE_INT)
1617          inst = emit(ASR(result_dst, op[0], op[1]));
1618       else
1619          inst = emit(SHR(result_dst, op[0], op[1]));
1620       break;
1621
1622    case ir_binop_bfm:
1623       emit(BFI1(result_dst, op[0], op[1]));
1624       break;
1625
1626    case ir_binop_ubo_load: {
1627       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1628       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1629       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1630       src_reg offset;
1631
1632       /* Now, load the vector from that offset. */
1633       assert(ir->type->is_vector() || ir->type->is_scalar());
1634
1635       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1636       packed_consts.type = result.type;
1637       src_reg surf_index;
1638
1639       if (const_uniform_block) {
1640          /* The block index is a constant, so just emit the binding table entry
1641           * as an immediate.
1642           */
1643          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1644                               const_uniform_block->value.u[0]);
1645       } else {
1646          /* The block index is not a constant. Evaluate the index expression
1647           * per-channel and add the base UBO index; the generator will select
1648           * a value from any live channel.
1649           */
1650          surf_index = src_reg(this, glsl_type::uint_type);
1651          emit(ADD(dst_reg(surf_index), op[0],
1652                   src_reg(prog_data->base.binding_table.ubo_start)));
1653
1654          /* Assume this may touch any UBO. It would be nice to provide
1655           * a tighter bound, but the array information is already lowered away.
1656           */
1657          brw_mark_surface_used(&prog_data->base,
1658                                prog_data->base.binding_table.ubo_start +
1659                                shader_prog->NumUniformBlocks - 1);
1660       }
1661
1662       if (const_offset_ir) {
1663          if (brw->gen >= 8) {
1664             /* Store the offset in a GRF so we can send-from-GRF. */
1665             offset = src_reg(this, glsl_type::int_type);
1666             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1667          } else {
1668             /* Immediates are fine on older generations since they'll be moved
1669              * to a (potentially fake) MRF at the generator level.
1670              */
1671             offset = src_reg(const_offset / 16);
1672          }
1673       } else {
1674          offset = src_reg(this, glsl_type::uint_type);
1675          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1676       }
1677
1678       if (brw->gen >= 7) {
1679          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1680          grf_offset.type = offset.type;
1681
1682          emit(MOV(grf_offset, offset));
1683
1684          emit(new(mem_ctx) vec4_instruction(this,
1685                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1686                                             dst_reg(packed_consts),
1687                                             surf_index,
1688                                             src_reg(grf_offset)));
1689       } else {
1690          vec4_instruction *pull =
1691             emit(new(mem_ctx) vec4_instruction(this,
1692                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1693                                                dst_reg(packed_consts),
1694                                                surf_index,
1695                                                offset));
1696          pull->base_mrf = 14;
1697          pull->mlen = 1;
1698       }
1699
1700       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1701       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1702                                             const_offset % 16 / 4,
1703                                             const_offset % 16 / 4,
1704                                             const_offset % 16 / 4);
1705
1706       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1707       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1708          emit(CMP(result_dst, packed_consts, src_reg(0u),
1709                   BRW_CONDITIONAL_NZ));
1710          emit(AND(result_dst, result, src_reg(0x1)));
1711       } else {
1712          emit(MOV(result_dst, packed_consts));
1713       }
1714       break;
1715    }
1716
1717    case ir_binop_vector_extract:
1718       unreachable("should have been lowered by vec_index_to_cond_assign");
1719
1720    case ir_triop_fma:
1721       op[0] = fix_3src_operand(op[0]);
1722       op[1] = fix_3src_operand(op[1]);
1723       op[2] = fix_3src_operand(op[2]);
1724       /* Note that the instruction's argument order is reversed from GLSL
1725        * and the IR.
1726        */
1727       emit(MAD(result_dst, op[2], op[1], op[0]));
1728       break;
1729
1730    case ir_triop_lrp:
1731       emit_lrp(result_dst, op[0], op[1], op[2]);
1732       break;
1733
1734    case ir_triop_csel:
1735       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1736       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1737       inst->predicate = BRW_PREDICATE_NORMAL;
1738       break;
1739
1740    case ir_triop_bfi:
1741       op[0] = fix_3src_operand(op[0]);
1742       op[1] = fix_3src_operand(op[1]);
1743       op[2] = fix_3src_operand(op[2]);
1744       emit(BFI2(result_dst, op[0], op[1], op[2]));
1745       break;
1746
1747    case ir_triop_bitfield_extract:
1748       op[0] = fix_3src_operand(op[0]);
1749       op[1] = fix_3src_operand(op[1]);
1750       op[2] = fix_3src_operand(op[2]);
1751       /* Note that the instruction's argument order is reversed from GLSL
1752        * and the IR.
1753        */
1754       emit(BFE(result_dst, op[2], op[1], op[0]));
1755       break;
1756
1757    case ir_triop_vector_insert:
1758       unreachable("should have been lowered by lower_vector_insert");
1759
1760    case ir_quadop_bitfield_insert:
1761       unreachable("not reached: should be handled by "
1762               "bitfield_insert_to_bfm_bfi\n");
1763
1764    case ir_quadop_vector:
1765       unreachable("not reached: should be handled by lower_quadop_vector");
1766
1767    case ir_unop_pack_half_2x16:
1768       emit_pack_half_2x16(result_dst, op[0]);
1769       break;
1770    case ir_unop_unpack_half_2x16:
1771       emit_unpack_half_2x16(result_dst, op[0]);
1772       break;
1773    case ir_unop_pack_snorm_2x16:
1774    case ir_unop_pack_snorm_4x8:
1775    case ir_unop_pack_unorm_2x16:
1776    case ir_unop_pack_unorm_4x8:
1777    case ir_unop_unpack_snorm_2x16:
1778    case ir_unop_unpack_snorm_4x8:
1779    case ir_unop_unpack_unorm_2x16:
1780    case ir_unop_unpack_unorm_4x8:
1781       unreachable("not reached: should be handled by lower_packing_builtins");
1782    case ir_unop_unpack_half_2x16_split_x:
1783    case ir_unop_unpack_half_2x16_split_y:
1784    case ir_binop_pack_half_2x16_split:
1785    case ir_unop_interpolate_at_centroid:
1786    case ir_binop_interpolate_at_sample:
1787    case ir_binop_interpolate_at_offset:
1788       unreachable("not reached: should not occur in vertex shader");
1789    case ir_binop_ldexp:
1790       unreachable("not reached: should be handled by ldexp_to_arith()");
1791    }
1792 }
1793
1794
1795 void
1796 vec4_visitor::visit(ir_swizzle *ir)
1797 {
1798    src_reg src;
1799    int i = 0;
1800    int swizzle[4];
1801
1802    /* Note that this is only swizzles in expressions, not those on the left
1803     * hand side of an assignment, which do write masking.  See ir_assignment
1804     * for that.
1805     */
1806
1807    ir->val->accept(this);
1808    src = this->result;
1809    assert(src.file != BAD_FILE);
1810
1811    for (i = 0; i < ir->type->vector_elements; i++) {
1812       switch (i) {
1813       case 0:
1814          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1815          break;
1816       case 1:
1817          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1818          break;
1819       case 2:
1820          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1821          break;
1822       case 3:
1823          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1824             break;
1825       }
1826    }
1827    for (; i < 4; i++) {
1828       /* Replicate the last channel out. */
1829       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1830    }
1831
1832    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1833
1834    this->result = src;
1835 }
1836
1837 void
1838 vec4_visitor::visit(ir_dereference_variable *ir)
1839 {
1840    const struct glsl_type *type = ir->type;
1841    dst_reg *reg = variable_storage(ir->var);
1842
1843    if (!reg) {
1844       fail("Failed to find variable storage for %s\n", ir->var->name);
1845       this->result = src_reg(brw_null_reg());
1846       return;
1847    }
1848
1849    this->result = src_reg(*reg);
1850
1851    /* System values get their swizzle from the dst_reg writemask */
1852    if (ir->var->data.mode == ir_var_system_value)
1853       return;
1854
1855    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1856       this->result.swizzle = swizzle_for_size(type->vector_elements);
1857 }
1858
1859
1860 int
1861 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1862 {
1863    /* Under normal circumstances array elements are stored consecutively, so
1864     * the stride is equal to the size of the array element.
1865     */
1866    return type_size(ir->type);
1867 }
1868
1869
1870 void
1871 vec4_visitor::visit(ir_dereference_array *ir)
1872 {
1873    ir_constant *constant_index;
1874    src_reg src;
1875    int array_stride = compute_array_stride(ir);
1876
1877    constant_index = ir->array_index->constant_expression_value();
1878
1879    ir->array->accept(this);
1880    src = this->result;
1881
1882    if (constant_index) {
1883       src.reg_offset += constant_index->value.i[0] * array_stride;
1884    } else {
1885       /* Variable index array dereference.  It eats the "vec4" of the
1886        * base of the array and an index that offsets the Mesa register
1887        * index.
1888        */
1889       ir->array_index->accept(this);
1890
1891       src_reg index_reg;
1892
1893       if (array_stride == 1) {
1894          index_reg = this->result;
1895       } else {
1896          index_reg = src_reg(this, glsl_type::int_type);
1897
1898          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1899       }
1900
1901       if (src.reladdr) {
1902          src_reg temp = src_reg(this, glsl_type::int_type);
1903
1904          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1905
1906          index_reg = temp;
1907       }
1908
1909       src.reladdr = ralloc(mem_ctx, src_reg);
1910       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1911    }
1912
1913    /* If the type is smaller than a vec4, replicate the last channel out. */
1914    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1915       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1916    else
1917       src.swizzle = BRW_SWIZZLE_NOOP;
1918    src.type = brw_type_for_base_type(ir->type);
1919
1920    this->result = src;
1921 }
1922
1923 void
1924 vec4_visitor::visit(ir_dereference_record *ir)
1925 {
1926    unsigned int i;
1927    const glsl_type *struct_type = ir->record->type;
1928    int offset = 0;
1929
1930    ir->record->accept(this);
1931
1932    for (i = 0; i < struct_type->length; i++) {
1933       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1934          break;
1935       offset += type_size(struct_type->fields.structure[i].type);
1936    }
1937
1938    /* If the type is smaller than a vec4, replicate the last channel out. */
1939    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1940       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1941    else
1942       this->result.swizzle = BRW_SWIZZLE_NOOP;
1943    this->result.type = brw_type_for_base_type(ir->type);
1944
1945    this->result.reg_offset += offset;
1946 }
1947
1948 /**
1949  * We want to be careful in assignment setup to hit the actual storage
1950  * instead of potentially using a temporary like we might with the
1951  * ir_dereference handler.
1952  */
1953 static dst_reg
1954 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1955 {
1956    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1957     * access of a vector, it must be separated into a series conditional moves
1958     * before reaching this point (see ir_vec_index_to_cond_assign).
1959     */
1960    assert(ir->as_dereference());
1961    ir_dereference_array *deref_array = ir->as_dereference_array();
1962    if (deref_array) {
1963       assert(!deref_array->array->type->is_vector());
1964    }
1965
1966    /* Use the rvalue deref handler for the most part.  We'll ignore
1967     * swizzles in it and write swizzles using writemask, though.
1968     */
1969    ir->accept(v);
1970    return dst_reg(v->result);
1971 }
1972
1973 void
1974 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1975                               const struct glsl_type *type,
1976                               enum brw_predicate predicate)
1977 {
1978    if (type->base_type == GLSL_TYPE_STRUCT) {
1979       for (unsigned int i = 0; i < type->length; i++) {
1980          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1981       }
1982       return;
1983    }
1984
1985    if (type->is_array()) {
1986       for (unsigned int i = 0; i < type->length; i++) {
1987          emit_block_move(dst, src, type->fields.array, predicate);
1988       }
1989       return;
1990    }
1991
1992    if (type->is_matrix()) {
1993       const struct glsl_type *vec_type;
1994
1995       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1996                                          type->vector_elements, 1);
1997
1998       for (int i = 0; i < type->matrix_columns; i++) {
1999          emit_block_move(dst, src, vec_type, predicate);
2000       }
2001       return;
2002    }
2003
2004    assert(type->is_scalar() || type->is_vector());
2005
2006    dst->type = brw_type_for_base_type(type);
2007    src->type = dst->type;
2008
2009    dst->writemask = (1 << type->vector_elements) - 1;
2010
2011    src->swizzle = swizzle_for_size(type->vector_elements);
2012
2013    vec4_instruction *inst = emit(MOV(*dst, *src));
2014    inst->predicate = predicate;
2015
2016    dst->reg_offset++;
2017    src->reg_offset++;
2018 }
2019
2020
2021 /* If the RHS processing resulted in an instruction generating a
2022  * temporary value, and it would be easy to rewrite the instruction to
2023  * generate its result right into the LHS instead, do so.  This ends
2024  * up reliably removing instructions where it can be tricky to do so
2025  * later without real UD chain information.
2026  */
2027 bool
2028 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2029                                      dst_reg dst,
2030                                      src_reg src,
2031                                      vec4_instruction *pre_rhs_inst,
2032                                      vec4_instruction *last_rhs_inst)
2033 {
2034    /* This could be supported, but it would take more smarts. */
2035    if (ir->condition)
2036       return false;
2037
2038    if (pre_rhs_inst == last_rhs_inst)
2039       return false; /* No instructions generated to work with. */
2040
2041    /* Make sure the last instruction generated our source reg. */
2042    if (src.file != GRF ||
2043        src.file != last_rhs_inst->dst.file ||
2044        src.reg != last_rhs_inst->dst.reg ||
2045        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2046        src.reladdr ||
2047        src.abs ||
2048        src.negate ||
2049        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2050       return false;
2051
2052    /* Check that that last instruction fully initialized the channels
2053     * we want to use, in the order we want to use them.  We could
2054     * potentially reswizzle the operands of many instructions so that
2055     * we could handle out of order channels, but don't yet.
2056     */
2057
2058    for (unsigned i = 0; i < 4; i++) {
2059       if (dst.writemask & (1 << i)) {
2060          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2061             return false;
2062
2063          if (BRW_GET_SWZ(src.swizzle, i) != i)
2064             return false;
2065       }
2066    }
2067
2068    /* Success!  Rewrite the instruction. */
2069    last_rhs_inst->dst.file = dst.file;
2070    last_rhs_inst->dst.reg = dst.reg;
2071    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2072    last_rhs_inst->dst.reladdr = dst.reladdr;
2073    last_rhs_inst->dst.writemask &= dst.writemask;
2074
2075    return true;
2076 }
2077
2078 void
2079 vec4_visitor::visit(ir_assignment *ir)
2080 {
2081    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2082    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2083
2084    if (!ir->lhs->type->is_scalar() &&
2085        !ir->lhs->type->is_vector()) {
2086       ir->rhs->accept(this);
2087       src_reg src = this->result;
2088
2089       if (ir->condition) {
2090          emit_bool_to_cond_code(ir->condition, &predicate);
2091       }
2092
2093       /* emit_block_move doesn't account for swizzles in the source register.
2094        * This should be ok, since the source register is a structure or an
2095        * array, and those can't be swizzled.  But double-check to be sure.
2096        */
2097       assert(src.swizzle ==
2098              (ir->rhs->type->is_matrix()
2099               ? swizzle_for_size(ir->rhs->type->vector_elements)
2100               : BRW_SWIZZLE_NOOP));
2101
2102       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2103       return;
2104    }
2105
2106    /* Now we're down to just a scalar/vector with writemasks. */
2107    int i;
2108
2109    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2110    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2111
2112    ir->rhs->accept(this);
2113
2114    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2115
2116    src_reg src = this->result;
2117
2118    int swizzles[4];
2119    int first_enabled_chan = 0;
2120    int src_chan = 0;
2121
2122    assert(ir->lhs->type->is_vector() ||
2123           ir->lhs->type->is_scalar());
2124    dst.writemask = ir->write_mask;
2125
2126    for (int i = 0; i < 4; i++) {
2127       if (dst.writemask & (1 << i)) {
2128          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2129          break;
2130       }
2131    }
2132
2133    /* Swizzle a small RHS vector into the channels being written.
2134     *
2135     * glsl ir treats write_mask as dictating how many channels are
2136     * present on the RHS while in our instructions we need to make
2137     * those channels appear in the slots of the vec4 they're written to.
2138     */
2139    for (int i = 0; i < 4; i++) {
2140       if (dst.writemask & (1 << i))
2141          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2142       else
2143          swizzles[i] = first_enabled_chan;
2144    }
2145    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2146                               swizzles[2], swizzles[3]);
2147
2148    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2149       return;
2150    }
2151
2152    if (ir->condition) {
2153       emit_bool_to_cond_code(ir->condition, &predicate);
2154    }
2155
2156    for (i = 0; i < type_size(ir->lhs->type); i++) {
2157       vec4_instruction *inst = emit(MOV(dst, src));
2158       inst->predicate = predicate;
2159
2160       dst.reg_offset++;
2161       src.reg_offset++;
2162    }
2163 }
2164
2165 void
2166 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2167 {
2168    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2169       foreach_in_list(ir_constant, field_value, &ir->components) {
2170          emit_constant_values(dst, field_value);
2171       }
2172       return;
2173    }
2174
2175    if (ir->type->is_array()) {
2176       for (unsigned int i = 0; i < ir->type->length; i++) {
2177          emit_constant_values(dst, ir->array_elements[i]);
2178       }
2179       return;
2180    }
2181
2182    if (ir->type->is_matrix()) {
2183       for (int i = 0; i < ir->type->matrix_columns; i++) {
2184          float *vec = &ir->value.f[i * ir->type->vector_elements];
2185
2186          for (int j = 0; j < ir->type->vector_elements; j++) {
2187             dst->writemask = 1 << j;
2188             dst->type = BRW_REGISTER_TYPE_F;
2189
2190             emit(MOV(*dst, src_reg(vec[j])));
2191          }
2192          dst->reg_offset++;
2193       }
2194       return;
2195    }
2196
2197    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2198
2199    for (int i = 0; i < ir->type->vector_elements; i++) {
2200       if (!(remaining_writemask & (1 << i)))
2201          continue;
2202
2203       dst->writemask = 1 << i;
2204       dst->type = brw_type_for_base_type(ir->type);
2205
2206       /* Find other components that match the one we're about to
2207        * write.  Emits fewer instructions for things like vec4(0.5,
2208        * 1.5, 1.5, 1.5).
2209        */
2210       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2211          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2212             if (ir->value.b[i] == ir->value.b[j])
2213                dst->writemask |= (1 << j);
2214          } else {
2215             /* u, i, and f storage all line up, so no need for a
2216              * switch case for comparing each type.
2217              */
2218             if (ir->value.u[i] == ir->value.u[j])
2219                dst->writemask |= (1 << j);
2220          }
2221       }
2222
2223       switch (ir->type->base_type) {
2224       case GLSL_TYPE_FLOAT:
2225          emit(MOV(*dst, src_reg(ir->value.f[i])));
2226          break;
2227       case GLSL_TYPE_INT:
2228          emit(MOV(*dst, src_reg(ir->value.i[i])));
2229          break;
2230       case GLSL_TYPE_UINT:
2231          emit(MOV(*dst, src_reg(ir->value.u[i])));
2232          break;
2233       case GLSL_TYPE_BOOL:
2234          emit(MOV(*dst, src_reg(ir->value.b[i])));
2235          break;
2236       default:
2237          unreachable("Non-float/uint/int/bool constant");
2238       }
2239
2240       remaining_writemask &= ~dst->writemask;
2241    }
2242    dst->reg_offset++;
2243 }
2244
2245 void
2246 vec4_visitor::visit(ir_constant *ir)
2247 {
2248    dst_reg dst = dst_reg(this, ir->type);
2249    this->result = src_reg(dst);
2250
2251    emit_constant_values(&dst, ir);
2252 }
2253
2254 void
2255 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2256 {
2257    ir_dereference *deref = static_cast<ir_dereference *>(
2258       ir->actual_parameters.get_head());
2259    ir_variable *location = deref->variable_referenced();
2260    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2261                           location->data.atomic.buffer_index);
2262
2263    /* Calculate the surface offset */
2264    src_reg offset(this, glsl_type::uint_type);
2265    ir_dereference_array *deref_array = deref->as_dereference_array();
2266    if (deref_array) {
2267       deref_array->array_index->accept(this);
2268
2269       src_reg tmp(this, glsl_type::uint_type);
2270       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2271       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2272    } else {
2273       offset = location->data.atomic.offset;
2274    }
2275
2276    /* Emit the appropriate machine instruction */
2277    const char *callee = ir->callee->function_name();
2278    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2279
2280    if (!strcmp("__intrinsic_atomic_read", callee)) {
2281       emit_untyped_surface_read(surf_index, dst, offset);
2282
2283    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2284       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2285                           src_reg(), src_reg());
2286
2287    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2288       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2289                           src_reg(), src_reg());
2290    }
2291 }
2292
2293 void
2294 vec4_visitor::visit(ir_call *ir)
2295 {
2296    const char *callee = ir->callee->function_name();
2297
2298    if (!strcmp("__intrinsic_atomic_read", callee) ||
2299        !strcmp("__intrinsic_atomic_increment", callee) ||
2300        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2301       visit_atomic_counter_intrinsic(ir);
2302    } else {
2303       unreachable("Unsupported intrinsic.");
2304    }
2305 }
2306
2307 src_reg
2308 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2309 {
2310    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2311    inst->base_mrf = 2;
2312    inst->mlen = 1;
2313    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2314    inst->dst.writemask = WRITEMASK_XYZW;
2315
2316    inst->src[1] = sampler;
2317
2318    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2319    int param_base = inst->base_mrf;
2320    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2321    int zero_mask = 0xf & ~coord_mask;
2322
2323    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2324             coordinate));
2325
2326    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2327             src_reg(0)));
2328
2329    emit(inst);
2330    return src_reg(inst->dst);
2331 }
2332
2333 static bool
2334 is_high_sampler(struct brw_context *brw, src_reg sampler)
2335 {
2336    if (brw->gen < 8 && !brw->is_haswell)
2337       return false;
2338
2339    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2340 }
2341
2342 void
2343 vec4_visitor::visit(ir_texture *ir)
2344 {
2345    uint32_t sampler =
2346       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2347
2348    ir_rvalue *nonconst_sampler_index =
2349       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2350
2351    /* Handle non-constant sampler array indexing */
2352    src_reg sampler_reg;
2353    if (nonconst_sampler_index) {
2354       /* The highest sampler which may be used by this operation is
2355        * the last element of the array. Mark it here, because the generator
2356        * doesn't have enough information to determine the bound.
2357        */
2358       uint32_t array_size = ir->sampler->as_dereference_array()
2359          ->array->type->array_size();
2360
2361       uint32_t max_used = sampler + array_size - 1;
2362       if (ir->op == ir_tg4 && brw->gen < 8) {
2363          max_used += prog_data->base.binding_table.gather_texture_start;
2364       } else {
2365          max_used += prog_data->base.binding_table.texture_start;
2366       }
2367
2368       brw_mark_surface_used(&prog_data->base, max_used);
2369
2370       /* Emit code to evaluate the actual indexing expression */
2371       nonconst_sampler_index->accept(this);
2372       dst_reg temp(this, glsl_type::uint_type);
2373       emit(ADD(temp, this->result, src_reg(sampler)))
2374          ->force_writemask_all = true;
2375       sampler_reg = src_reg(temp);
2376    } else {
2377       /* Single sampler, or constant array index; the indexing expression
2378        * is just an immediate.
2379        */
2380       sampler_reg = src_reg(sampler);
2381    }
2382
2383    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2384     * emitting anything other than setting up the constant result.
2385     */
2386    if (ir->op == ir_tg4) {
2387       ir_constant *chan = ir->lod_info.component->as_constant();
2388       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2389       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2390          dst_reg result(this, ir->type);
2391          this->result = src_reg(result);
2392          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2393          return;
2394       }
2395    }
2396
2397    /* Should be lowered by do_lower_texture_projection */
2398    assert(!ir->projector);
2399
2400    /* Should be lowered */
2401    assert(!ir->offset || !ir->offset->type->is_array());
2402
2403    /* Generate code to compute all the subexpression trees.  This has to be
2404     * done before loading any values into MRFs for the sampler message since
2405     * generating these values may involve SEND messages that need the MRFs.
2406     */
2407    src_reg coordinate;
2408    if (ir->coordinate) {
2409       ir->coordinate->accept(this);
2410       coordinate = this->result;
2411    }
2412
2413    src_reg shadow_comparitor;
2414    if (ir->shadow_comparitor) {
2415       ir->shadow_comparitor->accept(this);
2416       shadow_comparitor = this->result;
2417    }
2418
2419    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2420    src_reg offset_value;
2421    if (has_nonconstant_offset) {
2422       ir->offset->accept(this);
2423       offset_value = src_reg(this->result);
2424    }
2425
2426    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2427    src_reg lod, dPdx, dPdy, sample_index, mcs;
2428    switch (ir->op) {
2429    case ir_tex:
2430       lod = src_reg(0.0f);
2431       lod_type = glsl_type::float_type;
2432       break;
2433    case ir_txf:
2434    case ir_txl:
2435    case ir_txs:
2436       ir->lod_info.lod->accept(this);
2437       lod = this->result;
2438       lod_type = ir->lod_info.lod->type;
2439       break;
2440    case ir_query_levels:
2441       lod = src_reg(0);
2442       lod_type = glsl_type::int_type;
2443       break;
2444    case ir_txf_ms:
2445       ir->lod_info.sample_index->accept(this);
2446       sample_index = this->result;
2447       sample_index_type = ir->lod_info.sample_index->type;
2448
2449       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2450          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2451       else
2452          mcs = src_reg(0u);
2453       break;
2454    case ir_txd:
2455       ir->lod_info.grad.dPdx->accept(this);
2456       dPdx = this->result;
2457
2458       ir->lod_info.grad.dPdy->accept(this);
2459       dPdy = this->result;
2460
2461       lod_type = ir->lod_info.grad.dPdx->type;
2462       break;
2463    case ir_txb:
2464    case ir_lod:
2465    case ir_tg4:
2466       break;
2467    }
2468
2469    enum opcode opcode;
2470    switch (ir->op) {
2471    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2472    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2473    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2474    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2475    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2476    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2477    case ir_tg4: opcode = has_nonconstant_offset
2478                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2479    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2480    case ir_txb:
2481       unreachable("TXB is not valid for vertex shaders.");
2482    case ir_lod:
2483       unreachable("LOD is not valid for vertex shaders.");
2484    default:
2485       unreachable("Unrecognized tex op");
2486    }
2487
2488    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2489
2490    if (ir->offset != NULL && ir->op != ir_txf)
2491       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2492
2493    /* Stuff the channel select bits in the top of the texture offset */
2494    if (ir->op == ir_tg4)
2495       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2496
2497    /* The message header is necessary for:
2498     * - Gen4 (always)
2499     * - Texel offsets
2500     * - Gather channel selection
2501     * - Sampler indices too large to fit in a 4-bit value.
2502     */
2503    inst->header_present =
2504       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2505       is_high_sampler(brw, sampler_reg);
2506    inst->base_mrf = 2;
2507    inst->mlen = inst->header_present + 1; /* always at least one */
2508    inst->dst = dst_reg(this, ir->type);
2509    inst->dst.writemask = WRITEMASK_XYZW;
2510    inst->shadow_compare = ir->shadow_comparitor != NULL;
2511
2512    inst->src[1] = sampler_reg;
2513
2514    /* MRF for the first parameter */
2515    int param_base = inst->base_mrf + inst->header_present;
2516
2517    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2518       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2519       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2520    } else {
2521       /* Load the coordinate */
2522       /* FINISHME: gl_clamp_mask and saturate */
2523       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2524       int zero_mask = 0xf & ~coord_mask;
2525
2526       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2527                coordinate));
2528
2529       if (zero_mask != 0) {
2530          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2531                   src_reg(0)));
2532       }
2533       /* Load the shadow comparitor */
2534       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2535          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2536                           WRITEMASK_X),
2537                   shadow_comparitor));
2538          inst->mlen++;
2539       }
2540
2541       /* Load the LOD info */
2542       if (ir->op == ir_tex || ir->op == ir_txl) {
2543          int mrf, writemask;
2544          if (brw->gen >= 5) {
2545             mrf = param_base + 1;
2546             if (ir->shadow_comparitor) {
2547                writemask = WRITEMASK_Y;
2548                /* mlen already incremented */
2549             } else {
2550                writemask = WRITEMASK_X;
2551                inst->mlen++;
2552             }
2553          } else /* brw->gen == 4 */ {
2554             mrf = param_base;
2555             writemask = WRITEMASK_W;
2556          }
2557          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2558       } else if (ir->op == ir_txf) {
2559          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2560       } else if (ir->op == ir_txf_ms) {
2561          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2562                   sample_index));
2563          if (brw->gen >= 7)
2564             /* MCS data is in the first channel of `mcs`, but we need to get it into
2565              * the .y channel of the second vec4 of params, so replicate .x across
2566              * the whole vec4 and then mask off everything except .y
2567              */
2568             mcs.swizzle = BRW_SWIZZLE_XXXX;
2569             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2570                      mcs));
2571          inst->mlen++;
2572       } else if (ir->op == ir_txd) {
2573          const glsl_type *type = lod_type;
2574
2575          if (brw->gen >= 5) {
2576             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2577             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2578             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2579             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2580             inst->mlen++;
2581
2582             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2583                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2584                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2585                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2586                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2587                inst->mlen++;
2588
2589                if (ir->shadow_comparitor) {
2590                   emit(MOV(dst_reg(MRF, param_base + 2,
2591                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2592                            shadow_comparitor));
2593                }
2594             }
2595          } else /* brw->gen == 4 */ {
2596             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2597             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2598             inst->mlen += 2;
2599          }
2600       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2601          if (ir->shadow_comparitor) {
2602             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2603                      shadow_comparitor));
2604          }
2605
2606          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2607                   offset_value));
2608          inst->mlen++;
2609       }
2610    }
2611
2612    emit(inst);
2613
2614    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2615     * spec requires layers.
2616     */
2617    if (ir->op == ir_txs) {
2618       glsl_type const *type = ir->sampler->type;
2619       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2620           type->sampler_array) {
2621          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2622                    writemask(inst->dst, WRITEMASK_Z),
2623                    src_reg(inst->dst), src_reg(6));
2624       }
2625    }
2626
2627    if (brw->gen == 6 && ir->op == ir_tg4) {
2628       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2629    }
2630
2631    swizzle_result(ir, src_reg(inst->dst), sampler);
2632 }
2633
2634 /**
2635  * Apply workarounds for Gen6 gather with UINT/SINT
2636  */
2637 void
2638 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2639 {
2640    if (!wa)
2641       return;
2642
2643    int width = (wa & WA_8BIT) ? 8 : 16;
2644    dst_reg dst_f = dst;
2645    dst_f.type = BRW_REGISTER_TYPE_F;
2646
2647    /* Convert from UNORM to UINT */
2648    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2649    emit(MOV(dst, src_reg(dst_f)));
2650
2651    if (wa & WA_SIGN) {
2652       /* Reinterpret the UINT value as a signed INT value by
2653        * shifting the sign bit into place, then shifting back
2654        * preserving sign.
2655        */
2656       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2657       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2658    }
2659 }
2660
2661 /**
2662  * Set up the gather channel based on the swizzle, for gather4.
2663  */
2664 uint32_t
2665 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2666 {
2667    ir_constant *chan = ir->lod_info.component->as_constant();
2668    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2669    switch (swiz) {
2670       case SWIZZLE_X: return 0;
2671       case SWIZZLE_Y:
2672          /* gather4 sampler is broken for green channel on RG32F --
2673           * we must ask for blue instead.
2674           */
2675          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2676             return 2;
2677          return 1;
2678       case SWIZZLE_Z: return 2;
2679       case SWIZZLE_W: return 3;
2680       default:
2681          unreachable("Not reached"); /* zero, one swizzles handled already */
2682    }
2683 }
2684
2685 void
2686 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2687 {
2688    int s = key->tex.swizzles[sampler];
2689
2690    this->result = src_reg(this, ir->type);
2691    dst_reg swizzled_result(this->result);
2692
2693    if (ir->op == ir_query_levels) {
2694       /* # levels is in .w */
2695       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2696       emit(MOV(swizzled_result, orig_val));
2697       return;
2698    }
2699
2700    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2701                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2702       emit(MOV(swizzled_result, orig_val));
2703       return;
2704    }
2705
2706
2707    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2708    int swizzle[4] = {0};
2709
2710    for (int i = 0; i < 4; i++) {
2711       switch (GET_SWZ(s, i)) {
2712       case SWIZZLE_ZERO:
2713          zero_mask |= (1 << i);
2714          break;
2715       case SWIZZLE_ONE:
2716          one_mask |= (1 << i);
2717          break;
2718       default:
2719          copy_mask |= (1 << i);
2720          swizzle[i] = GET_SWZ(s, i);
2721          break;
2722       }
2723    }
2724
2725    if (copy_mask) {
2726       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2727       swizzled_result.writemask = copy_mask;
2728       emit(MOV(swizzled_result, orig_val));
2729    }
2730
2731    if (zero_mask) {
2732       swizzled_result.writemask = zero_mask;
2733       emit(MOV(swizzled_result, src_reg(0.0f)));
2734    }
2735
2736    if (one_mask) {
2737       swizzled_result.writemask = one_mask;
2738       emit(MOV(swizzled_result, src_reg(1.0f)));
2739    }
2740 }
2741
2742 void
2743 vec4_visitor::visit(ir_return *)
2744 {
2745    unreachable("not reached");
2746 }
2747
2748 void
2749 vec4_visitor::visit(ir_discard *)
2750 {
2751    unreachable("not reached");
2752 }
2753
2754 void
2755 vec4_visitor::visit(ir_if *ir)
2756 {
2757    /* Don't point the annotation at the if statement, because then it plus
2758     * the then and else blocks get printed.
2759     */
2760    this->base_ir = ir->condition;
2761
2762    if (brw->gen == 6) {
2763       emit_if_gen6(ir);
2764    } else {
2765       enum brw_predicate predicate;
2766       emit_bool_to_cond_code(ir->condition, &predicate);
2767       emit(IF(predicate));
2768    }
2769
2770    visit_instructions(&ir->then_instructions);
2771
2772    if (!ir->else_instructions.is_empty()) {
2773       this->base_ir = ir->condition;
2774       emit(BRW_OPCODE_ELSE);
2775
2776       visit_instructions(&ir->else_instructions);
2777    }
2778
2779    this->base_ir = ir->condition;
2780    emit(BRW_OPCODE_ENDIF);
2781 }
2782
2783 void
2784 vec4_visitor::visit(ir_emit_vertex *)
2785 {
2786    unreachable("not reached");
2787 }
2788
2789 void
2790 vec4_visitor::visit(ir_end_primitive *)
2791 {
2792    unreachable("not reached");
2793 }
2794
2795 void
2796 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2797                                   dst_reg dst, src_reg offset,
2798                                   src_reg src0, src_reg src1)
2799 {
2800    unsigned mlen = 0;
2801
2802    /* Set the atomic operation offset. */
2803    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2804    mlen++;
2805
2806    /* Set the atomic operation arguments. */
2807    if (src0.file != BAD_FILE) {
2808       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2809       mlen++;
2810    }
2811
2812    if (src1.file != BAD_FILE) {
2813       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2814       mlen++;
2815    }
2816
2817    /* Emit the instruction.  Note that this maps to the normal SIMD8
2818     * untyped atomic message on Ivy Bridge, but that's OK because
2819     * unused channels will be masked out.
2820     */
2821    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2822                                  src_reg(atomic_op), src_reg(surf_index));
2823    inst->base_mrf = 0;
2824    inst->mlen = mlen;
2825 }
2826
2827 void
2828 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2829                                         src_reg offset)
2830 {
2831    /* Set the surface read offset. */
2832    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2833
2834    /* Emit the instruction.  Note that this maps to the normal SIMD8
2835     * untyped surface read message, but that's OK because unused
2836     * channels will be masked out.
2837     */
2838    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2839                                  dst, src_reg(surf_index));
2840    inst->base_mrf = 0;
2841    inst->mlen = 1;
2842 }
2843
2844 void
2845 vec4_visitor::emit_ndc_computation()
2846 {
2847    /* Get the position */
2848    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2849
2850    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2851    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2852    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2853
2854    current_annotation = "NDC";
2855    dst_reg ndc_w = ndc;
2856    ndc_w.writemask = WRITEMASK_W;
2857    src_reg pos_w = pos;
2858    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2859    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2860
2861    dst_reg ndc_xyz = ndc;
2862    ndc_xyz.writemask = WRITEMASK_XYZ;
2863
2864    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2865 }
2866
2867 void
2868 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2869 {
2870    if (brw->gen < 6 &&
2871        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2872         key->userclip_active || brw->has_negative_rhw_bug)) {
2873       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2874       dst_reg header1_w = header1;
2875       header1_w.writemask = WRITEMASK_W;
2876
2877       emit(MOV(header1, 0u));
2878
2879       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2880          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2881
2882          current_annotation = "Point size";
2883          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2884          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2885       }
2886
2887       if (key->userclip_active) {
2888          current_annotation = "Clipping flags";
2889          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2890          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2891
2892          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2893          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2894          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2895
2896          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2897          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2898          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2899          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2900       }
2901
2902       /* i965 clipping workaround:
2903        * 1) Test for -ve rhw
2904        * 2) If set,
2905        *      set ndc = (0,0,0,0)
2906        *      set ucp[6] = 1
2907        *
2908        * Later, clipping will detect ucp[6] and ensure the primitive is
2909        * clipped against all fixed planes.
2910        */
2911       if (brw->has_negative_rhw_bug) {
2912          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2913          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2914          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2915          vec4_instruction *inst;
2916          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2917          inst->predicate = BRW_PREDICATE_NORMAL;
2918          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2919          inst->predicate = BRW_PREDICATE_NORMAL;
2920       }
2921
2922       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2923    } else if (brw->gen < 6) {
2924       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2925    } else {
2926       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2927       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2928          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2929                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2930       }
2931       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2932          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2933                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2934       }
2935       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2936          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2937                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2938       }
2939    }
2940 }
2941
2942 void
2943 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2944 {
2945    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2946     *
2947     *     "If a linked set of shaders forming the vertex stage contains no
2948     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2949     *     application has requested clipping against user clip planes through
2950     *     the API, then the coordinate written to gl_Position is used for
2951     *     comparison against the user clip planes."
2952     *
2953     * This function is only called if the shader didn't write to
2954     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2955     * if the user wrote to it; otherwise we use gl_Position.
2956     */
2957    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2958    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2959       clip_vertex = VARYING_SLOT_POS;
2960    }
2961
2962    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2963         ++i) {
2964       reg.writemask = 1 << i;
2965       emit(DP4(reg,
2966                src_reg(output_reg[clip_vertex]),
2967                src_reg(this->userplane[i + offset])));
2968    }
2969 }
2970
2971 void
2972 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2973 {
2974    assert (varying < VARYING_SLOT_MAX);
2975    reg.type = output_reg[varying].type;
2976    current_annotation = output_reg_annotation[varying];
2977    /* Copy the register, saturating if necessary */
2978    vec4_instruction *inst = emit(MOV(reg,
2979                                      src_reg(output_reg[varying])));
2980    if ((varying == VARYING_SLOT_COL0 ||
2981         varying == VARYING_SLOT_COL1 ||
2982         varying == VARYING_SLOT_BFC0 ||
2983         varying == VARYING_SLOT_BFC1) &&
2984        key->clamp_vertex_color) {
2985       inst->saturate = true;
2986    }
2987 }
2988
2989 void
2990 vec4_visitor::emit_urb_slot(int mrf, int varying)
2991 {
2992    struct brw_reg hw_reg = brw_message_reg(mrf);
2993    dst_reg reg = dst_reg(MRF, mrf);
2994    reg.type = BRW_REGISTER_TYPE_F;
2995
2996    switch (varying) {
2997    case VARYING_SLOT_PSIZ:
2998       /* PSIZ is always in slot 0, and is coupled with other flags. */
2999       current_annotation = "indices, point width, clip flags";
3000       emit_psiz_and_flags(hw_reg);
3001       break;
3002    case BRW_VARYING_SLOT_NDC:
3003       current_annotation = "NDC";
3004       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3005       break;
3006    case VARYING_SLOT_POS:
3007       current_annotation = "gl_Position";
3008       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3009       break;
3010    case VARYING_SLOT_EDGE:
3011       /* This is present when doing unfilled polygons.  We're supposed to copy
3012        * the edge flag from the user-provided vertex array
3013        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3014        * of that attribute (starts as 1.0f).  This is then used in clipping to
3015        * determine which edges should be drawn as wireframe.
3016        */
3017       current_annotation = "edge flag";
3018       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3019                                     glsl_type::float_type, WRITEMASK_XYZW))));
3020       break;
3021    case BRW_VARYING_SLOT_PAD:
3022       /* No need to write to this slot */
3023       break;
3024    default:
3025       emit_generic_urb_slot(reg, varying);
3026       break;
3027    }
3028 }
3029
3030 static int
3031 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3032 {
3033    if (brw->gen >= 6) {
3034       /* URB data written (does not include the message header reg) must
3035        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3036        * section 5.4.3.2.2: URB_INTERLEAVED.
3037        *
3038        * URB entries are allocated on a multiple of 1024 bits, so an
3039        * extra 128 bits written here to make the end align to 256 is
3040        * no problem.
3041        */
3042       if ((mlen % 2) != 1)
3043          mlen++;
3044    }
3045
3046    return mlen;
3047 }
3048
3049
3050 /**
3051  * Generates the VUE payload plus the necessary URB write instructions to
3052  * output it.
3053  *
3054  * The VUE layout is documented in Volume 2a.
3055  */
3056 void
3057 vec4_visitor::emit_vertex()
3058 {
3059    /* MRF 0 is reserved for the debugger, so start with message header
3060     * in MRF 1.
3061     */
3062    int base_mrf = 1;
3063    int mrf = base_mrf;
3064    /* In the process of generating our URB write message contents, we
3065     * may need to unspill a register or load from an array.  Those
3066     * reads would use MRFs 14-15.
3067     */
3068    int max_usable_mrf = 13;
3069
3070    /* The following assertion verifies that max_usable_mrf causes an
3071     * even-numbered amount of URB write data, which will meet gen6's
3072     * requirements for length alignment.
3073     */
3074    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3075
3076    /* First mrf is the g0-based message header containing URB handles and
3077     * such.
3078     */
3079    emit_urb_write_header(mrf++);
3080
3081    if (brw->gen < 6) {
3082       emit_ndc_computation();
3083    }
3084
3085    /* Lower legacy ff and ClipVertex clipping to clip distances */
3086    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3087       current_annotation = "user clip distances";
3088
3089       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3090       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3091
3092       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3093       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3094    }
3095
3096    /* We may need to split this up into several URB writes, so do them in a
3097     * loop.
3098     */
3099    int slot = 0;
3100    bool complete = false;
3101    do {
3102       /* URB offset is in URB row increments, and each of our MRFs is half of
3103        * one of those, since we're doing interleaved writes.
3104        */
3105       int offset = slot / 2;
3106
3107       mrf = base_mrf + 1;
3108       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3109          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3110
3111          /* If this was max_usable_mrf, we can't fit anything more into this
3112           * URB WRITE.
3113           */
3114          if (mrf > max_usable_mrf) {
3115             slot++;
3116             break;
3117          }
3118       }
3119
3120       complete = slot >= prog_data->vue_map.num_slots;
3121       current_annotation = "URB write";
3122       vec4_instruction *inst = emit_urb_write_opcode(complete);
3123       inst->base_mrf = base_mrf;
3124       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3125       inst->offset += offset;
3126    } while(!complete);
3127 }
3128
3129
3130 src_reg
3131 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3132                                  src_reg *reladdr, int reg_offset)
3133 {
3134    /* Because we store the values to scratch interleaved like our
3135     * vertex data, we need to scale the vec4 index by 2.
3136     */
3137    int message_header_scale = 2;
3138
3139    /* Pre-gen6, the message header uses byte offsets instead of vec4
3140     * (16-byte) offset units.
3141     */
3142    if (brw->gen < 6)
3143       message_header_scale *= 16;
3144
3145    if (reladdr) {
3146       src_reg index = src_reg(this, glsl_type::int_type);
3147
3148       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3149       emit_before(inst, MUL(dst_reg(index),
3150                             index, src_reg(message_header_scale)));
3151
3152       return index;
3153    } else {
3154       return src_reg(reg_offset * message_header_scale);
3155    }
3156 }
3157
3158 src_reg
3159 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3160                                        src_reg *reladdr, int reg_offset)
3161 {
3162    if (reladdr) {
3163       src_reg index = src_reg(this, glsl_type::int_type);
3164
3165       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3166
3167       /* Pre-gen6, the message header uses byte offsets instead of vec4
3168        * (16-byte) offset units.
3169        */
3170       if (brw->gen < 6) {
3171          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3172       }
3173
3174       return index;
3175    } else if (brw->gen >= 8) {
3176       /* Store the offset in a GRF so we can send-from-GRF. */
3177       src_reg offset = src_reg(this, glsl_type::int_type);
3178       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3179       return offset;
3180    } else {
3181       int message_header_scale = brw->gen < 6 ? 16 : 1;
3182       return src_reg(reg_offset * message_header_scale);
3183    }
3184 }
3185
3186 /**
3187  * Emits an instruction before @inst to load the value named by @orig_src
3188  * from scratch space at @base_offset to @temp.
3189  *
3190  * @base_offset is measured in 32-byte units (the size of a register).
3191  */
3192 void
3193 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3194                                 dst_reg temp, src_reg orig_src,
3195                                 int base_offset)
3196 {
3197    int reg_offset = base_offset + orig_src.reg_offset;
3198    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3199
3200    emit_before(inst, SCRATCH_READ(temp, index));
3201 }
3202
3203 /**
3204  * Emits an instruction after @inst to store the value to be written
3205  * to @orig_dst to scratch space at @base_offset, from @temp.
3206  *
3207  * @base_offset is measured in 32-byte units (the size of a register).
3208  */
3209 void
3210 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3211 {
3212    int reg_offset = base_offset + inst->dst.reg_offset;
3213    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3214
3215    /* Create a temporary register to store *inst's result in.
3216     *
3217     * We have to be careful in MOVing from our temporary result register in
3218     * the scratch write.  If we swizzle from channels of the temporary that
3219     * weren't initialized, it will confuse live interval analysis, which will
3220     * make spilling fail to make progress.
3221     */
3222    src_reg temp = src_reg(this, glsl_type::vec4_type);
3223    temp.type = inst->dst.type;
3224    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3225    int swizzles[4];
3226    for (int i = 0; i < 4; i++)
3227       if (inst->dst.writemask & (1 << i))
3228          swizzles[i] = i;
3229       else
3230          swizzles[i] = first_writemask_chan;
3231    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3232                                swizzles[2], swizzles[3]);
3233
3234    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3235                                        inst->dst.writemask));
3236    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3237    write->predicate = inst->predicate;
3238    write->ir = inst->ir;
3239    write->annotation = inst->annotation;
3240    inst->insert_after(write);
3241
3242    inst->dst.file = temp.file;
3243    inst->dst.reg = temp.reg;
3244    inst->dst.reg_offset = temp.reg_offset;
3245    inst->dst.reladdr = NULL;
3246 }
3247
3248 /**
3249  * We can't generally support array access in GRF space, because a
3250  * single instruction's destination can only span 2 contiguous
3251  * registers.  So, we send all GRF arrays that get variable index
3252  * access to scratch space.
3253  */
3254 void
3255 vec4_visitor::move_grf_array_access_to_scratch()
3256 {
3257    int scratch_loc[this->virtual_grf_count];
3258
3259    for (int i = 0; i < this->virtual_grf_count; i++) {
3260       scratch_loc[i] = -1;
3261    }
3262
3263    /* First, calculate the set of virtual GRFs that need to be punted
3264     * to scratch due to having any array access on them, and where in
3265     * scratch.
3266     */
3267    foreach_in_list(vec4_instruction, inst, &instructions) {
3268       if (inst->dst.file == GRF && inst->dst.reladdr &&
3269           scratch_loc[inst->dst.reg] == -1) {
3270          scratch_loc[inst->dst.reg] = c->last_scratch;
3271          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3272       }
3273
3274       for (int i = 0 ; i < 3; i++) {
3275          src_reg *src = &inst->src[i];
3276
3277          if (src->file == GRF && src->reladdr &&
3278              scratch_loc[src->reg] == -1) {
3279             scratch_loc[src->reg] = c->last_scratch;
3280             c->last_scratch += this->virtual_grf_sizes[src->reg];
3281          }
3282       }
3283    }
3284
3285    /* Now, for anything that will be accessed through scratch, rewrite
3286     * it to load/store.  Note that this is a _safe list walk, because
3287     * we may generate a new scratch_write instruction after the one
3288     * we're processing.
3289     */
3290    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3291       /* Set up the annotation tracking for new generated instructions. */
3292       base_ir = inst->ir;
3293       current_annotation = inst->annotation;
3294
3295       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3296          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3297       }
3298
3299       for (int i = 0 ; i < 3; i++) {
3300          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3301             continue;
3302
3303          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3304
3305          emit_scratch_read(inst, temp, inst->src[i],
3306                            scratch_loc[inst->src[i].reg]);
3307
3308          inst->src[i].file = temp.file;
3309          inst->src[i].reg = temp.reg;
3310          inst->src[i].reg_offset = temp.reg_offset;
3311          inst->src[i].reladdr = NULL;
3312       }
3313    }
3314 }
3315
3316 /**
3317  * Emits an instruction before @inst to load the value named by @orig_src
3318  * from the pull constant buffer (surface) at @base_offset to @temp.
3319  */
3320 void
3321 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3322                                       dst_reg temp, src_reg orig_src,
3323                                       int base_offset)
3324 {
3325    int reg_offset = base_offset + orig_src.reg_offset;
3326    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3327    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3328    vec4_instruction *load;
3329
3330    if (brw->gen >= 7) {
3331       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3332       grf_offset.type = offset.type;
3333       emit_before(inst, MOV(grf_offset, offset));
3334
3335       load = new(mem_ctx) vec4_instruction(this,
3336                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3337                                            temp, index, src_reg(grf_offset));
3338    } else {
3339       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3340                                            temp, index, offset);
3341       load->base_mrf = 14;
3342       load->mlen = 1;
3343    }
3344    emit_before(inst, load);
3345 }
3346
3347 /**
3348  * Implements array access of uniforms by inserting a
3349  * PULL_CONSTANT_LOAD instruction.
3350  *
3351  * Unlike temporary GRF array access (where we don't support it due to
3352  * the difficulty of doing relative addressing on instruction
3353  * destinations), we could potentially do array access of uniforms
3354  * that were loaded in GRF space as push constants.  In real-world
3355  * usage we've seen, though, the arrays being used are always larger
3356  * than we could load as push constants, so just always move all
3357  * uniform array access out to a pull constant buffer.
3358  */
3359 void
3360 vec4_visitor::move_uniform_array_access_to_pull_constants()
3361 {
3362    int pull_constant_loc[this->uniforms];
3363
3364    for (int i = 0; i < this->uniforms; i++) {
3365       pull_constant_loc[i] = -1;
3366    }
3367
3368    /* Walk through and find array access of uniforms.  Put a copy of that
3369     * uniform in the pull constant buffer.
3370     *
3371     * Note that we don't move constant-indexed accesses to arrays.  No
3372     * testing has been done of the performance impact of this choice.
3373     */
3374    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3375       for (int i = 0 ; i < 3; i++) {
3376          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3377             continue;
3378
3379          int uniform = inst->src[i].reg;
3380
3381          /* If this array isn't already present in the pull constant buffer,
3382           * add it.
3383           */
3384          if (pull_constant_loc[uniform] == -1) {
3385             const gl_constant_value **values =
3386                &stage_prog_data->param[uniform * 4];
3387
3388             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3389
3390             assert(uniform < uniform_array_size);
3391             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3392                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3393                   = values[j];
3394             }
3395          }
3396
3397          /* Set up the annotation tracking for new generated instructions. */
3398          base_ir = inst->ir;
3399          current_annotation = inst->annotation;
3400
3401          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3402
3403          emit_pull_constant_load(inst, temp, inst->src[i],
3404                                  pull_constant_loc[uniform]);
3405
3406          inst->src[i].file = temp.file;
3407          inst->src[i].reg = temp.reg;
3408          inst->src[i].reg_offset = temp.reg_offset;
3409          inst->src[i].reladdr = NULL;
3410       }
3411    }
3412
3413    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3414     * no need to track them as larger-than-vec4 objects.  This will be
3415     * relied on in cutting out unused uniform vectors from push
3416     * constants.
3417     */
3418    split_uniform_registers();
3419 }
3420
3421 void
3422 vec4_visitor::resolve_ud_negate(src_reg *reg)
3423 {
3424    if (reg->type != BRW_REGISTER_TYPE_UD ||
3425        !reg->negate)
3426       return;
3427
3428    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3429    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3430    *reg = temp;
3431 }
3432
3433 vec4_visitor::vec4_visitor(struct brw_context *brw,
3434                            struct brw_vec4_compile *c,
3435                            struct gl_program *prog,
3436                            const struct brw_vec4_prog_key *key,
3437                            struct brw_vec4_prog_data *prog_data,
3438                            struct gl_shader_program *shader_prog,
3439                            gl_shader_stage stage,
3440                            void *mem_ctx,
3441                            bool debug_flag,
3442                            bool no_spills,
3443                            shader_time_shader_type st_base,
3444                            shader_time_shader_type st_written,
3445                            shader_time_shader_type st_reset)
3446    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3447      c(c),
3448      key(key),
3449      prog_data(prog_data),
3450      sanity_param_count(0),
3451      fail_msg(NULL),
3452      first_non_payload_grf(0),
3453      need_all_constants_in_pull_buffer(false),
3454      debug_flag(debug_flag),
3455      no_spills(no_spills),
3456      st_base(st_base),
3457      st_written(st_written),
3458      st_reset(st_reset)
3459 {
3460    this->mem_ctx = mem_ctx;
3461    this->failed = false;
3462
3463    this->base_ir = NULL;
3464    this->current_annotation = NULL;
3465    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3466
3467    this->variable_ht = hash_table_ctor(0,
3468                                        hash_table_pointer_hash,
3469                                        hash_table_pointer_compare);
3470
3471    this->virtual_grf_start = NULL;
3472    this->virtual_grf_end = NULL;
3473    this->virtual_grf_sizes = NULL;
3474    this->virtual_grf_count = 0;
3475    this->virtual_grf_reg_map = NULL;
3476    this->virtual_grf_reg_count = 0;
3477    this->virtual_grf_array_size = 0;
3478    this->live_intervals_valid = false;
3479
3480    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3481
3482    this->uniforms = 0;
3483
3484    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3485     * at least one. See setup_uniforms() in brw_vec4.cpp.
3486     */
3487    this->uniform_array_size = 1;
3488    if (prog_data) {
3489       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3490    }
3491
3492    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3493    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3494 }
3495
3496 vec4_visitor::~vec4_visitor()
3497 {
3498    hash_table_dtor(this->variable_ht);
3499 }
3500
3501
3502 void
3503 vec4_visitor::fail(const char *format, ...)
3504 {
3505    va_list va;
3506    char *msg;
3507
3508    if (failed)
3509       return;
3510
3511    failed = true;
3512
3513    va_start(va, format);
3514    msg = ralloc_vasprintf(mem_ctx, format, va);
3515    va_end(va);
3516    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3517
3518    this->fail_msg = msg;
3519
3520    if (debug_flag) {
3521       fprintf(stderr, "%s",  msg);
3522    }
3523 }
3524
3525 } /* namespace brw */