src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, const dst_reg &dst,
  34                                    const src_reg &src0, const src_reg &src1,
  35                                    const src_reg &src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->saturate = false;
  43    this->force_writemask_all = false;
  44    this->no_dd_clear = false;
  45    this->no_dd_check = false;
  46    this->writes_accumulator = false;
  47    this->conditional_mod = BRW_CONDITIONAL_NONE;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 123                     const src_reg &src1)                                \
 124    {                                                                    \
 125       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 126                                            src0, src1);                 \
 127    }
 128
 129 #define ALU2_ACC(op)                                                    \
 130    vec4_instruction *                                                   \
 131    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 132                     const src_reg &src1)                                \
 133    {                                                                    \
 134       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 135                        BRW_OPCODE_##op, dst, src0, src1);               \
 136       inst->writes_accumulator = true;                                 \
 137       return inst;                                                     \
 138    }
 139
 140 #define ALU3(op)                                                        \
 141    vec4_instruction *                                                   \
 142    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 143                     const src_reg &src1, const src_reg &src2)           \
 144    {                                                                    \
 145       assert(brw->gen >= 6);                                            \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1, src2);           \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU1(F32TO16)
 157 ALU1(F16TO32)
 158 ALU2(ADD)
 159 ALU2(MUL)
 160 ALU2_ACC(MACH)
 161 ALU2(AND)
 162 ALU2(OR)
 163 ALU2(XOR)
 164 ALU2(DP3)
 165 ALU2(DP4)
 166 ALU2(DPH)
 167 ALU2(SHL)
 168 ALU2(SHR)
 169 ALU2(ASR)
 170 ALU3(LRP)
 171 ALU1(BFREV)
 172 ALU3(BFE)
 173 ALU2(BFI1)
 174 ALU3(BFI2)
 175 ALU1(FBH)
 176 ALU1(FBL)
 177 ALU1(CBIT)
 178 ALU3(MAD)
 179 ALU2_ACC(ADDC)
 180 ALU2_ACC(SUBB)
 181 ALU2(MAC)
 182
 183 /** Gen4 predicated IF. */
 184 vec4_instruction *
 185 vec4_visitor::IF(enum brw_predicate predicate)
 186 {
 187    vec4_instruction *inst;
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 190    inst->predicate = predicate;
 191
 192    return inst;
 193 }
 194
 195 /** Gen6 IF with embedded comparison. */
 196 vec4_instruction *
 197 vec4_visitor::IF(src_reg src0, src_reg src1,
 198                  enum brw_conditional_mod condition)
 199 {
 200    assert(brw->gen == 6);
 201
 202    vec4_instruction *inst;
 203
 204    resolve_ud_negate(&src0);
 205    resolve_ud_negate(&src1);
 206
 207    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 208                                         src0, src1);
 209    inst->conditional_mod = condition;
 210
 211    return inst;
 212 }
 213
 214 /**
 215  * CMP: Sets the low bit of the destination channels with the result
 216  * of the comparison, while the upper bits are undefined, and updates
 217  * the flag register with the packed 16 bits of the result.
 218  */
 219 vec4_instruction *
 220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 221                   enum brw_conditional_mod condition)
 222 {
 223    vec4_instruction *inst;
 224
 225    /* original gen4 does type conversion to the destination type
 226     * before before comparison, producing garbage results for floating
 227     * point comparisons.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 vec4_instruction *
 245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 246 {
 247    vec4_instruction *inst;
 248
 249    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 250                                         dst, index);
 251    inst->base_mrf = 14;
 252    inst->mlen = 2;
 253
 254    return inst;
 255 }
 256
 257 vec4_instruction *
 258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 259                             const src_reg &index)
 260 {
 261    vec4_instruction *inst;
 262
 263    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 264                                         dst, src, index);
 265    inst->base_mrf = 13;
 266    inst->mlen = 3;
 267
 268    return inst;
 269 }
 270
 271 void
 272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 273 {
 274    static enum opcode dot_opcodes[] = {
 275       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 276    };
 277
 278    emit(dot_opcodes[elements - 2], dst, src0, src1);
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(src_reg src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(MOV(expanded, src));
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::fix_math_operand(src_reg src)
 309 {
 310    /* The gen6 math instruction ignores the source modifiers --
 311     * swizzle, abs, negate, and at least some parts of the register
 312     * region description.
 313     *
 314     * Rather than trying to enumerate all these cases, *always* expand the
 315     * operand to a temp GRF for gen6.
 316     *
 317     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 318     * can't use.
 319     */
 320
 321    if (brw->gen == 7 && src.file != IMM)
 322       return src;
 323
 324    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 325    expanded.type = src.type;
 326    emit(MOV(expanded, src));
 327    return src_reg(expanded);
 328 }
 329
 330 void
 331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 332 {
 333    src = fix_math_operand(src);
 334
 335    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 336       /* The gen6 math instruction must be align1, so we can't do
 337        * writemasks.
 338        */
 339       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 340
 341       emit(opcode, temp_dst, src);
 342
 343       emit(MOV(dst, src_reg(temp_dst)));
 344    } else {
 345       emit(opcode, dst, src);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 351 {
 352    vec4_instruction *inst = emit(opcode, dst, src);
 353    inst->base_mrf = 1;
 354    inst->mlen = 1;
 355 }
 356
 357 void
 358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 359 {
 360    switch (opcode) {
 361    case SHADER_OPCODE_RCP:
 362    case SHADER_OPCODE_RSQ:
 363    case SHADER_OPCODE_SQRT:
 364    case SHADER_OPCODE_EXP2:
 365    case SHADER_OPCODE_LOG2:
 366    case SHADER_OPCODE_SIN:
 367    case SHADER_OPCODE_COS:
 368       break;
 369    default:
 370       unreachable("not reached: bad math opcode");
 371    }
 372
 373    if (brw->gen >= 8) {
 374       emit(opcode, dst, src);
 375    } else if (brw->gen >= 6) {
 376       emit_math1_gen6(opcode, dst, src);
 377    } else {
 378       emit_math1_gen4(opcode, dst, src);
 379    }
 380 }
 381
 382 void
 383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 384                               dst_reg dst, src_reg src0, src_reg src1)
 385 {
 386    src0 = fix_math_operand(src0);
 387    src1 = fix_math_operand(src1);
 388
 389    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 390       /* The gen6 math instruction must be align1, so we can't do
 391        * writemasks.
 392        */
 393       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 394       temp_dst.type = dst.type;
 395
 396       emit(opcode, temp_dst, src0, src1);
 397
 398       emit(MOV(dst, src_reg(temp_dst)));
 399    } else {
 400       emit(opcode, dst, src0, src1);
 401    }
 402 }
 403
 404 void
 405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 406                               dst_reg dst, src_reg src0, src_reg src1)
 407 {
 408    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 409    inst->base_mrf = 1;
 410    inst->mlen = 2;
 411 }
 412
 413 void
 414 vec4_visitor::emit_math(enum opcode opcode,
 415                         dst_reg dst, src_reg src0, src_reg src1)
 416 {
 417    switch (opcode) {
 418    case SHADER_OPCODE_POW:
 419    case SHADER_OPCODE_INT_QUOTIENT:
 420    case SHADER_OPCODE_INT_REMAINDER:
 421       break;
 422    default:
 423       unreachable("not reached: unsupported binary math opcode");
 424    }
 425
 426    if (brw->gen >= 8) {
 427       emit(opcode, dst, src0, src1);
 428    } else if (brw->gen >= 6) {
 429       emit_math2_gen6(opcode, dst, src0, src1);
 430    } else {
 431       emit_math2_gen4(opcode, dst, src0, src1);
 432    }
 433 }
 434
 435 void
 436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 437 {
 438    if (brw->gen < 7) {
 439       unreachable("ir_unop_pack_half_2x16 should be lowered");
 440    }
 441
 442    assert(dst.type == BRW_REGISTER_TYPE_UD);
 443    assert(src0.type == BRW_REGISTER_TYPE_F);
 444
 445    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 446     *
 447     *   Because this instruction does not have a 16-bit floating-point type,
 448     *   the destination data type must be Word (W).
 449     *
 450     *   The destination must be DWord-aligned and specify a horizontal stride
 451     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 452     *   each destination channel and the upper word is not modified.
 453     *
 454     * The above restriction implies that the f32to16 instruction must use
 455     * align1 mode, because only in align1 mode is it possible to specify
 456     * horizontal stride.  We choose here to defy the hardware docs and emit
 457     * align16 instructions.
 458     *
 459     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 460     * instructions. I was partially successful in that the code passed all
 461     * tests.  However, the code was dubiously correct and fragile, and the
 462     * tests were not harsh enough to probe that frailty. Not trusting the
 463     * code, I chose instead to remain in align16 mode in defiance of the hw
 464     * docs).
 465     *
 466     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 467     * simulator, emitting a f32to16 in align16 mode with UD as destination
 468     * data type is safe. The behavior differs from that specified in the PRM
 469     * in that the upper word of each destination channel is cleared to 0.
 470     */
 471
 472    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 473    src_reg tmp_src(tmp_dst);
 474
 475 #if 0
 476    /* Verify the undocumented behavior on which the following instructions
 477     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 478     * then the result of the bit-or instruction below will be incorrect.
 479     *
 480     * You should inspect the disasm output in order to verify that the MOV is
 481     * not optimized away.
 482     */
 483    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 484 #endif
 485
 486    /* Give tmp the form below, where "." means untouched.
 487     *
 488     *     w z          y          x w z          y          x
 489     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 490     *
 491     * That the upper word of each write-channel be 0 is required for the
 492     * following bit-shift and bit-or instructions to work. Note that this
 493     * relies on the undocumented hardware behavior mentioned above.
 494     */
 495    tmp_dst.writemask = WRITEMASK_XY;
 496    emit(F32TO16(tmp_dst, src0));
 497
 498    /* Give the write-channels of dst the form:
 499     *   0xhhhh0000
 500     */
 501    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 502    emit(SHL(dst, tmp_src, src_reg(16u)));
 503
 504    /* Finally, give the write-channels of dst the form of packHalf2x16's
 505     * output:
 506     *   0xhhhhllll
 507     */
 508    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 509    emit(OR(dst, src_reg(dst), tmp_src));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 514 {
 515    if (brw->gen < 7) {
 516       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 517    }
 518
 519    assert(dst.type == BRW_REGISTER_TYPE_F);
 520    assert(src0.type == BRW_REGISTER_TYPE_UD);
 521
 522    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 523     *
 524     *   Because this instruction does not have a 16-bit floating-point type,
 525     *   the source data type must be Word (W). The destination type must be
 526     *   F (Float).
 527     *
 528     * To use W as the source data type, we must adjust horizontal strides,
 529     * which is only possible in align1 mode. All my [chadv] attempts at
 530     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 531     * Piglit tests, so I gave up.
 532     *
 533     * I've verified that, on gen7 hardware and the simulator, it is safe to
 534     * emit f16to32 in align16 mode with UD as source data type.
 535     */
 536
 537    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 538    src_reg tmp_src(tmp_dst);
 539
 540    tmp_dst.writemask = WRITEMASK_X;
 541    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 542
 543    tmp_dst.writemask = WRITEMASK_Y;
 544    emit(SHR(tmp_dst, src0, src_reg(16u)));
 545
 546    dst.writemask = WRITEMASK_XY;
 547    emit(F16TO32(dst, tmp_src));
 548 }
 549
 550 void
 551 vec4_visitor::visit_instructions(const exec_list *list)
 552 {
 553    foreach_in_list(ir_instruction, ir, list) {
 554       base_ir = ir;
 555       ir->accept(this);
 556    }
 557 }
 558
 559
 560 static int
 561 type_size(const struct glsl_type *type)
 562 {
 563    unsigned int i;
 564    int size;
 565
 566    switch (type->base_type) {
 567    case GLSL_TYPE_UINT:
 568    case GLSL_TYPE_INT:
 569    case GLSL_TYPE_FLOAT:
 570    case GLSL_TYPE_BOOL:
 571       if (type->is_matrix()) {
 572          return type->matrix_columns;
 573       } else {
 574          /* Regardless of size of vector, it gets a vec4. This is bad
 575           * packing for things like floats, but otherwise arrays become a
 576           * mess.  Hopefully a later pass over the code can pack scalars
 577           * down if appropriate.
 578           */
 579          return 1;
 580       }
 581    case GLSL_TYPE_ARRAY:
 582       assert(type->length > 0);
 583       return type_size(type->fields.array) * type->length;
 584    case GLSL_TYPE_STRUCT:
 585       size = 0;
 586       for (i = 0; i < type->length; i++) {
 587          size += type_size(type->fields.structure[i].type);
 588       }
 589       return size;
 590    case GLSL_TYPE_SAMPLER:
 591       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 592        * at link time.
 593        */
 594       return 1;
 595    case GLSL_TYPE_ATOMIC_UINT:
 596       return 0;
 597    case GLSL_TYPE_IMAGE:
 598    case GLSL_TYPE_VOID:
 599    case GLSL_TYPE_ERROR:
 600    case GLSL_TYPE_INTERFACE:
 601       unreachable("not reached");
 602    }
 603
 604    return 0;
 605 }
 606
 607 int
 608 vec4_visitor::virtual_grf_alloc(int size)
 609 {
 610    if (virtual_grf_array_size <= virtual_grf_count) {
 611       if (virtual_grf_array_size == 0)
 612          virtual_grf_array_size = 16;
 613       else
 614          virtual_grf_array_size *= 2;
 615       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 616                                    virtual_grf_array_size);
 617       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 618                                      virtual_grf_array_size);
 619    }
 620    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 621    virtual_grf_reg_count += size;
 622    virtual_grf_sizes[virtual_grf_count] = size;
 623    return virtual_grf_count++;
 624 }
 625
 626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 627 {
 628    init();
 629
 630    this->file = GRF;
 631    this->reg = v->virtual_grf_alloc(type_size(type));
 632
 633    if (type->is_array() || type->is_record()) {
 634       this->swizzle = BRW_SWIZZLE_NOOP;
 635    } else {
 636       this->swizzle = swizzle_for_size(type->vector_elements);
 637    }
 638
 639    this->type = brw_type_for_base_type(type);
 640 }
 641
 642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 643 {
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->virtual_grf_alloc(type_size(type));
 648
 649    if (type->is_array() || type->is_record()) {
 650       this->writemask = WRITEMASK_XYZW;
 651    } else {
 652       this->writemask = (1 << type->vector_elements) - 1;
 653    }
 654
 655    this->type = brw_type_for_base_type(type);
 656 }
 657
 658 /* Our support for uniforms is piggy-backed on the struct
 659  * gl_fragment_program, because that's where the values actually
 660  * get stored, rather than in some global gl_shader_program uniform
 661  * store.
 662  */
 663 void
 664 vec4_visitor::setup_uniform_values(ir_variable *ir)
 665 {
 666    int namelen = strlen(ir->name);
 667
 668    /* The data for our (non-builtin) uniforms is stored in a series of
 669     * gl_uniform_driver_storage structs for each subcomponent that
 670     * glGetUniformLocation() could name.  We know it's been set up in the same
 671     * order we'd walk the type, so walk the list of storage and find anything
 672     * with our name, or the prefix of a component that starts with our name.
 673     */
 674    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 675       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 676
 677       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 678           (storage->name[namelen] != 0 &&
 679            storage->name[namelen] != '.' &&
 680            storage->name[namelen] != '[')) {
 681          continue;
 682       }
 683
 684       gl_constant_value *components = storage->storage;
 685       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 686                                storage->type->matrix_columns);
 687
 688       for (unsigned s = 0; s < vector_count; s++) {
 689          assert(uniforms < uniform_array_size);
 690          uniform_vector_size[uniforms] = storage->type->vector_elements;
 691
 692          int i;
 693          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 694             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 695             components++;
 696          }
 697          for (; i < 4; i++) {
 698             static float zero = 0;
 699             stage_prog_data->param[uniforms * 4 + i] = &zero;
 700          }
 701
 702          uniforms++;
 703       }
 704    }
 705 }
 706
 707 void
 708 vec4_visitor::setup_uniform_clipplane_values()
 709 {
 710    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 711
 712    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 713       assert(this->uniforms < uniform_array_size);
 714       this->uniform_vector_size[this->uniforms] = 4;
 715       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 716       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 717       for (int j = 0; j < 4; ++j) {
 718          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 719       }
 720       ++this->uniforms;
 721    }
 722 }
 723
 724 /* Our support for builtin uniforms is even scarier than non-builtin.
 725  * It sits on top of the PROG_STATE_VAR parameters that are
 726  * automatically updated from GL context state.
 727  */
 728 void
 729 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 730 {
 731    const ir_state_slot *const slots = ir->state_slots;
 732    assert(ir->state_slots != NULL);
 733
 734    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 735       /* This state reference has already been setup by ir_to_mesa,
 736        * but we'll get the same index back here.  We can reference
 737        * ParameterValues directly, since unlike brw_fs.cpp, we never
 738        * add new state references during compile.
 739        */
 740       int index = _mesa_add_state_reference(this->prog->Parameters,
 741                                             (gl_state_index *)slots[i].tokens);
 742       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 743
 744       assert(this->uniforms < uniform_array_size);
 745       this->uniform_vector_size[this->uniforms] = 0;
 746       /* Add each of the unique swizzled channels of the element.
 747        * This will end up matching the size of the glsl_type of this field.
 748        */
 749       int last_swiz = -1;
 750       for (unsigned int j = 0; j < 4; j++) {
 751          int swiz = GET_SWZ(slots[i].swizzle, j);
 752          last_swiz = swiz;
 753
 754          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 755          assert(this->uniforms < uniform_array_size);
 756          if (swiz <= last_swiz)
 757             this->uniform_vector_size[this->uniforms]++;
 758       }
 759       this->uniforms++;
 760    }
 761 }
 762
 763 dst_reg *
 764 vec4_visitor::variable_storage(ir_variable *var)
 765 {
 766    return (dst_reg *)hash_table_find(this->variable_ht, var);
 767 }
 768
 769 void
 770 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 771                                      enum brw_predicate *predicate)
 772 {
 773    ir_expression *expr = ir->as_expression();
 774
 775    *predicate = BRW_PREDICATE_NORMAL;
 776
 777    if (expr) {
 778       src_reg op[2];
 779       vec4_instruction *inst;
 780
 781       assert(expr->get_num_operands() <= 2);
 782       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 783          expr->operands[i]->accept(this);
 784          op[i] = this->result;
 785
 786          resolve_ud_negate(&op[i]);
 787       }
 788
 789       switch (expr->operation) {
 790       case ir_unop_logic_not:
 791          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 792          inst->conditional_mod = BRW_CONDITIONAL_Z;
 793          break;
 794
 795       case ir_binop_logic_xor:
 796          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 797          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 798          break;
 799
 800       case ir_binop_logic_or:
 801          inst = emit(OR(dst_null_d(), op[0], op[1]));
 802          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 803          break;
 804
 805       case ir_binop_logic_and:
 806          inst = emit(AND(dst_null_d(), op[0], op[1]));
 807          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 808          break;
 809
 810       case ir_unop_f2b:
 811          if (brw->gen >= 6) {
 812             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 813          } else {
 814             inst = emit(MOV(dst_null_f(), op[0]));
 815             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 816          }
 817          break;
 818
 819       case ir_unop_i2b:
 820          if (brw->gen >= 6) {
 821             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 822          } else {
 823             inst = emit(MOV(dst_null_d(), op[0]));
 824             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 825          }
 826          break;
 827
 828       case ir_binop_all_equal:
 829          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 830          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 831          break;
 832
 833       case ir_binop_any_nequal:
 834          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 835          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 836          break;
 837
 838       case ir_unop_any:
 839          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 840          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 841          break;
 842
 843       case ir_binop_greater:
 844       case ir_binop_gequal:
 845       case ir_binop_less:
 846       case ir_binop_lequal:
 847       case ir_binop_equal:
 848       case ir_binop_nequal:
 849          emit(CMP(dst_null_d(), op[0], op[1],
 850                   brw_conditional_for_comparison(expr->operation)));
 851          break;
 852
 853       default:
 854          unreachable("not reached");
 855       }
 856       return;
 857    }
 858
 859    ir->accept(this);
 860
 861    resolve_ud_negate(&this->result);
 862
 863    if (brw->gen >= 6) {
 864       vec4_instruction *inst = emit(AND(dst_null_d(),
 865                                         this->result, src_reg(1)));
 866       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867    } else {
 868       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 869       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 870    }
 871 }
 872
 873 /**
 874  * Emit a gen6 IF statement with the comparison folded into the IF
 875  * instruction.
 876  */
 877 void
 878 vec4_visitor::emit_if_gen6(ir_if *ir)
 879 {
 880    ir_expression *expr = ir->condition->as_expression();
 881
 882    if (expr) {
 883       src_reg op[2];
 884       dst_reg temp;
 885
 886       assert(expr->get_num_operands() <= 2);
 887       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 888          expr->operands[i]->accept(this);
 889          op[i] = this->result;
 890       }
 891
 892       switch (expr->operation) {
 893       case ir_unop_logic_not:
 894          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 895          return;
 896
 897       case ir_binop_logic_xor:
 898          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 899          return;
 900
 901       case ir_binop_logic_or:
 902          temp = dst_reg(this, glsl_type::bool_type);
 903          emit(OR(temp, op[0], op[1]));
 904          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 905          return;
 906
 907       case ir_binop_logic_and:
 908          temp = dst_reg(this, glsl_type::bool_type);
 909          emit(AND(temp, op[0], op[1]));
 910          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 911          return;
 912
 913       case ir_unop_f2b:
 914          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 915          return;
 916
 917       case ir_unop_i2b:
 918          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 919          return;
 920
 921       case ir_binop_greater:
 922       case ir_binop_gequal:
 923       case ir_binop_less:
 924       case ir_binop_lequal:
 925       case ir_binop_equal:
 926       case ir_binop_nequal:
 927          emit(IF(op[0], op[1],
 928                  brw_conditional_for_comparison(expr->operation)));
 929          return;
 930
 931       case ir_binop_all_equal:
 932          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 933          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 934          return;
 935
 936       case ir_binop_any_nequal:
 937          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 938          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 939          return;
 940
 941       case ir_unop_any:
 942          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 943          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 944          return;
 945
 946       default:
 947          unreachable("not reached");
 948       }
 949       return;
 950    }
 951
 952    ir->condition->accept(this);
 953
 954    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 955 }
 956
 957 void
 958 vec4_visitor::visit(ir_variable *ir)
 959 {
 960    dst_reg *reg = NULL;
 961
 962    if (variable_storage(ir))
 963       return;
 964
 965    switch (ir->data.mode) {
 966    case ir_var_shader_in:
 967       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 968       break;
 969
 970    case ir_var_shader_out:
 971       reg = new(mem_ctx) dst_reg(this, ir->type);
 972
 973       for (int i = 0; i < type_size(ir->type); i++) {
 974          output_reg[ir->data.location + i] = *reg;
 975          output_reg[ir->data.location + i].reg_offset = i;
 976          output_reg[ir->data.location + i].type =
 977             brw_type_for_base_type(ir->type->get_scalar_type());
 978          output_reg_annotation[ir->data.location + i] = ir->name;
 979       }
 980       break;
 981
 982    case ir_var_auto:
 983    case ir_var_temporary:
 984       reg = new(mem_ctx) dst_reg(this, ir->type);
 985       break;
 986
 987    case ir_var_uniform:
 988       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 989
 990       /* Thanks to the lower_ubo_reference pass, we will see only
 991        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 992        * variables, so no need for them to be in variable_ht.
 993        *
 994        * Atomic counters take no uniform storage, no need to do
 995        * anything here.
 996        */
 997       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 998          return;
 999
1000       /* Track how big the whole uniform variable is, in case we need to put a
1001        * copy of its data into pull constants for array access.
1002        */
1003       assert(this->uniforms < uniform_array_size);
1004       this->uniform_size[this->uniforms] = type_size(ir->type);
1005
1006       if (!strncmp(ir->name, "gl_", 3)) {
1007          setup_builtin_uniform_values(ir);
1008       } else {
1009          setup_uniform_values(ir);
1010       }
1011       break;
1012
1013    case ir_var_system_value:
1014       reg = make_reg_for_system_value(ir);
1015       break;
1016
1017    default:
1018       unreachable("not reached");
1019    }
1020
1021    reg->type = brw_type_for_base_type(ir->type);
1022    hash_table_insert(this->variable_ht, reg, ir);
1023 }
1024
1025 void
1026 vec4_visitor::visit(ir_loop *ir)
1027 {
1028    /* We don't want debugging output to print the whole body of the
1029     * loop as the annotation.
1030     */
1031    this->base_ir = NULL;
1032
1033    emit(BRW_OPCODE_DO);
1034
1035    visit_instructions(&ir->body_instructions);
1036
1037    emit(BRW_OPCODE_WHILE);
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_loop_jump *ir)
1042 {
1043    switch (ir->mode) {
1044    case ir_loop_jump::jump_break:
1045       emit(BRW_OPCODE_BREAK);
1046       break;
1047    case ir_loop_jump::jump_continue:
1048       emit(BRW_OPCODE_CONTINUE);
1049       break;
1050    }
1051 }
1052
1053
1054 void
1055 vec4_visitor::visit(ir_function_signature *)
1056 {
1057    unreachable("not reached");
1058 }
1059
1060 void
1061 vec4_visitor::visit(ir_function *ir)
1062 {
1063    /* Ignore function bodies other than main() -- we shouldn't see calls to
1064     * them since they should all be inlined.
1065     */
1066    if (strcmp(ir->name, "main") == 0) {
1067       const ir_function_signature *sig;
1068       exec_list empty;
1069
1070       sig = ir->matching_signature(NULL, &empty, false);
1071
1072       assert(sig);
1073
1074       visit_instructions(&sig->body);
1075    }
1076 }
1077
1078 bool
1079 vec4_visitor::try_emit_sat(ir_expression *ir)
1080 {
1081    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1082    if (!sat_src)
1083       return false;
1084
1085    sat_src->accept(this);
1086    src_reg src = this->result;
1087
1088    this->result = src_reg(this, ir->type);
1089    vec4_instruction *inst;
1090    inst = emit(MOV(dst_reg(this->result), src));
1091    inst->saturate = true;
1092
1093    return true;
1094 }
1095
1096 bool
1097 vec4_visitor::try_emit_mad(ir_expression *ir)
1098 {
1099    /* 3-src instructions were introduced in gen6. */
1100    if (brw->gen < 6)
1101       return false;
1102
1103    /* MAD can only handle floating-point data. */
1104    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1105       return false;
1106
1107    ir_rvalue *nonmul = ir->operands[1];
1108    ir_expression *mul = ir->operands[0]->as_expression();
1109
1110    if (!mul || mul->operation != ir_binop_mul) {
1111       nonmul = ir->operands[0];
1112       mul = ir->operands[1]->as_expression();
1113
1114       if (!mul || mul->operation != ir_binop_mul)
1115          return false;
1116    }
1117
1118    nonmul->accept(this);
1119    src_reg src0 = fix_3src_operand(this->result);
1120
1121    mul->operands[0]->accept(this);
1122    src_reg src1 = fix_3src_operand(this->result);
1123
1124    mul->operands[1]->accept(this);
1125    src_reg src2 = fix_3src_operand(this->result);
1126
1127    this->result = src_reg(this, ir->type);
1128    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1129
1130    return true;
1131 }
1132
1133 bool
1134 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1135 {
1136    ir_expression *const cmp = ir->operands[0]->as_expression();
1137
1138    if (cmp == NULL)
1139       return false;
1140
1141    switch (cmp->operation) {
1142    case ir_binop_less:
1143    case ir_binop_greater:
1144    case ir_binop_lequal:
1145    case ir_binop_gequal:
1146    case ir_binop_equal:
1147    case ir_binop_nequal:
1148       break;
1149
1150    default:
1151       return false;
1152    }
1153
1154    cmp->operands[0]->accept(this);
1155    const src_reg cmp_src0 = this->result;
1156
1157    cmp->operands[1]->accept(this);
1158    const src_reg cmp_src1 = this->result;
1159
1160    this->result = src_reg(this, ir->type);
1161
1162    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1163             brw_conditional_for_comparison(cmp->operation)));
1164
1165    /* If the comparison is false, this->result will just happen to be zero.
1166     */
1167    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1168                                        this->result, src_reg(1.0f));
1169    inst->predicate = BRW_PREDICATE_NORMAL;
1170    inst->predicate_inverse = true;
1171
1172    return true;
1173 }
1174
1175 void
1176 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1177                           src_reg src0, src_reg src1)
1178 {
1179    vec4_instruction *inst;
1180
1181    if (brw->gen >= 6) {
1182       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1183       inst->conditional_mod = conditionalmod;
1184    } else {
1185       emit(CMP(dst, src0, src1, conditionalmod));
1186
1187       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1188       inst->predicate = BRW_PREDICATE_NORMAL;
1189    }
1190 }
1191
1192 void
1193 vec4_visitor::emit_lrp(const dst_reg &dst,
1194                        const src_reg &x, const src_reg &y, const src_reg &a)
1195 {
1196    if (brw->gen >= 6) {
1197       /* Note that the instruction's argument order is reversed from GLSL
1198        * and the IR.
1199        */
1200       emit(LRP(dst,
1201                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1202    } else {
1203       /* Earlier generations don't support three source operations, so we
1204        * need to emit x*(1-a) + y*a.
1205        */
1206       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1207       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1208       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1209       y_times_a.writemask           = dst.writemask;
1210       one_minus_a.writemask         = dst.writemask;
1211       x_times_one_minus_a.writemask = dst.writemask;
1212
1213       emit(MUL(y_times_a, y, a));
1214       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1215       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1216       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1217    }
1218 }
1219
1220 void
1221 vec4_visitor::visit(ir_expression *ir)
1222 {
1223    unsigned int operand;
1224    src_reg op[Elements(ir->operands)];
1225    src_reg result_src;
1226    dst_reg result_dst;
1227    vec4_instruction *inst;
1228
1229    if (try_emit_sat(ir))
1230       return;
1231
1232    if (ir->operation == ir_binop_add) {
1233       if (try_emit_mad(ir))
1234          return;
1235    }
1236
1237    if (ir->operation == ir_unop_b2f) {
1238       if (try_emit_b2f_of_compare(ir))
1239          return;
1240    }
1241
1242    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1243       this->result.file = BAD_FILE;
1244       ir->operands[operand]->accept(this);
1245       if (this->result.file == BAD_FILE) {
1246          fprintf(stderr, "Failed to get tree for expression operand:\n");
1247          ir->operands[operand]->fprint(stderr);
1248          exit(1);
1249       }
1250       op[operand] = this->result;
1251
1252       /* Matrix expression operands should have been broken down to vector
1253        * operations already.
1254        */
1255       assert(!ir->operands[operand]->type->is_matrix());
1256    }
1257
1258    int vector_elements = ir->operands[0]->type->vector_elements;
1259    if (ir->operands[1]) {
1260       vector_elements = MAX2(vector_elements,
1261                              ir->operands[1]->type->vector_elements);
1262    }
1263
1264    this->result.file = BAD_FILE;
1265
1266    /* Storage for our result.  Ideally for an assignment we'd be using
1267     * the actual storage for the result here, instead.
1268     */
1269    result_src = src_reg(this, ir->type);
1270    /* convenience for the emit functions below. */
1271    result_dst = dst_reg(result_src);
1272    /* If nothing special happens, this is the result. */
1273    this->result = result_src;
1274    /* Limit writes to the channels that will be used by result_src later.
1275     * This does limit this temp's use as a temporary for multi-instruction
1276     * sequences.
1277     */
1278    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1279
1280    switch (ir->operation) {
1281    case ir_unop_logic_not:
1282       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1283        * ones complement of the whole register, not just bit 0.
1284        */
1285       emit(XOR(result_dst, op[0], src_reg(1)));
1286       break;
1287    case ir_unop_neg:
1288       op[0].negate = !op[0].negate;
1289       emit(MOV(result_dst, op[0]));
1290       break;
1291    case ir_unop_abs:
1292       op[0].abs = true;
1293       op[0].negate = false;
1294       emit(MOV(result_dst, op[0]));
1295       break;
1296
1297    case ir_unop_sign:
1298       if (ir->type->is_float()) {
1299          /* AND(val, 0x80000000) gives the sign bit.
1300           *
1301           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1302           * zero.
1303           */
1304          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1305
1306          op[0].type = BRW_REGISTER_TYPE_UD;
1307          result_dst.type = BRW_REGISTER_TYPE_UD;
1308          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1309
1310          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1311          inst->predicate = BRW_PREDICATE_NORMAL;
1312
1313          this->result.type = BRW_REGISTER_TYPE_F;
1314       } else {
1315          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1316           *               -> non-negative val generates 0x00000000.
1317           *  Predicated OR sets 1 if val is positive.
1318           */
1319          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1320
1321          emit(ASR(result_dst, op[0], src_reg(31)));
1322
1323          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1324          inst->predicate = BRW_PREDICATE_NORMAL;
1325       }
1326       break;
1327
1328    case ir_unop_rcp:
1329       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1330       break;
1331
1332    case ir_unop_exp2:
1333       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1334       break;
1335    case ir_unop_log2:
1336       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1337       break;
1338    case ir_unop_exp:
1339    case ir_unop_log:
1340       unreachable("not reached: should be handled by ir_explog_to_explog2");
1341    case ir_unop_sin:
1342    case ir_unop_sin_reduced:
1343       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1344       break;
1345    case ir_unop_cos:
1346    case ir_unop_cos_reduced:
1347       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1348       break;
1349
1350    case ir_unop_dFdx:
1351    case ir_unop_dFdy:
1352       unreachable("derivatives not valid in vertex shader");
1353
1354    case ir_unop_bitfield_reverse:
1355       emit(BFREV(result_dst, op[0]));
1356       break;
1357    case ir_unop_bit_count:
1358       emit(CBIT(result_dst, op[0]));
1359       break;
1360    case ir_unop_find_msb: {
1361       src_reg temp = src_reg(this, glsl_type::uint_type);
1362
1363       inst = emit(FBH(dst_reg(temp), op[0]));
1364       inst->dst.writemask = WRITEMASK_XYZW;
1365
1366       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1367        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1368        * subtract the result from 31 to convert the MSB count into an LSB count.
1369        */
1370
1371       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1372       temp.swizzle = BRW_SWIZZLE_NOOP;
1373       emit(MOV(result_dst, temp));
1374
1375       src_reg src_tmp = src_reg(result_dst);
1376       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1377
1378       src_tmp.negate = true;
1379       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1380       inst->predicate = BRW_PREDICATE_NORMAL;
1381       break;
1382    }
1383    case ir_unop_find_lsb:
1384       emit(FBL(result_dst, op[0]));
1385       break;
1386
1387    case ir_unop_noise:
1388       unreachable("not reached: should be handled by lower_noise");
1389
1390    case ir_binop_add:
1391       emit(ADD(result_dst, op[0], op[1]));
1392       break;
1393    case ir_binop_sub:
1394       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1395
1396    case ir_binop_mul:
1397       if (brw->gen < 8 && ir->type->is_integer()) {
1398          /* For integer multiplication, the MUL uses the low 16 bits of one of
1399           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1400           * accumulates in the contribution of the upper 16 bits of that
1401           * operand.  If we can determine that one of the args is in the low
1402           * 16 bits, though, we can just emit a single MUL.
1403           */
1404          if (ir->operands[0]->is_uint16_constant()) {
1405             if (brw->gen < 7)
1406                emit(MUL(result_dst, op[0], op[1]));
1407             else
1408                emit(MUL(result_dst, op[1], op[0]));
1409          } else if (ir->operands[1]->is_uint16_constant()) {
1410             if (brw->gen < 7)
1411                emit(MUL(result_dst, op[1], op[0]));
1412             else
1413                emit(MUL(result_dst, op[0], op[1]));
1414          } else {
1415             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1416
1417             emit(MUL(acc, op[0], op[1]));
1418             emit(MACH(dst_null_d(), op[0], op[1]));
1419             emit(MOV(result_dst, src_reg(acc)));
1420          }
1421       } else {
1422          emit(MUL(result_dst, op[0], op[1]));
1423       }
1424       break;
1425    case ir_binop_imul_high: {
1426       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1427
1428       emit(MUL(acc, op[0], op[1]));
1429       emit(MACH(result_dst, op[0], op[1]));
1430       break;
1431    }
1432    case ir_binop_div:
1433       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1434       assert(ir->type->is_integer());
1435       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1436       break;
1437    case ir_binop_carry: {
1438       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1439
1440       emit(ADDC(dst_null_ud(), op[0], op[1]));
1441       emit(MOV(result_dst, src_reg(acc)));
1442       break;
1443    }
1444    case ir_binop_borrow: {
1445       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1446
1447       emit(SUBB(dst_null_ud(), op[0], op[1]));
1448       emit(MOV(result_dst, src_reg(acc)));
1449       break;
1450    }
1451    case ir_binop_mod:
1452       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1453       assert(ir->type->is_integer());
1454       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1455       break;
1456
1457    case ir_binop_less:
1458    case ir_binop_greater:
1459    case ir_binop_lequal:
1460    case ir_binop_gequal:
1461    case ir_binop_equal:
1462    case ir_binop_nequal: {
1463       emit(CMP(result_dst, op[0], op[1],
1464                brw_conditional_for_comparison(ir->operation)));
1465       emit(AND(result_dst, result_src, src_reg(0x1)));
1466       break;
1467    }
1468
1469    case ir_binop_all_equal:
1470       /* "==" operator producing a scalar boolean. */
1471       if (ir->operands[0]->type->is_vector() ||
1472           ir->operands[1]->type->is_vector()) {
1473          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1474          emit(MOV(result_dst, src_reg(0)));
1475          inst = emit(MOV(result_dst, src_reg(1)));
1476          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1477       } else {
1478          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1479          emit(AND(result_dst, result_src, src_reg(0x1)));
1480       }
1481       break;
1482    case ir_binop_any_nequal:
1483       /* "!=" operator producing a scalar boolean. */
1484       if (ir->operands[0]->type->is_vector() ||
1485           ir->operands[1]->type->is_vector()) {
1486          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1487
1488          emit(MOV(result_dst, src_reg(0)));
1489          inst = emit(MOV(result_dst, src_reg(1)));
1490          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1491       } else {
1492          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1493          emit(AND(result_dst, result_src, src_reg(0x1)));
1494       }
1495       break;
1496
1497    case ir_unop_any:
1498       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1499       emit(MOV(result_dst, src_reg(0)));
1500
1501       inst = emit(MOV(result_dst, src_reg(1)));
1502       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1503       break;
1504
1505    case ir_binop_logic_xor:
1506       emit(XOR(result_dst, op[0], op[1]));
1507       break;
1508
1509    case ir_binop_logic_or:
1510       emit(OR(result_dst, op[0], op[1]));
1511       break;
1512
1513    case ir_binop_logic_and:
1514       emit(AND(result_dst, op[0], op[1]));
1515       break;
1516
1517    case ir_binop_dot:
1518       assert(ir->operands[0]->type->is_vector());
1519       assert(ir->operands[0]->type == ir->operands[1]->type);
1520       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1521       break;
1522
1523    case ir_unop_sqrt:
1524       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1525       break;
1526    case ir_unop_rsq:
1527       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1528       break;
1529
1530    case ir_unop_bitcast_i2f:
1531    case ir_unop_bitcast_u2f:
1532       this->result = op[0];
1533       this->result.type = BRW_REGISTER_TYPE_F;
1534       break;
1535
1536    case ir_unop_bitcast_f2i:
1537       this->result = op[0];
1538       this->result.type = BRW_REGISTER_TYPE_D;
1539       break;
1540
1541    case ir_unop_bitcast_f2u:
1542       this->result = op[0];
1543       this->result.type = BRW_REGISTER_TYPE_UD;
1544       break;
1545
1546    case ir_unop_i2f:
1547    case ir_unop_i2u:
1548    case ir_unop_u2i:
1549    case ir_unop_u2f:
1550    case ir_unop_b2f:
1551    case ir_unop_b2i:
1552    case ir_unop_f2i:
1553    case ir_unop_f2u:
1554       emit(MOV(result_dst, op[0]));
1555       break;
1556    case ir_unop_f2b:
1557    case ir_unop_i2b: {
1558       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1559       emit(AND(result_dst, result_src, src_reg(1)));
1560       break;
1561    }
1562
1563    case ir_unop_trunc:
1564       emit(RNDZ(result_dst, op[0]));
1565       break;
1566    case ir_unop_ceil:
1567       op[0].negate = !op[0].negate;
1568       inst = emit(RNDD(result_dst, op[0]));
1569       this->result.negate = true;
1570       break;
1571    case ir_unop_floor:
1572       inst = emit(RNDD(result_dst, op[0]));
1573       break;
1574    case ir_unop_fract:
1575       inst = emit(FRC(result_dst, op[0]));
1576       break;
1577    case ir_unop_round_even:
1578       emit(RNDE(result_dst, op[0]));
1579       break;
1580
1581    case ir_binop_min:
1582       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1583       break;
1584    case ir_binop_max:
1585       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1586       break;
1587
1588    case ir_binop_pow:
1589       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1590       break;
1591
1592    case ir_unop_bit_not:
1593       inst = emit(NOT(result_dst, op[0]));
1594       break;
1595    case ir_binop_bit_and:
1596       inst = emit(AND(result_dst, op[0], op[1]));
1597       break;
1598    case ir_binop_bit_xor:
1599       inst = emit(XOR(result_dst, op[0], op[1]));
1600       break;
1601    case ir_binop_bit_or:
1602       inst = emit(OR(result_dst, op[0], op[1]));
1603       break;
1604
1605    case ir_binop_lshift:
1606       inst = emit(SHL(result_dst, op[0], op[1]));
1607       break;
1608
1609    case ir_binop_rshift:
1610       if (ir->type->base_type == GLSL_TYPE_INT)
1611          inst = emit(ASR(result_dst, op[0], op[1]));
1612       else
1613          inst = emit(SHR(result_dst, op[0], op[1]));
1614       break;
1615
1616    case ir_binop_bfm:
1617       emit(BFI1(result_dst, op[0], op[1]));
1618       break;
1619
1620    case ir_binop_ubo_load: {
1621       ir_constant *uniform_block = ir->operands[0]->as_constant();
1622       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1623       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1624       src_reg offset;
1625
1626       /* Now, load the vector from that offset. */
1627       assert(ir->type->is_vector() || ir->type->is_scalar());
1628
1629       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1630       packed_consts.type = result.type;
1631       src_reg surf_index =
1632          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1633       if (const_offset_ir) {
1634          if (brw->gen >= 8) {
1635             /* Store the offset in a GRF so we can send-from-GRF. */
1636             offset = src_reg(this, glsl_type::int_type);
1637             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1638          } else {
1639             /* Immediates are fine on older generations since they'll be moved
1640              * to a (potentially fake) MRF at the generator level.
1641              */
1642             offset = src_reg(const_offset / 16);
1643          }
1644       } else {
1645          offset = src_reg(this, glsl_type::uint_type);
1646          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1647       }
1648
1649       if (brw->gen >= 7) {
1650          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1651          grf_offset.type = offset.type;
1652
1653          emit(MOV(grf_offset, offset));
1654
1655          emit(new(mem_ctx) vec4_instruction(this,
1656                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1657                                             dst_reg(packed_consts),
1658                                             surf_index,
1659                                             src_reg(grf_offset)));
1660       } else {
1661          vec4_instruction *pull =
1662             emit(new(mem_ctx) vec4_instruction(this,
1663                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1664                                                dst_reg(packed_consts),
1665                                                surf_index,
1666                                                offset));
1667          pull->base_mrf = 14;
1668          pull->mlen = 1;
1669       }
1670
1671       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1672       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1673                                             const_offset % 16 / 4,
1674                                             const_offset % 16 / 4,
1675                                             const_offset % 16 / 4);
1676
1677       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1678       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1679          emit(CMP(result_dst, packed_consts, src_reg(0u),
1680                   BRW_CONDITIONAL_NZ));
1681          emit(AND(result_dst, result, src_reg(0x1)));
1682       } else {
1683          emit(MOV(result_dst, packed_consts));
1684       }
1685       break;
1686    }
1687
1688    case ir_binop_vector_extract:
1689       unreachable("should have been lowered by vec_index_to_cond_assign");
1690
1691    case ir_triop_fma:
1692       op[0] = fix_3src_operand(op[0]);
1693       op[1] = fix_3src_operand(op[1]);
1694       op[2] = fix_3src_operand(op[2]);
1695       /* Note that the instruction's argument order is reversed from GLSL
1696        * and the IR.
1697        */
1698       emit(MAD(result_dst, op[2], op[1], op[0]));
1699       break;
1700
1701    case ir_triop_lrp:
1702       emit_lrp(result_dst, op[0], op[1], op[2]);
1703       break;
1704
1705    case ir_triop_csel:
1706       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1707       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1708       inst->predicate = BRW_PREDICATE_NORMAL;
1709       break;
1710
1711    case ir_triop_bfi:
1712       op[0] = fix_3src_operand(op[0]);
1713       op[1] = fix_3src_operand(op[1]);
1714       op[2] = fix_3src_operand(op[2]);
1715       emit(BFI2(result_dst, op[0], op[1], op[2]));
1716       break;
1717
1718    case ir_triop_bitfield_extract:
1719       op[0] = fix_3src_operand(op[0]);
1720       op[1] = fix_3src_operand(op[1]);
1721       op[2] = fix_3src_operand(op[2]);
1722       /* Note that the instruction's argument order is reversed from GLSL
1723        * and the IR.
1724        */
1725       emit(BFE(result_dst, op[2], op[1], op[0]));
1726       break;
1727
1728    case ir_triop_vector_insert:
1729       unreachable("should have been lowered by lower_vector_insert");
1730
1731    case ir_quadop_bitfield_insert:
1732       unreachable("not reached: should be handled by "
1733               "bitfield_insert_to_bfm_bfi\n");
1734
1735    case ir_quadop_vector:
1736       unreachable("not reached: should be handled by lower_quadop_vector");
1737
1738    case ir_unop_pack_half_2x16:
1739       emit_pack_half_2x16(result_dst, op[0]);
1740       break;
1741    case ir_unop_unpack_half_2x16:
1742       emit_unpack_half_2x16(result_dst, op[0]);
1743       break;
1744    case ir_unop_pack_snorm_2x16:
1745    case ir_unop_pack_snorm_4x8:
1746    case ir_unop_pack_unorm_2x16:
1747    case ir_unop_pack_unorm_4x8:
1748    case ir_unop_unpack_snorm_2x16:
1749    case ir_unop_unpack_snorm_4x8:
1750    case ir_unop_unpack_unorm_2x16:
1751    case ir_unop_unpack_unorm_4x8:
1752       unreachable("not reached: should be handled by lower_packing_builtins");
1753    case ir_unop_unpack_half_2x16_split_x:
1754    case ir_unop_unpack_half_2x16_split_y:
1755    case ir_binop_pack_half_2x16_split:
1756    case ir_unop_interpolate_at_centroid:
1757    case ir_binop_interpolate_at_sample:
1758    case ir_binop_interpolate_at_offset:
1759       unreachable("not reached: should not occur in vertex shader");
1760    case ir_binop_ldexp:
1761       unreachable("not reached: should be handled by ldexp_to_arith()");
1762    }
1763 }
1764
1765
1766 void
1767 vec4_visitor::visit(ir_swizzle *ir)
1768 {
1769    src_reg src;
1770    int i = 0;
1771    int swizzle[4];
1772
1773    /* Note that this is only swizzles in expressions, not those on the left
1774     * hand side of an assignment, which do write masking.  See ir_assignment
1775     * for that.
1776     */
1777
1778    ir->val->accept(this);
1779    src = this->result;
1780    assert(src.file != BAD_FILE);
1781
1782    for (i = 0; i < ir->type->vector_elements; i++) {
1783       switch (i) {
1784       case 0:
1785          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1786          break;
1787       case 1:
1788          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1789          break;
1790       case 2:
1791          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1792          break;
1793       case 3:
1794          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1795             break;
1796       }
1797    }
1798    for (; i < 4; i++) {
1799       /* Replicate the last channel out. */
1800       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1801    }
1802
1803    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1804
1805    this->result = src;
1806 }
1807
1808 void
1809 vec4_visitor::visit(ir_dereference_variable *ir)
1810 {
1811    const struct glsl_type *type = ir->type;
1812    dst_reg *reg = variable_storage(ir->var);
1813
1814    if (!reg) {
1815       fail("Failed to find variable storage for %s\n", ir->var->name);
1816       this->result = src_reg(brw_null_reg());
1817       return;
1818    }
1819
1820    this->result = src_reg(*reg);
1821
1822    /* System values get their swizzle from the dst_reg writemask */
1823    if (ir->var->data.mode == ir_var_system_value)
1824       return;
1825
1826    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1827       this->result.swizzle = swizzle_for_size(type->vector_elements);
1828 }
1829
1830
1831 int
1832 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1833 {
1834    /* Under normal circumstances array elements are stored consecutively, so
1835     * the stride is equal to the size of the array element.
1836     */
1837    return type_size(ir->type);
1838 }
1839
1840
1841 void
1842 vec4_visitor::visit(ir_dereference_array *ir)
1843 {
1844    ir_constant *constant_index;
1845    src_reg src;
1846    int array_stride = compute_array_stride(ir);
1847
1848    constant_index = ir->array_index->constant_expression_value();
1849
1850    ir->array->accept(this);
1851    src = this->result;
1852
1853    if (constant_index) {
1854       src.reg_offset += constant_index->value.i[0] * array_stride;
1855    } else {
1856       /* Variable index array dereference.  It eats the "vec4" of the
1857        * base of the array and an index that offsets the Mesa register
1858        * index.
1859        */
1860       ir->array_index->accept(this);
1861
1862       src_reg index_reg;
1863
1864       if (array_stride == 1) {
1865          index_reg = this->result;
1866       } else {
1867          index_reg = src_reg(this, glsl_type::int_type);
1868
1869          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1870       }
1871
1872       if (src.reladdr) {
1873          src_reg temp = src_reg(this, glsl_type::int_type);
1874
1875          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1876
1877          index_reg = temp;
1878       }
1879
1880       src.reladdr = ralloc(mem_ctx, src_reg);
1881       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1882    }
1883
1884    /* If the type is smaller than a vec4, replicate the last channel out. */
1885    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1886       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1887    else
1888       src.swizzle = BRW_SWIZZLE_NOOP;
1889    src.type = brw_type_for_base_type(ir->type);
1890
1891    this->result = src;
1892 }
1893
1894 void
1895 vec4_visitor::visit(ir_dereference_record *ir)
1896 {
1897    unsigned int i;
1898    const glsl_type *struct_type = ir->record->type;
1899    int offset = 0;
1900
1901    ir->record->accept(this);
1902
1903    for (i = 0; i < struct_type->length; i++) {
1904       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1905          break;
1906       offset += type_size(struct_type->fields.structure[i].type);
1907    }
1908
1909    /* If the type is smaller than a vec4, replicate the last channel out. */
1910    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1911       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1912    else
1913       this->result.swizzle = BRW_SWIZZLE_NOOP;
1914    this->result.type = brw_type_for_base_type(ir->type);
1915
1916    this->result.reg_offset += offset;
1917 }
1918
1919 /**
1920  * We want to be careful in assignment setup to hit the actual storage
1921  * instead of potentially using a temporary like we might with the
1922  * ir_dereference handler.
1923  */
1924 static dst_reg
1925 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1926 {
1927    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1928     * access of a vector, it must be separated into a series conditional moves
1929     * before reaching this point (see ir_vec_index_to_cond_assign).
1930     */
1931    assert(ir->as_dereference());
1932    ir_dereference_array *deref_array = ir->as_dereference_array();
1933    if (deref_array) {
1934       assert(!deref_array->array->type->is_vector());
1935    }
1936
1937    /* Use the rvalue deref handler for the most part.  We'll ignore
1938     * swizzles in it and write swizzles using writemask, though.
1939     */
1940    ir->accept(v);
1941    return dst_reg(v->result);
1942 }
1943
1944 void
1945 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1946                               const struct glsl_type *type,
1947                               enum brw_predicate predicate)
1948 {
1949    if (type->base_type == GLSL_TYPE_STRUCT) {
1950       for (unsigned int i = 0; i < type->length; i++) {
1951          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1952       }
1953       return;
1954    }
1955
1956    if (type->is_array()) {
1957       for (unsigned int i = 0; i < type->length; i++) {
1958          emit_block_move(dst, src, type->fields.array, predicate);
1959       }
1960       return;
1961    }
1962
1963    if (type->is_matrix()) {
1964       const struct glsl_type *vec_type;
1965
1966       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1967                                          type->vector_elements, 1);
1968
1969       for (int i = 0; i < type->matrix_columns; i++) {
1970          emit_block_move(dst, src, vec_type, predicate);
1971       }
1972       return;
1973    }
1974
1975    assert(type->is_scalar() || type->is_vector());
1976
1977    dst->type = brw_type_for_base_type(type);
1978    src->type = dst->type;
1979
1980    dst->writemask = (1 << type->vector_elements) - 1;
1981
1982    src->swizzle = swizzle_for_size(type->vector_elements);
1983
1984    vec4_instruction *inst = emit(MOV(*dst, *src));
1985    inst->predicate = predicate;
1986
1987    dst->reg_offset++;
1988    src->reg_offset++;
1989 }
1990
1991
1992 /* If the RHS processing resulted in an instruction generating a
1993  * temporary value, and it would be easy to rewrite the instruction to
1994  * generate its result right into the LHS instead, do so.  This ends
1995  * up reliably removing instructions where it can be tricky to do so
1996  * later without real UD chain information.
1997  */
1998 bool
1999 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2000                                      dst_reg dst,
2001                                      src_reg src,
2002                                      vec4_instruction *pre_rhs_inst,
2003                                      vec4_instruction *last_rhs_inst)
2004 {
2005    /* This could be supported, but it would take more smarts. */
2006    if (ir->condition)
2007       return false;
2008
2009    if (pre_rhs_inst == last_rhs_inst)
2010       return false; /* No instructions generated to work with. */
2011
2012    /* Make sure the last instruction generated our source reg. */
2013    if (src.file != GRF ||
2014        src.file != last_rhs_inst->dst.file ||
2015        src.reg != last_rhs_inst->dst.reg ||
2016        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2017        src.reladdr ||
2018        src.abs ||
2019        src.negate ||
2020        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2021       return false;
2022
2023    /* Check that that last instruction fully initialized the channels
2024     * we want to use, in the order we want to use them.  We could
2025     * potentially reswizzle the operands of many instructions so that
2026     * we could handle out of order channels, but don't yet.
2027     */
2028
2029    for (unsigned i = 0; i < 4; i++) {
2030       if (dst.writemask & (1 << i)) {
2031          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2032             return false;
2033
2034          if (BRW_GET_SWZ(src.swizzle, i) != i)
2035             return false;
2036       }
2037    }
2038
2039    /* Success!  Rewrite the instruction. */
2040    last_rhs_inst->dst.file = dst.file;
2041    last_rhs_inst->dst.reg = dst.reg;
2042    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2043    last_rhs_inst->dst.reladdr = dst.reladdr;
2044    last_rhs_inst->dst.writemask &= dst.writemask;
2045
2046    return true;
2047 }
2048
2049 void
2050 vec4_visitor::visit(ir_assignment *ir)
2051 {
2052    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2053    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2054
2055    if (!ir->lhs->type->is_scalar() &&
2056        !ir->lhs->type->is_vector()) {
2057       ir->rhs->accept(this);
2058       src_reg src = this->result;
2059
2060       if (ir->condition) {
2061          emit_bool_to_cond_code(ir->condition, &predicate);
2062       }
2063
2064       /* emit_block_move doesn't account for swizzles in the source register.
2065        * This should be ok, since the source register is a structure or an
2066        * array, and those can't be swizzled.  But double-check to be sure.
2067        */
2068       assert(src.swizzle ==
2069              (ir->rhs->type->is_matrix()
2070               ? swizzle_for_size(ir->rhs->type->vector_elements)
2071               : BRW_SWIZZLE_NOOP));
2072
2073       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2074       return;
2075    }
2076
2077    /* Now we're down to just a scalar/vector with writemasks. */
2078    int i;
2079
2080    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2081    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2082
2083    ir->rhs->accept(this);
2084
2085    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2086
2087    src_reg src = this->result;
2088
2089    int swizzles[4];
2090    int first_enabled_chan = 0;
2091    int src_chan = 0;
2092
2093    assert(ir->lhs->type->is_vector() ||
2094           ir->lhs->type->is_scalar());
2095    dst.writemask = ir->write_mask;
2096
2097    for (int i = 0; i < 4; i++) {
2098       if (dst.writemask & (1 << i)) {
2099          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2100          break;
2101       }
2102    }
2103
2104    /* Swizzle a small RHS vector into the channels being written.
2105     *
2106     * glsl ir treats write_mask as dictating how many channels are
2107     * present on the RHS while in our instructions we need to make
2108     * those channels appear in the slots of the vec4 they're written to.
2109     */
2110    for (int i = 0; i < 4; i++) {
2111       if (dst.writemask & (1 << i))
2112          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2113       else
2114          swizzles[i] = first_enabled_chan;
2115    }
2116    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2117                               swizzles[2], swizzles[3]);
2118
2119    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2120       return;
2121    }
2122
2123    if (ir->condition) {
2124       emit_bool_to_cond_code(ir->condition, &predicate);
2125    }
2126
2127    for (i = 0; i < type_size(ir->lhs->type); i++) {
2128       vec4_instruction *inst = emit(MOV(dst, src));
2129       inst->predicate = predicate;
2130
2131       dst.reg_offset++;
2132       src.reg_offset++;
2133    }
2134 }
2135
2136 void
2137 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2138 {
2139    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2140       foreach_in_list(ir_constant, field_value, &ir->components) {
2141          emit_constant_values(dst, field_value);
2142       }
2143       return;
2144    }
2145
2146    if (ir->type->is_array()) {
2147       for (unsigned int i = 0; i < ir->type->length; i++) {
2148          emit_constant_values(dst, ir->array_elements[i]);
2149       }
2150       return;
2151    }
2152
2153    if (ir->type->is_matrix()) {
2154       for (int i = 0; i < ir->type->matrix_columns; i++) {
2155          float *vec = &ir->value.f[i * ir->type->vector_elements];
2156
2157          for (int j = 0; j < ir->type->vector_elements; j++) {
2158             dst->writemask = 1 << j;
2159             dst->type = BRW_REGISTER_TYPE_F;
2160
2161             emit(MOV(*dst, src_reg(vec[j])));
2162          }
2163          dst->reg_offset++;
2164       }
2165       return;
2166    }
2167
2168    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2169
2170    for (int i = 0; i < ir->type->vector_elements; i++) {
2171       if (!(remaining_writemask & (1 << i)))
2172          continue;
2173
2174       dst->writemask = 1 << i;
2175       dst->type = brw_type_for_base_type(ir->type);
2176
2177       /* Find other components that match the one we're about to
2178        * write.  Emits fewer instructions for things like vec4(0.5,
2179        * 1.5, 1.5, 1.5).
2180        */
2181       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2182          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2183             if (ir->value.b[i] == ir->value.b[j])
2184                dst->writemask |= (1 << j);
2185          } else {
2186             /* u, i, and f storage all line up, so no need for a
2187              * switch case for comparing each type.
2188              */
2189             if (ir->value.u[i] == ir->value.u[j])
2190                dst->writemask |= (1 << j);
2191          }
2192       }
2193
2194       switch (ir->type->base_type) {
2195       case GLSL_TYPE_FLOAT:
2196          emit(MOV(*dst, src_reg(ir->value.f[i])));
2197          break;
2198       case GLSL_TYPE_INT:
2199          emit(MOV(*dst, src_reg(ir->value.i[i])));
2200          break;
2201       case GLSL_TYPE_UINT:
2202          emit(MOV(*dst, src_reg(ir->value.u[i])));
2203          break;
2204       case GLSL_TYPE_BOOL:
2205          emit(MOV(*dst, src_reg(ir->value.b[i])));
2206          break;
2207       default:
2208          unreachable("Non-float/uint/int/bool constant");
2209       }
2210
2211       remaining_writemask &= ~dst->writemask;
2212    }
2213    dst->reg_offset++;
2214 }
2215
2216 void
2217 vec4_visitor::visit(ir_constant *ir)
2218 {
2219    dst_reg dst = dst_reg(this, ir->type);
2220    this->result = src_reg(dst);
2221
2222    emit_constant_values(&dst, ir);
2223 }
2224
2225 void
2226 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2227 {
2228    ir_dereference *deref = static_cast<ir_dereference *>(
2229       ir->actual_parameters.get_head());
2230    ir_variable *location = deref->variable_referenced();
2231    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2232                           location->data.atomic.buffer_index);
2233
2234    /* Calculate the surface offset */
2235    src_reg offset(this, glsl_type::uint_type);
2236    ir_dereference_array *deref_array = deref->as_dereference_array();
2237    if (deref_array) {
2238       deref_array->array_index->accept(this);
2239
2240       src_reg tmp(this, glsl_type::uint_type);
2241       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2242       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2243    } else {
2244       offset = location->data.atomic.offset;
2245    }
2246
2247    /* Emit the appropriate machine instruction */
2248    const char *callee = ir->callee->function_name();
2249    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2250
2251    if (!strcmp("__intrinsic_atomic_read", callee)) {
2252       emit_untyped_surface_read(surf_index, dst, offset);
2253
2254    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2255       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2256                           src_reg(), src_reg());
2257
2258    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2259       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2260                           src_reg(), src_reg());
2261    }
2262 }
2263
2264 void
2265 vec4_visitor::visit(ir_call *ir)
2266 {
2267    const char *callee = ir->callee->function_name();
2268
2269    if (!strcmp("__intrinsic_atomic_read", callee) ||
2270        !strcmp("__intrinsic_atomic_increment", callee) ||
2271        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2272       visit_atomic_counter_intrinsic(ir);
2273    } else {
2274       unreachable("Unsupported intrinsic.");
2275    }
2276 }
2277
2278 src_reg
2279 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, uint32_t sampler)
2280 {
2281    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2282    inst->base_mrf = 2;
2283    inst->mlen = 1;
2284    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2285    inst->dst.writemask = WRITEMASK_XYZW;
2286
2287    inst->src[1] = src_reg(sampler);
2288
2289    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2290    int param_base = inst->base_mrf;
2291    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2292    int zero_mask = 0xf & ~coord_mask;
2293
2294    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2295             coordinate));
2296
2297    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2298             src_reg(0)));
2299
2300    emit(inst);
2301    return src_reg(inst->dst);
2302 }
2303
2304 void
2305 vec4_visitor::visit(ir_texture *ir)
2306 {
2307    uint32_t sampler =
2308       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2309
2310    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2311     * emitting anything other than setting up the constant result.
2312     */
2313    if (ir->op == ir_tg4) {
2314       ir_constant *chan = ir->lod_info.component->as_constant();
2315       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2316       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2317          dst_reg result(this, ir->type);
2318          this->result = src_reg(result);
2319          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2320          return;
2321       }
2322    }
2323
2324    /* Should be lowered by do_lower_texture_projection */
2325    assert(!ir->projector);
2326
2327    /* Should be lowered */
2328    assert(!ir->offset || !ir->offset->type->is_array());
2329
2330    /* Generate code to compute all the subexpression trees.  This has to be
2331     * done before loading any values into MRFs for the sampler message since
2332     * generating these values may involve SEND messages that need the MRFs.
2333     */
2334    src_reg coordinate;
2335    if (ir->coordinate) {
2336       ir->coordinate->accept(this);
2337       coordinate = this->result;
2338    }
2339
2340    src_reg shadow_comparitor;
2341    if (ir->shadow_comparitor) {
2342       ir->shadow_comparitor->accept(this);
2343       shadow_comparitor = this->result;
2344    }
2345
2346    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2347    src_reg offset_value;
2348    if (has_nonconstant_offset) {
2349       ir->offset->accept(this);
2350       offset_value = src_reg(this->result);
2351    }
2352
2353    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2354    src_reg lod, dPdx, dPdy, sample_index, mcs;
2355    switch (ir->op) {
2356    case ir_tex:
2357       lod = src_reg(0.0f);
2358       lod_type = glsl_type::float_type;
2359       break;
2360    case ir_txf:
2361    case ir_txl:
2362    case ir_txs:
2363       ir->lod_info.lod->accept(this);
2364       lod = this->result;
2365       lod_type = ir->lod_info.lod->type;
2366       break;
2367    case ir_query_levels:
2368       lod = src_reg(0);
2369       lod_type = glsl_type::int_type;
2370       break;
2371    case ir_txf_ms:
2372       ir->lod_info.sample_index->accept(this);
2373       sample_index = this->result;
2374       sample_index_type = ir->lod_info.sample_index->type;
2375
2376       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2377          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2378       else
2379          mcs = src_reg(0u);
2380       break;
2381    case ir_txd:
2382       ir->lod_info.grad.dPdx->accept(this);
2383       dPdx = this->result;
2384
2385       ir->lod_info.grad.dPdy->accept(this);
2386       dPdy = this->result;
2387
2388       lod_type = ir->lod_info.grad.dPdx->type;
2389       break;
2390    case ir_txb:
2391    case ir_lod:
2392    case ir_tg4:
2393       break;
2394    }
2395
2396    enum opcode opcode;
2397    switch (ir->op) {
2398    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2399    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2400    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2401    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2402    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2403    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2404    case ir_tg4: opcode = has_nonconstant_offset
2405                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2406    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2407    case ir_txb:
2408       unreachable("TXB is not valid for vertex shaders.");
2409    case ir_lod:
2410       unreachable("LOD is not valid for vertex shaders.");
2411    default:
2412       unreachable("Unrecognized tex op");
2413    }
2414
2415    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2416
2417    if (ir->offset != NULL && ir->op != ir_txf)
2418       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2419
2420    /* Stuff the channel select bits in the top of the texture offset */
2421    if (ir->op == ir_tg4)
2422       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2423
2424    /* The message header is necessary for:
2425     * - Gen4 (always)
2426     * - Texel offsets
2427     * - Gather channel selection
2428     * - Sampler indices too large to fit in a 4-bit value.
2429     */
2430    inst->header_present =
2431       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2432       sampler >= 16;
2433    inst->base_mrf = 2;
2434    inst->mlen = inst->header_present + 1; /* always at least one */
2435    inst->dst = dst_reg(this, ir->type);
2436    inst->dst.writemask = WRITEMASK_XYZW;
2437    inst->shadow_compare = ir->shadow_comparitor != NULL;
2438
2439    inst->src[1] = src_reg(sampler);
2440
2441    /* MRF for the first parameter */
2442    int param_base = inst->base_mrf + inst->header_present;
2443
2444    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2445       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2446       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2447    } else {
2448       /* Load the coordinate */
2449       /* FINISHME: gl_clamp_mask and saturate */
2450       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2451       int zero_mask = 0xf & ~coord_mask;
2452
2453       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2454                coordinate));
2455
2456       if (zero_mask != 0) {
2457          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2458                   src_reg(0)));
2459       }
2460       /* Load the shadow comparitor */
2461       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2462          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2463                           WRITEMASK_X),
2464                   shadow_comparitor));
2465          inst->mlen++;
2466       }
2467
2468       /* Load the LOD info */
2469       if (ir->op == ir_tex || ir->op == ir_txl) {
2470          int mrf, writemask;
2471          if (brw->gen >= 5) {
2472             mrf = param_base + 1;
2473             if (ir->shadow_comparitor) {
2474                writemask = WRITEMASK_Y;
2475                /* mlen already incremented */
2476             } else {
2477                writemask = WRITEMASK_X;
2478                inst->mlen++;
2479             }
2480          } else /* brw->gen == 4 */ {
2481             mrf = param_base;
2482             writemask = WRITEMASK_W;
2483          }
2484          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2485       } else if (ir->op == ir_txf) {
2486          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2487       } else if (ir->op == ir_txf_ms) {
2488          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2489                   sample_index));
2490          if (brw->gen >= 7)
2491             /* MCS data is in the first channel of `mcs`, but we need to get it into
2492              * the .y channel of the second vec4 of params, so replicate .x across
2493              * the whole vec4 and then mask off everything except .y
2494              */
2495             mcs.swizzle = BRW_SWIZZLE_XXXX;
2496             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2497                      mcs));
2498          inst->mlen++;
2499       } else if (ir->op == ir_txd) {
2500          const glsl_type *type = lod_type;
2501
2502          if (brw->gen >= 5) {
2503             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2504             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2505             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2506             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2507             inst->mlen++;
2508
2509             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2510                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2511                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2512                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2513                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2514                inst->mlen++;
2515
2516                if (ir->shadow_comparitor) {
2517                   emit(MOV(dst_reg(MRF, param_base + 2,
2518                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2519                            shadow_comparitor));
2520                }
2521             }
2522          } else /* brw->gen == 4 */ {
2523             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2524             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2525             inst->mlen += 2;
2526          }
2527       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2528          if (ir->shadow_comparitor) {
2529             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2530                      shadow_comparitor));
2531          }
2532
2533          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2534                   offset_value));
2535          inst->mlen++;
2536       }
2537    }
2538
2539    emit(inst);
2540
2541    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2542     * spec requires layers.
2543     */
2544    if (ir->op == ir_txs) {
2545       glsl_type const *type = ir->sampler->type;
2546       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2547           type->sampler_array) {
2548          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2549                    writemask(inst->dst, WRITEMASK_Z),
2550                    src_reg(inst->dst), src_reg(6));
2551       }
2552    }
2553
2554    if (brw->gen == 6 && ir->op == ir_tg4) {
2555       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2556    }
2557
2558    swizzle_result(ir, src_reg(inst->dst), sampler);
2559 }
2560
2561 /**
2562  * Apply workarounds for Gen6 gather with UINT/SINT
2563  */
2564 void
2565 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2566 {
2567    if (!wa)
2568       return;
2569
2570    int width = (wa & WA_8BIT) ? 8 : 16;
2571    dst_reg dst_f = dst;
2572    dst_f.type = BRW_REGISTER_TYPE_F;
2573
2574    /* Convert from UNORM to UINT */
2575    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2576    emit(MOV(dst, src_reg(dst_f)));
2577
2578    if (wa & WA_SIGN) {
2579       /* Reinterpret the UINT value as a signed INT value by
2580        * shifting the sign bit into place, then shifting back
2581        * preserving sign.
2582        */
2583       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2584       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2585    }
2586 }
2587
2588 /**
2589  * Set up the gather channel based on the swizzle, for gather4.
2590  */
2591 uint32_t
2592 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2593 {
2594    ir_constant *chan = ir->lod_info.component->as_constant();
2595    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2596    switch (swiz) {
2597       case SWIZZLE_X: return 0;
2598       case SWIZZLE_Y:
2599          /* gather4 sampler is broken for green channel on RG32F --
2600           * we must ask for blue instead.
2601           */
2602          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2603             return 2;
2604          return 1;
2605       case SWIZZLE_Z: return 2;
2606       case SWIZZLE_W: return 3;
2607       default:
2608          unreachable("Not reached"); /* zero, one swizzles handled already */
2609    }
2610 }
2611
2612 void
2613 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2614 {
2615    int s = key->tex.swizzles[sampler];
2616
2617    this->result = src_reg(this, ir->type);
2618    dst_reg swizzled_result(this->result);
2619
2620    if (ir->op == ir_query_levels) {
2621       /* # levels is in .w */
2622       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2623       emit(MOV(swizzled_result, orig_val));
2624       return;
2625    }
2626
2627    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2628                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2629       emit(MOV(swizzled_result, orig_val));
2630       return;
2631    }
2632
2633
2634    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2635    int swizzle[4] = {0};
2636
2637    for (int i = 0; i < 4; i++) {
2638       switch (GET_SWZ(s, i)) {
2639       case SWIZZLE_ZERO:
2640          zero_mask |= (1 << i);
2641          break;
2642       case SWIZZLE_ONE:
2643          one_mask |= (1 << i);
2644          break;
2645       default:
2646          copy_mask |= (1 << i);
2647          swizzle[i] = GET_SWZ(s, i);
2648          break;
2649       }
2650    }
2651
2652    if (copy_mask) {
2653       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2654       swizzled_result.writemask = copy_mask;
2655       emit(MOV(swizzled_result, orig_val));
2656    }
2657
2658    if (zero_mask) {
2659       swizzled_result.writemask = zero_mask;
2660       emit(MOV(swizzled_result, src_reg(0.0f)));
2661    }
2662
2663    if (one_mask) {
2664       swizzled_result.writemask = one_mask;
2665       emit(MOV(swizzled_result, src_reg(1.0f)));
2666    }
2667 }
2668
2669 void
2670 vec4_visitor::visit(ir_return *)
2671 {
2672    unreachable("not reached");
2673 }
2674
2675 void
2676 vec4_visitor::visit(ir_discard *)
2677 {
2678    unreachable("not reached");
2679 }
2680
2681 void
2682 vec4_visitor::visit(ir_if *ir)
2683 {
2684    /* Don't point the annotation at the if statement, because then it plus
2685     * the then and else blocks get printed.
2686     */
2687    this->base_ir = ir->condition;
2688
2689    if (brw->gen == 6) {
2690       emit_if_gen6(ir);
2691    } else {
2692       enum brw_predicate predicate;
2693       emit_bool_to_cond_code(ir->condition, &predicate);
2694       emit(IF(predicate));
2695    }
2696
2697    visit_instructions(&ir->then_instructions);
2698
2699    if (!ir->else_instructions.is_empty()) {
2700       this->base_ir = ir->condition;
2701       emit(BRW_OPCODE_ELSE);
2702
2703       visit_instructions(&ir->else_instructions);
2704    }
2705
2706    this->base_ir = ir->condition;
2707    emit(BRW_OPCODE_ENDIF);
2708 }
2709
2710 void
2711 vec4_visitor::visit(ir_emit_vertex *)
2712 {
2713    unreachable("not reached");
2714 }
2715
2716 void
2717 vec4_visitor::visit(ir_end_primitive *)
2718 {
2719    unreachable("not reached");
2720 }
2721
2722 void
2723 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2724                                   dst_reg dst, src_reg offset,
2725                                   src_reg src0, src_reg src1)
2726 {
2727    unsigned mlen = 0;
2728
2729    /* Set the atomic operation offset. */
2730    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2731    mlen++;
2732
2733    /* Set the atomic operation arguments. */
2734    if (src0.file != BAD_FILE) {
2735       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2736       mlen++;
2737    }
2738
2739    if (src1.file != BAD_FILE) {
2740       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2741       mlen++;
2742    }
2743
2744    /* Emit the instruction.  Note that this maps to the normal SIMD8
2745     * untyped atomic message on Ivy Bridge, but that's OK because
2746     * unused channels will be masked out.
2747     */
2748    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2749                                  src_reg(atomic_op), src_reg(surf_index));
2750    inst->base_mrf = 0;
2751    inst->mlen = mlen;
2752 }
2753
2754 void
2755 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2756                                         src_reg offset)
2757 {
2758    /* Set the surface read offset. */
2759    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2760
2761    /* Emit the instruction.  Note that this maps to the normal SIMD8
2762     * untyped surface read message, but that's OK because unused
2763     * channels will be masked out.
2764     */
2765    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2766                                  dst, src_reg(surf_index));
2767    inst->base_mrf = 0;
2768    inst->mlen = 1;
2769 }
2770
2771 void
2772 vec4_visitor::emit_ndc_computation()
2773 {
2774    /* Get the position */
2775    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2776
2777    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2778    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2779    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2780
2781    current_annotation = "NDC";
2782    dst_reg ndc_w = ndc;
2783    ndc_w.writemask = WRITEMASK_W;
2784    src_reg pos_w = pos;
2785    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2786    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2787
2788    dst_reg ndc_xyz = ndc;
2789    ndc_xyz.writemask = WRITEMASK_XYZ;
2790
2791    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2792 }
2793
2794 void
2795 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2796 {
2797    if (brw->gen < 6 &&
2798        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2799         key->userclip_active || brw->has_negative_rhw_bug)) {
2800       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2801       dst_reg header1_w = header1;
2802       header1_w.writemask = WRITEMASK_W;
2803
2804       emit(MOV(header1, 0u));
2805
2806       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2807          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2808
2809          current_annotation = "Point size";
2810          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2811          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2812       }
2813
2814       if (key->userclip_active) {
2815          current_annotation = "Clipping flags";
2816          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2817          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2818
2819          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2820          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2821          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2822
2823          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2824          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2825          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2826          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2827       }
2828
2829       /* i965 clipping workaround:
2830        * 1) Test for -ve rhw
2831        * 2) If set,
2832        *      set ndc = (0,0,0,0)
2833        *      set ucp[6] = 1
2834        *
2835        * Later, clipping will detect ucp[6] and ensure the primitive is
2836        * clipped against all fixed planes.
2837        */
2838       if (brw->has_negative_rhw_bug) {
2839          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2840          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2841          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2842          vec4_instruction *inst;
2843          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2844          inst->predicate = BRW_PREDICATE_NORMAL;
2845          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2846          inst->predicate = BRW_PREDICATE_NORMAL;
2847       }
2848
2849       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2850    } else if (brw->gen < 6) {
2851       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2852    } else {
2853       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2854       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2855          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2856                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2857       }
2858       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2859          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2860                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2861       }
2862       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2863          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2864                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2865       }
2866    }
2867 }
2868
2869 void
2870 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2871 {
2872    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2873     *
2874     *     "If a linked set of shaders forming the vertex stage contains no
2875     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2876     *     application has requested clipping against user clip planes through
2877     *     the API, then the coordinate written to gl_Position is used for
2878     *     comparison against the user clip planes."
2879     *
2880     * This function is only called if the shader didn't write to
2881     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2882     * if the user wrote to it; otherwise we use gl_Position.
2883     */
2884    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2885    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2886       clip_vertex = VARYING_SLOT_POS;
2887    }
2888
2889    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2890         ++i) {
2891       reg.writemask = 1 << i;
2892       emit(DP4(reg,
2893                src_reg(output_reg[clip_vertex]),
2894                src_reg(this->userplane[i + offset])));
2895    }
2896 }
2897
2898 void
2899 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2900 {
2901    assert (varying < VARYING_SLOT_MAX);
2902    reg.type = output_reg[varying].type;
2903    current_annotation = output_reg_annotation[varying];
2904    /* Copy the register, saturating if necessary */
2905    vec4_instruction *inst = emit(MOV(reg,
2906                                      src_reg(output_reg[varying])));
2907    if ((varying == VARYING_SLOT_COL0 ||
2908         varying == VARYING_SLOT_COL1 ||
2909         varying == VARYING_SLOT_BFC0 ||
2910         varying == VARYING_SLOT_BFC1) &&
2911        key->clamp_vertex_color) {
2912       inst->saturate = true;
2913    }
2914 }
2915
2916 void
2917 vec4_visitor::emit_urb_slot(int mrf, int varying)
2918 {
2919    struct brw_reg hw_reg = brw_message_reg(mrf);
2920    dst_reg reg = dst_reg(MRF, mrf);
2921    reg.type = BRW_REGISTER_TYPE_F;
2922
2923    switch (varying) {
2924    case VARYING_SLOT_PSIZ:
2925       /* PSIZ is always in slot 0, and is coupled with other flags. */
2926       current_annotation = "indices, point width, clip flags";
2927       emit_psiz_and_flags(hw_reg);
2928       break;
2929    case BRW_VARYING_SLOT_NDC:
2930       current_annotation = "NDC";
2931       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2932       break;
2933    case VARYING_SLOT_POS:
2934       current_annotation = "gl_Position";
2935       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2936       break;
2937    case VARYING_SLOT_EDGE:
2938       /* This is present when doing unfilled polygons.  We're supposed to copy
2939        * the edge flag from the user-provided vertex array
2940        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2941        * of that attribute (starts as 1.0f).  This is then used in clipping to
2942        * determine which edges should be drawn as wireframe.
2943        */
2944       current_annotation = "edge flag";
2945       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2946                                     glsl_type::float_type, WRITEMASK_XYZW))));
2947       break;
2948    case BRW_VARYING_SLOT_PAD:
2949       /* No need to write to this slot */
2950       break;
2951    default:
2952       emit_generic_urb_slot(reg, varying);
2953       break;
2954    }
2955 }
2956
2957 static int
2958 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2959 {
2960    if (brw->gen >= 6) {
2961       /* URB data written (does not include the message header reg) must
2962        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2963        * section 5.4.3.2.2: URB_INTERLEAVED.
2964        *
2965        * URB entries are allocated on a multiple of 1024 bits, so an
2966        * extra 128 bits written here to make the end align to 256 is
2967        * no problem.
2968        */
2969       if ((mlen % 2) != 1)
2970          mlen++;
2971    }
2972
2973    return mlen;
2974 }
2975
2976
2977 /**
2978  * Generates the VUE payload plus the necessary URB write instructions to
2979  * output it.
2980  *
2981  * The VUE layout is documented in Volume 2a.
2982  */
2983 void
2984 vec4_visitor::emit_vertex()
2985 {
2986    /* MRF 0 is reserved for the debugger, so start with message header
2987     * in MRF 1.
2988     */
2989    int base_mrf = 1;
2990    int mrf = base_mrf;
2991    /* In the process of generating our URB write message contents, we
2992     * may need to unspill a register or load from an array.  Those
2993     * reads would use MRFs 14-15.
2994     */
2995    int max_usable_mrf = 13;
2996
2997    /* The following assertion verifies that max_usable_mrf causes an
2998     * even-numbered amount of URB write data, which will meet gen6's
2999     * requirements for length alignment.
3000     */
3001    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3002
3003    /* First mrf is the g0-based message header containing URB handles and
3004     * such.
3005     */
3006    emit_urb_write_header(mrf++);
3007
3008    if (brw->gen < 6) {
3009       emit_ndc_computation();
3010    }
3011
3012    /* Lower legacy ff and ClipVertex clipping to clip distances */
3013    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3014       current_annotation = "user clip distances";
3015
3016       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3017       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3018
3019       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3020       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3021    }
3022
3023    /* We may need to split this up into several URB writes, so do them in a
3024     * loop.
3025     */
3026    int slot = 0;
3027    bool complete = false;
3028    do {
3029       /* URB offset is in URB row increments, and each of our MRFs is half of
3030        * one of those, since we're doing interleaved writes.
3031        */
3032       int offset = slot / 2;
3033
3034       mrf = base_mrf + 1;
3035       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3036          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3037
3038          /* If this was max_usable_mrf, we can't fit anything more into this
3039           * URB WRITE.
3040           */
3041          if (mrf > max_usable_mrf) {
3042             slot++;
3043             break;
3044          }
3045       }
3046
3047       complete = slot >= prog_data->vue_map.num_slots;
3048       current_annotation = "URB write";
3049       vec4_instruction *inst = emit_urb_write_opcode(complete);
3050       inst->base_mrf = base_mrf;
3051       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3052       inst->offset += offset;
3053    } while(!complete);
3054 }
3055
3056
3057 src_reg
3058 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3059                                  src_reg *reladdr, int reg_offset)
3060 {
3061    /* Because we store the values to scratch interleaved like our
3062     * vertex data, we need to scale the vec4 index by 2.
3063     */
3064    int message_header_scale = 2;
3065
3066    /* Pre-gen6, the message header uses byte offsets instead of vec4
3067     * (16-byte) offset units.
3068     */
3069    if (brw->gen < 6)
3070       message_header_scale *= 16;
3071
3072    if (reladdr) {
3073       src_reg index = src_reg(this, glsl_type::int_type);
3074
3075       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3076       emit_before(inst, MUL(dst_reg(index),
3077                             index, src_reg(message_header_scale)));
3078
3079       return index;
3080    } else {
3081       return src_reg(reg_offset * message_header_scale);
3082    }
3083 }
3084
3085 src_reg
3086 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3087                                        src_reg *reladdr, int reg_offset)
3088 {
3089    if (reladdr) {
3090       src_reg index = src_reg(this, glsl_type::int_type);
3091
3092       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3093
3094       /* Pre-gen6, the message header uses byte offsets instead of vec4
3095        * (16-byte) offset units.
3096        */
3097       if (brw->gen < 6) {
3098          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3099       }
3100
3101       return index;
3102    } else if (brw->gen >= 8) {
3103       /* Store the offset in a GRF so we can send-from-GRF. */
3104       src_reg offset = src_reg(this, glsl_type::int_type);
3105       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3106       return offset;
3107    } else {
3108       int message_header_scale = brw->gen < 6 ? 16 : 1;
3109       return src_reg(reg_offset * message_header_scale);
3110    }
3111 }
3112
3113 /**
3114  * Emits an instruction before @inst to load the value named by @orig_src
3115  * from scratch space at @base_offset to @temp.
3116  *
3117  * @base_offset is measured in 32-byte units (the size of a register).
3118  */
3119 void
3120 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3121                                 dst_reg temp, src_reg orig_src,
3122                                 int base_offset)
3123 {
3124    int reg_offset = base_offset + orig_src.reg_offset;
3125    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3126
3127    emit_before(inst, SCRATCH_READ(temp, index));
3128 }
3129
3130 /**
3131  * Emits an instruction after @inst to store the value to be written
3132  * to @orig_dst to scratch space at @base_offset, from @temp.
3133  *
3134  * @base_offset is measured in 32-byte units (the size of a register).
3135  */
3136 void
3137 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3138 {
3139    int reg_offset = base_offset + inst->dst.reg_offset;
3140    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3141
3142    /* Create a temporary register to store *inst's result in.
3143     *
3144     * We have to be careful in MOVing from our temporary result register in
3145     * the scratch write.  If we swizzle from channels of the temporary that
3146     * weren't initialized, it will confuse live interval analysis, which will
3147     * make spilling fail to make progress.
3148     */
3149    src_reg temp = src_reg(this, glsl_type::vec4_type);
3150    temp.type = inst->dst.type;
3151    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3152    int swizzles[4];
3153    for (int i = 0; i < 4; i++)
3154       if (inst->dst.writemask & (1 << i))
3155          swizzles[i] = i;
3156       else
3157          swizzles[i] = first_writemask_chan;
3158    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3159                                swizzles[2], swizzles[3]);
3160
3161    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3162                                        inst->dst.writemask));
3163    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3164    write->predicate = inst->predicate;
3165    write->ir = inst->ir;
3166    write->annotation = inst->annotation;
3167    inst->insert_after(write);
3168
3169    inst->dst.file = temp.file;
3170    inst->dst.reg = temp.reg;
3171    inst->dst.reg_offset = temp.reg_offset;
3172    inst->dst.reladdr = NULL;
3173 }
3174
3175 /**
3176  * We can't generally support array access in GRF space, because a
3177  * single instruction's destination can only span 2 contiguous
3178  * registers.  So, we send all GRF arrays that get variable index
3179  * access to scratch space.
3180  */
3181 void
3182 vec4_visitor::move_grf_array_access_to_scratch()
3183 {
3184    int scratch_loc[this->virtual_grf_count];
3185
3186    for (int i = 0; i < this->virtual_grf_count; i++) {
3187       scratch_loc[i] = -1;
3188    }
3189
3190    /* First, calculate the set of virtual GRFs that need to be punted
3191     * to scratch due to having any array access on them, and where in
3192     * scratch.
3193     */
3194    foreach_in_list(vec4_instruction, inst, &instructions) {
3195       if (inst->dst.file == GRF && inst->dst.reladdr &&
3196           scratch_loc[inst->dst.reg] == -1) {
3197          scratch_loc[inst->dst.reg] = c->last_scratch;
3198          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3199       }
3200
3201       for (int i = 0 ; i < 3; i++) {
3202          src_reg *src = &inst->src[i];
3203
3204          if (src->file == GRF && src->reladdr &&
3205              scratch_loc[src->reg] == -1) {
3206             scratch_loc[src->reg] = c->last_scratch;
3207             c->last_scratch += this->virtual_grf_sizes[src->reg];
3208          }
3209       }
3210    }
3211
3212    /* Now, for anything that will be accessed through scratch, rewrite
3213     * it to load/store.  Note that this is a _safe list walk, because
3214     * we may generate a new scratch_write instruction after the one
3215     * we're processing.
3216     */
3217    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3218       /* Set up the annotation tracking for new generated instructions. */
3219       base_ir = inst->ir;
3220       current_annotation = inst->annotation;
3221
3222       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3223          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3224       }
3225
3226       for (int i = 0 ; i < 3; i++) {
3227          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3228             continue;
3229
3230          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3231
3232          emit_scratch_read(inst, temp, inst->src[i],
3233                            scratch_loc[inst->src[i].reg]);
3234
3235          inst->src[i].file = temp.file;
3236          inst->src[i].reg = temp.reg;
3237          inst->src[i].reg_offset = temp.reg_offset;
3238          inst->src[i].reladdr = NULL;
3239       }
3240    }
3241 }
3242
3243 /**
3244  * Emits an instruction before @inst to load the value named by @orig_src
3245  * from the pull constant buffer (surface) at @base_offset to @temp.
3246  */
3247 void
3248 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3249                                       dst_reg temp, src_reg orig_src,
3250                                       int base_offset)
3251 {
3252    int reg_offset = base_offset + orig_src.reg_offset;
3253    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3254    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3255    vec4_instruction *load;
3256
3257    if (brw->gen >= 7) {
3258       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3259       grf_offset.type = offset.type;
3260       emit_before(inst, MOV(grf_offset, offset));
3261
3262       load = new(mem_ctx) vec4_instruction(this,
3263                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3264                                            temp, index, src_reg(grf_offset));
3265    } else {
3266       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3267                                            temp, index, offset);
3268       load->base_mrf = 14;
3269       load->mlen = 1;
3270    }
3271    emit_before(inst, load);
3272 }
3273
3274 /**
3275  * Implements array access of uniforms by inserting a
3276  * PULL_CONSTANT_LOAD instruction.
3277  *
3278  * Unlike temporary GRF array access (where we don't support it due to
3279  * the difficulty of doing relative addressing on instruction
3280  * destinations), we could potentially do array access of uniforms
3281  * that were loaded in GRF space as push constants.  In real-world
3282  * usage we've seen, though, the arrays being used are always larger
3283  * than we could load as push constants, so just always move all
3284  * uniform array access out to a pull constant buffer.
3285  */
3286 void
3287 vec4_visitor::move_uniform_array_access_to_pull_constants()
3288 {
3289    int pull_constant_loc[this->uniforms];
3290
3291    for (int i = 0; i < this->uniforms; i++) {
3292       pull_constant_loc[i] = -1;
3293    }
3294
3295    /* Walk through and find array access of uniforms.  Put a copy of that
3296     * uniform in the pull constant buffer.
3297     *
3298     * Note that we don't move constant-indexed accesses to arrays.  No
3299     * testing has been done of the performance impact of this choice.
3300     */
3301    foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3302       for (int i = 0 ; i < 3; i++) {
3303          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3304             continue;
3305
3306          int uniform = inst->src[i].reg;
3307
3308          /* If this array isn't already present in the pull constant buffer,
3309           * add it.
3310           */
3311          if (pull_constant_loc[uniform] == -1) {
3312             const float **values = &stage_prog_data->param[uniform * 4];
3313
3314             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3315
3316             assert(uniform < uniform_array_size);
3317             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3318                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3319                   = values[j];
3320             }
3321          }
3322
3323          /* Set up the annotation tracking for new generated instructions. */
3324          base_ir = inst->ir;
3325          current_annotation = inst->annotation;
3326
3327          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3328
3329          emit_pull_constant_load(inst, temp, inst->src[i],
3330                                  pull_constant_loc[uniform]);
3331
3332          inst->src[i].file = temp.file;
3333          inst->src[i].reg = temp.reg;
3334          inst->src[i].reg_offset = temp.reg_offset;
3335          inst->src[i].reladdr = NULL;
3336       }
3337    }
3338
3339    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3340     * no need to track them as larger-than-vec4 objects.  This will be
3341     * relied on in cutting out unused uniform vectors from push
3342     * constants.
3343     */
3344    split_uniform_registers();
3345 }
3346
3347 void
3348 vec4_visitor::resolve_ud_negate(src_reg *reg)
3349 {
3350    if (reg->type != BRW_REGISTER_TYPE_UD ||
3351        !reg->negate)
3352       return;
3353
3354    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3355    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3356    *reg = temp;
3357 }
3358
3359 vec4_visitor::vec4_visitor(struct brw_context *brw,
3360                            struct brw_vec4_compile *c,
3361                            struct gl_program *prog,
3362                            const struct brw_vec4_prog_key *key,
3363                            struct brw_vec4_prog_data *prog_data,
3364                            struct gl_shader_program *shader_prog,
3365                            gl_shader_stage stage,
3366                            void *mem_ctx,
3367                            bool debug_flag,
3368                            bool no_spills,
3369                            shader_time_shader_type st_base,
3370                            shader_time_shader_type st_written,
3371                            shader_time_shader_type st_reset)
3372    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3373      c(c),
3374      key(key),
3375      prog_data(prog_data),
3376      sanity_param_count(0),
3377      fail_msg(NULL),
3378      first_non_payload_grf(0),
3379      need_all_constants_in_pull_buffer(false),
3380      debug_flag(debug_flag),
3381      no_spills(no_spills),
3382      st_base(st_base),
3383      st_written(st_written),
3384      st_reset(st_reset)
3385 {
3386    this->mem_ctx = mem_ctx;
3387    this->failed = false;
3388
3389    this->base_ir = NULL;
3390    this->current_annotation = NULL;
3391    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3392
3393    this->variable_ht = hash_table_ctor(0,
3394                                        hash_table_pointer_hash,
3395                                        hash_table_pointer_compare);
3396
3397    this->virtual_grf_start = NULL;
3398    this->virtual_grf_end = NULL;
3399    this->virtual_grf_sizes = NULL;
3400    this->virtual_grf_count = 0;
3401    this->virtual_grf_reg_map = NULL;
3402    this->virtual_grf_reg_count = 0;
3403    this->virtual_grf_array_size = 0;
3404    this->live_intervals_valid = false;
3405
3406    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3407
3408    this->uniforms = 0;
3409
3410    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3411     * at least one. See setup_uniforms() in brw_vec4.cpp.
3412     */
3413    this->uniform_array_size = 1;
3414    if (prog_data) {
3415       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3416    }
3417
3418    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3419    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3420 }
3421
3422 vec4_visitor::~vec4_visitor()
3423 {
3424    hash_table_dtor(this->variable_ht);
3425 }
3426
3427
3428 void
3429 vec4_visitor::fail(const char *format, ...)
3430 {
3431    va_list va;
3432    char *msg;
3433
3434    if (failed)
3435       return;
3436
3437    failed = true;
3438
3439    va_start(va, format);
3440    msg = ralloc_vasprintf(mem_ctx, format, va);
3441    va_end(va);
3442    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3443
3444    this->fail_msg = msg;
3445
3446    if (debug_flag) {
3447       fprintf(stderr, "%s",  msg);
3448    }
3449 }
3450
3451 } /* namespace brw */