src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->writes_accumulator = false;
  46    this->conditional_mod = BRW_CONDITIONAL_NONE;
  47    this->sampler = 0;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 125                                            src0, src1);                 \
 126    }
 127
 128 #define ALU2_ACC(op)                                                    \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 131    {                                                                    \
 132       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 133                        BRW_OPCODE_##op, dst, src0, src1);               \
 134       inst->writes_accumulator = true;                                 \
 135       return inst;                                                     \
 136    }
 137
 138 #define ALU3(op)                                                        \
 139    vec4_instruction *                                                   \
 140    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 141    {                                                                    \
 142       assert(brw->gen >= 6);                                            \
 143       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 144                                            src0, src1, src2);           \
 145    }
 146
 147 ALU1(NOT)
 148 ALU1(MOV)
 149 ALU1(FRC)
 150 ALU1(RNDD)
 151 ALU1(RNDE)
 152 ALU1(RNDZ)
 153 ALU1(F32TO16)
 154 ALU1(F16TO32)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2_ACC(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(DP3)
 162 ALU2(DP4)
 163 ALU2(DPH)
 164 ALU2(SHL)
 165 ALU2(SHR)
 166 ALU2(ASR)
 167 ALU3(LRP)
 168 ALU1(BFREV)
 169 ALU3(BFE)
 170 ALU2(BFI1)
 171 ALU3(BFI2)
 172 ALU1(FBH)
 173 ALU1(FBL)
 174 ALU1(CBIT)
 175 ALU3(MAD)
 176 ALU2_ACC(ADDC)
 177 ALU2_ACC(SUBB)
 178 ALU2(MAC)
 179
 180 /** Gen4 predicated IF. */
 181 vec4_instruction *
 182 vec4_visitor::IF(uint32_t predicate)
 183 {
 184    vec4_instruction *inst;
 185
 186    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 187    inst->predicate = predicate;
 188
 189    return inst;
 190 }
 191
 192 /** Gen6 IF with embedded comparison. */
 193 vec4_instruction *
 194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 195 {
 196    assert(brw->gen == 6);
 197
 198    vec4_instruction *inst;
 199
 200    resolve_ud_negate(&src0);
 201    resolve_ud_negate(&src1);
 202
 203    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 204                                         src0, src1);
 205    inst->conditional_mod = condition;
 206
 207    return inst;
 208 }
 209
 210 /**
 211  * CMP: Sets the low bit of the destination channels with the result
 212  * of the comparison, while the upper bits are undefined, and updates
 213  * the flag register with the packed 16 bits of the result.
 214  */
 215 vec4_instruction *
 216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 217 {
 218    vec4_instruction *inst;
 219
 220    /* original gen4 does type conversion to the destination type
 221     * before before comparison, producing garbage results for floating
 222     * point comparisons.
 223     */
 224    if (brw->gen == 4) {
 225       dst.type = src0.type;
 226       if (dst.file == HW_REG)
 227          dst.fixed_hw_reg.type = dst.type;
 228    }
 229
 230    resolve_ud_negate(&src0);
 231    resolve_ud_negate(&src1);
 232
 233    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 234    inst->conditional_mod = condition;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 245                                         dst, index);
 246    inst->base_mrf = 14;
 247    inst->mlen = 2;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 258                                         dst, src, index);
 259    inst->base_mrf = 13;
 260    inst->mlen = 3;
 261
 262    return inst;
 263 }
 264
 265 void
 266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 267 {
 268    static enum opcode dot_opcodes[] = {
 269       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 270    };
 271
 272    emit(dot_opcodes[elements - 2], dst, src0, src1);
 273 }
 274
 275 src_reg
 276 vec4_visitor::fix_3src_operand(src_reg src)
 277 {
 278    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 279     * able to use vertical stride of zero to replicate the vec4 uniform, like
 280     *
 281     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 282     *
 283     * But you can't, since vertical stride is always four in three-source
 284     * instructions. Instead, insert a MOV instruction to do the replication so
 285     * that the three-source instruction can consume it.
 286     */
 287
 288    /* The MOV is only needed if the source is a uniform or immediate. */
 289    if (src.file != UNIFORM && src.file != IMM)
 290       return src;
 291
 292    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 293       return src;
 294
 295    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 296    expanded.type = src.type;
 297    emit(MOV(expanded, src));
 298    return src_reg(expanded);
 299 }
 300
 301 src_reg
 302 vec4_visitor::fix_math_operand(src_reg src)
 303 {
 304    /* The gen6 math instruction ignores the source modifiers --
 305     * swizzle, abs, negate, and at least some parts of the register
 306     * region description.
 307     *
 308     * Rather than trying to enumerate all these cases, *always* expand the
 309     * operand to a temp GRF for gen6.
 310     *
 311     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 312     * can't use.
 313     */
 314
 315    if (brw->gen == 7 && src.file != IMM)
 316       return src;
 317
 318    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 319    expanded.type = src.type;
 320    emit(MOV(expanded, src));
 321    return src_reg(expanded);
 322 }
 323
 324 void
 325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 326 {
 327    src = fix_math_operand(src);
 328
 329    if (dst.writemask != WRITEMASK_XYZW) {
 330       /* The gen6 math instruction must be align1, so we can't do
 331        * writemasks.
 332        */
 333       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 334
 335       emit(opcode, temp_dst, src);
 336
 337       emit(MOV(dst, src_reg(temp_dst)));
 338    } else {
 339       emit(opcode, dst, src);
 340    }
 341 }
 342
 343 void
 344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 345 {
 346    vec4_instruction *inst = emit(opcode, dst, src);
 347    inst->base_mrf = 1;
 348    inst->mlen = 1;
 349 }
 350
 351 void
 352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 353 {
 354    switch (opcode) {
 355    case SHADER_OPCODE_RCP:
 356    case SHADER_OPCODE_RSQ:
 357    case SHADER_OPCODE_SQRT:
 358    case SHADER_OPCODE_EXP2:
 359    case SHADER_OPCODE_LOG2:
 360    case SHADER_OPCODE_SIN:
 361    case SHADER_OPCODE_COS:
 362       break;
 363    default:
 364       assert(!"not reached: bad math opcode");
 365       return;
 366    }
 367
 368    if (brw->gen >= 6) {
 369       return emit_math1_gen6(opcode, dst, src);
 370    } else {
 371       return emit_math1_gen4(opcode, dst, src);
 372    }
 373 }
 374
 375 void
 376 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 377                               dst_reg dst, src_reg src0, src_reg src1)
 378 {
 379    src0 = fix_math_operand(src0);
 380    src1 = fix_math_operand(src1);
 381
 382    if (dst.writemask != WRITEMASK_XYZW) {
 383       /* The gen6 math instruction must be align1, so we can't do
 384        * writemasks.
 385        */
 386       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 387       temp_dst.type = dst.type;
 388
 389       emit(opcode, temp_dst, src0, src1);
 390
 391       emit(MOV(dst, src_reg(temp_dst)));
 392    } else {
 393       emit(opcode, dst, src0, src1);
 394    }
 395 }
 396
 397 void
 398 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 399                               dst_reg dst, src_reg src0, src_reg src1)
 400 {
 401    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 402    inst->base_mrf = 1;
 403    inst->mlen = 2;
 404 }
 405
 406 void
 407 vec4_visitor::emit_math(enum opcode opcode,
 408                         dst_reg dst, src_reg src0, src_reg src1)
 409 {
 410    switch (opcode) {
 411    case SHADER_OPCODE_POW:
 412    case SHADER_OPCODE_INT_QUOTIENT:
 413    case SHADER_OPCODE_INT_REMAINDER:
 414       break;
 415    default:
 416       assert(!"not reached: unsupported binary math opcode");
 417       return;
 418    }
 419
 420    if (brw->gen >= 6) {
 421       return emit_math2_gen6(opcode, dst, src0, src1);
 422    } else {
 423       return emit_math2_gen4(opcode, dst, src0, src1);
 424    }
 425 }
 426
 427 void
 428 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 429 {
 430    if (brw->gen < 7)
 431       assert(!"ir_unop_pack_half_2x16 should be lowered");
 432
 433    assert(dst.type == BRW_REGISTER_TYPE_UD);
 434    assert(src0.type == BRW_REGISTER_TYPE_F);
 435
 436    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 437     *
 438     *   Because this instruction does not have a 16-bit floating-point type,
 439     *   the destination data type must be Word (W).
 440     *
 441     *   The destination must be DWord-aligned and specify a horizontal stride
 442     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 443     *   each destination channel and the upper word is not modified.
 444     *
 445     * The above restriction implies that the f32to16 instruction must use
 446     * align1 mode, because only in align1 mode is it possible to specify
 447     * horizontal stride.  We choose here to defy the hardware docs and emit
 448     * align16 instructions.
 449     *
 450     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 451     * instructions. I was partially successful in that the code passed all
 452     * tests.  However, the code was dubiously correct and fragile, and the
 453     * tests were not harsh enough to probe that frailty. Not trusting the
 454     * code, I chose instead to remain in align16 mode in defiance of the hw
 455     * docs).
 456     *
 457     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 458     * simulator, emitting a f32to16 in align16 mode with UD as destination
 459     * data type is safe. The behavior differs from that specified in the PRM
 460     * in that the upper word of each destination channel is cleared to 0.
 461     */
 462
 463    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 464    src_reg tmp_src(tmp_dst);
 465
 466 #if 0
 467    /* Verify the undocumented behavior on which the following instructions
 468     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 469     * then the result of the bit-or instruction below will be incorrect.
 470     *
 471     * You should inspect the disasm output in order to verify that the MOV is
 472     * not optimized away.
 473     */
 474    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 475 #endif
 476
 477    /* Give tmp the form below, where "." means untouched.
 478     *
 479     *     w z          y          x w z          y          x
 480     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 481     *
 482     * That the upper word of each write-channel be 0 is required for the
 483     * following bit-shift and bit-or instructions to work. Note that this
 484     * relies on the undocumented hardware behavior mentioned above.
 485     */
 486    tmp_dst.writemask = WRITEMASK_XY;
 487    emit(F32TO16(tmp_dst, src0));
 488
 489    /* Give the write-channels of dst the form:
 490     *   0xhhhh0000
 491     */
 492    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 493    emit(SHL(dst, tmp_src, src_reg(16u)));
 494
 495    /* Finally, give the write-channels of dst the form of packHalf2x16's
 496     * output:
 497     *   0xhhhhllll
 498     */
 499    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 500    emit(OR(dst, src_reg(dst), tmp_src));
 501 }
 502
 503 void
 504 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 505 {
 506    if (brw->gen < 7)
 507       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 508
 509    assert(dst.type == BRW_REGISTER_TYPE_F);
 510    assert(src0.type == BRW_REGISTER_TYPE_UD);
 511
 512    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 513     *
 514     *   Because this instruction does not have a 16-bit floating-point type,
 515     *   the source data type must be Word (W). The destination type must be
 516     *   F (Float).
 517     *
 518     * To use W as the source data type, we must adjust horizontal strides,
 519     * which is only possible in align1 mode. All my [chadv] attempts at
 520     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 521     * Piglit tests, so I gave up.
 522     *
 523     * I've verified that, on gen7 hardware and the simulator, it is safe to
 524     * emit f16to32 in align16 mode with UD as source data type.
 525     */
 526
 527    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 528    src_reg tmp_src(tmp_dst);
 529
 530    tmp_dst.writemask = WRITEMASK_X;
 531    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 532
 533    tmp_dst.writemask = WRITEMASK_Y;
 534    emit(SHR(tmp_dst, src0, src_reg(16u)));
 535
 536    dst.writemask = WRITEMASK_XY;
 537    emit(F16TO32(dst, tmp_src));
 538 }
 539
 540 void
 541 vec4_visitor::visit_instructions(const exec_list *list)
 542 {
 543    foreach_list(node, list) {
 544       ir_instruction *ir = (ir_instruction *)node;
 545
 546       base_ir = ir;
 547       ir->accept(this);
 548    }
 549 }
 550
 551
 552 static int
 553 type_size(const struct glsl_type *type)
 554 {
 555    unsigned int i;
 556    int size;
 557
 558    switch (type->base_type) {
 559    case GLSL_TYPE_UINT:
 560    case GLSL_TYPE_INT:
 561    case GLSL_TYPE_FLOAT:
 562    case GLSL_TYPE_BOOL:
 563       if (type->is_matrix()) {
 564          return type->matrix_columns;
 565       } else {
 566          /* Regardless of size of vector, it gets a vec4. This is bad
 567           * packing for things like floats, but otherwise arrays become a
 568           * mess.  Hopefully a later pass over the code can pack scalars
 569           * down if appropriate.
 570           */
 571          return 1;
 572       }
 573    case GLSL_TYPE_ARRAY:
 574       assert(type->length > 0);
 575       return type_size(type->fields.array) * type->length;
 576    case GLSL_TYPE_STRUCT:
 577       size = 0;
 578       for (i = 0; i < type->length; i++) {
 579          size += type_size(type->fields.structure[i].type);
 580       }
 581       return size;
 582    case GLSL_TYPE_SAMPLER:
 583       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 584        * at link time.
 585        */
 586       return 1;
 587    case GLSL_TYPE_ATOMIC_UINT:
 588       return 0;
 589    case GLSL_TYPE_IMAGE:
 590    case GLSL_TYPE_VOID:
 591    case GLSL_TYPE_ERROR:
 592    case GLSL_TYPE_INTERFACE:
 593       assert(0);
 594       break;
 595    }
 596
 597    return 0;
 598 }
 599
 600 int
 601 vec4_visitor::virtual_grf_alloc(int size)
 602 {
 603    if (virtual_grf_array_size <= virtual_grf_count) {
 604       if (virtual_grf_array_size == 0)
 605          virtual_grf_array_size = 16;
 606       else
 607          virtual_grf_array_size *= 2;
 608       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 609                                    virtual_grf_array_size);
 610       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 611                                      virtual_grf_array_size);
 612    }
 613    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 614    virtual_grf_reg_count += size;
 615    virtual_grf_sizes[virtual_grf_count] = size;
 616    return virtual_grf_count++;
 617 }
 618
 619 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 620 {
 621    init();
 622
 623    this->file = GRF;
 624    this->reg = v->virtual_grf_alloc(type_size(type));
 625
 626    if (type->is_array() || type->is_record()) {
 627       this->swizzle = BRW_SWIZZLE_NOOP;
 628    } else {
 629       this->swizzle = swizzle_for_size(type->vector_elements);
 630    }
 631
 632    this->type = brw_type_for_base_type(type);
 633 }
 634
 635 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 636 {
 637    init();
 638
 639    this->file = GRF;
 640    this->reg = v->virtual_grf_alloc(type_size(type));
 641
 642    if (type->is_array() || type->is_record()) {
 643       this->writemask = WRITEMASK_XYZW;
 644    } else {
 645       this->writemask = (1 << type->vector_elements) - 1;
 646    }
 647
 648    this->type = brw_type_for_base_type(type);
 649 }
 650
 651 /* Our support for uniforms is piggy-backed on the struct
 652  * gl_fragment_program, because that's where the values actually
 653  * get stored, rather than in some global gl_shader_program uniform
 654  * store.
 655  */
 656 void
 657 vec4_visitor::setup_uniform_values(ir_variable *ir)
 658 {
 659    int namelen = strlen(ir->name);
 660
 661    /* The data for our (non-builtin) uniforms is stored in a series of
 662     * gl_uniform_driver_storage structs for each subcomponent that
 663     * glGetUniformLocation() could name.  We know it's been set up in the same
 664     * order we'd walk the type, so walk the list of storage and find anything
 665     * with our name, or the prefix of a component that starts with our name.
 666     */
 667    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 668       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 669
 670       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 671           (storage->name[namelen] != 0 &&
 672            storage->name[namelen] != '.' &&
 673            storage->name[namelen] != '[')) {
 674          continue;
 675       }
 676
 677       gl_constant_value *components = storage->storage;
 678       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 679                                storage->type->matrix_columns);
 680
 681       for (unsigned s = 0; s < vector_count; s++) {
 682          assert(uniforms < uniform_array_size);
 683          uniform_vector_size[uniforms] = storage->type->vector_elements;
 684
 685          int i;
 686          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 687             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 688             components++;
 689          }
 690          for (; i < 4; i++) {
 691             static float zero = 0;
 692             stage_prog_data->param[uniforms * 4 + i] = &zero;
 693          }
 694
 695          uniforms++;
 696       }
 697    }
 698 }
 699
 700 void
 701 vec4_visitor::setup_uniform_clipplane_values()
 702 {
 703    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 704
 705    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 706       assert(this->uniforms < uniform_array_size);
 707       this->uniform_vector_size[this->uniforms] = 4;
 708       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 709       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 710       for (int j = 0; j < 4; ++j) {
 711          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 712       }
 713       ++this->uniforms;
 714    }
 715 }
 716
 717 /* Our support for builtin uniforms is even scarier than non-builtin.
 718  * It sits on top of the PROG_STATE_VAR parameters that are
 719  * automatically updated from GL context state.
 720  */
 721 void
 722 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 723 {
 724    const ir_state_slot *const slots = ir->state_slots;
 725    assert(ir->state_slots != NULL);
 726
 727    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 728       /* This state reference has already been setup by ir_to_mesa,
 729        * but we'll get the same index back here.  We can reference
 730        * ParameterValues directly, since unlike brw_fs.cpp, we never
 731        * add new state references during compile.
 732        */
 733       int index = _mesa_add_state_reference(this->prog->Parameters,
 734                                             (gl_state_index *)slots[i].tokens);
 735       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 736
 737       assert(this->uniforms < uniform_array_size);
 738       this->uniform_vector_size[this->uniforms] = 0;
 739       /* Add each of the unique swizzled channels of the element.
 740        * This will end up matching the size of the glsl_type of this field.
 741        */
 742       int last_swiz = -1;
 743       for (unsigned int j = 0; j < 4; j++) {
 744          int swiz = GET_SWZ(slots[i].swizzle, j);
 745          last_swiz = swiz;
 746
 747          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 748          assert(this->uniforms < uniform_array_size);
 749          if (swiz <= last_swiz)
 750             this->uniform_vector_size[this->uniforms]++;
 751       }
 752       this->uniforms++;
 753    }
 754 }
 755
 756 dst_reg *
 757 vec4_visitor::variable_storage(ir_variable *var)
 758 {
 759    return (dst_reg *)hash_table_find(this->variable_ht, var);
 760 }
 761
 762 void
 763 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 764 {
 765    ir_expression *expr = ir->as_expression();
 766
 767    *predicate = BRW_PREDICATE_NORMAL;
 768
 769    if (expr) {
 770       src_reg op[2];
 771       vec4_instruction *inst;
 772
 773       assert(expr->get_num_operands() <= 2);
 774       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 775          expr->operands[i]->accept(this);
 776          op[i] = this->result;
 777
 778          resolve_ud_negate(&op[i]);
 779       }
 780
 781       switch (expr->operation) {
 782       case ir_unop_logic_not:
 783          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 784          inst->conditional_mod = BRW_CONDITIONAL_Z;
 785          break;
 786
 787       case ir_binop_logic_xor:
 788          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 789          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 790          break;
 791
 792       case ir_binop_logic_or:
 793          inst = emit(OR(dst_null_d(), op[0], op[1]));
 794          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 795          break;
 796
 797       case ir_binop_logic_and:
 798          inst = emit(AND(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_unop_f2b:
 803          if (brw->gen >= 6) {
 804             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 805          } else {
 806             inst = emit(MOV(dst_null_f(), op[0]));
 807             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 808          }
 809          break;
 810
 811       case ir_unop_i2b:
 812          if (brw->gen >= 6) {
 813             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 814          } else {
 815             inst = emit(MOV(dst_null_d(), op[0]));
 816             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          }
 818          break;
 819
 820       case ir_binop_all_equal:
 821          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 822          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 823          break;
 824
 825       case ir_binop_any_nequal:
 826          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 827          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 828          break;
 829
 830       case ir_unop_any:
 831          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 832          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 833          break;
 834
 835       case ir_binop_greater:
 836       case ir_binop_gequal:
 837       case ir_binop_less:
 838       case ir_binop_lequal:
 839       case ir_binop_equal:
 840       case ir_binop_nequal:
 841          emit(CMP(dst_null_d(), op[0], op[1],
 842                   brw_conditional_for_comparison(expr->operation)));
 843          break;
 844
 845       default:
 846          assert(!"not reached");
 847          break;
 848       }
 849       return;
 850    }
 851
 852    ir->accept(this);
 853
 854    resolve_ud_negate(&this->result);
 855
 856    if (brw->gen >= 6) {
 857       vec4_instruction *inst = emit(AND(dst_null_d(),
 858                                         this->result, src_reg(1)));
 859       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 860    } else {
 861       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 862       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 863    }
 864 }
 865
 866 /**
 867  * Emit a gen6 IF statement with the comparison folded into the IF
 868  * instruction.
 869  */
 870 void
 871 vec4_visitor::emit_if_gen6(ir_if *ir)
 872 {
 873    ir_expression *expr = ir->condition->as_expression();
 874
 875    if (expr) {
 876       src_reg op[2];
 877       dst_reg temp;
 878
 879       assert(expr->get_num_operands() <= 2);
 880       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 881          expr->operands[i]->accept(this);
 882          op[i] = this->result;
 883       }
 884
 885       switch (expr->operation) {
 886       case ir_unop_logic_not:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 888          return;
 889
 890       case ir_binop_logic_xor:
 891          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_binop_logic_or:
 895          temp = dst_reg(this, glsl_type::bool_type);
 896          emit(OR(temp, op[0], op[1]));
 897          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 898          return;
 899
 900       case ir_binop_logic_and:
 901          temp = dst_reg(this, glsl_type::bool_type);
 902          emit(AND(temp, op[0], op[1]));
 903          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 904          return;
 905
 906       case ir_unop_f2b:
 907          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 908          return;
 909
 910       case ir_unop_i2b:
 911          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 912          return;
 913
 914       case ir_binop_greater:
 915       case ir_binop_gequal:
 916       case ir_binop_less:
 917       case ir_binop_lequal:
 918       case ir_binop_equal:
 919       case ir_binop_nequal:
 920          emit(IF(op[0], op[1],
 921                  brw_conditional_for_comparison(expr->operation)));
 922          return;
 923
 924       case ir_binop_all_equal:
 925          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 926          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 927          return;
 928
 929       case ir_binop_any_nequal:
 930          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 931          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 932          return;
 933
 934       case ir_unop_any:
 935          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 936          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 937          return;
 938
 939       default:
 940          assert(!"not reached");
 941          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 942          return;
 943       }
 944       return;
 945    }
 946
 947    ir->condition->accept(this);
 948
 949    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 950 }
 951
 952 void
 953 vec4_visitor::visit(ir_variable *ir)
 954 {
 955    dst_reg *reg = NULL;
 956
 957    if (variable_storage(ir))
 958       return;
 959
 960    switch (ir->data.mode) {
 961    case ir_var_shader_in:
 962       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 963       break;
 964
 965    case ir_var_shader_out:
 966       reg = new(mem_ctx) dst_reg(this, ir->type);
 967
 968       for (int i = 0; i < type_size(ir->type); i++) {
 969          output_reg[ir->data.location + i] = *reg;
 970          output_reg[ir->data.location + i].reg_offset = i;
 971          output_reg[ir->data.location + i].type =
 972             brw_type_for_base_type(ir->type->get_scalar_type());
 973          output_reg_annotation[ir->data.location + i] = ir->name;
 974       }
 975       break;
 976
 977    case ir_var_auto:
 978    case ir_var_temporary:
 979       reg = new(mem_ctx) dst_reg(this, ir->type);
 980       break;
 981
 982    case ir_var_uniform:
 983       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 984
 985       /* Thanks to the lower_ubo_reference pass, we will see only
 986        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 987        * variables, so no need for them to be in variable_ht.
 988        *
 989        * Atomic counters take no uniform storage, no need to do
 990        * anything here.
 991        */
 992       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 993          return;
 994
 995       /* Track how big the whole uniform variable is, in case we need to put a
 996        * copy of its data into pull constants for array access.
 997        */
 998       assert(this->uniforms < uniform_array_size);
 999       this->uniform_size[this->uniforms] = type_size(ir->type);
1000
1001       if (!strncmp(ir->name, "gl_", 3)) {
1002          setup_builtin_uniform_values(ir);
1003       } else {
1004          setup_uniform_values(ir);
1005       }
1006       break;
1007
1008    case ir_var_system_value:
1009       reg = make_reg_for_system_value(ir);
1010       break;
1011
1012    default:
1013       assert(!"not reached");
1014    }
1015
1016    reg->type = brw_type_for_base_type(ir->type);
1017    hash_table_insert(this->variable_ht, reg, ir);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop *ir)
1022 {
1023    /* We don't want debugging output to print the whole body of the
1024     * loop as the annotation.
1025     */
1026    this->base_ir = NULL;
1027
1028    emit(BRW_OPCODE_DO);
1029
1030    visit_instructions(&ir->body_instructions);
1031
1032    emit(BRW_OPCODE_WHILE);
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_loop_jump *ir)
1037 {
1038    switch (ir->mode) {
1039    case ir_loop_jump::jump_break:
1040       emit(BRW_OPCODE_BREAK);
1041       break;
1042    case ir_loop_jump::jump_continue:
1043       emit(BRW_OPCODE_CONTINUE);
1044       break;
1045    }
1046 }
1047
1048
1049 void
1050 vec4_visitor::visit(ir_function_signature *ir)
1051 {
1052    assert(0);
1053    (void)ir;
1054 }
1055
1056 void
1057 vec4_visitor::visit(ir_function *ir)
1058 {
1059    /* Ignore function bodies other than main() -- we shouldn't see calls to
1060     * them since they should all be inlined.
1061     */
1062    if (strcmp(ir->name, "main") == 0) {
1063       const ir_function_signature *sig;
1064       exec_list empty;
1065
1066       sig = ir->matching_signature(NULL, &empty);
1067
1068       assert(sig);
1069
1070       visit_instructions(&sig->body);
1071    }
1072 }
1073
1074 bool
1075 vec4_visitor::try_emit_sat(ir_expression *ir)
1076 {
1077    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1078    if (!sat_src)
1079       return false;
1080
1081    sat_src->accept(this);
1082    src_reg src = this->result;
1083
1084    this->result = src_reg(this, ir->type);
1085    vec4_instruction *inst;
1086    inst = emit(MOV(dst_reg(this->result), src));
1087    inst->saturate = true;
1088
1089    return true;
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir)
1094 {
1095    /* 3-src instructions were introduced in gen6. */
1096    if (brw->gen < 6)
1097       return false;
1098
1099    /* MAD can only handle floating-point data. */
1100    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101       return false;
1102
1103    ir_rvalue *nonmul = ir->operands[1];
1104    ir_expression *mul = ir->operands[0]->as_expression();
1105
1106    if (!mul || mul->operation != ir_binop_mul) {
1107       nonmul = ir->operands[0];
1108       mul = ir->operands[1]->as_expression();
1109
1110       if (!mul || mul->operation != ir_binop_mul)
1111          return false;
1112    }
1113
1114    nonmul->accept(this);
1115    src_reg src0 = fix_3src_operand(this->result);
1116
1117    mul->operands[0]->accept(this);
1118    src_reg src1 = fix_3src_operand(this->result);
1119
1120    mul->operands[1]->accept(this);
1121    src_reg src2 = fix_3src_operand(this->result);
1122
1123    this->result = src_reg(this, ir->type);
1124    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1125
1126    return true;
1127 }
1128
1129 void
1130 vec4_visitor::emit_bool_comparison(unsigned int op,
1131                                  dst_reg dst, src_reg src0, src_reg src1)
1132 {
1133    /* original gen4 does destination conversion before comparison. */
1134    if (brw->gen < 5)
1135       dst.type = src0.type;
1136
1137    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1138
1139    dst.type = BRW_REGISTER_TYPE_D;
1140    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1141 }
1142
1143 void
1144 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1145                           src_reg src0, src_reg src1)
1146 {
1147    vec4_instruction *inst;
1148
1149    if (brw->gen >= 6) {
1150       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1151       inst->conditional_mod = conditionalmod;
1152    } else {
1153       emit(CMP(dst, src0, src1, conditionalmod));
1154
1155       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1156       inst->predicate = BRW_PREDICATE_NORMAL;
1157    }
1158 }
1159
1160 void
1161 vec4_visitor::emit_lrp(const dst_reg &dst,
1162                        const src_reg &x, const src_reg &y, const src_reg &a)
1163 {
1164    if (brw->gen >= 6) {
1165       /* Note that the instruction's argument order is reversed from GLSL
1166        * and the IR.
1167        */
1168       emit(LRP(dst,
1169                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1170    } else {
1171       /* Earlier generations don't support three source operations, so we
1172        * need to emit x*(1-a) + y*a.
1173        */
1174       dst_reg one_minus_a   = dst_reg(this, glsl_type::vec4_type);
1175       one_minus_a.writemask = dst.writemask;
1176
1177       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1178       vec4_instruction *mul = emit(MUL(dst_null_f(), y, a));
1179       mul->writes_accumulator = true;
1180       emit(MAC(dst, x, src_reg(one_minus_a)));
1181    }
1182 }
1183
1184 void
1185 vec4_visitor::visit(ir_expression *ir)
1186 {
1187    unsigned int operand;
1188    src_reg op[Elements(ir->operands)];
1189    src_reg result_src;
1190    dst_reg result_dst;
1191    vec4_instruction *inst;
1192
1193    if (try_emit_sat(ir))
1194       return;
1195
1196    if (ir->operation == ir_binop_add) {
1197       if (try_emit_mad(ir))
1198          return;
1199    }
1200
1201    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1202       this->result.file = BAD_FILE;
1203       ir->operands[operand]->accept(this);
1204       if (this->result.file == BAD_FILE) {
1205          fprintf(stderr, "Failed to get tree for expression operand:\n");
1206          ir->operands[operand]->fprint(stderr);
1207          exit(1);
1208       }
1209       op[operand] = this->result;
1210
1211       /* Matrix expression operands should have been broken down to vector
1212        * operations already.
1213        */
1214       assert(!ir->operands[operand]->type->is_matrix());
1215    }
1216
1217    int vector_elements = ir->operands[0]->type->vector_elements;
1218    if (ir->operands[1]) {
1219       vector_elements = MAX2(vector_elements,
1220                              ir->operands[1]->type->vector_elements);
1221    }
1222
1223    this->result.file = BAD_FILE;
1224
1225    /* Storage for our result.  Ideally for an assignment we'd be using
1226     * the actual storage for the result here, instead.
1227     */
1228    result_src = src_reg(this, ir->type);
1229    /* convenience for the emit functions below. */
1230    result_dst = dst_reg(result_src);
1231    /* If nothing special happens, this is the result. */
1232    this->result = result_src;
1233    /* Limit writes to the channels that will be used by result_src later.
1234     * This does limit this temp's use as a temporary for multi-instruction
1235     * sequences.
1236     */
1237    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1238
1239    switch (ir->operation) {
1240    case ir_unop_logic_not:
1241       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1242        * ones complement of the whole register, not just bit 0.
1243        */
1244       emit(XOR(result_dst, op[0], src_reg(1)));
1245       break;
1246    case ir_unop_neg:
1247       op[0].negate = !op[0].negate;
1248       emit(MOV(result_dst, op[0]));
1249       break;
1250    case ir_unop_abs:
1251       op[0].abs = true;
1252       op[0].negate = false;
1253       emit(MOV(result_dst, op[0]));
1254       break;
1255
1256    case ir_unop_sign:
1257       if (ir->type->is_float()) {
1258          /* AND(val, 0x80000000) gives the sign bit.
1259           *
1260           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1261           * zero.
1262           */
1263          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1264
1265          op[0].type = BRW_REGISTER_TYPE_UD;
1266          result_dst.type = BRW_REGISTER_TYPE_UD;
1267          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1268
1269          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1270          inst->predicate = BRW_PREDICATE_NORMAL;
1271
1272          this->result.type = BRW_REGISTER_TYPE_F;
1273       } else {
1274          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1275           *               -> non-negative val generates 0x00000000.
1276           *  Predicated OR sets 1 if val is positive.
1277           */
1278          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1279
1280          emit(ASR(result_dst, op[0], src_reg(31)));
1281
1282          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1283          inst->predicate = BRW_PREDICATE_NORMAL;
1284       }
1285       break;
1286
1287    case ir_unop_rcp:
1288       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1289       break;
1290
1291    case ir_unop_exp2:
1292       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1293       break;
1294    case ir_unop_log2:
1295       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1296       break;
1297    case ir_unop_exp:
1298    case ir_unop_log:
1299       assert(!"not reached: should be handled by ir_explog_to_explog2");
1300       break;
1301    case ir_unop_sin:
1302    case ir_unop_sin_reduced:
1303       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1304       break;
1305    case ir_unop_cos:
1306    case ir_unop_cos_reduced:
1307       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1308       break;
1309
1310    case ir_unop_dFdx:
1311    case ir_unop_dFdy:
1312       assert(!"derivatives not valid in vertex shader");
1313       break;
1314
1315    case ir_unop_bitfield_reverse:
1316       emit(BFREV(result_dst, op[0]));
1317       break;
1318    case ir_unop_bit_count:
1319       emit(CBIT(result_dst, op[0]));
1320       break;
1321    case ir_unop_find_msb: {
1322       src_reg temp = src_reg(this, glsl_type::uint_type);
1323
1324       inst = emit(FBH(dst_reg(temp), op[0]));
1325       inst->dst.writemask = WRITEMASK_XYZW;
1326
1327       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1328        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1329        * subtract the result from 31 to convert the MSB count into an LSB count.
1330        */
1331
1332       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1333       temp.swizzle = BRW_SWIZZLE_NOOP;
1334       emit(MOV(result_dst, temp));
1335
1336       src_reg src_tmp = src_reg(result_dst);
1337       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1338
1339       src_tmp.negate = true;
1340       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1341       inst->predicate = BRW_PREDICATE_NORMAL;
1342       break;
1343    }
1344    case ir_unop_find_lsb:
1345       emit(FBL(result_dst, op[0]));
1346       break;
1347
1348    case ir_unop_noise:
1349       assert(!"not reached: should be handled by lower_noise");
1350       break;
1351
1352    case ir_binop_add:
1353       emit(ADD(result_dst, op[0], op[1]));
1354       break;
1355    case ir_binop_sub:
1356       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1357       break;
1358
1359    case ir_binop_mul:
1360       if (brw->gen < 8 && ir->type->is_integer()) {
1361          /* For integer multiplication, the MUL uses the low 16 bits of one of
1362           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1363           * accumulates in the contribution of the upper 16 bits of that
1364           * operand.  If we can determine that one of the args is in the low
1365           * 16 bits, though, we can just emit a single MUL.
1366           */
1367          if (ir->operands[0]->is_uint16_constant()) {
1368             if (brw->gen < 7)
1369                emit(MUL(result_dst, op[0], op[1]));
1370             else
1371                emit(MUL(result_dst, op[1], op[0]));
1372          } else if (ir->operands[1]->is_uint16_constant()) {
1373             if (brw->gen < 7)
1374                emit(MUL(result_dst, op[1], op[0]));
1375             else
1376                emit(MUL(result_dst, op[0], op[1]));
1377          } else {
1378             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1379
1380             emit(MUL(acc, op[0], op[1]));
1381             emit(MACH(dst_null_d(), op[0], op[1]));
1382             emit(MOV(result_dst, src_reg(acc)));
1383          }
1384       } else {
1385          emit(MUL(result_dst, op[0], op[1]));
1386       }
1387       break;
1388    case ir_binop_imul_high: {
1389       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1390
1391       emit(MUL(acc, op[0], op[1]));
1392       emit(MACH(result_dst, op[0], op[1]));
1393       break;
1394    }
1395    case ir_binop_div:
1396       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1397       assert(ir->type->is_integer());
1398       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1399       break;
1400    case ir_binop_carry: {
1401       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1402
1403       emit(ADDC(dst_null_ud(), op[0], op[1]));
1404       emit(MOV(result_dst, src_reg(acc)));
1405       break;
1406    }
1407    case ir_binop_borrow: {
1408       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1409
1410       emit(SUBB(dst_null_ud(), op[0], op[1]));
1411       emit(MOV(result_dst, src_reg(acc)));
1412       break;
1413    }
1414    case ir_binop_mod:
1415       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1416       assert(ir->type->is_integer());
1417       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1418       break;
1419
1420    case ir_binop_less:
1421    case ir_binop_greater:
1422    case ir_binop_lequal:
1423    case ir_binop_gequal:
1424    case ir_binop_equal:
1425    case ir_binop_nequal: {
1426       emit(CMP(result_dst, op[0], op[1],
1427                brw_conditional_for_comparison(ir->operation)));
1428       emit(AND(result_dst, result_src, src_reg(0x1)));
1429       break;
1430    }
1431
1432    case ir_binop_all_equal:
1433       /* "==" operator producing a scalar boolean. */
1434       if (ir->operands[0]->type->is_vector() ||
1435           ir->operands[1]->type->is_vector()) {
1436          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1437          emit(MOV(result_dst, src_reg(0)));
1438          inst = emit(MOV(result_dst, src_reg(1)));
1439          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1440       } else {
1441          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1442          emit(AND(result_dst, result_src, src_reg(0x1)));
1443       }
1444       break;
1445    case ir_binop_any_nequal:
1446       /* "!=" operator producing a scalar boolean. */
1447       if (ir->operands[0]->type->is_vector() ||
1448           ir->operands[1]->type->is_vector()) {
1449          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1450
1451          emit(MOV(result_dst, src_reg(0)));
1452          inst = emit(MOV(result_dst, src_reg(1)));
1453          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1454       } else {
1455          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1456          emit(AND(result_dst, result_src, src_reg(0x1)));
1457       }
1458       break;
1459
1460    case ir_unop_any:
1461       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1462       emit(MOV(result_dst, src_reg(0)));
1463
1464       inst = emit(MOV(result_dst, src_reg(1)));
1465       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1466       break;
1467
1468    case ir_binop_logic_xor:
1469       emit(XOR(result_dst, op[0], op[1]));
1470       break;
1471
1472    case ir_binop_logic_or:
1473       emit(OR(result_dst, op[0], op[1]));
1474       break;
1475
1476    case ir_binop_logic_and:
1477       emit(AND(result_dst, op[0], op[1]));
1478       break;
1479
1480    case ir_binop_dot:
1481       assert(ir->operands[0]->type->is_vector());
1482       assert(ir->operands[0]->type == ir->operands[1]->type);
1483       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1484       break;
1485
1486    case ir_unop_sqrt:
1487       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1488       break;
1489    case ir_unop_rsq:
1490       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1491       break;
1492
1493    case ir_unop_bitcast_i2f:
1494    case ir_unop_bitcast_u2f:
1495       this->result = op[0];
1496       this->result.type = BRW_REGISTER_TYPE_F;
1497       break;
1498
1499    case ir_unop_bitcast_f2i:
1500       this->result = op[0];
1501       this->result.type = BRW_REGISTER_TYPE_D;
1502       break;
1503
1504    case ir_unop_bitcast_f2u:
1505       this->result = op[0];
1506       this->result.type = BRW_REGISTER_TYPE_UD;
1507       break;
1508
1509    case ir_unop_i2f:
1510    case ir_unop_i2u:
1511    case ir_unop_u2i:
1512    case ir_unop_u2f:
1513    case ir_unop_b2f:
1514    case ir_unop_b2i:
1515    case ir_unop_f2i:
1516    case ir_unop_f2u:
1517       emit(MOV(result_dst, op[0]));
1518       break;
1519    case ir_unop_f2b:
1520    case ir_unop_i2b: {
1521       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1522       emit(AND(result_dst, result_src, src_reg(1)));
1523       break;
1524    }
1525
1526    case ir_unop_trunc:
1527       emit(RNDZ(result_dst, op[0]));
1528       break;
1529    case ir_unop_ceil:
1530       op[0].negate = !op[0].negate;
1531       inst = emit(RNDD(result_dst, op[0]));
1532       this->result.negate = true;
1533       break;
1534    case ir_unop_floor:
1535       inst = emit(RNDD(result_dst, op[0]));
1536       break;
1537    case ir_unop_fract:
1538       inst = emit(FRC(result_dst, op[0]));
1539       break;
1540    case ir_unop_round_even:
1541       emit(RNDE(result_dst, op[0]));
1542       break;
1543
1544    case ir_binop_min:
1545       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1546       break;
1547    case ir_binop_max:
1548       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1549       break;
1550
1551    case ir_binop_pow:
1552       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1553       break;
1554
1555    case ir_unop_bit_not:
1556       inst = emit(NOT(result_dst, op[0]));
1557       break;
1558    case ir_binop_bit_and:
1559       inst = emit(AND(result_dst, op[0], op[1]));
1560       break;
1561    case ir_binop_bit_xor:
1562       inst = emit(XOR(result_dst, op[0], op[1]));
1563       break;
1564    case ir_binop_bit_or:
1565       inst = emit(OR(result_dst, op[0], op[1]));
1566       break;
1567
1568    case ir_binop_lshift:
1569       inst = emit(SHL(result_dst, op[0], op[1]));
1570       break;
1571
1572    case ir_binop_rshift:
1573       if (ir->type->base_type == GLSL_TYPE_INT)
1574          inst = emit(ASR(result_dst, op[0], op[1]));
1575       else
1576          inst = emit(SHR(result_dst, op[0], op[1]));
1577       break;
1578
1579    case ir_binop_bfm:
1580       emit(BFI1(result_dst, op[0], op[1]));
1581       break;
1582
1583    case ir_binop_ubo_load: {
1584       ir_constant *uniform_block = ir->operands[0]->as_constant();
1585       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1586       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1587       src_reg offset;
1588
1589       /* Now, load the vector from that offset. */
1590       assert(ir->type->is_vector() || ir->type->is_scalar());
1591
1592       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1593       packed_consts.type = result.type;
1594       src_reg surf_index =
1595          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1596       if (const_offset_ir) {
1597          if (brw->gen >= 8) {
1598             /* Store the offset in a GRF so we can send-from-GRF. */
1599             offset = src_reg(this, glsl_type::int_type);
1600             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1601          } else {
1602             /* Immediates are fine on older generations since they'll be moved
1603              * to a (potentially fake) MRF at the generator level.
1604              */
1605             offset = src_reg(const_offset / 16);
1606          }
1607       } else {
1608          offset = src_reg(this, glsl_type::uint_type);
1609          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1610       }
1611
1612       if (brw->gen >= 7) {
1613          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1614          grf_offset.type = offset.type;
1615
1616          emit(MOV(grf_offset, offset));
1617
1618          emit(new(mem_ctx) vec4_instruction(this,
1619                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1620                                             dst_reg(packed_consts),
1621                                             surf_index,
1622                                             src_reg(grf_offset)));
1623       } else {
1624          vec4_instruction *pull =
1625             emit(new(mem_ctx) vec4_instruction(this,
1626                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1627                                                dst_reg(packed_consts),
1628                                                surf_index,
1629                                                offset));
1630          pull->base_mrf = 14;
1631          pull->mlen = 1;
1632       }
1633
1634       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1635       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1636                                             const_offset % 16 / 4,
1637                                             const_offset % 16 / 4,
1638                                             const_offset % 16 / 4);
1639
1640       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1641       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1642          emit(CMP(result_dst, packed_consts, src_reg(0u),
1643                   BRW_CONDITIONAL_NZ));
1644          emit(AND(result_dst, result, src_reg(0x1)));
1645       } else {
1646          emit(MOV(result_dst, packed_consts));
1647       }
1648       break;
1649    }
1650
1651    case ir_binop_vector_extract:
1652       assert(!"should have been lowered by vec_index_to_cond_assign");
1653       break;
1654
1655    case ir_triop_fma:
1656       op[0] = fix_3src_operand(op[0]);
1657       op[1] = fix_3src_operand(op[1]);
1658       op[2] = fix_3src_operand(op[2]);
1659       /* Note that the instruction's argument order is reversed from GLSL
1660        * and the IR.
1661        */
1662       emit(MAD(result_dst, op[2], op[1], op[0]));
1663       break;
1664
1665    case ir_triop_lrp:
1666       emit_lrp(result_dst, op[0], op[1], op[2]);
1667       break;
1668
1669    case ir_triop_csel:
1670       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1671       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1672       inst->predicate = BRW_PREDICATE_NORMAL;
1673       break;
1674
1675    case ir_triop_bfi:
1676       op[0] = fix_3src_operand(op[0]);
1677       op[1] = fix_3src_operand(op[1]);
1678       op[2] = fix_3src_operand(op[2]);
1679       emit(BFI2(result_dst, op[0], op[1], op[2]));
1680       break;
1681
1682    case ir_triop_bitfield_extract:
1683       op[0] = fix_3src_operand(op[0]);
1684       op[1] = fix_3src_operand(op[1]);
1685       op[2] = fix_3src_operand(op[2]);
1686       /* Note that the instruction's argument order is reversed from GLSL
1687        * and the IR.
1688        */
1689       emit(BFE(result_dst, op[2], op[1], op[0]));
1690       break;
1691
1692    case ir_triop_vector_insert:
1693       assert(!"should have been lowered by lower_vector_insert");
1694       break;
1695
1696    case ir_quadop_bitfield_insert:
1697       assert(!"not reached: should be handled by "
1698               "bitfield_insert_to_bfm_bfi\n");
1699       break;
1700
1701    case ir_quadop_vector:
1702       assert(!"not reached: should be handled by lower_quadop_vector");
1703       break;
1704
1705    case ir_unop_pack_half_2x16:
1706       emit_pack_half_2x16(result_dst, op[0]);
1707       break;
1708    case ir_unop_unpack_half_2x16:
1709       emit_unpack_half_2x16(result_dst, op[0]);
1710       break;
1711    case ir_unop_pack_snorm_2x16:
1712    case ir_unop_pack_snorm_4x8:
1713    case ir_unop_pack_unorm_2x16:
1714    case ir_unop_pack_unorm_4x8:
1715    case ir_unop_unpack_snorm_2x16:
1716    case ir_unop_unpack_snorm_4x8:
1717    case ir_unop_unpack_unorm_2x16:
1718    case ir_unop_unpack_unorm_4x8:
1719       assert(!"not reached: should be handled by lower_packing_builtins");
1720       break;
1721    case ir_unop_unpack_half_2x16_split_x:
1722    case ir_unop_unpack_half_2x16_split_y:
1723    case ir_binop_pack_half_2x16_split:
1724       assert(!"not reached: should not occur in vertex shader");
1725       break;
1726    case ir_binop_ldexp:
1727       assert(!"not reached: should be handled by ldexp_to_arith()");
1728       break;
1729    }
1730 }
1731
1732
1733 void
1734 vec4_visitor::visit(ir_swizzle *ir)
1735 {
1736    src_reg src;
1737    int i = 0;
1738    int swizzle[4];
1739
1740    /* Note that this is only swizzles in expressions, not those on the left
1741     * hand side of an assignment, which do write masking.  See ir_assignment
1742     * for that.
1743     */
1744
1745    ir->val->accept(this);
1746    src = this->result;
1747    assert(src.file != BAD_FILE);
1748
1749    for (i = 0; i < ir->type->vector_elements; i++) {
1750       switch (i) {
1751       case 0:
1752          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1753          break;
1754       case 1:
1755          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1756          break;
1757       case 2:
1758          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1759          break;
1760       case 3:
1761          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1762             break;
1763       }
1764    }
1765    for (; i < 4; i++) {
1766       /* Replicate the last channel out. */
1767       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1768    }
1769
1770    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1771
1772    this->result = src;
1773 }
1774
1775 void
1776 vec4_visitor::visit(ir_dereference_variable *ir)
1777 {
1778    const struct glsl_type *type = ir->type;
1779    dst_reg *reg = variable_storage(ir->var);
1780
1781    if (!reg) {
1782       fail("Failed to find variable storage for %s\n", ir->var->name);
1783       this->result = src_reg(brw_null_reg());
1784       return;
1785    }
1786
1787    this->result = src_reg(*reg);
1788
1789    /* System values get their swizzle from the dst_reg writemask */
1790    if (ir->var->data.mode == ir_var_system_value)
1791       return;
1792
1793    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1794       this->result.swizzle = swizzle_for_size(type->vector_elements);
1795 }
1796
1797
1798 int
1799 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1800 {
1801    /* Under normal circumstances array elements are stored consecutively, so
1802     * the stride is equal to the size of the array element.
1803     */
1804    return type_size(ir->type);
1805 }
1806
1807
1808 void
1809 vec4_visitor::visit(ir_dereference_array *ir)
1810 {
1811    ir_constant *constant_index;
1812    src_reg src;
1813    int array_stride = compute_array_stride(ir);
1814
1815    constant_index = ir->array_index->constant_expression_value();
1816
1817    ir->array->accept(this);
1818    src = this->result;
1819
1820    if (constant_index) {
1821       src.reg_offset += constant_index->value.i[0] * array_stride;
1822    } else {
1823       /* Variable index array dereference.  It eats the "vec4" of the
1824        * base of the array and an index that offsets the Mesa register
1825        * index.
1826        */
1827       ir->array_index->accept(this);
1828
1829       src_reg index_reg;
1830
1831       if (array_stride == 1) {
1832          index_reg = this->result;
1833       } else {
1834          index_reg = src_reg(this, glsl_type::int_type);
1835
1836          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1837       }
1838
1839       if (src.reladdr) {
1840          src_reg temp = src_reg(this, glsl_type::int_type);
1841
1842          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1843
1844          index_reg = temp;
1845       }
1846
1847       src.reladdr = ralloc(mem_ctx, src_reg);
1848       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1849    }
1850
1851    /* If the type is smaller than a vec4, replicate the last channel out. */
1852    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1853       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1854    else
1855       src.swizzle = BRW_SWIZZLE_NOOP;
1856    src.type = brw_type_for_base_type(ir->type);
1857
1858    this->result = src;
1859 }
1860
1861 void
1862 vec4_visitor::visit(ir_dereference_record *ir)
1863 {
1864    unsigned int i;
1865    const glsl_type *struct_type = ir->record->type;
1866    int offset = 0;
1867
1868    ir->record->accept(this);
1869
1870    for (i = 0; i < struct_type->length; i++) {
1871       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1872          break;
1873       offset += type_size(struct_type->fields.structure[i].type);
1874    }
1875
1876    /* If the type is smaller than a vec4, replicate the last channel out. */
1877    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1878       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1879    else
1880       this->result.swizzle = BRW_SWIZZLE_NOOP;
1881    this->result.type = brw_type_for_base_type(ir->type);
1882
1883    this->result.reg_offset += offset;
1884 }
1885
1886 /**
1887  * We want to be careful in assignment setup to hit the actual storage
1888  * instead of potentially using a temporary like we might with the
1889  * ir_dereference handler.
1890  */
1891 static dst_reg
1892 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1893 {
1894    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1895     * access of a vector, it must be separated into a series conditional moves
1896     * before reaching this point (see ir_vec_index_to_cond_assign).
1897     */
1898    assert(ir->as_dereference());
1899    ir_dereference_array *deref_array = ir->as_dereference_array();
1900    if (deref_array) {
1901       assert(!deref_array->array->type->is_vector());
1902    }
1903
1904    /* Use the rvalue deref handler for the most part.  We'll ignore
1905     * swizzles in it and write swizzles using writemask, though.
1906     */
1907    ir->accept(v);
1908    return dst_reg(v->result);
1909 }
1910
1911 void
1912 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1913                               const struct glsl_type *type, uint32_t predicate)
1914 {
1915    if (type->base_type == GLSL_TYPE_STRUCT) {
1916       for (unsigned int i = 0; i < type->length; i++) {
1917          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1918       }
1919       return;
1920    }
1921
1922    if (type->is_array()) {
1923       for (unsigned int i = 0; i < type->length; i++) {
1924          emit_block_move(dst, src, type->fields.array, predicate);
1925       }
1926       return;
1927    }
1928
1929    if (type->is_matrix()) {
1930       const struct glsl_type *vec_type;
1931
1932       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1933                                          type->vector_elements, 1);
1934
1935       for (int i = 0; i < type->matrix_columns; i++) {
1936          emit_block_move(dst, src, vec_type, predicate);
1937       }
1938       return;
1939    }
1940
1941    assert(type->is_scalar() || type->is_vector());
1942
1943    dst->type = brw_type_for_base_type(type);
1944    src->type = dst->type;
1945
1946    dst->writemask = (1 << type->vector_elements) - 1;
1947
1948    src->swizzle = swizzle_for_size(type->vector_elements);
1949
1950    vec4_instruction *inst = emit(MOV(*dst, *src));
1951    inst->predicate = predicate;
1952
1953    dst->reg_offset++;
1954    src->reg_offset++;
1955 }
1956
1957
1958 /* If the RHS processing resulted in an instruction generating a
1959  * temporary value, and it would be easy to rewrite the instruction to
1960  * generate its result right into the LHS instead, do so.  This ends
1961  * up reliably removing instructions where it can be tricky to do so
1962  * later without real UD chain information.
1963  */
1964 bool
1965 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1966                                      dst_reg dst,
1967                                      src_reg src,
1968                                      vec4_instruction *pre_rhs_inst,
1969                                      vec4_instruction *last_rhs_inst)
1970 {
1971    /* This could be supported, but it would take more smarts. */
1972    if (ir->condition)
1973       return false;
1974
1975    if (pre_rhs_inst == last_rhs_inst)
1976       return false; /* No instructions generated to work with. */
1977
1978    /* Make sure the last instruction generated our source reg. */
1979    if (src.file != GRF ||
1980        src.file != last_rhs_inst->dst.file ||
1981        src.reg != last_rhs_inst->dst.reg ||
1982        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1983        src.reladdr ||
1984        src.abs ||
1985        src.negate ||
1986        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1987       return false;
1988
1989    /* Check that that last instruction fully initialized the channels
1990     * we want to use, in the order we want to use them.  We could
1991     * potentially reswizzle the operands of many instructions so that
1992     * we could handle out of order channels, but don't yet.
1993     */
1994
1995    for (unsigned i = 0; i < 4; i++) {
1996       if (dst.writemask & (1 << i)) {
1997          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1998             return false;
1999
2000          if (BRW_GET_SWZ(src.swizzle, i) != i)
2001             return false;
2002       }
2003    }
2004
2005    /* Success!  Rewrite the instruction. */
2006    last_rhs_inst->dst.file = dst.file;
2007    last_rhs_inst->dst.reg = dst.reg;
2008    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2009    last_rhs_inst->dst.reladdr = dst.reladdr;
2010    last_rhs_inst->dst.writemask &= dst.writemask;
2011
2012    return true;
2013 }
2014
2015 void
2016 vec4_visitor::visit(ir_assignment *ir)
2017 {
2018    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2019    uint32_t predicate = BRW_PREDICATE_NONE;
2020
2021    if (!ir->lhs->type->is_scalar() &&
2022        !ir->lhs->type->is_vector()) {
2023       ir->rhs->accept(this);
2024       src_reg src = this->result;
2025
2026       if (ir->condition) {
2027          emit_bool_to_cond_code(ir->condition, &predicate);
2028       }
2029
2030       /* emit_block_move doesn't account for swizzles in the source register.
2031        * This should be ok, since the source register is a structure or an
2032        * array, and those can't be swizzled.  But double-check to be sure.
2033        */
2034       assert(src.swizzle ==
2035              (ir->rhs->type->is_matrix()
2036               ? swizzle_for_size(ir->rhs->type->vector_elements)
2037               : BRW_SWIZZLE_NOOP));
2038
2039       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2040       return;
2041    }
2042
2043    /* Now we're down to just a scalar/vector with writemasks. */
2044    int i;
2045
2046    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2047    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2048
2049    ir->rhs->accept(this);
2050
2051    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2052
2053    src_reg src = this->result;
2054
2055    int swizzles[4];
2056    int first_enabled_chan = 0;
2057    int src_chan = 0;
2058
2059    assert(ir->lhs->type->is_vector() ||
2060           ir->lhs->type->is_scalar());
2061    dst.writemask = ir->write_mask;
2062
2063    for (int i = 0; i < 4; i++) {
2064       if (dst.writemask & (1 << i)) {
2065          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2066          break;
2067       }
2068    }
2069
2070    /* Swizzle a small RHS vector into the channels being written.
2071     *
2072     * glsl ir treats write_mask as dictating how many channels are
2073     * present on the RHS while in our instructions we need to make
2074     * those channels appear in the slots of the vec4 they're written to.
2075     */
2076    for (int i = 0; i < 4; i++) {
2077       if (dst.writemask & (1 << i))
2078          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2079       else
2080          swizzles[i] = first_enabled_chan;
2081    }
2082    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2083                               swizzles[2], swizzles[3]);
2084
2085    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2086       return;
2087    }
2088
2089    if (ir->condition) {
2090       emit_bool_to_cond_code(ir->condition, &predicate);
2091    }
2092
2093    for (i = 0; i < type_size(ir->lhs->type); i++) {
2094       vec4_instruction *inst = emit(MOV(dst, src));
2095       inst->predicate = predicate;
2096
2097       dst.reg_offset++;
2098       src.reg_offset++;
2099    }
2100 }
2101
2102 void
2103 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2104 {
2105    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2106       foreach_list(node, &ir->components) {
2107          ir_constant *field_value = (ir_constant *)node;
2108
2109          emit_constant_values(dst, field_value);
2110       }
2111       return;
2112    }
2113
2114    if (ir->type->is_array()) {
2115       for (unsigned int i = 0; i < ir->type->length; i++) {
2116          emit_constant_values(dst, ir->array_elements[i]);
2117       }
2118       return;
2119    }
2120
2121    if (ir->type->is_matrix()) {
2122       for (int i = 0; i < ir->type->matrix_columns; i++) {
2123          float *vec = &ir->value.f[i * ir->type->vector_elements];
2124
2125          for (int j = 0; j < ir->type->vector_elements; j++) {
2126             dst->writemask = 1 << j;
2127             dst->type = BRW_REGISTER_TYPE_F;
2128
2129             emit(MOV(*dst, src_reg(vec[j])));
2130          }
2131          dst->reg_offset++;
2132       }
2133       return;
2134    }
2135
2136    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2137
2138    for (int i = 0; i < ir->type->vector_elements; i++) {
2139       if (!(remaining_writemask & (1 << i)))
2140          continue;
2141
2142       dst->writemask = 1 << i;
2143       dst->type = brw_type_for_base_type(ir->type);
2144
2145       /* Find other components that match the one we're about to
2146        * write.  Emits fewer instructions for things like vec4(0.5,
2147        * 1.5, 1.5, 1.5).
2148        */
2149       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2150          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2151             if (ir->value.b[i] == ir->value.b[j])
2152                dst->writemask |= (1 << j);
2153          } else {
2154             /* u, i, and f storage all line up, so no need for a
2155              * switch case for comparing each type.
2156              */
2157             if (ir->value.u[i] == ir->value.u[j])
2158                dst->writemask |= (1 << j);
2159          }
2160       }
2161
2162       switch (ir->type->base_type) {
2163       case GLSL_TYPE_FLOAT:
2164          emit(MOV(*dst, src_reg(ir->value.f[i])));
2165          break;
2166       case GLSL_TYPE_INT:
2167          emit(MOV(*dst, src_reg(ir->value.i[i])));
2168          break;
2169       case GLSL_TYPE_UINT:
2170          emit(MOV(*dst, src_reg(ir->value.u[i])));
2171          break;
2172       case GLSL_TYPE_BOOL:
2173          emit(MOV(*dst, src_reg(ir->value.b[i])));
2174          break;
2175       default:
2176          assert(!"Non-float/uint/int/bool constant");
2177          break;
2178       }
2179
2180       remaining_writemask &= ~dst->writemask;
2181    }
2182    dst->reg_offset++;
2183 }
2184
2185 void
2186 vec4_visitor::visit(ir_constant *ir)
2187 {
2188    dst_reg dst = dst_reg(this, ir->type);
2189    this->result = src_reg(dst);
2190
2191    emit_constant_values(&dst, ir);
2192 }
2193
2194 void
2195 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2196 {
2197    ir_dereference *deref = static_cast<ir_dereference *>(
2198       ir->actual_parameters.get_head());
2199    ir_variable *location = deref->variable_referenced();
2200    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2201                           location->data.atomic.buffer_index);
2202
2203    /* Calculate the surface offset */
2204    src_reg offset(this, glsl_type::uint_type);
2205    ir_dereference_array *deref_array = deref->as_dereference_array();
2206    if (deref_array) {
2207       deref_array->array_index->accept(this);
2208
2209       src_reg tmp(this, glsl_type::uint_type);
2210       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2211       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2212    } else {
2213       offset = location->data.atomic.offset;
2214    }
2215
2216    /* Emit the appropriate machine instruction */
2217    const char *callee = ir->callee->function_name();
2218    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2219
2220    if (!strcmp("__intrinsic_atomic_read", callee)) {
2221       emit_untyped_surface_read(surf_index, dst, offset);
2222
2223    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2224       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2225                           src_reg(), src_reg());
2226
2227    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2228       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2229                           src_reg(), src_reg());
2230    }
2231 }
2232
2233 void
2234 vec4_visitor::visit(ir_call *ir)
2235 {
2236    const char *callee = ir->callee->function_name();
2237
2238    if (!strcmp("__intrinsic_atomic_read", callee) ||
2239        !strcmp("__intrinsic_atomic_increment", callee) ||
2240        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2241       visit_atomic_counter_intrinsic(ir);
2242    } else {
2243       assert(!"Unsupported intrinsic.");
2244    }
2245 }
2246
2247 src_reg
2248 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2249 {
2250    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2251    inst->base_mrf = 2;
2252    inst->mlen = 1;
2253    inst->sampler = sampler;
2254    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2255    inst->dst.writemask = WRITEMASK_XYZW;
2256
2257    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2258    int param_base = inst->base_mrf;
2259    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2260    int zero_mask = 0xf & ~coord_mask;
2261
2262    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2263             coordinate));
2264
2265    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2266             src_reg(0)));
2267
2268    emit(inst);
2269    return src_reg(inst->dst);
2270 }
2271
2272 void
2273 vec4_visitor::visit(ir_texture *ir)
2274 {
2275    int sampler =
2276       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2277
2278    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2279     * emitting anything other than setting up the constant result.
2280     */
2281    if (ir->op == ir_tg4) {
2282       ir_constant *chan = ir->lod_info.component->as_constant();
2283       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2284       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2285          dst_reg result(this, ir->type);
2286          this->result = src_reg(result);
2287          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2288          return;
2289       }
2290    }
2291
2292    /* Should be lowered by do_lower_texture_projection */
2293    assert(!ir->projector);
2294
2295    /* Should be lowered */
2296    assert(!ir->offset || !ir->offset->type->is_array());
2297
2298    /* Generate code to compute all the subexpression trees.  This has to be
2299     * done before loading any values into MRFs for the sampler message since
2300     * generating these values may involve SEND messages that need the MRFs.
2301     */
2302    src_reg coordinate;
2303    if (ir->coordinate) {
2304       ir->coordinate->accept(this);
2305       coordinate = this->result;
2306    }
2307
2308    src_reg shadow_comparitor;
2309    if (ir->shadow_comparitor) {
2310       ir->shadow_comparitor->accept(this);
2311       shadow_comparitor = this->result;
2312    }
2313
2314    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2315    src_reg offset_value;
2316    if (has_nonconstant_offset) {
2317       ir->offset->accept(this);
2318       offset_value = src_reg(this->result);
2319    }
2320
2321    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2322    src_reg lod, dPdx, dPdy, sample_index, mcs;
2323    switch (ir->op) {
2324    case ir_tex:
2325       lod = src_reg(0.0f);
2326       lod_type = glsl_type::float_type;
2327       break;
2328    case ir_txf:
2329    case ir_txl:
2330    case ir_txs:
2331       ir->lod_info.lod->accept(this);
2332       lod = this->result;
2333       lod_type = ir->lod_info.lod->type;
2334       break;
2335    case ir_query_levels:
2336       lod = src_reg(0);
2337       lod_type = glsl_type::int_type;
2338       break;
2339    case ir_txf_ms:
2340       ir->lod_info.sample_index->accept(this);
2341       sample_index = this->result;
2342       sample_index_type = ir->lod_info.sample_index->type;
2343
2344       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2345          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2346       else
2347          mcs = src_reg(0u);
2348       break;
2349    case ir_txd:
2350       ir->lod_info.grad.dPdx->accept(this);
2351       dPdx = this->result;
2352
2353       ir->lod_info.grad.dPdy->accept(this);
2354       dPdy = this->result;
2355
2356       lod_type = ir->lod_info.grad.dPdx->type;
2357       break;
2358    case ir_txb:
2359    case ir_lod:
2360    case ir_tg4:
2361       break;
2362    }
2363
2364    vec4_instruction *inst = NULL;
2365    switch (ir->op) {
2366    case ir_tex:
2367    case ir_txl:
2368       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2369       break;
2370    case ir_txd:
2371       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2372       break;
2373    case ir_txf:
2374       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2375       break;
2376    case ir_txf_ms:
2377       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2378       break;
2379    case ir_txs:
2380       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2381       break;
2382    case ir_tg4:
2383       if (has_nonconstant_offset)
2384          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2385       else
2386          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2387       break;
2388    case ir_query_levels:
2389       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2390       break;
2391    case ir_txb:
2392       assert(!"TXB is not valid for vertex shaders.");
2393       break;
2394    case ir_lod:
2395       assert(!"LOD is not valid for vertex shaders.");
2396       break;
2397    default:
2398       assert(!"Unrecognized tex op");
2399    }
2400
2401    if (ir->offset != NULL && ir->op != ir_txf)
2402       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2403
2404    /* Stuff the channel select bits in the top of the texture offset */
2405    if (ir->op == ir_tg4)
2406       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2407
2408    /* The message header is necessary for:
2409     * - Gen4 (always)
2410     * - Texel offsets
2411     * - Gather channel selection
2412     * - Sampler indices too large to fit in a 4-bit value.
2413     */
2414    inst->header_present =
2415       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2416       sampler >= 16;
2417    inst->base_mrf = 2;
2418    inst->mlen = inst->header_present + 1; /* always at least one */
2419    inst->sampler = sampler;
2420    inst->dst = dst_reg(this, ir->type);
2421    inst->dst.writemask = WRITEMASK_XYZW;
2422    inst->shadow_compare = ir->shadow_comparitor != NULL;
2423
2424    /* MRF for the first parameter */
2425    int param_base = inst->base_mrf + inst->header_present;
2426
2427    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2428       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2429       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2430    } else {
2431       /* Load the coordinate */
2432       /* FINISHME: gl_clamp_mask and saturate */
2433       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2434       int zero_mask = 0xf & ~coord_mask;
2435
2436       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2437                coordinate));
2438
2439       if (zero_mask != 0) {
2440          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2441                   src_reg(0)));
2442       }
2443       /* Load the shadow comparitor */
2444       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2445          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2446                           WRITEMASK_X),
2447                   shadow_comparitor));
2448          inst->mlen++;
2449       }
2450
2451       /* Load the LOD info */
2452       if (ir->op == ir_tex || ir->op == ir_txl) {
2453          int mrf, writemask;
2454          if (brw->gen >= 5) {
2455             mrf = param_base + 1;
2456             if (ir->shadow_comparitor) {
2457                writemask = WRITEMASK_Y;
2458                /* mlen already incremented */
2459             } else {
2460                writemask = WRITEMASK_X;
2461                inst->mlen++;
2462             }
2463          } else /* brw->gen == 4 */ {
2464             mrf = param_base;
2465             writemask = WRITEMASK_W;
2466          }
2467          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2468       } else if (ir->op == ir_txf) {
2469          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2470       } else if (ir->op == ir_txf_ms) {
2471          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2472                   sample_index));
2473          if (brw->gen >= 7)
2474             /* MCS data is in the first channel of `mcs`, but we need to get it into
2475              * the .y channel of the second vec4 of params, so replicate .x across
2476              * the whole vec4 and then mask off everything except .y
2477              */
2478             mcs.swizzle = BRW_SWIZZLE_XXXX;
2479             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2480                      mcs));
2481          inst->mlen++;
2482       } else if (ir->op == ir_txd) {
2483          const glsl_type *type = lod_type;
2484
2485          if (brw->gen >= 5) {
2486             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2487             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2488             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2489             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2490             inst->mlen++;
2491
2492             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2493                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2494                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2495                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2496                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2497                inst->mlen++;
2498
2499                if (ir->shadow_comparitor) {
2500                   emit(MOV(dst_reg(MRF, param_base + 2,
2501                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2502                            shadow_comparitor));
2503                }
2504             }
2505          } else /* brw->gen == 4 */ {
2506             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2507             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2508             inst->mlen += 2;
2509          }
2510       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2511          if (ir->shadow_comparitor) {
2512             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2513                      shadow_comparitor));
2514          }
2515
2516          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2517                   offset_value));
2518          inst->mlen++;
2519       }
2520    }
2521
2522    emit(inst);
2523
2524    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2525     * spec requires layers.
2526     */
2527    if (ir->op == ir_txs) {
2528       glsl_type const *type = ir->sampler->type;
2529       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2530           type->sampler_array) {
2531          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2532                    writemask(inst->dst, WRITEMASK_Z),
2533                    src_reg(inst->dst), src_reg(6));
2534       }
2535    }
2536
2537    if (brw->gen == 6 && ir->op == ir_tg4) {
2538       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2539    }
2540
2541    swizzle_result(ir, src_reg(inst->dst), sampler);
2542 }
2543
2544 /**
2545  * Apply workarounds for Gen6 gather with UINT/SINT
2546  */
2547 void
2548 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2549 {
2550    if (!wa)
2551       return;
2552
2553    int width = (wa & WA_8BIT) ? 8 : 16;
2554    dst_reg dst_f = dst;
2555    dst_f.type = BRW_REGISTER_TYPE_F;
2556
2557    /* Convert from UNORM to UINT */
2558    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2559    emit(MOV(dst, src_reg(dst_f)));
2560
2561    if (wa & WA_SIGN) {
2562       /* Reinterpret the UINT value as a signed INT value by
2563        * shifting the sign bit into place, then shifting back
2564        * preserving sign.
2565        */
2566       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2567       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2568    }
2569 }
2570
2571 /**
2572  * Set up the gather channel based on the swizzle, for gather4.
2573  */
2574 uint32_t
2575 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2576 {
2577    ir_constant *chan = ir->lod_info.component->as_constant();
2578    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2579    switch (swiz) {
2580       case SWIZZLE_X: return 0;
2581       case SWIZZLE_Y:
2582          /* gather4 sampler is broken for green channel on RG32F --
2583           * we must ask for blue instead.
2584           */
2585          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2586             return 2;
2587          return 1;
2588       case SWIZZLE_Z: return 2;
2589       case SWIZZLE_W: return 3;
2590       default:
2591          assert(!"Not reached"); /* zero, one swizzles handled already */
2592          return 0;
2593    }
2594 }
2595
2596 void
2597 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2598 {
2599    int s = key->tex.swizzles[sampler];
2600
2601    this->result = src_reg(this, ir->type);
2602    dst_reg swizzled_result(this->result);
2603
2604    if (ir->op == ir_query_levels) {
2605       /* # levels is in .w */
2606       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2607       emit(MOV(swizzled_result, orig_val));
2608       return;
2609    }
2610
2611    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2612                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2613       emit(MOV(swizzled_result, orig_val));
2614       return;
2615    }
2616
2617
2618    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2619    int swizzle[4] = {0};
2620
2621    for (int i = 0; i < 4; i++) {
2622       switch (GET_SWZ(s, i)) {
2623       case SWIZZLE_ZERO:
2624          zero_mask |= (1 << i);
2625          break;
2626       case SWIZZLE_ONE:
2627          one_mask |= (1 << i);
2628          break;
2629       default:
2630          copy_mask |= (1 << i);
2631          swizzle[i] = GET_SWZ(s, i);
2632          break;
2633       }
2634    }
2635
2636    if (copy_mask) {
2637       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2638       swizzled_result.writemask = copy_mask;
2639       emit(MOV(swizzled_result, orig_val));
2640    }
2641
2642    if (zero_mask) {
2643       swizzled_result.writemask = zero_mask;
2644       emit(MOV(swizzled_result, src_reg(0.0f)));
2645    }
2646
2647    if (one_mask) {
2648       swizzled_result.writemask = one_mask;
2649       emit(MOV(swizzled_result, src_reg(1.0f)));
2650    }
2651 }
2652
2653 void
2654 vec4_visitor::visit(ir_return *ir)
2655 {
2656    assert(!"not reached");
2657 }
2658
2659 void
2660 vec4_visitor::visit(ir_discard *ir)
2661 {
2662    assert(!"not reached");
2663 }
2664
2665 void
2666 vec4_visitor::visit(ir_if *ir)
2667 {
2668    /* Don't point the annotation at the if statement, because then it plus
2669     * the then and else blocks get printed.
2670     */
2671    this->base_ir = ir->condition;
2672
2673    if (brw->gen == 6) {
2674       emit_if_gen6(ir);
2675    } else {
2676       uint32_t predicate;
2677       emit_bool_to_cond_code(ir->condition, &predicate);
2678       emit(IF(predicate));
2679    }
2680
2681    visit_instructions(&ir->then_instructions);
2682
2683    if (!ir->else_instructions.is_empty()) {
2684       this->base_ir = ir->condition;
2685       emit(BRW_OPCODE_ELSE);
2686
2687       visit_instructions(&ir->else_instructions);
2688    }
2689
2690    this->base_ir = ir->condition;
2691    emit(BRW_OPCODE_ENDIF);
2692 }
2693
2694 void
2695 vec4_visitor::visit(ir_emit_vertex *)
2696 {
2697    assert(!"not reached");
2698 }
2699
2700 void
2701 vec4_visitor::visit(ir_end_primitive *)
2702 {
2703    assert(!"not reached");
2704 }
2705
2706 void
2707 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2708                                   dst_reg dst, src_reg offset,
2709                                   src_reg src0, src_reg src1)
2710 {
2711    unsigned mlen = 0;
2712
2713    /* Set the atomic operation offset. */
2714    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2715    mlen++;
2716
2717    /* Set the atomic operation arguments. */
2718    if (src0.file != BAD_FILE) {
2719       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2720       mlen++;
2721    }
2722
2723    if (src1.file != BAD_FILE) {
2724       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2725       mlen++;
2726    }
2727
2728    /* Emit the instruction.  Note that this maps to the normal SIMD8
2729     * untyped atomic message on Ivy Bridge, but that's OK because
2730     * unused channels will be masked out.
2731     */
2732    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2733                                  src_reg(atomic_op), src_reg(surf_index));
2734    inst->base_mrf = 0;
2735    inst->mlen = mlen;
2736 }
2737
2738 void
2739 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2740                                         src_reg offset)
2741 {
2742    /* Set the surface read offset. */
2743    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2744
2745    /* Emit the instruction.  Note that this maps to the normal SIMD8
2746     * untyped surface read message, but that's OK because unused
2747     * channels will be masked out.
2748     */
2749    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2750                                  dst, src_reg(surf_index));
2751    inst->base_mrf = 0;
2752    inst->mlen = 1;
2753 }
2754
2755 void
2756 vec4_visitor::emit_ndc_computation()
2757 {
2758    /* Get the position */
2759    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2760
2761    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2762    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2763    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2764
2765    current_annotation = "NDC";
2766    dst_reg ndc_w = ndc;
2767    ndc_w.writemask = WRITEMASK_W;
2768    src_reg pos_w = pos;
2769    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2770    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2771
2772    dst_reg ndc_xyz = ndc;
2773    ndc_xyz.writemask = WRITEMASK_XYZ;
2774
2775    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2776 }
2777
2778 void
2779 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2780 {
2781    if (brw->gen < 6 &&
2782        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2783         key->userclip_active || brw->has_negative_rhw_bug)) {
2784       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2785       dst_reg header1_w = header1;
2786       header1_w.writemask = WRITEMASK_W;
2787
2788       emit(MOV(header1, 0u));
2789
2790       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2791          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2792
2793          current_annotation = "Point size";
2794          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2795          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2796       }
2797
2798       if (key->userclip_active) {
2799          current_annotation = "Clipping flags";
2800          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2801          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2802
2803          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2804          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2805          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2806
2807          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2808          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2809          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2810          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2811       }
2812
2813       /* i965 clipping workaround:
2814        * 1) Test for -ve rhw
2815        * 2) If set,
2816        *      set ndc = (0,0,0,0)
2817        *      set ucp[6] = 1
2818        *
2819        * Later, clipping will detect ucp[6] and ensure the primitive is
2820        * clipped against all fixed planes.
2821        */
2822       if (brw->has_negative_rhw_bug) {
2823          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2824          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2825          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2826          vec4_instruction *inst;
2827          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2828          inst->predicate = BRW_PREDICATE_NORMAL;
2829          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2830          inst->predicate = BRW_PREDICATE_NORMAL;
2831       }
2832
2833       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2834    } else if (brw->gen < 6) {
2835       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2836    } else {
2837       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2838       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2839          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2840                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2841       }
2842       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2843          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2844                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2845       }
2846       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2847          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2848                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2849       }
2850    }
2851 }
2852
2853 void
2854 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2855 {
2856    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2857     *
2858     *     "If a linked set of shaders forming the vertex stage contains no
2859     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2860     *     application has requested clipping against user clip planes through
2861     *     the API, then the coordinate written to gl_Position is used for
2862     *     comparison against the user clip planes."
2863     *
2864     * This function is only called if the shader didn't write to
2865     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2866     * if the user wrote to it; otherwise we use gl_Position.
2867     */
2868    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2869    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2870       clip_vertex = VARYING_SLOT_POS;
2871    }
2872
2873    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2874         ++i) {
2875       reg.writemask = 1 << i;
2876       emit(DP4(reg,
2877                src_reg(output_reg[clip_vertex]),
2878                src_reg(this->userplane[i + offset])));
2879    }
2880 }
2881
2882 void
2883 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2884 {
2885    assert (varying < VARYING_SLOT_MAX);
2886    reg.type = output_reg[varying].type;
2887    current_annotation = output_reg_annotation[varying];
2888    /* Copy the register, saturating if necessary */
2889    vec4_instruction *inst = emit(MOV(reg,
2890                                      src_reg(output_reg[varying])));
2891    if ((varying == VARYING_SLOT_COL0 ||
2892         varying == VARYING_SLOT_COL1 ||
2893         varying == VARYING_SLOT_BFC0 ||
2894         varying == VARYING_SLOT_BFC1) &&
2895        key->clamp_vertex_color) {
2896       inst->saturate = true;
2897    }
2898 }
2899
2900 void
2901 vec4_visitor::emit_urb_slot(int mrf, int varying)
2902 {
2903    struct brw_reg hw_reg = brw_message_reg(mrf);
2904    dst_reg reg = dst_reg(MRF, mrf);
2905    reg.type = BRW_REGISTER_TYPE_F;
2906
2907    switch (varying) {
2908    case VARYING_SLOT_PSIZ:
2909       /* PSIZ is always in slot 0, and is coupled with other flags. */
2910       current_annotation = "indices, point width, clip flags";
2911       emit_psiz_and_flags(hw_reg);
2912       break;
2913    case BRW_VARYING_SLOT_NDC:
2914       current_annotation = "NDC";
2915       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2916       break;
2917    case VARYING_SLOT_POS:
2918       current_annotation = "gl_Position";
2919       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2920       break;
2921    case VARYING_SLOT_EDGE:
2922       /* This is present when doing unfilled polygons.  We're supposed to copy
2923        * the edge flag from the user-provided vertex array
2924        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2925        * of that attribute (starts as 1.0f).  This is then used in clipping to
2926        * determine which edges should be drawn as wireframe.
2927        */
2928       current_annotation = "edge flag";
2929       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2930                                     glsl_type::float_type, WRITEMASK_XYZW))));
2931       break;
2932    case BRW_VARYING_SLOT_PAD:
2933       /* No need to write to this slot */
2934       break;
2935    default:
2936       emit_generic_urb_slot(reg, varying);
2937       break;
2938    }
2939 }
2940
2941 static int
2942 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2943 {
2944    if (brw->gen >= 6) {
2945       /* URB data written (does not include the message header reg) must
2946        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2947        * section 5.4.3.2.2: URB_INTERLEAVED.
2948        *
2949        * URB entries are allocated on a multiple of 1024 bits, so an
2950        * extra 128 bits written here to make the end align to 256 is
2951        * no problem.
2952        */
2953       if ((mlen % 2) != 1)
2954          mlen++;
2955    }
2956
2957    return mlen;
2958 }
2959
2960
2961 /**
2962  * Generates the VUE payload plus the necessary URB write instructions to
2963  * output it.
2964  *
2965  * The VUE layout is documented in Volume 2a.
2966  */
2967 void
2968 vec4_visitor::emit_vertex()
2969 {
2970    /* MRF 0 is reserved for the debugger, so start with message header
2971     * in MRF 1.
2972     */
2973    int base_mrf = 1;
2974    int mrf = base_mrf;
2975    /* In the process of generating our URB write message contents, we
2976     * may need to unspill a register or load from an array.  Those
2977     * reads would use MRFs 14-15.
2978     */
2979    int max_usable_mrf = 13;
2980
2981    /* The following assertion verifies that max_usable_mrf causes an
2982     * even-numbered amount of URB write data, which will meet gen6's
2983     * requirements for length alignment.
2984     */
2985    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2986
2987    /* First mrf is the g0-based message header containing URB handles and
2988     * such.
2989     */
2990    emit_urb_write_header(mrf++);
2991
2992    if (brw->gen < 6) {
2993       emit_ndc_computation();
2994    }
2995
2996    /* Lower legacy ff and ClipVertex clipping to clip distances */
2997    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2998       current_annotation = "user clip distances";
2999
3000       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3001       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3002
3003       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3004       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3005    }
3006
3007    /* We may need to split this up into several URB writes, so do them in a
3008     * loop.
3009     */
3010    int slot = 0;
3011    bool complete = false;
3012    do {
3013       /* URB offset is in URB row increments, and each of our MRFs is half of
3014        * one of those, since we're doing interleaved writes.
3015        */
3016       int offset = slot / 2;
3017
3018       mrf = base_mrf + 1;
3019       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3020          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3021
3022          /* If this was max_usable_mrf, we can't fit anything more into this
3023           * URB WRITE.
3024           */
3025          if (mrf > max_usable_mrf) {
3026             slot++;
3027             break;
3028          }
3029       }
3030
3031       complete = slot >= prog_data->vue_map.num_slots;
3032       current_annotation = "URB write";
3033       vec4_instruction *inst = emit_urb_write_opcode(complete);
3034       inst->base_mrf = base_mrf;
3035       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3036       inst->offset += offset;
3037    } while(!complete);
3038 }
3039
3040
3041 src_reg
3042 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3043                                  src_reg *reladdr, int reg_offset)
3044 {
3045    /* Because we store the values to scratch interleaved like our
3046     * vertex data, we need to scale the vec4 index by 2.
3047     */
3048    int message_header_scale = 2;
3049
3050    /* Pre-gen6, the message header uses byte offsets instead of vec4
3051     * (16-byte) offset units.
3052     */
3053    if (brw->gen < 6)
3054       message_header_scale *= 16;
3055
3056    if (reladdr) {
3057       src_reg index = src_reg(this, glsl_type::int_type);
3058
3059       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3060       emit_before(inst, MUL(dst_reg(index),
3061                             index, src_reg(message_header_scale)));
3062
3063       return index;
3064    } else {
3065       return src_reg(reg_offset * message_header_scale);
3066    }
3067 }
3068
3069 src_reg
3070 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3071                                        src_reg *reladdr, int reg_offset)
3072 {
3073    if (reladdr) {
3074       src_reg index = src_reg(this, glsl_type::int_type);
3075
3076       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3077
3078       /* Pre-gen6, the message header uses byte offsets instead of vec4
3079        * (16-byte) offset units.
3080        */
3081       if (brw->gen < 6) {
3082          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3083       }
3084
3085       return index;
3086    } else if (brw->gen >= 8) {
3087       /* Store the offset in a GRF so we can send-from-GRF. */
3088       src_reg offset = src_reg(this, glsl_type::int_type);
3089       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3090       return offset;
3091    } else {
3092       int message_header_scale = brw->gen < 6 ? 16 : 1;
3093       return src_reg(reg_offset * message_header_scale);
3094    }
3095 }
3096
3097 /**
3098  * Emits an instruction before @inst to load the value named by @orig_src
3099  * from scratch space at @base_offset to @temp.
3100  *
3101  * @base_offset is measured in 32-byte units (the size of a register).
3102  */
3103 void
3104 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3105                                 dst_reg temp, src_reg orig_src,
3106                                 int base_offset)
3107 {
3108    int reg_offset = base_offset + orig_src.reg_offset;
3109    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3110
3111    emit_before(inst, SCRATCH_READ(temp, index));
3112 }
3113
3114 /**
3115  * Emits an instruction after @inst to store the value to be written
3116  * to @orig_dst to scratch space at @base_offset, from @temp.
3117  *
3118  * @base_offset is measured in 32-byte units (the size of a register).
3119  */
3120 void
3121 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3122 {
3123    int reg_offset = base_offset + inst->dst.reg_offset;
3124    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3125
3126    /* Create a temporary register to store *inst's result in.
3127     *
3128     * We have to be careful in MOVing from our temporary result register in
3129     * the scratch write.  If we swizzle from channels of the temporary that
3130     * weren't initialized, it will confuse live interval analysis, which will
3131     * make spilling fail to make progress.
3132     */
3133    src_reg temp = src_reg(this, glsl_type::vec4_type);
3134    temp.type = inst->dst.type;
3135    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3136    int swizzles[4];
3137    for (int i = 0; i < 4; i++)
3138       if (inst->dst.writemask & (1 << i))
3139          swizzles[i] = i;
3140       else
3141          swizzles[i] = first_writemask_chan;
3142    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3143                                swizzles[2], swizzles[3]);
3144
3145    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3146                                        inst->dst.writemask));
3147    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3148    write->predicate = inst->predicate;
3149    write->ir = inst->ir;
3150    write->annotation = inst->annotation;
3151    inst->insert_after(write);
3152
3153    inst->dst.file = temp.file;
3154    inst->dst.reg = temp.reg;
3155    inst->dst.reg_offset = temp.reg_offset;
3156    inst->dst.reladdr = NULL;
3157 }
3158
3159 /**
3160  * We can't generally support array access in GRF space, because a
3161  * single instruction's destination can only span 2 contiguous
3162  * registers.  So, we send all GRF arrays that get variable index
3163  * access to scratch space.
3164  */
3165 void
3166 vec4_visitor::move_grf_array_access_to_scratch()
3167 {
3168    int scratch_loc[this->virtual_grf_count];
3169
3170    for (int i = 0; i < this->virtual_grf_count; i++) {
3171       scratch_loc[i] = -1;
3172    }
3173
3174    /* First, calculate the set of virtual GRFs that need to be punted
3175     * to scratch due to having any array access on them, and where in
3176     * scratch.
3177     */
3178    foreach_list(node, &this->instructions) {
3179       vec4_instruction *inst = (vec4_instruction *)node;
3180
3181       if (inst->dst.file == GRF && inst->dst.reladdr &&
3182           scratch_loc[inst->dst.reg] == -1) {
3183          scratch_loc[inst->dst.reg] = c->last_scratch;
3184          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3185       }
3186
3187       for (int i = 0 ; i < 3; i++) {
3188          src_reg *src = &inst->src[i];
3189
3190          if (src->file == GRF && src->reladdr &&
3191              scratch_loc[src->reg] == -1) {
3192             scratch_loc[src->reg] = c->last_scratch;
3193             c->last_scratch += this->virtual_grf_sizes[src->reg];
3194          }
3195       }
3196    }
3197
3198    /* Now, for anything that will be accessed through scratch, rewrite
3199     * it to load/store.  Note that this is a _safe list walk, because
3200     * we may generate a new scratch_write instruction after the one
3201     * we're processing.
3202     */
3203    foreach_list_safe(node, &this->instructions) {
3204       vec4_instruction *inst = (vec4_instruction *)node;
3205
3206       /* Set up the annotation tracking for new generated instructions. */
3207       base_ir = inst->ir;
3208       current_annotation = inst->annotation;
3209
3210       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3211          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3212       }
3213
3214       for (int i = 0 ; i < 3; i++) {
3215          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3216             continue;
3217
3218          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3219
3220          emit_scratch_read(inst, temp, inst->src[i],
3221                            scratch_loc[inst->src[i].reg]);
3222
3223          inst->src[i].file = temp.file;
3224          inst->src[i].reg = temp.reg;
3225          inst->src[i].reg_offset = temp.reg_offset;
3226          inst->src[i].reladdr = NULL;
3227       }
3228    }
3229 }
3230
3231 /**
3232  * Emits an instruction before @inst to load the value named by @orig_src
3233  * from the pull constant buffer (surface) at @base_offset to @temp.
3234  */
3235 void
3236 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3237                                       dst_reg temp, src_reg orig_src,
3238                                       int base_offset)
3239 {
3240    int reg_offset = base_offset + orig_src.reg_offset;
3241    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3242    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3243    vec4_instruction *load;
3244
3245    if (brw->gen >= 7) {
3246       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3247       grf_offset.type = offset.type;
3248       emit_before(inst, MOV(grf_offset, offset));
3249
3250       load = new(mem_ctx) vec4_instruction(this,
3251                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3252                                            temp, index, src_reg(grf_offset));
3253    } else {
3254       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3255                                            temp, index, offset);
3256       load->base_mrf = 14;
3257       load->mlen = 1;
3258    }
3259    emit_before(inst, load);
3260 }
3261
3262 /**
3263  * Implements array access of uniforms by inserting a
3264  * PULL_CONSTANT_LOAD instruction.
3265  *
3266  * Unlike temporary GRF array access (where we don't support it due to
3267  * the difficulty of doing relative addressing on instruction
3268  * destinations), we could potentially do array access of uniforms
3269  * that were loaded in GRF space as push constants.  In real-world
3270  * usage we've seen, though, the arrays being used are always larger
3271  * than we could load as push constants, so just always move all
3272  * uniform array access out to a pull constant buffer.
3273  */
3274 void
3275 vec4_visitor::move_uniform_array_access_to_pull_constants()
3276 {
3277    int pull_constant_loc[this->uniforms];
3278
3279    for (int i = 0; i < this->uniforms; i++) {
3280       pull_constant_loc[i] = -1;
3281    }
3282
3283    /* Walk through and find array access of uniforms.  Put a copy of that
3284     * uniform in the pull constant buffer.
3285     *
3286     * Note that we don't move constant-indexed accesses to arrays.  No
3287     * testing has been done of the performance impact of this choice.
3288     */
3289    foreach_list_safe(node, &this->instructions) {
3290       vec4_instruction *inst = (vec4_instruction *)node;
3291
3292       for (int i = 0 ; i < 3; i++) {
3293          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3294             continue;
3295
3296          int uniform = inst->src[i].reg;
3297
3298          /* If this array isn't already present in the pull constant buffer,
3299           * add it.
3300           */
3301          if (pull_constant_loc[uniform] == -1) {
3302             const float **values = &stage_prog_data->param[uniform * 4];
3303
3304             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3305
3306             assert(uniform < uniform_array_size);
3307             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3308                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3309                   = values[j];
3310             }
3311          }
3312
3313          /* Set up the annotation tracking for new generated instructions. */
3314          base_ir = inst->ir;
3315          current_annotation = inst->annotation;
3316
3317          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3318
3319          emit_pull_constant_load(inst, temp, inst->src[i],
3320                                  pull_constant_loc[uniform]);
3321
3322          inst->src[i].file = temp.file;
3323          inst->src[i].reg = temp.reg;
3324          inst->src[i].reg_offset = temp.reg_offset;
3325          inst->src[i].reladdr = NULL;
3326       }
3327    }
3328
3329    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3330     * no need to track them as larger-than-vec4 objects.  This will be
3331     * relied on in cutting out unused uniform vectors from push
3332     * constants.
3333     */
3334    split_uniform_registers();
3335 }
3336
3337 void
3338 vec4_visitor::resolve_ud_negate(src_reg *reg)
3339 {
3340    if (reg->type != BRW_REGISTER_TYPE_UD ||
3341        !reg->negate)
3342       return;
3343
3344    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3345    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3346    *reg = temp;
3347 }
3348
3349 vec4_visitor::vec4_visitor(struct brw_context *brw,
3350                            struct brw_vec4_compile *c,
3351                            struct gl_program *prog,
3352                            const struct brw_vec4_prog_key *key,
3353                            struct brw_vec4_prog_data *prog_data,
3354                            struct gl_shader_program *shader_prog,
3355                            gl_shader_stage stage,
3356                            void *mem_ctx,
3357                            bool debug_flag,
3358                            bool no_spills,
3359                            shader_time_shader_type st_base,
3360                            shader_time_shader_type st_written,
3361                            shader_time_shader_type st_reset)
3362    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3363      c(c),
3364      key(key),
3365      prog_data(prog_data),
3366      sanity_param_count(0),
3367      fail_msg(NULL),
3368      first_non_payload_grf(0),
3369      need_all_constants_in_pull_buffer(false),
3370      debug_flag(debug_flag),
3371      no_spills(no_spills),
3372      st_base(st_base),
3373      st_written(st_written),
3374      st_reset(st_reset)
3375 {
3376    this->mem_ctx = mem_ctx;
3377    this->failed = false;
3378
3379    this->base_ir = NULL;
3380    this->current_annotation = NULL;
3381    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3382
3383    this->variable_ht = hash_table_ctor(0,
3384                                        hash_table_pointer_hash,
3385                                        hash_table_pointer_compare);
3386
3387    this->virtual_grf_start = NULL;
3388    this->virtual_grf_end = NULL;
3389    this->virtual_grf_sizes = NULL;
3390    this->virtual_grf_count = 0;
3391    this->virtual_grf_reg_map = NULL;
3392    this->virtual_grf_reg_count = 0;
3393    this->virtual_grf_array_size = 0;
3394    this->live_intervals_valid = false;
3395
3396    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3397
3398    this->uniforms = 0;
3399
3400    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3401     * at least one. See setup_uniforms() in brw_vec4.cpp.
3402     */
3403    this->uniform_array_size = 1;
3404    if (prog_data) {
3405       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3406    }
3407
3408    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3409    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3410 }
3411
3412 vec4_visitor::~vec4_visitor()
3413 {
3414    hash_table_dtor(this->variable_ht);
3415 }
3416
3417
3418 void
3419 vec4_visitor::fail(const char *format, ...)
3420 {
3421    va_list va;
3422    char *msg;
3423
3424    if (failed)
3425       return;
3426
3427    failed = true;
3428
3429    va_start(va, format);
3430    msg = ralloc_vasprintf(mem_ctx, format, va);
3431    va_end(va);
3432    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3433
3434    this->fail_msg = msg;
3435
3436    if (debug_flag) {
3437       fprintf(stderr, "%s",  msg);
3438    }
3439 }
3440
3441 } /* namespace brw */