src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->writes_accumulator = false;
  46    this->conditional_mod = BRW_CONDITIONAL_NONE;
  47    this->sampler = 0;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 125                                            src0, src1);                 \
 126    }
 127
 128 #define ALU2_ACC(op)                                                    \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 131    {                                                                    \
 132       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 133                        BRW_OPCODE_##op, dst, src0, src1);               \
 134       inst->writes_accumulator = true;                                 \
 135       return inst;                                                     \
 136    }
 137
 138 #define ALU3(op)                                                        \
 139    vec4_instruction *                                                   \
 140    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 141    {                                                                    \
 142       assert(brw->gen >= 6);                                            \
 143       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 144                                            src0, src1, src2);           \
 145    }
 146
 147 ALU1(NOT)
 148 ALU1(MOV)
 149 ALU1(FRC)
 150 ALU1(RNDD)
 151 ALU1(RNDE)
 152 ALU1(RNDZ)
 153 ALU1(F32TO16)
 154 ALU1(F16TO32)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2_ACC(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(DP3)
 162 ALU2(DP4)
 163 ALU2(DPH)
 164 ALU2(SHL)
 165 ALU2(SHR)
 166 ALU2(ASR)
 167 ALU3(LRP)
 168 ALU1(BFREV)
 169 ALU3(BFE)
 170 ALU2(BFI1)
 171 ALU3(BFI2)
 172 ALU1(FBH)
 173 ALU1(FBL)
 174 ALU1(CBIT)
 175 ALU3(MAD)
 176 ALU2_ACC(ADDC)
 177 ALU2_ACC(SUBB)
 178 ALU2(MAC)
 179
 180 /** Gen4 predicated IF. */
 181 vec4_instruction *
 182 vec4_visitor::IF(uint32_t predicate)
 183 {
 184    vec4_instruction *inst;
 185
 186    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 187    inst->predicate = predicate;
 188
 189    return inst;
 190 }
 191
 192 /** Gen6 IF with embedded comparison. */
 193 vec4_instruction *
 194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 195 {
 196    assert(brw->gen == 6);
 197
 198    vec4_instruction *inst;
 199
 200    resolve_ud_negate(&src0);
 201    resolve_ud_negate(&src1);
 202
 203    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 204                                         src0, src1);
 205    inst->conditional_mod = condition;
 206
 207    return inst;
 208 }
 209
 210 /**
 211  * CMP: Sets the low bit of the destination channels with the result
 212  * of the comparison, while the upper bits are undefined, and updates
 213  * the flag register with the packed 16 bits of the result.
 214  */
 215 vec4_instruction *
 216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 217 {
 218    vec4_instruction *inst;
 219
 220    /* original gen4 does type conversion to the destination type
 221     * before before comparison, producing garbage results for floating
 222     * point comparisons.
 223     */
 224    if (brw->gen == 4) {
 225       dst.type = src0.type;
 226       if (dst.file == HW_REG)
 227          dst.fixed_hw_reg.type = dst.type;
 228    }
 229
 230    resolve_ud_negate(&src0);
 231    resolve_ud_negate(&src1);
 232
 233    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 234    inst->conditional_mod = condition;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 245                                         dst, index);
 246    inst->base_mrf = 14;
 247    inst->mlen = 2;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 258                                         dst, src, index);
 259    inst->base_mrf = 13;
 260    inst->mlen = 3;
 261
 262    return inst;
 263 }
 264
 265 void
 266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 267 {
 268    static enum opcode dot_opcodes[] = {
 269       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 270    };
 271
 272    emit(dot_opcodes[elements - 2], dst, src0, src1);
 273 }
 274
 275 src_reg
 276 vec4_visitor::fix_3src_operand(src_reg src)
 277 {
 278    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 279     * able to use vertical stride of zero to replicate the vec4 uniform, like
 280     *
 281     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 282     *
 283     * But you can't, since vertical stride is always four in three-source
 284     * instructions. Instead, insert a MOV instruction to do the replication so
 285     * that the three-source instruction can consume it.
 286     */
 287
 288    /* The MOV is only needed if the source is a uniform or immediate. */
 289    if (src.file != UNIFORM && src.file != IMM)
 290       return src;
 291
 292    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 293       return src;
 294
 295    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 296    expanded.type = src.type;
 297    emit(MOV(expanded, src));
 298    return src_reg(expanded);
 299 }
 300
 301 src_reg
 302 vec4_visitor::fix_math_operand(src_reg src)
 303 {
 304    /* The gen6 math instruction ignores the source modifiers --
 305     * swizzle, abs, negate, and at least some parts of the register
 306     * region description.
 307     *
 308     * Rather than trying to enumerate all these cases, *always* expand the
 309     * operand to a temp GRF for gen6.
 310     *
 311     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 312     * can't use.
 313     */
 314
 315    if (brw->gen == 7 && src.file != IMM)
 316       return src;
 317
 318    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 319    expanded.type = src.type;
 320    emit(MOV(expanded, src));
 321    return src_reg(expanded);
 322 }
 323
 324 void
 325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 326 {
 327    src = fix_math_operand(src);
 328
 329    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 330       /* The gen6 math instruction must be align1, so we can't do
 331        * writemasks.
 332        */
 333       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 334
 335       emit(opcode, temp_dst, src);
 336
 337       emit(MOV(dst, src_reg(temp_dst)));
 338    } else {
 339       emit(opcode, dst, src);
 340    }
 341 }
 342
 343 void
 344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 345 {
 346    vec4_instruction *inst = emit(opcode, dst, src);
 347    inst->base_mrf = 1;
 348    inst->mlen = 1;
 349 }
 350
 351 void
 352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 353 {
 354    switch (opcode) {
 355    case SHADER_OPCODE_RCP:
 356    case SHADER_OPCODE_RSQ:
 357    case SHADER_OPCODE_SQRT:
 358    case SHADER_OPCODE_EXP2:
 359    case SHADER_OPCODE_LOG2:
 360    case SHADER_OPCODE_SIN:
 361    case SHADER_OPCODE_COS:
 362       break;
 363    default:
 364       assert(!"not reached: bad math opcode");
 365       return;
 366    }
 367
 368    if (brw->gen >= 8) {
 369       emit(opcode, dst, src);
 370    } else if (brw->gen >= 6) {
 371       emit_math1_gen6(opcode, dst, src);
 372    } else {
 373       emit_math1_gen4(opcode, dst, src);
 374    }
 375 }
 376
 377 void
 378 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 379                               dst_reg dst, src_reg src0, src_reg src1)
 380 {
 381    src0 = fix_math_operand(src0);
 382    src1 = fix_math_operand(src1);
 383
 384    if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 385       /* The gen6 math instruction must be align1, so we can't do
 386        * writemasks.
 387        */
 388       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 389       temp_dst.type = dst.type;
 390
 391       emit(opcode, temp_dst, src0, src1);
 392
 393       emit(MOV(dst, src_reg(temp_dst)));
 394    } else {
 395       emit(opcode, dst, src0, src1);
 396    }
 397 }
 398
 399 void
 400 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 401                               dst_reg dst, src_reg src0, src_reg src1)
 402 {
 403    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 404    inst->base_mrf = 1;
 405    inst->mlen = 2;
 406 }
 407
 408 void
 409 vec4_visitor::emit_math(enum opcode opcode,
 410                         dst_reg dst, src_reg src0, src_reg src1)
 411 {
 412    switch (opcode) {
 413    case SHADER_OPCODE_POW:
 414    case SHADER_OPCODE_INT_QUOTIENT:
 415    case SHADER_OPCODE_INT_REMAINDER:
 416       break;
 417    default:
 418       assert(!"not reached: unsupported binary math opcode");
 419       return;
 420    }
 421
 422    if (brw->gen >= 8) {
 423       emit(opcode, dst, src0, src1);
 424    } else if (brw->gen >= 6) {
 425       emit_math2_gen6(opcode, dst, src0, src1);
 426    } else {
 427       emit_math2_gen4(opcode, dst, src0, src1);
 428    }
 429 }
 430
 431 void
 432 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 433 {
 434    if (brw->gen < 7)
 435       assert(!"ir_unop_pack_half_2x16 should be lowered");
 436
 437    assert(dst.type == BRW_REGISTER_TYPE_UD);
 438    assert(src0.type == BRW_REGISTER_TYPE_F);
 439
 440    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 441     *
 442     *   Because this instruction does not have a 16-bit floating-point type,
 443     *   the destination data type must be Word (W).
 444     *
 445     *   The destination must be DWord-aligned and specify a horizontal stride
 446     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 447     *   each destination channel and the upper word is not modified.
 448     *
 449     * The above restriction implies that the f32to16 instruction must use
 450     * align1 mode, because only in align1 mode is it possible to specify
 451     * horizontal stride.  We choose here to defy the hardware docs and emit
 452     * align16 instructions.
 453     *
 454     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 455     * instructions. I was partially successful in that the code passed all
 456     * tests.  However, the code was dubiously correct and fragile, and the
 457     * tests were not harsh enough to probe that frailty. Not trusting the
 458     * code, I chose instead to remain in align16 mode in defiance of the hw
 459     * docs).
 460     *
 461     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 462     * simulator, emitting a f32to16 in align16 mode with UD as destination
 463     * data type is safe. The behavior differs from that specified in the PRM
 464     * in that the upper word of each destination channel is cleared to 0.
 465     */
 466
 467    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 468    src_reg tmp_src(tmp_dst);
 469
 470 #if 0
 471    /* Verify the undocumented behavior on which the following instructions
 472     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 473     * then the result of the bit-or instruction below will be incorrect.
 474     *
 475     * You should inspect the disasm output in order to verify that the MOV is
 476     * not optimized away.
 477     */
 478    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 479 #endif
 480
 481    /* Give tmp the form below, where "." means untouched.
 482     *
 483     *     w z          y          x w z          y          x
 484     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 485     *
 486     * That the upper word of each write-channel be 0 is required for the
 487     * following bit-shift and bit-or instructions to work. Note that this
 488     * relies on the undocumented hardware behavior mentioned above.
 489     */
 490    tmp_dst.writemask = WRITEMASK_XY;
 491    emit(F32TO16(tmp_dst, src0));
 492
 493    /* Give the write-channels of dst the form:
 494     *   0xhhhh0000
 495     */
 496    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 497    emit(SHL(dst, tmp_src, src_reg(16u)));
 498
 499    /* Finally, give the write-channels of dst the form of packHalf2x16's
 500     * output:
 501     *   0xhhhhllll
 502     */
 503    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 504    emit(OR(dst, src_reg(dst), tmp_src));
 505 }
 506
 507 void
 508 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 509 {
 510    if (brw->gen < 7)
 511       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 512
 513    assert(dst.type == BRW_REGISTER_TYPE_F);
 514    assert(src0.type == BRW_REGISTER_TYPE_UD);
 515
 516    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 517     *
 518     *   Because this instruction does not have a 16-bit floating-point type,
 519     *   the source data type must be Word (W). The destination type must be
 520     *   F (Float).
 521     *
 522     * To use W as the source data type, we must adjust horizontal strides,
 523     * which is only possible in align1 mode. All my [chadv] attempts at
 524     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 525     * Piglit tests, so I gave up.
 526     *
 527     * I've verified that, on gen7 hardware and the simulator, it is safe to
 528     * emit f16to32 in align16 mode with UD as source data type.
 529     */
 530
 531    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 532    src_reg tmp_src(tmp_dst);
 533
 534    tmp_dst.writemask = WRITEMASK_X;
 535    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 536
 537    tmp_dst.writemask = WRITEMASK_Y;
 538    emit(SHR(tmp_dst, src0, src_reg(16u)));
 539
 540    dst.writemask = WRITEMASK_XY;
 541    emit(F16TO32(dst, tmp_src));
 542 }
 543
 544 void
 545 vec4_visitor::visit_instructions(const exec_list *list)
 546 {
 547    foreach_list(node, list) {
 548       ir_instruction *ir = (ir_instruction *)node;
 549
 550       base_ir = ir;
 551       ir->accept(this);
 552    }
 553 }
 554
 555
 556 static int
 557 type_size(const struct glsl_type *type)
 558 {
 559    unsigned int i;
 560    int size;
 561
 562    switch (type->base_type) {
 563    case GLSL_TYPE_UINT:
 564    case GLSL_TYPE_INT:
 565    case GLSL_TYPE_FLOAT:
 566    case GLSL_TYPE_BOOL:
 567       if (type->is_matrix()) {
 568          return type->matrix_columns;
 569       } else {
 570          /* Regardless of size of vector, it gets a vec4. This is bad
 571           * packing for things like floats, but otherwise arrays become a
 572           * mess.  Hopefully a later pass over the code can pack scalars
 573           * down if appropriate.
 574           */
 575          return 1;
 576       }
 577    case GLSL_TYPE_ARRAY:
 578       assert(type->length > 0);
 579       return type_size(type->fields.array) * type->length;
 580    case GLSL_TYPE_STRUCT:
 581       size = 0;
 582       for (i = 0; i < type->length; i++) {
 583          size += type_size(type->fields.structure[i].type);
 584       }
 585       return size;
 586    case GLSL_TYPE_SAMPLER:
 587       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 588        * at link time.
 589        */
 590       return 1;
 591    case GLSL_TYPE_ATOMIC_UINT:
 592       return 0;
 593    case GLSL_TYPE_IMAGE:
 594    case GLSL_TYPE_VOID:
 595    case GLSL_TYPE_ERROR:
 596    case GLSL_TYPE_INTERFACE:
 597       assert(0);
 598       break;
 599    }
 600
 601    return 0;
 602 }
 603
 604 int
 605 vec4_visitor::virtual_grf_alloc(int size)
 606 {
 607    if (virtual_grf_array_size <= virtual_grf_count) {
 608       if (virtual_grf_array_size == 0)
 609          virtual_grf_array_size = 16;
 610       else
 611          virtual_grf_array_size *= 2;
 612       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 613                                    virtual_grf_array_size);
 614       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 615                                      virtual_grf_array_size);
 616    }
 617    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 618    virtual_grf_reg_count += size;
 619    virtual_grf_sizes[virtual_grf_count] = size;
 620    return virtual_grf_count++;
 621 }
 622
 623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->virtual_grf_alloc(type_size(type));
 629
 630    if (type->is_array() || type->is_record()) {
 631       this->swizzle = BRW_SWIZZLE_NOOP;
 632    } else {
 633       this->swizzle = swizzle_for_size(type->vector_elements);
 634    }
 635
 636    this->type = brw_type_for_base_type(type);
 637 }
 638
 639 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 640 {
 641    init();
 642
 643    this->file = GRF;
 644    this->reg = v->virtual_grf_alloc(type_size(type));
 645
 646    if (type->is_array() || type->is_record()) {
 647       this->writemask = WRITEMASK_XYZW;
 648    } else {
 649       this->writemask = (1 << type->vector_elements) - 1;
 650    }
 651
 652    this->type = brw_type_for_base_type(type);
 653 }
 654
 655 /* Our support for uniforms is piggy-backed on the struct
 656  * gl_fragment_program, because that's where the values actually
 657  * get stored, rather than in some global gl_shader_program uniform
 658  * store.
 659  */
 660 void
 661 vec4_visitor::setup_uniform_values(ir_variable *ir)
 662 {
 663    int namelen = strlen(ir->name);
 664
 665    /* The data for our (non-builtin) uniforms is stored in a series of
 666     * gl_uniform_driver_storage structs for each subcomponent that
 667     * glGetUniformLocation() could name.  We know it's been set up in the same
 668     * order we'd walk the type, so walk the list of storage and find anything
 669     * with our name, or the prefix of a component that starts with our name.
 670     */
 671    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 672       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 673
 674       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 675           (storage->name[namelen] != 0 &&
 676            storage->name[namelen] != '.' &&
 677            storage->name[namelen] != '[')) {
 678          continue;
 679       }
 680
 681       gl_constant_value *components = storage->storage;
 682       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 683                                storage->type->matrix_columns);
 684
 685       for (unsigned s = 0; s < vector_count; s++) {
 686          assert(uniforms < uniform_array_size);
 687          uniform_vector_size[uniforms] = storage->type->vector_elements;
 688
 689          int i;
 690          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 691             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 692             components++;
 693          }
 694          for (; i < 4; i++) {
 695             static float zero = 0;
 696             stage_prog_data->param[uniforms * 4 + i] = &zero;
 697          }
 698
 699          uniforms++;
 700       }
 701    }
 702 }
 703
 704 void
 705 vec4_visitor::setup_uniform_clipplane_values()
 706 {
 707    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 708
 709    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 710       assert(this->uniforms < uniform_array_size);
 711       this->uniform_vector_size[this->uniforms] = 4;
 712       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 713       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 714       for (int j = 0; j < 4; ++j) {
 715          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 716       }
 717       ++this->uniforms;
 718    }
 719 }
 720
 721 /* Our support for builtin uniforms is even scarier than non-builtin.
 722  * It sits on top of the PROG_STATE_VAR parameters that are
 723  * automatically updated from GL context state.
 724  */
 725 void
 726 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 727 {
 728    const ir_state_slot *const slots = ir->state_slots;
 729    assert(ir->state_slots != NULL);
 730
 731    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 732       /* This state reference has already been setup by ir_to_mesa,
 733        * but we'll get the same index back here.  We can reference
 734        * ParameterValues directly, since unlike brw_fs.cpp, we never
 735        * add new state references during compile.
 736        */
 737       int index = _mesa_add_state_reference(this->prog->Parameters,
 738                                             (gl_state_index *)slots[i].tokens);
 739       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 740
 741       assert(this->uniforms < uniform_array_size);
 742       this->uniform_vector_size[this->uniforms] = 0;
 743       /* Add each of the unique swizzled channels of the element.
 744        * This will end up matching the size of the glsl_type of this field.
 745        */
 746       int last_swiz = -1;
 747       for (unsigned int j = 0; j < 4; j++) {
 748          int swiz = GET_SWZ(slots[i].swizzle, j);
 749          last_swiz = swiz;
 750
 751          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 752          assert(this->uniforms < uniform_array_size);
 753          if (swiz <= last_swiz)
 754             this->uniform_vector_size[this->uniforms]++;
 755       }
 756       this->uniforms++;
 757    }
 758 }
 759
 760 dst_reg *
 761 vec4_visitor::variable_storage(ir_variable *var)
 762 {
 763    return (dst_reg *)hash_table_find(this->variable_ht, var);
 764 }
 765
 766 void
 767 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 768 {
 769    ir_expression *expr = ir->as_expression();
 770
 771    *predicate = BRW_PREDICATE_NORMAL;
 772
 773    if (expr) {
 774       src_reg op[2];
 775       vec4_instruction *inst;
 776
 777       assert(expr->get_num_operands() <= 2);
 778       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 779          expr->operands[i]->accept(this);
 780          op[i] = this->result;
 781
 782          resolve_ud_negate(&op[i]);
 783       }
 784
 785       switch (expr->operation) {
 786       case ir_unop_logic_not:
 787          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 788          inst->conditional_mod = BRW_CONDITIONAL_Z;
 789          break;
 790
 791       case ir_binop_logic_xor:
 792          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 793          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 794          break;
 795
 796       case ir_binop_logic_or:
 797          inst = emit(OR(dst_null_d(), op[0], op[1]));
 798          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 799          break;
 800
 801       case ir_binop_logic_and:
 802          inst = emit(AND(dst_null_d(), op[0], op[1]));
 803          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 804          break;
 805
 806       case ir_unop_f2b:
 807          if (brw->gen >= 6) {
 808             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 809          } else {
 810             inst = emit(MOV(dst_null_f(), op[0]));
 811             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 812          }
 813          break;
 814
 815       case ir_unop_i2b:
 816          if (brw->gen >= 6) {
 817             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 818          } else {
 819             inst = emit(MOV(dst_null_d(), op[0]));
 820             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 821          }
 822          break;
 823
 824       case ir_binop_all_equal:
 825          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 826          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 827          break;
 828
 829       case ir_binop_any_nequal:
 830          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 831          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 832          break;
 833
 834       case ir_unop_any:
 835          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 836          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 837          break;
 838
 839       case ir_binop_greater:
 840       case ir_binop_gequal:
 841       case ir_binop_less:
 842       case ir_binop_lequal:
 843       case ir_binop_equal:
 844       case ir_binop_nequal:
 845          emit(CMP(dst_null_d(), op[0], op[1],
 846                   brw_conditional_for_comparison(expr->operation)));
 847          break;
 848
 849       default:
 850          assert(!"not reached");
 851          break;
 852       }
 853       return;
 854    }
 855
 856    ir->accept(this);
 857
 858    resolve_ud_negate(&this->result);
 859
 860    if (brw->gen >= 6) {
 861       vec4_instruction *inst = emit(AND(dst_null_d(),
 862                                         this->result, src_reg(1)));
 863       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 864    } else {
 865       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 866       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867    }
 868 }
 869
 870 /**
 871  * Emit a gen6 IF statement with the comparison folded into the IF
 872  * instruction.
 873  */
 874 void
 875 vec4_visitor::emit_if_gen6(ir_if *ir)
 876 {
 877    ir_expression *expr = ir->condition->as_expression();
 878
 879    if (expr) {
 880       src_reg op[2];
 881       dst_reg temp;
 882
 883       assert(expr->get_num_operands() <= 2);
 884       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 885          expr->operands[i]->accept(this);
 886          op[i] = this->result;
 887       }
 888
 889       switch (expr->operation) {
 890       case ir_unop_logic_not:
 891          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 892          return;
 893
 894       case ir_binop_logic_xor:
 895          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 896          return;
 897
 898       case ir_binop_logic_or:
 899          temp = dst_reg(this, glsl_type::bool_type);
 900          emit(OR(temp, op[0], op[1]));
 901          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 902          return;
 903
 904       case ir_binop_logic_and:
 905          temp = dst_reg(this, glsl_type::bool_type);
 906          emit(AND(temp, op[0], op[1]));
 907          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 908          return;
 909
 910       case ir_unop_f2b:
 911          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 912          return;
 913
 914       case ir_unop_i2b:
 915          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 916          return;
 917
 918       case ir_binop_greater:
 919       case ir_binop_gequal:
 920       case ir_binop_less:
 921       case ir_binop_lequal:
 922       case ir_binop_equal:
 923       case ir_binop_nequal:
 924          emit(IF(op[0], op[1],
 925                  brw_conditional_for_comparison(expr->operation)));
 926          return;
 927
 928       case ir_binop_all_equal:
 929          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 930          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 931          return;
 932
 933       case ir_binop_any_nequal:
 934          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 935          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 936          return;
 937
 938       case ir_unop_any:
 939          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 940          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 941          return;
 942
 943       default:
 944          assert(!"not reached");
 945          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 946          return;
 947       }
 948       return;
 949    }
 950
 951    ir->condition->accept(this);
 952
 953    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 954 }
 955
 956 void
 957 vec4_visitor::visit(ir_variable *ir)
 958 {
 959    dst_reg *reg = NULL;
 960
 961    if (variable_storage(ir))
 962       return;
 963
 964    switch (ir->data.mode) {
 965    case ir_var_shader_in:
 966       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 967       break;
 968
 969    case ir_var_shader_out:
 970       reg = new(mem_ctx) dst_reg(this, ir->type);
 971
 972       for (int i = 0; i < type_size(ir->type); i++) {
 973          output_reg[ir->data.location + i] = *reg;
 974          output_reg[ir->data.location + i].reg_offset = i;
 975          output_reg[ir->data.location + i].type =
 976             brw_type_for_base_type(ir->type->get_scalar_type());
 977          output_reg_annotation[ir->data.location + i] = ir->name;
 978       }
 979       break;
 980
 981    case ir_var_auto:
 982    case ir_var_temporary:
 983       reg = new(mem_ctx) dst_reg(this, ir->type);
 984       break;
 985
 986    case ir_var_uniform:
 987       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 988
 989       /* Thanks to the lower_ubo_reference pass, we will see only
 990        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 991        * variables, so no need for them to be in variable_ht.
 992        *
 993        * Atomic counters take no uniform storage, no need to do
 994        * anything here.
 995        */
 996       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 997          return;
 998
 999       /* Track how big the whole uniform variable is, in case we need to put a
1000        * copy of its data into pull constants for array access.
1001        */
1002       assert(this->uniforms < uniform_array_size);
1003       this->uniform_size[this->uniforms] = type_size(ir->type);
1004
1005       if (!strncmp(ir->name, "gl_", 3)) {
1006          setup_builtin_uniform_values(ir);
1007       } else {
1008          setup_uniform_values(ir);
1009       }
1010       break;
1011
1012    case ir_var_system_value:
1013       reg = make_reg_for_system_value(ir);
1014       break;
1015
1016    default:
1017       assert(!"not reached");
1018    }
1019
1020    reg->type = brw_type_for_base_type(ir->type);
1021    hash_table_insert(this->variable_ht, reg, ir);
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_loop *ir)
1026 {
1027    /* We don't want debugging output to print the whole body of the
1028     * loop as the annotation.
1029     */
1030    this->base_ir = NULL;
1031
1032    emit(BRW_OPCODE_DO);
1033
1034    visit_instructions(&ir->body_instructions);
1035
1036    emit(BRW_OPCODE_WHILE);
1037 }
1038
1039 void
1040 vec4_visitor::visit(ir_loop_jump *ir)
1041 {
1042    switch (ir->mode) {
1043    case ir_loop_jump::jump_break:
1044       emit(BRW_OPCODE_BREAK);
1045       break;
1046    case ir_loop_jump::jump_continue:
1047       emit(BRW_OPCODE_CONTINUE);
1048       break;
1049    }
1050 }
1051
1052
1053 void
1054 vec4_visitor::visit(ir_function_signature *ir)
1055 {
1056    assert(0);
1057    (void)ir;
1058 }
1059
1060 void
1061 vec4_visitor::visit(ir_function *ir)
1062 {
1063    /* Ignore function bodies other than main() -- we shouldn't see calls to
1064     * them since they should all be inlined.
1065     */
1066    if (strcmp(ir->name, "main") == 0) {
1067       const ir_function_signature *sig;
1068       exec_list empty;
1069
1070       sig = ir->matching_signature(NULL, &empty);
1071
1072       assert(sig);
1073
1074       visit_instructions(&sig->body);
1075    }
1076 }
1077
1078 bool
1079 vec4_visitor::try_emit_sat(ir_expression *ir)
1080 {
1081    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1082    if (!sat_src)
1083       return false;
1084
1085    sat_src->accept(this);
1086    src_reg src = this->result;
1087
1088    this->result = src_reg(this, ir->type);
1089    vec4_instruction *inst;
1090    inst = emit(MOV(dst_reg(this->result), src));
1091    inst->saturate = true;
1092
1093    return true;
1094 }
1095
1096 bool
1097 vec4_visitor::try_emit_mad(ir_expression *ir)
1098 {
1099    /* 3-src instructions were introduced in gen6. */
1100    if (brw->gen < 6)
1101       return false;
1102
1103    /* MAD can only handle floating-point data. */
1104    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1105       return false;
1106
1107    ir_rvalue *nonmul = ir->operands[1];
1108    ir_expression *mul = ir->operands[0]->as_expression();
1109
1110    if (!mul || mul->operation != ir_binop_mul) {
1111       nonmul = ir->operands[0];
1112       mul = ir->operands[1]->as_expression();
1113
1114       if (!mul || mul->operation != ir_binop_mul)
1115          return false;
1116    }
1117
1118    nonmul->accept(this);
1119    src_reg src0 = fix_3src_operand(this->result);
1120
1121    mul->operands[0]->accept(this);
1122    src_reg src1 = fix_3src_operand(this->result);
1123
1124    mul->operands[1]->accept(this);
1125    src_reg src2 = fix_3src_operand(this->result);
1126
1127    this->result = src_reg(this, ir->type);
1128    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1129
1130    return true;
1131 }
1132
1133 bool
1134 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1135 {
1136    ir_expression *const cmp = ir->operands[0]->as_expression();
1137
1138    if (cmp == NULL)
1139       return false;
1140
1141    switch (cmp->operation) {
1142    case ir_binop_less:
1143    case ir_binop_greater:
1144    case ir_binop_lequal:
1145    case ir_binop_gequal:
1146    case ir_binop_equal:
1147    case ir_binop_nequal:
1148       break;
1149
1150    default:
1151       return false;
1152    }
1153
1154    cmp->operands[0]->accept(this);
1155    const src_reg cmp_src0 = this->result;
1156
1157    cmp->operands[1]->accept(this);
1158    const src_reg cmp_src1 = this->result;
1159
1160    this->result = src_reg(this, ir->type);
1161
1162    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1163             brw_conditional_for_comparison(cmp->operation)));
1164
1165    /* If the comparison is false, this->result will just happen to be zero.
1166     */
1167    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1168                                        this->result, src_reg(1.0f));
1169    inst->predicate = BRW_PREDICATE_NORMAL;
1170    inst->predicate_inverse = true;
1171
1172    return true;
1173 }
1174
1175 void
1176 vec4_visitor::emit_bool_comparison(unsigned int op,
1177                                  dst_reg dst, src_reg src0, src_reg src1)
1178 {
1179    /* original gen4 does destination conversion before comparison. */
1180    if (brw->gen < 5)
1181       dst.type = src0.type;
1182
1183    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1184
1185    dst.type = BRW_REGISTER_TYPE_D;
1186    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1187 }
1188
1189 void
1190 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1191                           src_reg src0, src_reg src1)
1192 {
1193    vec4_instruction *inst;
1194
1195    if (brw->gen >= 6) {
1196       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1197       inst->conditional_mod = conditionalmod;
1198    } else {
1199       emit(CMP(dst, src0, src1, conditionalmod));
1200
1201       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1202       inst->predicate = BRW_PREDICATE_NORMAL;
1203    }
1204 }
1205
1206 void
1207 vec4_visitor::emit_lrp(const dst_reg &dst,
1208                        const src_reg &x, const src_reg &y, const src_reg &a)
1209 {
1210    if (brw->gen >= 6) {
1211       /* Note that the instruction's argument order is reversed from GLSL
1212        * and the IR.
1213        */
1214       emit(LRP(dst,
1215                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1216    } else {
1217       /* Earlier generations don't support three source operations, so we
1218        * need to emit x*(1-a) + y*a.
1219        */
1220       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1221       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1222       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1223       y_times_a.writemask           = dst.writemask;
1224       one_minus_a.writemask         = dst.writemask;
1225       x_times_one_minus_a.writemask = dst.writemask;
1226
1227       emit(MUL(y_times_a, y, a));
1228       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1229       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1230       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1231    }
1232 }
1233
1234 void
1235 vec4_visitor::visit(ir_expression *ir)
1236 {
1237    unsigned int operand;
1238    src_reg op[Elements(ir->operands)];
1239    src_reg result_src;
1240    dst_reg result_dst;
1241    vec4_instruction *inst;
1242
1243    if (try_emit_sat(ir))
1244       return;
1245
1246    if (ir->operation == ir_binop_add) {
1247       if (try_emit_mad(ir))
1248          return;
1249    }
1250
1251    if (ir->operation == ir_unop_b2f) {
1252       if (try_emit_b2f_of_compare(ir))
1253          return;
1254    }
1255
1256    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1257       this->result.file = BAD_FILE;
1258       ir->operands[operand]->accept(this);
1259       if (this->result.file == BAD_FILE) {
1260          fprintf(stderr, "Failed to get tree for expression operand:\n");
1261          ir->operands[operand]->fprint(stderr);
1262          exit(1);
1263       }
1264       op[operand] = this->result;
1265
1266       /* Matrix expression operands should have been broken down to vector
1267        * operations already.
1268        */
1269       assert(!ir->operands[operand]->type->is_matrix());
1270    }
1271
1272    int vector_elements = ir->operands[0]->type->vector_elements;
1273    if (ir->operands[1]) {
1274       vector_elements = MAX2(vector_elements,
1275                              ir->operands[1]->type->vector_elements);
1276    }
1277
1278    this->result.file = BAD_FILE;
1279
1280    /* Storage for our result.  Ideally for an assignment we'd be using
1281     * the actual storage for the result here, instead.
1282     */
1283    result_src = src_reg(this, ir->type);
1284    /* convenience for the emit functions below. */
1285    result_dst = dst_reg(result_src);
1286    /* If nothing special happens, this is the result. */
1287    this->result = result_src;
1288    /* Limit writes to the channels that will be used by result_src later.
1289     * This does limit this temp's use as a temporary for multi-instruction
1290     * sequences.
1291     */
1292    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1293
1294    switch (ir->operation) {
1295    case ir_unop_logic_not:
1296       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1297        * ones complement of the whole register, not just bit 0.
1298        */
1299       emit(XOR(result_dst, op[0], src_reg(1)));
1300       break;
1301    case ir_unop_neg:
1302       op[0].negate = !op[0].negate;
1303       emit(MOV(result_dst, op[0]));
1304       break;
1305    case ir_unop_abs:
1306       op[0].abs = true;
1307       op[0].negate = false;
1308       emit(MOV(result_dst, op[0]));
1309       break;
1310
1311    case ir_unop_sign:
1312       if (ir->type->is_float()) {
1313          /* AND(val, 0x80000000) gives the sign bit.
1314           *
1315           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1316           * zero.
1317           */
1318          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1319
1320          op[0].type = BRW_REGISTER_TYPE_UD;
1321          result_dst.type = BRW_REGISTER_TYPE_UD;
1322          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1323
1324          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1325          inst->predicate = BRW_PREDICATE_NORMAL;
1326
1327          this->result.type = BRW_REGISTER_TYPE_F;
1328       } else {
1329          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1330           *               -> non-negative val generates 0x00000000.
1331           *  Predicated OR sets 1 if val is positive.
1332           */
1333          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1334
1335          emit(ASR(result_dst, op[0], src_reg(31)));
1336
1337          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1338          inst->predicate = BRW_PREDICATE_NORMAL;
1339       }
1340       break;
1341
1342    case ir_unop_rcp:
1343       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1344       break;
1345
1346    case ir_unop_exp2:
1347       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1348       break;
1349    case ir_unop_log2:
1350       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1351       break;
1352    case ir_unop_exp:
1353    case ir_unop_log:
1354       assert(!"not reached: should be handled by ir_explog_to_explog2");
1355       break;
1356    case ir_unop_sin:
1357    case ir_unop_sin_reduced:
1358       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1359       break;
1360    case ir_unop_cos:
1361    case ir_unop_cos_reduced:
1362       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1363       break;
1364
1365    case ir_unop_dFdx:
1366    case ir_unop_dFdy:
1367       assert(!"derivatives not valid in vertex shader");
1368       break;
1369
1370    case ir_unop_bitfield_reverse:
1371       emit(BFREV(result_dst, op[0]));
1372       break;
1373    case ir_unop_bit_count:
1374       emit(CBIT(result_dst, op[0]));
1375       break;
1376    case ir_unop_find_msb: {
1377       src_reg temp = src_reg(this, glsl_type::uint_type);
1378
1379       inst = emit(FBH(dst_reg(temp), op[0]));
1380       inst->dst.writemask = WRITEMASK_XYZW;
1381
1382       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1383        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1384        * subtract the result from 31 to convert the MSB count into an LSB count.
1385        */
1386
1387       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1388       temp.swizzle = BRW_SWIZZLE_NOOP;
1389       emit(MOV(result_dst, temp));
1390
1391       src_reg src_tmp = src_reg(result_dst);
1392       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1393
1394       src_tmp.negate = true;
1395       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1396       inst->predicate = BRW_PREDICATE_NORMAL;
1397       break;
1398    }
1399    case ir_unop_find_lsb:
1400       emit(FBL(result_dst, op[0]));
1401       break;
1402
1403    case ir_unop_noise:
1404       assert(!"not reached: should be handled by lower_noise");
1405       break;
1406
1407    case ir_binop_add:
1408       emit(ADD(result_dst, op[0], op[1]));
1409       break;
1410    case ir_binop_sub:
1411       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1412       break;
1413
1414    case ir_binop_mul:
1415       if (brw->gen < 8 && ir->type->is_integer()) {
1416          /* For integer multiplication, the MUL uses the low 16 bits of one of
1417           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1418           * accumulates in the contribution of the upper 16 bits of that
1419           * operand.  If we can determine that one of the args is in the low
1420           * 16 bits, though, we can just emit a single MUL.
1421           */
1422          if (ir->operands[0]->is_uint16_constant()) {
1423             if (brw->gen < 7)
1424                emit(MUL(result_dst, op[0], op[1]));
1425             else
1426                emit(MUL(result_dst, op[1], op[0]));
1427          } else if (ir->operands[1]->is_uint16_constant()) {
1428             if (brw->gen < 7)
1429                emit(MUL(result_dst, op[1], op[0]));
1430             else
1431                emit(MUL(result_dst, op[0], op[1]));
1432          } else {
1433             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1434
1435             emit(MUL(acc, op[0], op[1]));
1436             emit(MACH(dst_null_d(), op[0], op[1]));
1437             emit(MOV(result_dst, src_reg(acc)));
1438          }
1439       } else {
1440          emit(MUL(result_dst, op[0], op[1]));
1441       }
1442       break;
1443    case ir_binop_imul_high: {
1444       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1445
1446       emit(MUL(acc, op[0], op[1]));
1447       emit(MACH(result_dst, op[0], op[1]));
1448       break;
1449    }
1450    case ir_binop_div:
1451       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1452       assert(ir->type->is_integer());
1453       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1454       break;
1455    case ir_binop_carry: {
1456       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1457
1458       emit(ADDC(dst_null_ud(), op[0], op[1]));
1459       emit(MOV(result_dst, src_reg(acc)));
1460       break;
1461    }
1462    case ir_binop_borrow: {
1463       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1464
1465       emit(SUBB(dst_null_ud(), op[0], op[1]));
1466       emit(MOV(result_dst, src_reg(acc)));
1467       break;
1468    }
1469    case ir_binop_mod:
1470       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1471       assert(ir->type->is_integer());
1472       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1473       break;
1474
1475    case ir_binop_less:
1476    case ir_binop_greater:
1477    case ir_binop_lequal:
1478    case ir_binop_gequal:
1479    case ir_binop_equal:
1480    case ir_binop_nequal: {
1481       emit(CMP(result_dst, op[0], op[1],
1482                brw_conditional_for_comparison(ir->operation)));
1483       emit(AND(result_dst, result_src, src_reg(0x1)));
1484       break;
1485    }
1486
1487    case ir_binop_all_equal:
1488       /* "==" operator producing a scalar boolean. */
1489       if (ir->operands[0]->type->is_vector() ||
1490           ir->operands[1]->type->is_vector()) {
1491          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1492          emit(MOV(result_dst, src_reg(0)));
1493          inst = emit(MOV(result_dst, src_reg(1)));
1494          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1495       } else {
1496          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1497          emit(AND(result_dst, result_src, src_reg(0x1)));
1498       }
1499       break;
1500    case ir_binop_any_nequal:
1501       /* "!=" operator producing a scalar boolean. */
1502       if (ir->operands[0]->type->is_vector() ||
1503           ir->operands[1]->type->is_vector()) {
1504          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1505
1506          emit(MOV(result_dst, src_reg(0)));
1507          inst = emit(MOV(result_dst, src_reg(1)));
1508          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1509       } else {
1510          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1511          emit(AND(result_dst, result_src, src_reg(0x1)));
1512       }
1513       break;
1514
1515    case ir_unop_any:
1516       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1517       emit(MOV(result_dst, src_reg(0)));
1518
1519       inst = emit(MOV(result_dst, src_reg(1)));
1520       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1521       break;
1522
1523    case ir_binop_logic_xor:
1524       emit(XOR(result_dst, op[0], op[1]));
1525       break;
1526
1527    case ir_binop_logic_or:
1528       emit(OR(result_dst, op[0], op[1]));
1529       break;
1530
1531    case ir_binop_logic_and:
1532       emit(AND(result_dst, op[0], op[1]));
1533       break;
1534
1535    case ir_binop_dot:
1536       assert(ir->operands[0]->type->is_vector());
1537       assert(ir->operands[0]->type == ir->operands[1]->type);
1538       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1539       break;
1540
1541    case ir_unop_sqrt:
1542       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1543       break;
1544    case ir_unop_rsq:
1545       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1546       break;
1547
1548    case ir_unop_bitcast_i2f:
1549    case ir_unop_bitcast_u2f:
1550       this->result = op[0];
1551       this->result.type = BRW_REGISTER_TYPE_F;
1552       break;
1553
1554    case ir_unop_bitcast_f2i:
1555       this->result = op[0];
1556       this->result.type = BRW_REGISTER_TYPE_D;
1557       break;
1558
1559    case ir_unop_bitcast_f2u:
1560       this->result = op[0];
1561       this->result.type = BRW_REGISTER_TYPE_UD;
1562       break;
1563
1564    case ir_unop_i2f:
1565    case ir_unop_i2u:
1566    case ir_unop_u2i:
1567    case ir_unop_u2f:
1568    case ir_unop_b2f:
1569    case ir_unop_b2i:
1570    case ir_unop_f2i:
1571    case ir_unop_f2u:
1572       emit(MOV(result_dst, op[0]));
1573       break;
1574    case ir_unop_f2b:
1575    case ir_unop_i2b: {
1576       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1577       emit(AND(result_dst, result_src, src_reg(1)));
1578       break;
1579    }
1580
1581    case ir_unop_trunc:
1582       emit(RNDZ(result_dst, op[0]));
1583       break;
1584    case ir_unop_ceil:
1585       op[0].negate = !op[0].negate;
1586       inst = emit(RNDD(result_dst, op[0]));
1587       this->result.negate = true;
1588       break;
1589    case ir_unop_floor:
1590       inst = emit(RNDD(result_dst, op[0]));
1591       break;
1592    case ir_unop_fract:
1593       inst = emit(FRC(result_dst, op[0]));
1594       break;
1595    case ir_unop_round_even:
1596       emit(RNDE(result_dst, op[0]));
1597       break;
1598
1599    case ir_binop_min:
1600       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1601       break;
1602    case ir_binop_max:
1603       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1604       break;
1605
1606    case ir_binop_pow:
1607       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1608       break;
1609
1610    case ir_unop_bit_not:
1611       inst = emit(NOT(result_dst, op[0]));
1612       break;
1613    case ir_binop_bit_and:
1614       inst = emit(AND(result_dst, op[0], op[1]));
1615       break;
1616    case ir_binop_bit_xor:
1617       inst = emit(XOR(result_dst, op[0], op[1]));
1618       break;
1619    case ir_binop_bit_or:
1620       inst = emit(OR(result_dst, op[0], op[1]));
1621       break;
1622
1623    case ir_binop_lshift:
1624       inst = emit(SHL(result_dst, op[0], op[1]));
1625       break;
1626
1627    case ir_binop_rshift:
1628       if (ir->type->base_type == GLSL_TYPE_INT)
1629          inst = emit(ASR(result_dst, op[0], op[1]));
1630       else
1631          inst = emit(SHR(result_dst, op[0], op[1]));
1632       break;
1633
1634    case ir_binop_bfm:
1635       emit(BFI1(result_dst, op[0], op[1]));
1636       break;
1637
1638    case ir_binop_ubo_load: {
1639       ir_constant *uniform_block = ir->operands[0]->as_constant();
1640       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1641       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1642       src_reg offset;
1643
1644       /* Now, load the vector from that offset. */
1645       assert(ir->type->is_vector() || ir->type->is_scalar());
1646
1647       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1648       packed_consts.type = result.type;
1649       src_reg surf_index =
1650          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1651       if (const_offset_ir) {
1652          if (brw->gen >= 8) {
1653             /* Store the offset in a GRF so we can send-from-GRF. */
1654             offset = src_reg(this, glsl_type::int_type);
1655             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1656          } else {
1657             /* Immediates are fine on older generations since they'll be moved
1658              * to a (potentially fake) MRF at the generator level.
1659              */
1660             offset = src_reg(const_offset / 16);
1661          }
1662       } else {
1663          offset = src_reg(this, glsl_type::uint_type);
1664          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1665       }
1666
1667       if (brw->gen >= 7) {
1668          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1669          grf_offset.type = offset.type;
1670
1671          emit(MOV(grf_offset, offset));
1672
1673          emit(new(mem_ctx) vec4_instruction(this,
1674                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1675                                             dst_reg(packed_consts),
1676                                             surf_index,
1677                                             src_reg(grf_offset)));
1678       } else {
1679          vec4_instruction *pull =
1680             emit(new(mem_ctx) vec4_instruction(this,
1681                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1682                                                dst_reg(packed_consts),
1683                                                surf_index,
1684                                                offset));
1685          pull->base_mrf = 14;
1686          pull->mlen = 1;
1687       }
1688
1689       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1690       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1691                                             const_offset % 16 / 4,
1692                                             const_offset % 16 / 4,
1693                                             const_offset % 16 / 4);
1694
1695       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1696       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1697          emit(CMP(result_dst, packed_consts, src_reg(0u),
1698                   BRW_CONDITIONAL_NZ));
1699          emit(AND(result_dst, result, src_reg(0x1)));
1700       } else {
1701          emit(MOV(result_dst, packed_consts));
1702       }
1703       break;
1704    }
1705
1706    case ir_binop_vector_extract:
1707       assert(!"should have been lowered by vec_index_to_cond_assign");
1708       break;
1709
1710    case ir_triop_fma:
1711       op[0] = fix_3src_operand(op[0]);
1712       op[1] = fix_3src_operand(op[1]);
1713       op[2] = fix_3src_operand(op[2]);
1714       /* Note that the instruction's argument order is reversed from GLSL
1715        * and the IR.
1716        */
1717       emit(MAD(result_dst, op[2], op[1], op[0]));
1718       break;
1719
1720    case ir_triop_lrp:
1721       emit_lrp(result_dst, op[0], op[1], op[2]);
1722       break;
1723
1724    case ir_triop_csel:
1725       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1726       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1727       inst->predicate = BRW_PREDICATE_NORMAL;
1728       break;
1729
1730    case ir_triop_bfi:
1731       op[0] = fix_3src_operand(op[0]);
1732       op[1] = fix_3src_operand(op[1]);
1733       op[2] = fix_3src_operand(op[2]);
1734       emit(BFI2(result_dst, op[0], op[1], op[2]));
1735       break;
1736
1737    case ir_triop_bitfield_extract:
1738       op[0] = fix_3src_operand(op[0]);
1739       op[1] = fix_3src_operand(op[1]);
1740       op[2] = fix_3src_operand(op[2]);
1741       /* Note that the instruction's argument order is reversed from GLSL
1742        * and the IR.
1743        */
1744       emit(BFE(result_dst, op[2], op[1], op[0]));
1745       break;
1746
1747    case ir_triop_vector_insert:
1748       assert(!"should have been lowered by lower_vector_insert");
1749       break;
1750
1751    case ir_quadop_bitfield_insert:
1752       assert(!"not reached: should be handled by "
1753               "bitfield_insert_to_bfm_bfi\n");
1754       break;
1755
1756    case ir_quadop_vector:
1757       assert(!"not reached: should be handled by lower_quadop_vector");
1758       break;
1759
1760    case ir_unop_pack_half_2x16:
1761       emit_pack_half_2x16(result_dst, op[0]);
1762       break;
1763    case ir_unop_unpack_half_2x16:
1764       emit_unpack_half_2x16(result_dst, op[0]);
1765       break;
1766    case ir_unop_pack_snorm_2x16:
1767    case ir_unop_pack_snorm_4x8:
1768    case ir_unop_pack_unorm_2x16:
1769    case ir_unop_pack_unorm_4x8:
1770    case ir_unop_unpack_snorm_2x16:
1771    case ir_unop_unpack_snorm_4x8:
1772    case ir_unop_unpack_unorm_2x16:
1773    case ir_unop_unpack_unorm_4x8:
1774       assert(!"not reached: should be handled by lower_packing_builtins");
1775       break;
1776    case ir_unop_unpack_half_2x16_split_x:
1777    case ir_unop_unpack_half_2x16_split_y:
1778    case ir_binop_pack_half_2x16_split:
1779       assert(!"not reached: should not occur in vertex shader");
1780       break;
1781    case ir_binop_ldexp:
1782       assert(!"not reached: should be handled by ldexp_to_arith()");
1783       break;
1784    }
1785 }
1786
1787
1788 void
1789 vec4_visitor::visit(ir_swizzle *ir)
1790 {
1791    src_reg src;
1792    int i = 0;
1793    int swizzle[4];
1794
1795    /* Note that this is only swizzles in expressions, not those on the left
1796     * hand side of an assignment, which do write masking.  See ir_assignment
1797     * for that.
1798     */
1799
1800    ir->val->accept(this);
1801    src = this->result;
1802    assert(src.file != BAD_FILE);
1803
1804    for (i = 0; i < ir->type->vector_elements; i++) {
1805       switch (i) {
1806       case 0:
1807          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1808          break;
1809       case 1:
1810          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1811          break;
1812       case 2:
1813          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1814          break;
1815       case 3:
1816          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1817             break;
1818       }
1819    }
1820    for (; i < 4; i++) {
1821       /* Replicate the last channel out. */
1822       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1823    }
1824
1825    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1826
1827    this->result = src;
1828 }
1829
1830 void
1831 vec4_visitor::visit(ir_dereference_variable *ir)
1832 {
1833    const struct glsl_type *type = ir->type;
1834    dst_reg *reg = variable_storage(ir->var);
1835
1836    if (!reg) {
1837       fail("Failed to find variable storage for %s\n", ir->var->name);
1838       this->result = src_reg(brw_null_reg());
1839       return;
1840    }
1841
1842    this->result = src_reg(*reg);
1843
1844    /* System values get their swizzle from the dst_reg writemask */
1845    if (ir->var->data.mode == ir_var_system_value)
1846       return;
1847
1848    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1849       this->result.swizzle = swizzle_for_size(type->vector_elements);
1850 }
1851
1852
1853 int
1854 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1855 {
1856    /* Under normal circumstances array elements are stored consecutively, so
1857     * the stride is equal to the size of the array element.
1858     */
1859    return type_size(ir->type);
1860 }
1861
1862
1863 void
1864 vec4_visitor::visit(ir_dereference_array *ir)
1865 {
1866    ir_constant *constant_index;
1867    src_reg src;
1868    int array_stride = compute_array_stride(ir);
1869
1870    constant_index = ir->array_index->constant_expression_value();
1871
1872    ir->array->accept(this);
1873    src = this->result;
1874
1875    if (constant_index) {
1876       src.reg_offset += constant_index->value.i[0] * array_stride;
1877    } else {
1878       /* Variable index array dereference.  It eats the "vec4" of the
1879        * base of the array and an index that offsets the Mesa register
1880        * index.
1881        */
1882       ir->array_index->accept(this);
1883
1884       src_reg index_reg;
1885
1886       if (array_stride == 1) {
1887          index_reg = this->result;
1888       } else {
1889          index_reg = src_reg(this, glsl_type::int_type);
1890
1891          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1892       }
1893
1894       if (src.reladdr) {
1895          src_reg temp = src_reg(this, glsl_type::int_type);
1896
1897          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1898
1899          index_reg = temp;
1900       }
1901
1902       src.reladdr = ralloc(mem_ctx, src_reg);
1903       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1904    }
1905
1906    /* If the type is smaller than a vec4, replicate the last channel out. */
1907    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1908       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1909    else
1910       src.swizzle = BRW_SWIZZLE_NOOP;
1911    src.type = brw_type_for_base_type(ir->type);
1912
1913    this->result = src;
1914 }
1915
1916 void
1917 vec4_visitor::visit(ir_dereference_record *ir)
1918 {
1919    unsigned int i;
1920    const glsl_type *struct_type = ir->record->type;
1921    int offset = 0;
1922
1923    ir->record->accept(this);
1924
1925    for (i = 0; i < struct_type->length; i++) {
1926       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1927          break;
1928       offset += type_size(struct_type->fields.structure[i].type);
1929    }
1930
1931    /* If the type is smaller than a vec4, replicate the last channel out. */
1932    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1933       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1934    else
1935       this->result.swizzle = BRW_SWIZZLE_NOOP;
1936    this->result.type = brw_type_for_base_type(ir->type);
1937
1938    this->result.reg_offset += offset;
1939 }
1940
1941 /**
1942  * We want to be careful in assignment setup to hit the actual storage
1943  * instead of potentially using a temporary like we might with the
1944  * ir_dereference handler.
1945  */
1946 static dst_reg
1947 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1948 {
1949    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1950     * access of a vector, it must be separated into a series conditional moves
1951     * before reaching this point (see ir_vec_index_to_cond_assign).
1952     */
1953    assert(ir->as_dereference());
1954    ir_dereference_array *deref_array = ir->as_dereference_array();
1955    if (deref_array) {
1956       assert(!deref_array->array->type->is_vector());
1957    }
1958
1959    /* Use the rvalue deref handler for the most part.  We'll ignore
1960     * swizzles in it and write swizzles using writemask, though.
1961     */
1962    ir->accept(v);
1963    return dst_reg(v->result);
1964 }
1965
1966 void
1967 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1968                               const struct glsl_type *type, uint32_t predicate)
1969 {
1970    if (type->base_type == GLSL_TYPE_STRUCT) {
1971       for (unsigned int i = 0; i < type->length; i++) {
1972          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1973       }
1974       return;
1975    }
1976
1977    if (type->is_array()) {
1978       for (unsigned int i = 0; i < type->length; i++) {
1979          emit_block_move(dst, src, type->fields.array, predicate);
1980       }
1981       return;
1982    }
1983
1984    if (type->is_matrix()) {
1985       const struct glsl_type *vec_type;
1986
1987       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1988                                          type->vector_elements, 1);
1989
1990       for (int i = 0; i < type->matrix_columns; i++) {
1991          emit_block_move(dst, src, vec_type, predicate);
1992       }
1993       return;
1994    }
1995
1996    assert(type->is_scalar() || type->is_vector());
1997
1998    dst->type = brw_type_for_base_type(type);
1999    src->type = dst->type;
2000
2001    dst->writemask = (1 << type->vector_elements) - 1;
2002
2003    src->swizzle = swizzle_for_size(type->vector_elements);
2004
2005    vec4_instruction *inst = emit(MOV(*dst, *src));
2006    inst->predicate = predicate;
2007
2008    dst->reg_offset++;
2009    src->reg_offset++;
2010 }
2011
2012
2013 /* If the RHS processing resulted in an instruction generating a
2014  * temporary value, and it would be easy to rewrite the instruction to
2015  * generate its result right into the LHS instead, do so.  This ends
2016  * up reliably removing instructions where it can be tricky to do so
2017  * later without real UD chain information.
2018  */
2019 bool
2020 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2021                                      dst_reg dst,
2022                                      src_reg src,
2023                                      vec4_instruction *pre_rhs_inst,
2024                                      vec4_instruction *last_rhs_inst)
2025 {
2026    /* This could be supported, but it would take more smarts. */
2027    if (ir->condition)
2028       return false;
2029
2030    if (pre_rhs_inst == last_rhs_inst)
2031       return false; /* No instructions generated to work with. */
2032
2033    /* Make sure the last instruction generated our source reg. */
2034    if (src.file != GRF ||
2035        src.file != last_rhs_inst->dst.file ||
2036        src.reg != last_rhs_inst->dst.reg ||
2037        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2038        src.reladdr ||
2039        src.abs ||
2040        src.negate ||
2041        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2042       return false;
2043
2044    /* Check that that last instruction fully initialized the channels
2045     * we want to use, in the order we want to use them.  We could
2046     * potentially reswizzle the operands of many instructions so that
2047     * we could handle out of order channels, but don't yet.
2048     */
2049
2050    for (unsigned i = 0; i < 4; i++) {
2051       if (dst.writemask & (1 << i)) {
2052          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2053             return false;
2054
2055          if (BRW_GET_SWZ(src.swizzle, i) != i)
2056             return false;
2057       }
2058    }
2059
2060    /* Success!  Rewrite the instruction. */
2061    last_rhs_inst->dst.file = dst.file;
2062    last_rhs_inst->dst.reg = dst.reg;
2063    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2064    last_rhs_inst->dst.reladdr = dst.reladdr;
2065    last_rhs_inst->dst.writemask &= dst.writemask;
2066
2067    return true;
2068 }
2069
2070 void
2071 vec4_visitor::visit(ir_assignment *ir)
2072 {
2073    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2074    uint32_t predicate = BRW_PREDICATE_NONE;
2075
2076    if (!ir->lhs->type->is_scalar() &&
2077        !ir->lhs->type->is_vector()) {
2078       ir->rhs->accept(this);
2079       src_reg src = this->result;
2080
2081       if (ir->condition) {
2082          emit_bool_to_cond_code(ir->condition, &predicate);
2083       }
2084
2085       /* emit_block_move doesn't account for swizzles in the source register.
2086        * This should be ok, since the source register is a structure or an
2087        * array, and those can't be swizzled.  But double-check to be sure.
2088        */
2089       assert(src.swizzle ==
2090              (ir->rhs->type->is_matrix()
2091               ? swizzle_for_size(ir->rhs->type->vector_elements)
2092               : BRW_SWIZZLE_NOOP));
2093
2094       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2095       return;
2096    }
2097
2098    /* Now we're down to just a scalar/vector with writemasks. */
2099    int i;
2100
2101    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2102    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2103
2104    ir->rhs->accept(this);
2105
2106    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2107
2108    src_reg src = this->result;
2109
2110    int swizzles[4];
2111    int first_enabled_chan = 0;
2112    int src_chan = 0;
2113
2114    assert(ir->lhs->type->is_vector() ||
2115           ir->lhs->type->is_scalar());
2116    dst.writemask = ir->write_mask;
2117
2118    for (int i = 0; i < 4; i++) {
2119       if (dst.writemask & (1 << i)) {
2120          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2121          break;
2122       }
2123    }
2124
2125    /* Swizzle a small RHS vector into the channels being written.
2126     *
2127     * glsl ir treats write_mask as dictating how many channels are
2128     * present on the RHS while in our instructions we need to make
2129     * those channels appear in the slots of the vec4 they're written to.
2130     */
2131    for (int i = 0; i < 4; i++) {
2132       if (dst.writemask & (1 << i))
2133          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2134       else
2135          swizzles[i] = first_enabled_chan;
2136    }
2137    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2138                               swizzles[2], swizzles[3]);
2139
2140    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2141       return;
2142    }
2143
2144    if (ir->condition) {
2145       emit_bool_to_cond_code(ir->condition, &predicate);
2146    }
2147
2148    for (i = 0; i < type_size(ir->lhs->type); i++) {
2149       vec4_instruction *inst = emit(MOV(dst, src));
2150       inst->predicate = predicate;
2151
2152       dst.reg_offset++;
2153       src.reg_offset++;
2154    }
2155 }
2156
2157 void
2158 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2159 {
2160    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2161       foreach_list(node, &ir->components) {
2162          ir_constant *field_value = (ir_constant *)node;
2163
2164          emit_constant_values(dst, field_value);
2165       }
2166       return;
2167    }
2168
2169    if (ir->type->is_array()) {
2170       for (unsigned int i = 0; i < ir->type->length; i++) {
2171          emit_constant_values(dst, ir->array_elements[i]);
2172       }
2173       return;
2174    }
2175
2176    if (ir->type->is_matrix()) {
2177       for (int i = 0; i < ir->type->matrix_columns; i++) {
2178          float *vec = &ir->value.f[i * ir->type->vector_elements];
2179
2180          for (int j = 0; j < ir->type->vector_elements; j++) {
2181             dst->writemask = 1 << j;
2182             dst->type = BRW_REGISTER_TYPE_F;
2183
2184             emit(MOV(*dst, src_reg(vec[j])));
2185          }
2186          dst->reg_offset++;
2187       }
2188       return;
2189    }
2190
2191    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2192
2193    for (int i = 0; i < ir->type->vector_elements; i++) {
2194       if (!(remaining_writemask & (1 << i)))
2195          continue;
2196
2197       dst->writemask = 1 << i;
2198       dst->type = brw_type_for_base_type(ir->type);
2199
2200       /* Find other components that match the one we're about to
2201        * write.  Emits fewer instructions for things like vec4(0.5,
2202        * 1.5, 1.5, 1.5).
2203        */
2204       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2205          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2206             if (ir->value.b[i] == ir->value.b[j])
2207                dst->writemask |= (1 << j);
2208          } else {
2209             /* u, i, and f storage all line up, so no need for a
2210              * switch case for comparing each type.
2211              */
2212             if (ir->value.u[i] == ir->value.u[j])
2213                dst->writemask |= (1 << j);
2214          }
2215       }
2216
2217       switch (ir->type->base_type) {
2218       case GLSL_TYPE_FLOAT:
2219          emit(MOV(*dst, src_reg(ir->value.f[i])));
2220          break;
2221       case GLSL_TYPE_INT:
2222          emit(MOV(*dst, src_reg(ir->value.i[i])));
2223          break;
2224       case GLSL_TYPE_UINT:
2225          emit(MOV(*dst, src_reg(ir->value.u[i])));
2226          break;
2227       case GLSL_TYPE_BOOL:
2228          emit(MOV(*dst, src_reg(ir->value.b[i])));
2229          break;
2230       default:
2231          assert(!"Non-float/uint/int/bool constant");
2232          break;
2233       }
2234
2235       remaining_writemask &= ~dst->writemask;
2236    }
2237    dst->reg_offset++;
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_constant *ir)
2242 {
2243    dst_reg dst = dst_reg(this, ir->type);
2244    this->result = src_reg(dst);
2245
2246    emit_constant_values(&dst, ir);
2247 }
2248
2249 void
2250 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2251 {
2252    ir_dereference *deref = static_cast<ir_dereference *>(
2253       ir->actual_parameters.get_head());
2254    ir_variable *location = deref->variable_referenced();
2255    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2256                           location->data.atomic.buffer_index);
2257
2258    /* Calculate the surface offset */
2259    src_reg offset(this, glsl_type::uint_type);
2260    ir_dereference_array *deref_array = deref->as_dereference_array();
2261    if (deref_array) {
2262       deref_array->array_index->accept(this);
2263
2264       src_reg tmp(this, glsl_type::uint_type);
2265       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2266       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2267    } else {
2268       offset = location->data.atomic.offset;
2269    }
2270
2271    /* Emit the appropriate machine instruction */
2272    const char *callee = ir->callee->function_name();
2273    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2274
2275    if (!strcmp("__intrinsic_atomic_read", callee)) {
2276       emit_untyped_surface_read(surf_index, dst, offset);
2277
2278    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2279       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2280                           src_reg(), src_reg());
2281
2282    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2283       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2284                           src_reg(), src_reg());
2285    }
2286 }
2287
2288 void
2289 vec4_visitor::visit(ir_call *ir)
2290 {
2291    const char *callee = ir->callee->function_name();
2292
2293    if (!strcmp("__intrinsic_atomic_read", callee) ||
2294        !strcmp("__intrinsic_atomic_increment", callee) ||
2295        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2296       visit_atomic_counter_intrinsic(ir);
2297    } else {
2298       assert(!"Unsupported intrinsic.");
2299    }
2300 }
2301
2302 src_reg
2303 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2304 {
2305    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2306    inst->base_mrf = 2;
2307    inst->mlen = 1;
2308    inst->sampler = sampler;
2309    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2310    inst->dst.writemask = WRITEMASK_XYZW;
2311
2312    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2313    int param_base = inst->base_mrf;
2314    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2315    int zero_mask = 0xf & ~coord_mask;
2316
2317    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2318             coordinate));
2319
2320    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2321             src_reg(0)));
2322
2323    emit(inst);
2324    return src_reg(inst->dst);
2325 }
2326
2327 void
2328 vec4_visitor::visit(ir_texture *ir)
2329 {
2330    int sampler =
2331       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2332
2333    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2334     * emitting anything other than setting up the constant result.
2335     */
2336    if (ir->op == ir_tg4) {
2337       ir_constant *chan = ir->lod_info.component->as_constant();
2338       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2339       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2340          dst_reg result(this, ir->type);
2341          this->result = src_reg(result);
2342          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2343          return;
2344       }
2345    }
2346
2347    /* Should be lowered by do_lower_texture_projection */
2348    assert(!ir->projector);
2349
2350    /* Should be lowered */
2351    assert(!ir->offset || !ir->offset->type->is_array());
2352
2353    /* Generate code to compute all the subexpression trees.  This has to be
2354     * done before loading any values into MRFs for the sampler message since
2355     * generating these values may involve SEND messages that need the MRFs.
2356     */
2357    src_reg coordinate;
2358    if (ir->coordinate) {
2359       ir->coordinate->accept(this);
2360       coordinate = this->result;
2361    }
2362
2363    src_reg shadow_comparitor;
2364    if (ir->shadow_comparitor) {
2365       ir->shadow_comparitor->accept(this);
2366       shadow_comparitor = this->result;
2367    }
2368
2369    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2370    src_reg offset_value;
2371    if (has_nonconstant_offset) {
2372       ir->offset->accept(this);
2373       offset_value = src_reg(this->result);
2374    }
2375
2376    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2377    src_reg lod, dPdx, dPdy, sample_index, mcs;
2378    switch (ir->op) {
2379    case ir_tex:
2380       lod = src_reg(0.0f);
2381       lod_type = glsl_type::float_type;
2382       break;
2383    case ir_txf:
2384    case ir_txl:
2385    case ir_txs:
2386       ir->lod_info.lod->accept(this);
2387       lod = this->result;
2388       lod_type = ir->lod_info.lod->type;
2389       break;
2390    case ir_query_levels:
2391       lod = src_reg(0);
2392       lod_type = glsl_type::int_type;
2393       break;
2394    case ir_txf_ms:
2395       ir->lod_info.sample_index->accept(this);
2396       sample_index = this->result;
2397       sample_index_type = ir->lod_info.sample_index->type;
2398
2399       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2400          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2401       else
2402          mcs = src_reg(0u);
2403       break;
2404    case ir_txd:
2405       ir->lod_info.grad.dPdx->accept(this);
2406       dPdx = this->result;
2407
2408       ir->lod_info.grad.dPdy->accept(this);
2409       dPdy = this->result;
2410
2411       lod_type = ir->lod_info.grad.dPdx->type;
2412       break;
2413    case ir_txb:
2414    case ir_lod:
2415    case ir_tg4:
2416       break;
2417    }
2418
2419    vec4_instruction *inst = NULL;
2420    switch (ir->op) {
2421    case ir_tex:
2422    case ir_txl:
2423       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2424       break;
2425    case ir_txd:
2426       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2427       break;
2428    case ir_txf:
2429       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2430       break;
2431    case ir_txf_ms:
2432       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2433       break;
2434    case ir_txs:
2435       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2436       break;
2437    case ir_tg4:
2438       if (has_nonconstant_offset)
2439          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2440       else
2441          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2442       break;
2443    case ir_query_levels:
2444       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2445       break;
2446    case ir_txb:
2447       assert(!"TXB is not valid for vertex shaders.");
2448       break;
2449    case ir_lod:
2450       assert(!"LOD is not valid for vertex shaders.");
2451       break;
2452    default:
2453       assert(!"Unrecognized tex op");
2454    }
2455
2456    if (ir->offset != NULL && ir->op != ir_txf)
2457       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2458
2459    /* Stuff the channel select bits in the top of the texture offset */
2460    if (ir->op == ir_tg4)
2461       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2462
2463    /* The message header is necessary for:
2464     * - Gen4 (always)
2465     * - Texel offsets
2466     * - Gather channel selection
2467     * - Sampler indices too large to fit in a 4-bit value.
2468     */
2469    inst->header_present =
2470       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2471       sampler >= 16;
2472    inst->base_mrf = 2;
2473    inst->mlen = inst->header_present + 1; /* always at least one */
2474    inst->sampler = sampler;
2475    inst->dst = dst_reg(this, ir->type);
2476    inst->dst.writemask = WRITEMASK_XYZW;
2477    inst->shadow_compare = ir->shadow_comparitor != NULL;
2478
2479    /* MRF for the first parameter */
2480    int param_base = inst->base_mrf + inst->header_present;
2481
2482    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2483       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2484       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2485    } else {
2486       /* Load the coordinate */
2487       /* FINISHME: gl_clamp_mask and saturate */
2488       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2489       int zero_mask = 0xf & ~coord_mask;
2490
2491       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2492                coordinate));
2493
2494       if (zero_mask != 0) {
2495          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2496                   src_reg(0)));
2497       }
2498       /* Load the shadow comparitor */
2499       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2500          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2501                           WRITEMASK_X),
2502                   shadow_comparitor));
2503          inst->mlen++;
2504       }
2505
2506       /* Load the LOD info */
2507       if (ir->op == ir_tex || ir->op == ir_txl) {
2508          int mrf, writemask;
2509          if (brw->gen >= 5) {
2510             mrf = param_base + 1;
2511             if (ir->shadow_comparitor) {
2512                writemask = WRITEMASK_Y;
2513                /* mlen already incremented */
2514             } else {
2515                writemask = WRITEMASK_X;
2516                inst->mlen++;
2517             }
2518          } else /* brw->gen == 4 */ {
2519             mrf = param_base;
2520             writemask = WRITEMASK_W;
2521          }
2522          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2523       } else if (ir->op == ir_txf) {
2524          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2525       } else if (ir->op == ir_txf_ms) {
2526          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2527                   sample_index));
2528          if (brw->gen >= 7)
2529             /* MCS data is in the first channel of `mcs`, but we need to get it into
2530              * the .y channel of the second vec4 of params, so replicate .x across
2531              * the whole vec4 and then mask off everything except .y
2532              */
2533             mcs.swizzle = BRW_SWIZZLE_XXXX;
2534             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2535                      mcs));
2536          inst->mlen++;
2537       } else if (ir->op == ir_txd) {
2538          const glsl_type *type = lod_type;
2539
2540          if (brw->gen >= 5) {
2541             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2542             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2543             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2544             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2545             inst->mlen++;
2546
2547             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2548                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2549                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2550                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2551                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2552                inst->mlen++;
2553
2554                if (ir->shadow_comparitor) {
2555                   emit(MOV(dst_reg(MRF, param_base + 2,
2556                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2557                            shadow_comparitor));
2558                }
2559             }
2560          } else /* brw->gen == 4 */ {
2561             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2562             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2563             inst->mlen += 2;
2564          }
2565       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2566          if (ir->shadow_comparitor) {
2567             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2568                      shadow_comparitor));
2569          }
2570
2571          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2572                   offset_value));
2573          inst->mlen++;
2574       }
2575    }
2576
2577    emit(inst);
2578
2579    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2580     * spec requires layers.
2581     */
2582    if (ir->op == ir_txs) {
2583       glsl_type const *type = ir->sampler->type;
2584       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2585           type->sampler_array) {
2586          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2587                    writemask(inst->dst, WRITEMASK_Z),
2588                    src_reg(inst->dst), src_reg(6));
2589       }
2590    }
2591
2592    if (brw->gen == 6 && ir->op == ir_tg4) {
2593       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2594    }
2595
2596    swizzle_result(ir, src_reg(inst->dst), sampler);
2597 }
2598
2599 /**
2600  * Apply workarounds for Gen6 gather with UINT/SINT
2601  */
2602 void
2603 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2604 {
2605    if (!wa)
2606       return;
2607
2608    int width = (wa & WA_8BIT) ? 8 : 16;
2609    dst_reg dst_f = dst;
2610    dst_f.type = BRW_REGISTER_TYPE_F;
2611
2612    /* Convert from UNORM to UINT */
2613    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2614    emit(MOV(dst, src_reg(dst_f)));
2615
2616    if (wa & WA_SIGN) {
2617       /* Reinterpret the UINT value as a signed INT value by
2618        * shifting the sign bit into place, then shifting back
2619        * preserving sign.
2620        */
2621       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2622       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2623    }
2624 }
2625
2626 /**
2627  * Set up the gather channel based on the swizzle, for gather4.
2628  */
2629 uint32_t
2630 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2631 {
2632    ir_constant *chan = ir->lod_info.component->as_constant();
2633    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2634    switch (swiz) {
2635       case SWIZZLE_X: return 0;
2636       case SWIZZLE_Y:
2637          /* gather4 sampler is broken for green channel on RG32F --
2638           * we must ask for blue instead.
2639           */
2640          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2641             return 2;
2642          return 1;
2643       case SWIZZLE_Z: return 2;
2644       case SWIZZLE_W: return 3;
2645       default:
2646          assert(!"Not reached"); /* zero, one swizzles handled already */
2647          return 0;
2648    }
2649 }
2650
2651 void
2652 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2653 {
2654    int s = key->tex.swizzles[sampler];
2655
2656    this->result = src_reg(this, ir->type);
2657    dst_reg swizzled_result(this->result);
2658
2659    if (ir->op == ir_query_levels) {
2660       /* # levels is in .w */
2661       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2662       emit(MOV(swizzled_result, orig_val));
2663       return;
2664    }
2665
2666    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2667                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2668       emit(MOV(swizzled_result, orig_val));
2669       return;
2670    }
2671
2672
2673    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2674    int swizzle[4] = {0};
2675
2676    for (int i = 0; i < 4; i++) {
2677       switch (GET_SWZ(s, i)) {
2678       case SWIZZLE_ZERO:
2679          zero_mask |= (1 << i);
2680          break;
2681       case SWIZZLE_ONE:
2682          one_mask |= (1 << i);
2683          break;
2684       default:
2685          copy_mask |= (1 << i);
2686          swizzle[i] = GET_SWZ(s, i);
2687          break;
2688       }
2689    }
2690
2691    if (copy_mask) {
2692       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2693       swizzled_result.writemask = copy_mask;
2694       emit(MOV(swizzled_result, orig_val));
2695    }
2696
2697    if (zero_mask) {
2698       swizzled_result.writemask = zero_mask;
2699       emit(MOV(swizzled_result, src_reg(0.0f)));
2700    }
2701
2702    if (one_mask) {
2703       swizzled_result.writemask = one_mask;
2704       emit(MOV(swizzled_result, src_reg(1.0f)));
2705    }
2706 }
2707
2708 void
2709 vec4_visitor::visit(ir_return *)
2710 {
2711    assert(!"not reached");
2712 }
2713
2714 void
2715 vec4_visitor::visit(ir_discard *)
2716 {
2717    assert(!"not reached");
2718 }
2719
2720 void
2721 vec4_visitor::visit(ir_if *ir)
2722 {
2723    /* Don't point the annotation at the if statement, because then it plus
2724     * the then and else blocks get printed.
2725     */
2726    this->base_ir = ir->condition;
2727
2728    if (brw->gen == 6) {
2729       emit_if_gen6(ir);
2730    } else {
2731       uint32_t predicate;
2732       emit_bool_to_cond_code(ir->condition, &predicate);
2733       emit(IF(predicate));
2734    }
2735
2736    visit_instructions(&ir->then_instructions);
2737
2738    if (!ir->else_instructions.is_empty()) {
2739       this->base_ir = ir->condition;
2740       emit(BRW_OPCODE_ELSE);
2741
2742       visit_instructions(&ir->else_instructions);
2743    }
2744
2745    this->base_ir = ir->condition;
2746    emit(BRW_OPCODE_ENDIF);
2747 }
2748
2749 void
2750 vec4_visitor::visit(ir_emit_vertex *)
2751 {
2752    assert(!"not reached");
2753 }
2754
2755 void
2756 vec4_visitor::visit(ir_end_primitive *)
2757 {
2758    assert(!"not reached");
2759 }
2760
2761 void
2762 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2763                                   dst_reg dst, src_reg offset,
2764                                   src_reg src0, src_reg src1)
2765 {
2766    unsigned mlen = 0;
2767
2768    /* Set the atomic operation offset. */
2769    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2770    mlen++;
2771
2772    /* Set the atomic operation arguments. */
2773    if (src0.file != BAD_FILE) {
2774       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2775       mlen++;
2776    }
2777
2778    if (src1.file != BAD_FILE) {
2779       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2780       mlen++;
2781    }
2782
2783    /* Emit the instruction.  Note that this maps to the normal SIMD8
2784     * untyped atomic message on Ivy Bridge, but that's OK because
2785     * unused channels will be masked out.
2786     */
2787    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2788                                  src_reg(atomic_op), src_reg(surf_index));
2789    inst->base_mrf = 0;
2790    inst->mlen = mlen;
2791 }
2792
2793 void
2794 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2795                                         src_reg offset)
2796 {
2797    /* Set the surface read offset. */
2798    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2799
2800    /* Emit the instruction.  Note that this maps to the normal SIMD8
2801     * untyped surface read message, but that's OK because unused
2802     * channels will be masked out.
2803     */
2804    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2805                                  dst, src_reg(surf_index));
2806    inst->base_mrf = 0;
2807    inst->mlen = 1;
2808 }
2809
2810 void
2811 vec4_visitor::emit_ndc_computation()
2812 {
2813    /* Get the position */
2814    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2815
2816    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2817    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2818    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2819
2820    current_annotation = "NDC";
2821    dst_reg ndc_w = ndc;
2822    ndc_w.writemask = WRITEMASK_W;
2823    src_reg pos_w = pos;
2824    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2825    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2826
2827    dst_reg ndc_xyz = ndc;
2828    ndc_xyz.writemask = WRITEMASK_XYZ;
2829
2830    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2831 }
2832
2833 void
2834 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2835 {
2836    if (brw->gen < 6 &&
2837        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2838         key->userclip_active || brw->has_negative_rhw_bug)) {
2839       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2840       dst_reg header1_w = header1;
2841       header1_w.writemask = WRITEMASK_W;
2842
2843       emit(MOV(header1, 0u));
2844
2845       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2846          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2847
2848          current_annotation = "Point size";
2849          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2850          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2851       }
2852
2853       if (key->userclip_active) {
2854          current_annotation = "Clipping flags";
2855          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2856          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2857
2858          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2859          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2860          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2861
2862          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2863          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2864          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2865          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2866       }
2867
2868       /* i965 clipping workaround:
2869        * 1) Test for -ve rhw
2870        * 2) If set,
2871        *      set ndc = (0,0,0,0)
2872        *      set ucp[6] = 1
2873        *
2874        * Later, clipping will detect ucp[6] and ensure the primitive is
2875        * clipped against all fixed planes.
2876        */
2877       if (brw->has_negative_rhw_bug) {
2878          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2879          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2880          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2881          vec4_instruction *inst;
2882          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2883          inst->predicate = BRW_PREDICATE_NORMAL;
2884          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2885          inst->predicate = BRW_PREDICATE_NORMAL;
2886       }
2887
2888       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2889    } else if (brw->gen < 6) {
2890       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2891    } else {
2892       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2893       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2894          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2895                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2896       }
2897       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2898          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2899                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2900       }
2901       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2902          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2903                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2904       }
2905    }
2906 }
2907
2908 void
2909 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2910 {
2911    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2912     *
2913     *     "If a linked set of shaders forming the vertex stage contains no
2914     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2915     *     application has requested clipping against user clip planes through
2916     *     the API, then the coordinate written to gl_Position is used for
2917     *     comparison against the user clip planes."
2918     *
2919     * This function is only called if the shader didn't write to
2920     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2921     * if the user wrote to it; otherwise we use gl_Position.
2922     */
2923    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2924    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2925       clip_vertex = VARYING_SLOT_POS;
2926    }
2927
2928    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2929         ++i) {
2930       reg.writemask = 1 << i;
2931       emit(DP4(reg,
2932                src_reg(output_reg[clip_vertex]),
2933                src_reg(this->userplane[i + offset])));
2934    }
2935 }
2936
2937 void
2938 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2939 {
2940    assert (varying < VARYING_SLOT_MAX);
2941    reg.type = output_reg[varying].type;
2942    current_annotation = output_reg_annotation[varying];
2943    /* Copy the register, saturating if necessary */
2944    vec4_instruction *inst = emit(MOV(reg,
2945                                      src_reg(output_reg[varying])));
2946    if ((varying == VARYING_SLOT_COL0 ||
2947         varying == VARYING_SLOT_COL1 ||
2948         varying == VARYING_SLOT_BFC0 ||
2949         varying == VARYING_SLOT_BFC1) &&
2950        key->clamp_vertex_color) {
2951       inst->saturate = true;
2952    }
2953 }
2954
2955 void
2956 vec4_visitor::emit_urb_slot(int mrf, int varying)
2957 {
2958    struct brw_reg hw_reg = brw_message_reg(mrf);
2959    dst_reg reg = dst_reg(MRF, mrf);
2960    reg.type = BRW_REGISTER_TYPE_F;
2961
2962    switch (varying) {
2963    case VARYING_SLOT_PSIZ:
2964       /* PSIZ is always in slot 0, and is coupled with other flags. */
2965       current_annotation = "indices, point width, clip flags";
2966       emit_psiz_and_flags(hw_reg);
2967       break;
2968    case BRW_VARYING_SLOT_NDC:
2969       current_annotation = "NDC";
2970       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2971       break;
2972    case VARYING_SLOT_POS:
2973       current_annotation = "gl_Position";
2974       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2975       break;
2976    case VARYING_SLOT_EDGE:
2977       /* This is present when doing unfilled polygons.  We're supposed to copy
2978        * the edge flag from the user-provided vertex array
2979        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2980        * of that attribute (starts as 1.0f).  This is then used in clipping to
2981        * determine which edges should be drawn as wireframe.
2982        */
2983       current_annotation = "edge flag";
2984       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2985                                     glsl_type::float_type, WRITEMASK_XYZW))));
2986       break;
2987    case BRW_VARYING_SLOT_PAD:
2988       /* No need to write to this slot */
2989       break;
2990    default:
2991       emit_generic_urb_slot(reg, varying);
2992       break;
2993    }
2994 }
2995
2996 static int
2997 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2998 {
2999    if (brw->gen >= 6) {
3000       /* URB data written (does not include the message header reg) must
3001        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3002        * section 5.4.3.2.2: URB_INTERLEAVED.
3003        *
3004        * URB entries are allocated on a multiple of 1024 bits, so an
3005        * extra 128 bits written here to make the end align to 256 is
3006        * no problem.
3007        */
3008       if ((mlen % 2) != 1)
3009          mlen++;
3010    }
3011
3012    return mlen;
3013 }
3014
3015
3016 /**
3017  * Generates the VUE payload plus the necessary URB write instructions to
3018  * output it.
3019  *
3020  * The VUE layout is documented in Volume 2a.
3021  */
3022 void
3023 vec4_visitor::emit_vertex()
3024 {
3025    /* MRF 0 is reserved for the debugger, so start with message header
3026     * in MRF 1.
3027     */
3028    int base_mrf = 1;
3029    int mrf = base_mrf;
3030    /* In the process of generating our URB write message contents, we
3031     * may need to unspill a register or load from an array.  Those
3032     * reads would use MRFs 14-15.
3033     */
3034    int max_usable_mrf = 13;
3035
3036    /* The following assertion verifies that max_usable_mrf causes an
3037     * even-numbered amount of URB write data, which will meet gen6's
3038     * requirements for length alignment.
3039     */
3040    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3041
3042    /* First mrf is the g0-based message header containing URB handles and
3043     * such.
3044     */
3045    emit_urb_write_header(mrf++);
3046
3047    if (brw->gen < 6) {
3048       emit_ndc_computation();
3049    }
3050
3051    /* Lower legacy ff and ClipVertex clipping to clip distances */
3052    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3053       current_annotation = "user clip distances";
3054
3055       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3056       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3057
3058       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3059       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3060    }
3061
3062    /* We may need to split this up into several URB writes, so do them in a
3063     * loop.
3064     */
3065    int slot = 0;
3066    bool complete = false;
3067    do {
3068       /* URB offset is in URB row increments, and each of our MRFs is half of
3069        * one of those, since we're doing interleaved writes.
3070        */
3071       int offset = slot / 2;
3072
3073       mrf = base_mrf + 1;
3074       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3075          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3076
3077          /* If this was max_usable_mrf, we can't fit anything more into this
3078           * URB WRITE.
3079           */
3080          if (mrf > max_usable_mrf) {
3081             slot++;
3082             break;
3083          }
3084       }
3085
3086       complete = slot >= prog_data->vue_map.num_slots;
3087       current_annotation = "URB write";
3088       vec4_instruction *inst = emit_urb_write_opcode(complete);
3089       inst->base_mrf = base_mrf;
3090       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3091       inst->offset += offset;
3092    } while(!complete);
3093 }
3094
3095
3096 src_reg
3097 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3098                                  src_reg *reladdr, int reg_offset)
3099 {
3100    /* Because we store the values to scratch interleaved like our
3101     * vertex data, we need to scale the vec4 index by 2.
3102     */
3103    int message_header_scale = 2;
3104
3105    /* Pre-gen6, the message header uses byte offsets instead of vec4
3106     * (16-byte) offset units.
3107     */
3108    if (brw->gen < 6)
3109       message_header_scale *= 16;
3110
3111    if (reladdr) {
3112       src_reg index = src_reg(this, glsl_type::int_type);
3113
3114       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3115       emit_before(inst, MUL(dst_reg(index),
3116                             index, src_reg(message_header_scale)));
3117
3118       return index;
3119    } else {
3120       return src_reg(reg_offset * message_header_scale);
3121    }
3122 }
3123
3124 src_reg
3125 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3126                                        src_reg *reladdr, int reg_offset)
3127 {
3128    if (reladdr) {
3129       src_reg index = src_reg(this, glsl_type::int_type);
3130
3131       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3132
3133       /* Pre-gen6, the message header uses byte offsets instead of vec4
3134        * (16-byte) offset units.
3135        */
3136       if (brw->gen < 6) {
3137          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3138       }
3139
3140       return index;
3141    } else if (brw->gen >= 8) {
3142       /* Store the offset in a GRF so we can send-from-GRF. */
3143       src_reg offset = src_reg(this, glsl_type::int_type);
3144       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3145       return offset;
3146    } else {
3147       int message_header_scale = brw->gen < 6 ? 16 : 1;
3148       return src_reg(reg_offset * message_header_scale);
3149    }
3150 }
3151
3152 /**
3153  * Emits an instruction before @inst to load the value named by @orig_src
3154  * from scratch space at @base_offset to @temp.
3155  *
3156  * @base_offset is measured in 32-byte units (the size of a register).
3157  */
3158 void
3159 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3160                                 dst_reg temp, src_reg orig_src,
3161                                 int base_offset)
3162 {
3163    int reg_offset = base_offset + orig_src.reg_offset;
3164    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3165
3166    emit_before(inst, SCRATCH_READ(temp, index));
3167 }
3168
3169 /**
3170  * Emits an instruction after @inst to store the value to be written
3171  * to @orig_dst to scratch space at @base_offset, from @temp.
3172  *
3173  * @base_offset is measured in 32-byte units (the size of a register).
3174  */
3175 void
3176 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3177 {
3178    int reg_offset = base_offset + inst->dst.reg_offset;
3179    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3180
3181    /* Create a temporary register to store *inst's result in.
3182     *
3183     * We have to be careful in MOVing from our temporary result register in
3184     * the scratch write.  If we swizzle from channels of the temporary that
3185     * weren't initialized, it will confuse live interval analysis, which will
3186     * make spilling fail to make progress.
3187     */
3188    src_reg temp = src_reg(this, glsl_type::vec4_type);
3189    temp.type = inst->dst.type;
3190    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3191    int swizzles[4];
3192    for (int i = 0; i < 4; i++)
3193       if (inst->dst.writemask & (1 << i))
3194          swizzles[i] = i;
3195       else
3196          swizzles[i] = first_writemask_chan;
3197    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3198                                swizzles[2], swizzles[3]);
3199
3200    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3201                                        inst->dst.writemask));
3202    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3203    write->predicate = inst->predicate;
3204    write->ir = inst->ir;
3205    write->annotation = inst->annotation;
3206    inst->insert_after(write);
3207
3208    inst->dst.file = temp.file;
3209    inst->dst.reg = temp.reg;
3210    inst->dst.reg_offset = temp.reg_offset;
3211    inst->dst.reladdr = NULL;
3212 }
3213
3214 /**
3215  * We can't generally support array access in GRF space, because a
3216  * single instruction's destination can only span 2 contiguous
3217  * registers.  So, we send all GRF arrays that get variable index
3218  * access to scratch space.
3219  */
3220 void
3221 vec4_visitor::move_grf_array_access_to_scratch()
3222 {
3223    int scratch_loc[this->virtual_grf_count];
3224
3225    for (int i = 0; i < this->virtual_grf_count; i++) {
3226       scratch_loc[i] = -1;
3227    }
3228
3229    /* First, calculate the set of virtual GRFs that need to be punted
3230     * to scratch due to having any array access on them, and where in
3231     * scratch.
3232     */
3233    foreach_list(node, &this->instructions) {
3234       vec4_instruction *inst = (vec4_instruction *)node;
3235
3236       if (inst->dst.file == GRF && inst->dst.reladdr &&
3237           scratch_loc[inst->dst.reg] == -1) {
3238          scratch_loc[inst->dst.reg] = c->last_scratch;
3239          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3240       }
3241
3242       for (int i = 0 ; i < 3; i++) {
3243          src_reg *src = &inst->src[i];
3244
3245          if (src->file == GRF && src->reladdr &&
3246              scratch_loc[src->reg] == -1) {
3247             scratch_loc[src->reg] = c->last_scratch;
3248             c->last_scratch += this->virtual_grf_sizes[src->reg];
3249          }
3250       }
3251    }
3252
3253    /* Now, for anything that will be accessed through scratch, rewrite
3254     * it to load/store.  Note that this is a _safe list walk, because
3255     * we may generate a new scratch_write instruction after the one
3256     * we're processing.
3257     */
3258    foreach_list_safe(node, &this->instructions) {
3259       vec4_instruction *inst = (vec4_instruction *)node;
3260
3261       /* Set up the annotation tracking for new generated instructions. */
3262       base_ir = inst->ir;
3263       current_annotation = inst->annotation;
3264
3265       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3266          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3267       }
3268
3269       for (int i = 0 ; i < 3; i++) {
3270          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3271             continue;
3272
3273          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3274
3275          emit_scratch_read(inst, temp, inst->src[i],
3276                            scratch_loc[inst->src[i].reg]);
3277
3278          inst->src[i].file = temp.file;
3279          inst->src[i].reg = temp.reg;
3280          inst->src[i].reg_offset = temp.reg_offset;
3281          inst->src[i].reladdr = NULL;
3282       }
3283    }
3284 }
3285
3286 /**
3287  * Emits an instruction before @inst to load the value named by @orig_src
3288  * from the pull constant buffer (surface) at @base_offset to @temp.
3289  */
3290 void
3291 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3292                                       dst_reg temp, src_reg orig_src,
3293                                       int base_offset)
3294 {
3295    int reg_offset = base_offset + orig_src.reg_offset;
3296    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3297    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3298    vec4_instruction *load;
3299
3300    if (brw->gen >= 7) {
3301       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3302       grf_offset.type = offset.type;
3303       emit_before(inst, MOV(grf_offset, offset));
3304
3305       load = new(mem_ctx) vec4_instruction(this,
3306                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3307                                            temp, index, src_reg(grf_offset));
3308    } else {
3309       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3310                                            temp, index, offset);
3311       load->base_mrf = 14;
3312       load->mlen = 1;
3313    }
3314    emit_before(inst, load);
3315 }
3316
3317 /**
3318  * Implements array access of uniforms by inserting a
3319  * PULL_CONSTANT_LOAD instruction.
3320  *
3321  * Unlike temporary GRF array access (where we don't support it due to
3322  * the difficulty of doing relative addressing on instruction
3323  * destinations), we could potentially do array access of uniforms
3324  * that were loaded in GRF space as push constants.  In real-world
3325  * usage we've seen, though, the arrays being used are always larger
3326  * than we could load as push constants, so just always move all
3327  * uniform array access out to a pull constant buffer.
3328  */
3329 void
3330 vec4_visitor::move_uniform_array_access_to_pull_constants()
3331 {
3332    int pull_constant_loc[this->uniforms];
3333
3334    for (int i = 0; i < this->uniforms; i++) {
3335       pull_constant_loc[i] = -1;
3336    }
3337
3338    /* Walk through and find array access of uniforms.  Put a copy of that
3339     * uniform in the pull constant buffer.
3340     *
3341     * Note that we don't move constant-indexed accesses to arrays.  No
3342     * testing has been done of the performance impact of this choice.
3343     */
3344    foreach_list_safe(node, &this->instructions) {
3345       vec4_instruction *inst = (vec4_instruction *)node;
3346
3347       for (int i = 0 ; i < 3; i++) {
3348          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3349             continue;
3350
3351          int uniform = inst->src[i].reg;
3352
3353          /* If this array isn't already present in the pull constant buffer,
3354           * add it.
3355           */
3356          if (pull_constant_loc[uniform] == -1) {
3357             const float **values = &stage_prog_data->param[uniform * 4];
3358
3359             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3360
3361             assert(uniform < uniform_array_size);
3362             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3363                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3364                   = values[j];
3365             }
3366          }
3367
3368          /* Set up the annotation tracking for new generated instructions. */
3369          base_ir = inst->ir;
3370          current_annotation = inst->annotation;
3371
3372          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3373
3374          emit_pull_constant_load(inst, temp, inst->src[i],
3375                                  pull_constant_loc[uniform]);
3376
3377          inst->src[i].file = temp.file;
3378          inst->src[i].reg = temp.reg;
3379          inst->src[i].reg_offset = temp.reg_offset;
3380          inst->src[i].reladdr = NULL;
3381       }
3382    }
3383
3384    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3385     * no need to track them as larger-than-vec4 objects.  This will be
3386     * relied on in cutting out unused uniform vectors from push
3387     * constants.
3388     */
3389    split_uniform_registers();
3390 }
3391
3392 void
3393 vec4_visitor::resolve_ud_negate(src_reg *reg)
3394 {
3395    if (reg->type != BRW_REGISTER_TYPE_UD ||
3396        !reg->negate)
3397       return;
3398
3399    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3400    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3401    *reg = temp;
3402 }
3403
3404 vec4_visitor::vec4_visitor(struct brw_context *brw,
3405                            struct brw_vec4_compile *c,
3406                            struct gl_program *prog,
3407                            const struct brw_vec4_prog_key *key,
3408                            struct brw_vec4_prog_data *prog_data,
3409                            struct gl_shader_program *shader_prog,
3410                            gl_shader_stage stage,
3411                            void *mem_ctx,
3412                            bool debug_flag,
3413                            bool no_spills,
3414                            shader_time_shader_type st_base,
3415                            shader_time_shader_type st_written,
3416                            shader_time_shader_type st_reset)
3417    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3418      c(c),
3419      key(key),
3420      prog_data(prog_data),
3421      sanity_param_count(0),
3422      fail_msg(NULL),
3423      first_non_payload_grf(0),
3424      need_all_constants_in_pull_buffer(false),
3425      debug_flag(debug_flag),
3426      no_spills(no_spills),
3427      st_base(st_base),
3428      st_written(st_written),
3429      st_reset(st_reset)
3430 {
3431    this->mem_ctx = mem_ctx;
3432    this->failed = false;
3433
3434    this->base_ir = NULL;
3435    this->current_annotation = NULL;
3436    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3437
3438    this->variable_ht = hash_table_ctor(0,
3439                                        hash_table_pointer_hash,
3440                                        hash_table_pointer_compare);
3441
3442    this->virtual_grf_start = NULL;
3443    this->virtual_grf_end = NULL;
3444    this->virtual_grf_sizes = NULL;
3445    this->virtual_grf_count = 0;
3446    this->virtual_grf_reg_map = NULL;
3447    this->virtual_grf_reg_count = 0;
3448    this->virtual_grf_array_size = 0;
3449    this->live_intervals_valid = false;
3450
3451    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3452
3453    this->uniforms = 0;
3454
3455    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3456     * at least one. See setup_uniforms() in brw_vec4.cpp.
3457     */
3458    this->uniform_array_size = 1;
3459    if (prog_data) {
3460       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3461    }
3462
3463    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3464    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3465 }
3466
3467 vec4_visitor::~vec4_visitor()
3468 {
3469    hash_table_dtor(this->variable_ht);
3470 }
3471
3472
3473 void
3474 vec4_visitor::fail(const char *format, ...)
3475 {
3476    va_list va;
3477    char *msg;
3478
3479    if (failed)
3480       return;
3481
3482    failed = true;
3483
3484    va_start(va, format);
3485    msg = ralloc_vasprintf(mem_ctx, format, va);
3486    va_end(va);
3487    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3488
3489    this->fail_msg = msg;
3490
3491    if (debug_flag) {
3492       fprintf(stderr, "%s",  msg);
3493    }
3494 }
3495
3496 } /* namespace brw */