src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "program/sampler.h"
  28 }
  29
  30 namespace brw {
  31
  32 vec4_instruction::vec4_instruction(vec4_visitor *v,
  33                                    enum opcode opcode, dst_reg dst,
  34                                    src_reg src0, src_reg src1, src_reg src2)
  35 {
  36    this->opcode = opcode;
  37    this->dst = dst;
  38    this->src[0] = src0;
  39    this->src[1] = src1;
  40    this->src[2] = src2;
  41    this->saturate = false;
  42    this->force_writemask_all = false;
  43    this->no_dd_clear = false;
  44    this->no_dd_check = false;
  45    this->writes_accumulator = false;
  46    this->conditional_mod = BRW_CONDITIONAL_NONE;
  47    this->sampler = 0;
  48    this->texture_offset = 0;
  49    this->target = 0;
  50    this->shadow_compare = false;
  51    this->ir = v->base_ir;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->annotation = v->current_annotation;
  58 }
  59
  60 vec4_instruction *
  61 vec4_visitor::emit(vec4_instruction *inst)
  62 {
  63    this->instructions.push_tail(inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  70 {
  71    new_inst->ir = inst->ir;
  72    new_inst->annotation = inst->annotation;
  73
  74    inst->insert_before(new_inst);
  75
  76    return inst;
  77 }
  78
  79 vec4_instruction *
  80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  81                    src_reg src0, src_reg src1, src_reg src2)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  84                                              src0, src1, src2));
  85 }
  86
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  92 }
  93
  94 vec4_instruction *
  95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  96 {
  97    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  98 }
  99
 100 vec4_instruction *
 101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
 102 {
 103    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
 104 }
 105
 106 vec4_instruction *
 107 vec4_visitor::emit(enum opcode opcode)
 108 {
 109    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 110 }
 111
 112 #define ALU1(op)                                                        \
 113    vec4_instruction *                                                   \
 114    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 115    {                                                                    \
 116       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 117                                            src0);                       \
 118    }
 119
 120 #define ALU2(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 125                                            src0, src1);                 \
 126    }
 127
 128 #define ALU2_ACC(op)                                                    \
 129    vec4_instruction *                                                   \
 130    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 131    {                                                                    \
 132       vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
 133                        BRW_OPCODE_##op, dst, src0, src1);               \
 134       inst->writes_accumulator = true;                                 \
 135       return inst;                                                     \
 136    }
 137
 138 #define ALU3(op)                                                        \
 139    vec4_instruction *                                                   \
 140    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 141    {                                                                    \
 142       assert(brw->gen >= 6);                                            \
 143       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 144                                            src0, src1, src2);           \
 145    }
 146
 147 ALU1(NOT)
 148 ALU1(MOV)
 149 ALU1(FRC)
 150 ALU1(RNDD)
 151 ALU1(RNDE)
 152 ALU1(RNDZ)
 153 ALU1(F32TO16)
 154 ALU1(F16TO32)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2_ACC(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(DP3)
 162 ALU2(DP4)
 163 ALU2(DPH)
 164 ALU2(SHL)
 165 ALU2(SHR)
 166 ALU2(ASR)
 167 ALU3(LRP)
 168 ALU1(BFREV)
 169 ALU3(BFE)
 170 ALU2(BFI1)
 171 ALU3(BFI2)
 172 ALU1(FBH)
 173 ALU1(FBL)
 174 ALU1(CBIT)
 175 ALU3(MAD)
 176 ALU2_ACC(ADDC)
 177 ALU2_ACC(SUBB)
 178 ALU2(MAC)
 179
 180 /** Gen4 predicated IF. */
 181 vec4_instruction *
 182 vec4_visitor::IF(uint32_t predicate)
 183 {
 184    vec4_instruction *inst;
 185
 186    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 187    inst->predicate = predicate;
 188
 189    return inst;
 190 }
 191
 192 /** Gen6 IF with embedded comparison. */
 193 vec4_instruction *
 194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 195 {
 196    assert(brw->gen == 6);
 197
 198    vec4_instruction *inst;
 199
 200    resolve_ud_negate(&src0);
 201    resolve_ud_negate(&src1);
 202
 203    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 204                                         src0, src1);
 205    inst->conditional_mod = condition;
 206
 207    return inst;
 208 }
 209
 210 /**
 211  * CMP: Sets the low bit of the destination channels with the result
 212  * of the comparison, while the upper bits are undefined, and updates
 213  * the flag register with the packed 16 bits of the result.
 214  */
 215 vec4_instruction *
 216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 217 {
 218    vec4_instruction *inst;
 219
 220    /* original gen4 does type conversion to the destination type
 221     * before before comparison, producing garbage results for floating
 222     * point comparisons.
 223     */
 224    if (brw->gen == 4) {
 225       dst.type = src0.type;
 226       if (dst.file == HW_REG)
 227          dst.fixed_hw_reg.type = dst.type;
 228    }
 229
 230    resolve_ud_negate(&src0);
 231    resolve_ud_negate(&src1);
 232
 233    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 234    inst->conditional_mod = condition;
 235
 236    return inst;
 237 }
 238
 239 vec4_instruction *
 240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 241 {
 242    vec4_instruction *inst;
 243
 244    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
 245                                         dst, index);
 246    inst->base_mrf = 14;
 247    inst->mlen = 2;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 258                                         dst, src, index);
 259    inst->base_mrf = 13;
 260    inst->mlen = 3;
 261
 262    return inst;
 263 }
 264
 265 void
 266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 267 {
 268    static enum opcode dot_opcodes[] = {
 269       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 270    };
 271
 272    emit(dot_opcodes[elements - 2], dst, src0, src1);
 273 }
 274
 275 src_reg
 276 vec4_visitor::fix_3src_operand(src_reg src)
 277 {
 278    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 279     * able to use vertical stride of zero to replicate the vec4 uniform, like
 280     *
 281     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 282     *
 283     * But you can't, since vertical stride is always four in three-source
 284     * instructions. Instead, insert a MOV instruction to do the replication so
 285     * that the three-source instruction can consume it.
 286     */
 287
 288    /* The MOV is only needed if the source is a uniform or immediate. */
 289    if (src.file != UNIFORM && src.file != IMM)
 290       return src;
 291
 292    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 293       return src;
 294
 295    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 296    expanded.type = src.type;
 297    emit(MOV(expanded, src));
 298    return src_reg(expanded);
 299 }
 300
 301 src_reg
 302 vec4_visitor::fix_math_operand(src_reg src)
 303 {
 304    /* The gen6 math instruction ignores the source modifiers --
 305     * swizzle, abs, negate, and at least some parts of the register
 306     * region description.
 307     *
 308     * Rather than trying to enumerate all these cases, *always* expand the
 309     * operand to a temp GRF for gen6.
 310     *
 311     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 312     * can't use.
 313     */
 314
 315    if (brw->gen == 7 && src.file != IMM)
 316       return src;
 317
 318    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 319    expanded.type = src.type;
 320    emit(MOV(expanded, src));
 321    return src_reg(expanded);
 322 }
 323
 324 void
 325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 326 {
 327    src = fix_math_operand(src);
 328
 329    if (dst.writemask != WRITEMASK_XYZW) {
 330       /* The gen6 math instruction must be align1, so we can't do
 331        * writemasks.
 332        */
 333       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 334
 335       emit(opcode, temp_dst, src);
 336
 337       emit(MOV(dst, src_reg(temp_dst)));
 338    } else {
 339       emit(opcode, dst, src);
 340    }
 341 }
 342
 343 void
 344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 345 {
 346    vec4_instruction *inst = emit(opcode, dst, src);
 347    inst->base_mrf = 1;
 348    inst->mlen = 1;
 349 }
 350
 351 void
 352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 353 {
 354    switch (opcode) {
 355    case SHADER_OPCODE_RCP:
 356    case SHADER_OPCODE_RSQ:
 357    case SHADER_OPCODE_SQRT:
 358    case SHADER_OPCODE_EXP2:
 359    case SHADER_OPCODE_LOG2:
 360    case SHADER_OPCODE_SIN:
 361    case SHADER_OPCODE_COS:
 362       break;
 363    default:
 364       assert(!"not reached: bad math opcode");
 365       return;
 366    }
 367
 368    if (brw->gen >= 6) {
 369       return emit_math1_gen6(opcode, dst, src);
 370    } else {
 371       return emit_math1_gen4(opcode, dst, src);
 372    }
 373 }
 374
 375 void
 376 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 377                               dst_reg dst, src_reg src0, src_reg src1)
 378 {
 379    src0 = fix_math_operand(src0);
 380    src1 = fix_math_operand(src1);
 381
 382    if (dst.writemask != WRITEMASK_XYZW) {
 383       /* The gen6 math instruction must be align1, so we can't do
 384        * writemasks.
 385        */
 386       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 387       temp_dst.type = dst.type;
 388
 389       emit(opcode, temp_dst, src0, src1);
 390
 391       emit(MOV(dst, src_reg(temp_dst)));
 392    } else {
 393       emit(opcode, dst, src0, src1);
 394    }
 395 }
 396
 397 void
 398 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 399                               dst_reg dst, src_reg src0, src_reg src1)
 400 {
 401    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 402    inst->base_mrf = 1;
 403    inst->mlen = 2;
 404 }
 405
 406 void
 407 vec4_visitor::emit_math(enum opcode opcode,
 408                         dst_reg dst, src_reg src0, src_reg src1)
 409 {
 410    switch (opcode) {
 411    case SHADER_OPCODE_POW:
 412    case SHADER_OPCODE_INT_QUOTIENT:
 413    case SHADER_OPCODE_INT_REMAINDER:
 414       break;
 415    default:
 416       assert(!"not reached: unsupported binary math opcode");
 417       return;
 418    }
 419
 420    if (brw->gen >= 6) {
 421       return emit_math2_gen6(opcode, dst, src0, src1);
 422    } else {
 423       return emit_math2_gen4(opcode, dst, src0, src1);
 424    }
 425 }
 426
 427 void
 428 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 429 {
 430    if (brw->gen < 7)
 431       assert(!"ir_unop_pack_half_2x16 should be lowered");
 432
 433    assert(dst.type == BRW_REGISTER_TYPE_UD);
 434    assert(src0.type == BRW_REGISTER_TYPE_F);
 435
 436    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 437     *
 438     *   Because this instruction does not have a 16-bit floating-point type,
 439     *   the destination data type must be Word (W).
 440     *
 441     *   The destination must be DWord-aligned and specify a horizontal stride
 442     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 443     *   each destination channel and the upper word is not modified.
 444     *
 445     * The above restriction implies that the f32to16 instruction must use
 446     * align1 mode, because only in align1 mode is it possible to specify
 447     * horizontal stride.  We choose here to defy the hardware docs and emit
 448     * align16 instructions.
 449     *
 450     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 451     * instructions. I was partially successful in that the code passed all
 452     * tests.  However, the code was dubiously correct and fragile, and the
 453     * tests were not harsh enough to probe that frailty. Not trusting the
 454     * code, I chose instead to remain in align16 mode in defiance of the hw
 455     * docs).
 456     *
 457     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 458     * simulator, emitting a f32to16 in align16 mode with UD as destination
 459     * data type is safe. The behavior differs from that specified in the PRM
 460     * in that the upper word of each destination channel is cleared to 0.
 461     */
 462
 463    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 464    src_reg tmp_src(tmp_dst);
 465
 466 #if 0
 467    /* Verify the undocumented behavior on which the following instructions
 468     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 469     * then the result of the bit-or instruction below will be incorrect.
 470     *
 471     * You should inspect the disasm output in order to verify that the MOV is
 472     * not optimized away.
 473     */
 474    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 475 #endif
 476
 477    /* Give tmp the form below, where "." means untouched.
 478     *
 479     *     w z          y          x w z          y          x
 480     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 481     *
 482     * That the upper word of each write-channel be 0 is required for the
 483     * following bit-shift and bit-or instructions to work. Note that this
 484     * relies on the undocumented hardware behavior mentioned above.
 485     */
 486    tmp_dst.writemask = WRITEMASK_XY;
 487    emit(F32TO16(tmp_dst, src0));
 488
 489    /* Give the write-channels of dst the form:
 490     *   0xhhhh0000
 491     */
 492    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 493    emit(SHL(dst, tmp_src, src_reg(16u)));
 494
 495    /* Finally, give the write-channels of dst the form of packHalf2x16's
 496     * output:
 497     *   0xhhhhllll
 498     */
 499    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 500    emit(OR(dst, src_reg(dst), tmp_src));
 501 }
 502
 503 void
 504 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 505 {
 506    if (brw->gen < 7)
 507       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 508
 509    assert(dst.type == BRW_REGISTER_TYPE_F);
 510    assert(src0.type == BRW_REGISTER_TYPE_UD);
 511
 512    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 513     *
 514     *   Because this instruction does not have a 16-bit floating-point type,
 515     *   the source data type must be Word (W). The destination type must be
 516     *   F (Float).
 517     *
 518     * To use W as the source data type, we must adjust horizontal strides,
 519     * which is only possible in align1 mode. All my [chadv] attempts at
 520     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 521     * Piglit tests, so I gave up.
 522     *
 523     * I've verified that, on gen7 hardware and the simulator, it is safe to
 524     * emit f16to32 in align16 mode with UD as source data type.
 525     */
 526
 527    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 528    src_reg tmp_src(tmp_dst);
 529
 530    tmp_dst.writemask = WRITEMASK_X;
 531    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 532
 533    tmp_dst.writemask = WRITEMASK_Y;
 534    emit(SHR(tmp_dst, src0, src_reg(16u)));
 535
 536    dst.writemask = WRITEMASK_XY;
 537    emit(F16TO32(dst, tmp_src));
 538 }
 539
 540 void
 541 vec4_visitor::visit_instructions(const exec_list *list)
 542 {
 543    foreach_list(node, list) {
 544       ir_instruction *ir = (ir_instruction *)node;
 545
 546       base_ir = ir;
 547       ir->accept(this);
 548    }
 549 }
 550
 551
 552 static int
 553 type_size(const struct glsl_type *type)
 554 {
 555    unsigned int i;
 556    int size;
 557
 558    switch (type->base_type) {
 559    case GLSL_TYPE_UINT:
 560    case GLSL_TYPE_INT:
 561    case GLSL_TYPE_FLOAT:
 562    case GLSL_TYPE_BOOL:
 563       if (type->is_matrix()) {
 564          return type->matrix_columns;
 565       } else {
 566          /* Regardless of size of vector, it gets a vec4. This is bad
 567           * packing for things like floats, but otherwise arrays become a
 568           * mess.  Hopefully a later pass over the code can pack scalars
 569           * down if appropriate.
 570           */
 571          return 1;
 572       }
 573    case GLSL_TYPE_ARRAY:
 574       assert(type->length > 0);
 575       return type_size(type->fields.array) * type->length;
 576    case GLSL_TYPE_STRUCT:
 577       size = 0;
 578       for (i = 0; i < type->length; i++) {
 579          size += type_size(type->fields.structure[i].type);
 580       }
 581       return size;
 582    case GLSL_TYPE_SAMPLER:
 583       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 584        * at link time.
 585        */
 586       return 1;
 587    case GLSL_TYPE_ATOMIC_UINT:
 588       return 0;
 589    case GLSL_TYPE_IMAGE:
 590    case GLSL_TYPE_VOID:
 591    case GLSL_TYPE_ERROR:
 592    case GLSL_TYPE_INTERFACE:
 593       assert(0);
 594       break;
 595    }
 596
 597    return 0;
 598 }
 599
 600 int
 601 vec4_visitor::virtual_grf_alloc(int size)
 602 {
 603    if (virtual_grf_array_size <= virtual_grf_count) {
 604       if (virtual_grf_array_size == 0)
 605          virtual_grf_array_size = 16;
 606       else
 607          virtual_grf_array_size *= 2;
 608       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 609                                    virtual_grf_array_size);
 610       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 611                                      virtual_grf_array_size);
 612    }
 613    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 614    virtual_grf_reg_count += size;
 615    virtual_grf_sizes[virtual_grf_count] = size;
 616    return virtual_grf_count++;
 617 }
 618
 619 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 620 {
 621    init();
 622
 623    this->file = GRF;
 624    this->reg = v->virtual_grf_alloc(type_size(type));
 625
 626    if (type->is_array() || type->is_record()) {
 627       this->swizzle = BRW_SWIZZLE_NOOP;
 628    } else {
 629       this->swizzle = swizzle_for_size(type->vector_elements);
 630    }
 631
 632    this->type = brw_type_for_base_type(type);
 633 }
 634
 635 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 636 {
 637    init();
 638
 639    this->file = GRF;
 640    this->reg = v->virtual_grf_alloc(type_size(type));
 641
 642    if (type->is_array() || type->is_record()) {
 643       this->writemask = WRITEMASK_XYZW;
 644    } else {
 645       this->writemask = (1 << type->vector_elements) - 1;
 646    }
 647
 648    this->type = brw_type_for_base_type(type);
 649 }
 650
 651 /* Our support for uniforms is piggy-backed on the struct
 652  * gl_fragment_program, because that's where the values actually
 653  * get stored, rather than in some global gl_shader_program uniform
 654  * store.
 655  */
 656 void
 657 vec4_visitor::setup_uniform_values(ir_variable *ir)
 658 {
 659    int namelen = strlen(ir->name);
 660
 661    /* The data for our (non-builtin) uniforms is stored in a series of
 662     * gl_uniform_driver_storage structs for each subcomponent that
 663     * glGetUniformLocation() could name.  We know it's been set up in the same
 664     * order we'd walk the type, so walk the list of storage and find anything
 665     * with our name, or the prefix of a component that starts with our name.
 666     */
 667    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 668       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 669
 670       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 671           (storage->name[namelen] != 0 &&
 672            storage->name[namelen] != '.' &&
 673            storage->name[namelen] != '[')) {
 674          continue;
 675       }
 676
 677       gl_constant_value *components = storage->storage;
 678       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 679                                storage->type->matrix_columns);
 680
 681       for (unsigned s = 0; s < vector_count; s++) {
 682          assert(uniforms < uniform_array_size);
 683          uniform_vector_size[uniforms] = storage->type->vector_elements;
 684
 685          int i;
 686          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 687             stage_prog_data->param[uniforms * 4 + i] = &components->f;
 688             components++;
 689          }
 690          for (; i < 4; i++) {
 691             static float zero = 0;
 692             stage_prog_data->param[uniforms * 4 + i] = &zero;
 693          }
 694
 695          uniforms++;
 696       }
 697    }
 698 }
 699
 700 void
 701 vec4_visitor::setup_uniform_clipplane_values()
 702 {
 703    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 704
 705    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 706       assert(this->uniforms < uniform_array_size);
 707       this->uniform_vector_size[this->uniforms] = 4;
 708       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 709       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 710       for (int j = 0; j < 4; ++j) {
 711          stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 712       }
 713       ++this->uniforms;
 714    }
 715 }
 716
 717 /* Our support for builtin uniforms is even scarier than non-builtin.
 718  * It sits on top of the PROG_STATE_VAR parameters that are
 719  * automatically updated from GL context state.
 720  */
 721 void
 722 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 723 {
 724    const ir_state_slot *const slots = ir->state_slots;
 725    assert(ir->state_slots != NULL);
 726
 727    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 728       /* This state reference has already been setup by ir_to_mesa,
 729        * but we'll get the same index back here.  We can reference
 730        * ParameterValues directly, since unlike brw_fs.cpp, we never
 731        * add new state references during compile.
 732        */
 733       int index = _mesa_add_state_reference(this->prog->Parameters,
 734                                             (gl_state_index *)slots[i].tokens);
 735       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 736
 737       assert(this->uniforms < uniform_array_size);
 738       this->uniform_vector_size[this->uniforms] = 0;
 739       /* Add each of the unique swizzled channels of the element.
 740        * This will end up matching the size of the glsl_type of this field.
 741        */
 742       int last_swiz = -1;
 743       for (unsigned int j = 0; j < 4; j++) {
 744          int swiz = GET_SWZ(slots[i].swizzle, j);
 745          last_swiz = swiz;
 746
 747          stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 748          assert(this->uniforms < uniform_array_size);
 749          if (swiz <= last_swiz)
 750             this->uniform_vector_size[this->uniforms]++;
 751       }
 752       this->uniforms++;
 753    }
 754 }
 755
 756 dst_reg *
 757 vec4_visitor::variable_storage(ir_variable *var)
 758 {
 759    return (dst_reg *)hash_table_find(this->variable_ht, var);
 760 }
 761
 762 void
 763 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 764 {
 765    ir_expression *expr = ir->as_expression();
 766
 767    *predicate = BRW_PREDICATE_NORMAL;
 768
 769    if (expr) {
 770       src_reg op[2];
 771       vec4_instruction *inst;
 772
 773       assert(expr->get_num_operands() <= 2);
 774       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 775          expr->operands[i]->accept(this);
 776          op[i] = this->result;
 777
 778          resolve_ud_negate(&op[i]);
 779       }
 780
 781       switch (expr->operation) {
 782       case ir_unop_logic_not:
 783          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 784          inst->conditional_mod = BRW_CONDITIONAL_Z;
 785          break;
 786
 787       case ir_binop_logic_xor:
 788          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 789          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 790          break;
 791
 792       case ir_binop_logic_or:
 793          inst = emit(OR(dst_null_d(), op[0], op[1]));
 794          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 795          break;
 796
 797       case ir_binop_logic_and:
 798          inst = emit(AND(dst_null_d(), op[0], op[1]));
 799          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 800          break;
 801
 802       case ir_unop_f2b:
 803          if (brw->gen >= 6) {
 804             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 805          } else {
 806             inst = emit(MOV(dst_null_f(), op[0]));
 807             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 808          }
 809          break;
 810
 811       case ir_unop_i2b:
 812          if (brw->gen >= 6) {
 813             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 814          } else {
 815             inst = emit(MOV(dst_null_d(), op[0]));
 816             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 817          }
 818          break;
 819
 820       case ir_binop_all_equal:
 821          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 822          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 823          break;
 824
 825       case ir_binop_any_nequal:
 826          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 827          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 828          break;
 829
 830       case ir_unop_any:
 831          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 832          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 833          break;
 834
 835       case ir_binop_greater:
 836       case ir_binop_gequal:
 837       case ir_binop_less:
 838       case ir_binop_lequal:
 839       case ir_binop_equal:
 840       case ir_binop_nequal:
 841          emit(CMP(dst_null_d(), op[0], op[1],
 842                   brw_conditional_for_comparison(expr->operation)));
 843          break;
 844
 845       default:
 846          assert(!"not reached");
 847          break;
 848       }
 849       return;
 850    }
 851
 852    ir->accept(this);
 853
 854    resolve_ud_negate(&this->result);
 855
 856    if (brw->gen >= 6) {
 857       vec4_instruction *inst = emit(AND(dst_null_d(),
 858                                         this->result, src_reg(1)));
 859       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 860    } else {
 861       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 862       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 863    }
 864 }
 865
 866 /**
 867  * Emit a gen6 IF statement with the comparison folded into the IF
 868  * instruction.
 869  */
 870 void
 871 vec4_visitor::emit_if_gen6(ir_if *ir)
 872 {
 873    ir_expression *expr = ir->condition->as_expression();
 874
 875    if (expr) {
 876       src_reg op[2];
 877       dst_reg temp;
 878
 879       assert(expr->get_num_operands() <= 2);
 880       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 881          expr->operands[i]->accept(this);
 882          op[i] = this->result;
 883       }
 884
 885       switch (expr->operation) {
 886       case ir_unop_logic_not:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 888          return;
 889
 890       case ir_binop_logic_xor:
 891          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 892          return;
 893
 894       case ir_binop_logic_or:
 895          temp = dst_reg(this, glsl_type::bool_type);
 896          emit(OR(temp, op[0], op[1]));
 897          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 898          return;
 899
 900       case ir_binop_logic_and:
 901          temp = dst_reg(this, glsl_type::bool_type);
 902          emit(AND(temp, op[0], op[1]));
 903          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 904          return;
 905
 906       case ir_unop_f2b:
 907          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 908          return;
 909
 910       case ir_unop_i2b:
 911          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 912          return;
 913
 914       case ir_binop_greater:
 915       case ir_binop_gequal:
 916       case ir_binop_less:
 917       case ir_binop_lequal:
 918       case ir_binop_equal:
 919       case ir_binop_nequal:
 920          emit(IF(op[0], op[1],
 921                  brw_conditional_for_comparison(expr->operation)));
 922          return;
 923
 924       case ir_binop_all_equal:
 925          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 926          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 927          return;
 928
 929       case ir_binop_any_nequal:
 930          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 931          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 932          return;
 933
 934       case ir_unop_any:
 935          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 936          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 937          return;
 938
 939       default:
 940          assert(!"not reached");
 941          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 942          return;
 943       }
 944       return;
 945    }
 946
 947    ir->condition->accept(this);
 948
 949    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 950 }
 951
 952 void
 953 vec4_visitor::visit(ir_variable *ir)
 954 {
 955    dst_reg *reg = NULL;
 956
 957    if (variable_storage(ir))
 958       return;
 959
 960    switch (ir->data.mode) {
 961    case ir_var_shader_in:
 962       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
 963       break;
 964
 965    case ir_var_shader_out:
 966       reg = new(mem_ctx) dst_reg(this, ir->type);
 967
 968       for (int i = 0; i < type_size(ir->type); i++) {
 969          output_reg[ir->data.location + i] = *reg;
 970          output_reg[ir->data.location + i].reg_offset = i;
 971          output_reg[ir->data.location + i].type =
 972             brw_type_for_base_type(ir->type->get_scalar_type());
 973          output_reg_annotation[ir->data.location + i] = ir->name;
 974       }
 975       break;
 976
 977    case ir_var_auto:
 978    case ir_var_temporary:
 979       reg = new(mem_ctx) dst_reg(this, ir->type);
 980       break;
 981
 982    case ir_var_uniform:
 983       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 984
 985       /* Thanks to the lower_ubo_reference pass, we will see only
 986        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 987        * variables, so no need for them to be in variable_ht.
 988        *
 989        * Atomic counters take no uniform storage, no need to do
 990        * anything here.
 991        */
 992       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 993          return;
 994
 995       /* Track how big the whole uniform variable is, in case we need to put a
 996        * copy of its data into pull constants for array access.
 997        */
 998       assert(this->uniforms < uniform_array_size);
 999       this->uniform_size[this->uniforms] = type_size(ir->type);
1000
1001       if (!strncmp(ir->name, "gl_", 3)) {
1002          setup_builtin_uniform_values(ir);
1003       } else {
1004          setup_uniform_values(ir);
1005       }
1006       break;
1007
1008    case ir_var_system_value:
1009       reg = make_reg_for_system_value(ir);
1010       break;
1011
1012    default:
1013       assert(!"not reached");
1014    }
1015
1016    reg->type = brw_type_for_base_type(ir->type);
1017    hash_table_insert(this->variable_ht, reg, ir);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop *ir)
1022 {
1023    /* We don't want debugging output to print the whole body of the
1024     * loop as the annotation.
1025     */
1026    this->base_ir = NULL;
1027
1028    emit(BRW_OPCODE_DO);
1029
1030    visit_instructions(&ir->body_instructions);
1031
1032    emit(BRW_OPCODE_WHILE);
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_loop_jump *ir)
1037 {
1038    switch (ir->mode) {
1039    case ir_loop_jump::jump_break:
1040       emit(BRW_OPCODE_BREAK);
1041       break;
1042    case ir_loop_jump::jump_continue:
1043       emit(BRW_OPCODE_CONTINUE);
1044       break;
1045    }
1046 }
1047
1048
1049 void
1050 vec4_visitor::visit(ir_function_signature *ir)
1051 {
1052    assert(0);
1053    (void)ir;
1054 }
1055
1056 void
1057 vec4_visitor::visit(ir_function *ir)
1058 {
1059    /* Ignore function bodies other than main() -- we shouldn't see calls to
1060     * them since they should all be inlined.
1061     */
1062    if (strcmp(ir->name, "main") == 0) {
1063       const ir_function_signature *sig;
1064       exec_list empty;
1065
1066       sig = ir->matching_signature(NULL, &empty);
1067
1068       assert(sig);
1069
1070       visit_instructions(&sig->body);
1071    }
1072 }
1073
1074 bool
1075 vec4_visitor::try_emit_sat(ir_expression *ir)
1076 {
1077    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1078    if (!sat_src)
1079       return false;
1080
1081    sat_src->accept(this);
1082    src_reg src = this->result;
1083
1084    this->result = src_reg(this, ir->type);
1085    vec4_instruction *inst;
1086    inst = emit(MOV(dst_reg(this->result), src));
1087    inst->saturate = true;
1088
1089    return true;
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1094 {
1095    /* 3-src instructions were introduced in gen6. */
1096    if (brw->gen < 6)
1097       return false;
1098
1099    /* MAD can only handle floating-point data. */
1100    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101       return false;
1102
1103    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1104    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1105
1106    if (!mul || mul->operation != ir_binop_mul)
1107       return false;
1108
1109    nonmul->accept(this);
1110    src_reg src0 = fix_3src_operand(this->result);
1111
1112    mul->operands[0]->accept(this);
1113    src_reg src1 = fix_3src_operand(this->result);
1114
1115    mul->operands[1]->accept(this);
1116    src_reg src2 = fix_3src_operand(this->result);
1117
1118    this->result = src_reg(this, ir->type);
1119    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1120
1121    return true;
1122 }
1123
1124 void
1125 vec4_visitor::emit_bool_comparison(unsigned int op,
1126                                  dst_reg dst, src_reg src0, src_reg src1)
1127 {
1128    /* original gen4 does destination conversion before comparison. */
1129    if (brw->gen < 5)
1130       dst.type = src0.type;
1131
1132    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1133
1134    dst.type = BRW_REGISTER_TYPE_D;
1135    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1136 }
1137
1138 void
1139 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1140                           src_reg src0, src_reg src1)
1141 {
1142    vec4_instruction *inst;
1143
1144    if (brw->gen >= 6) {
1145       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1146       inst->conditional_mod = conditionalmod;
1147    } else {
1148       emit(CMP(dst, src0, src1, conditionalmod));
1149
1150       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1151       inst->predicate = BRW_PREDICATE_NORMAL;
1152    }
1153 }
1154
1155 void
1156 vec4_visitor::emit_lrp(const dst_reg &dst,
1157                        const src_reg &x, const src_reg &y, const src_reg &a)
1158 {
1159    if (brw->gen >= 6) {
1160       /* Note that the instruction's argument order is reversed from GLSL
1161        * and the IR.
1162        */
1163       emit(LRP(dst,
1164                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1165    } else {
1166       /* Earlier generations don't support three source operations, so we
1167        * need to emit x*(1-a) + y*a.
1168        */
1169       dst_reg one_minus_a   = dst_reg(this, glsl_type::vec4_type);
1170       one_minus_a.writemask = dst.writemask;
1171
1172       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1173       vec4_instruction *mul = emit(MUL(dst_null_f(), y, a));
1174       mul->writes_accumulator = true;
1175       emit(MAC(dst, x, src_reg(one_minus_a)));
1176    }
1177 }
1178
1179 void
1180 vec4_visitor::visit(ir_expression *ir)
1181 {
1182    unsigned int operand;
1183    src_reg op[Elements(ir->operands)];
1184    src_reg result_src;
1185    dst_reg result_dst;
1186    vec4_instruction *inst;
1187
1188    if (try_emit_sat(ir))
1189       return;
1190
1191    if (ir->operation == ir_binop_add) {
1192       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1193          return;
1194    }
1195
1196    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1197       this->result.file = BAD_FILE;
1198       ir->operands[operand]->accept(this);
1199       if (this->result.file == BAD_FILE) {
1200          fprintf(stderr, "Failed to get tree for expression operand:\n");
1201          ir->operands[operand]->fprint(stderr);
1202          exit(1);
1203       }
1204       op[operand] = this->result;
1205
1206       /* Matrix expression operands should have been broken down to vector
1207        * operations already.
1208        */
1209       assert(!ir->operands[operand]->type->is_matrix());
1210    }
1211
1212    int vector_elements = ir->operands[0]->type->vector_elements;
1213    if (ir->operands[1]) {
1214       vector_elements = MAX2(vector_elements,
1215                              ir->operands[1]->type->vector_elements);
1216    }
1217
1218    this->result.file = BAD_FILE;
1219
1220    /* Storage for our result.  Ideally for an assignment we'd be using
1221     * the actual storage for the result here, instead.
1222     */
1223    result_src = src_reg(this, ir->type);
1224    /* convenience for the emit functions below. */
1225    result_dst = dst_reg(result_src);
1226    /* If nothing special happens, this is the result. */
1227    this->result = result_src;
1228    /* Limit writes to the channels that will be used by result_src later.
1229     * This does limit this temp's use as a temporary for multi-instruction
1230     * sequences.
1231     */
1232    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1233
1234    switch (ir->operation) {
1235    case ir_unop_logic_not:
1236       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1237        * ones complement of the whole register, not just bit 0.
1238        */
1239       emit(XOR(result_dst, op[0], src_reg(1)));
1240       break;
1241    case ir_unop_neg:
1242       op[0].negate = !op[0].negate;
1243       emit(MOV(result_dst, op[0]));
1244       break;
1245    case ir_unop_abs:
1246       op[0].abs = true;
1247       op[0].negate = false;
1248       emit(MOV(result_dst, op[0]));
1249       break;
1250
1251    case ir_unop_sign:
1252       if (ir->type->is_float()) {
1253          /* AND(val, 0x80000000) gives the sign bit.
1254           *
1255           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1256           * zero.
1257           */
1258          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1259
1260          op[0].type = BRW_REGISTER_TYPE_UD;
1261          result_dst.type = BRW_REGISTER_TYPE_UD;
1262          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1263
1264          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1265          inst->predicate = BRW_PREDICATE_NORMAL;
1266
1267          this->result.type = BRW_REGISTER_TYPE_F;
1268       } else {
1269          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1270           *               -> non-negative val generates 0x00000000.
1271           *  Predicated OR sets 1 if val is positive.
1272           */
1273          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1274
1275          emit(ASR(result_dst, op[0], src_reg(31)));
1276
1277          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1278          inst->predicate = BRW_PREDICATE_NORMAL;
1279       }
1280       break;
1281
1282    case ir_unop_rcp:
1283       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1284       break;
1285
1286    case ir_unop_exp2:
1287       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1288       break;
1289    case ir_unop_log2:
1290       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1291       break;
1292    case ir_unop_exp:
1293    case ir_unop_log:
1294       assert(!"not reached: should be handled by ir_explog_to_explog2");
1295       break;
1296    case ir_unop_sin:
1297    case ir_unop_sin_reduced:
1298       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1299       break;
1300    case ir_unop_cos:
1301    case ir_unop_cos_reduced:
1302       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1303       break;
1304
1305    case ir_unop_dFdx:
1306    case ir_unop_dFdy:
1307       assert(!"derivatives not valid in vertex shader");
1308       break;
1309
1310    case ir_unop_bitfield_reverse:
1311       emit(BFREV(result_dst, op[0]));
1312       break;
1313    case ir_unop_bit_count:
1314       emit(CBIT(result_dst, op[0]));
1315       break;
1316    case ir_unop_find_msb: {
1317       src_reg temp = src_reg(this, glsl_type::uint_type);
1318
1319       inst = emit(FBH(dst_reg(temp), op[0]));
1320       inst->dst.writemask = WRITEMASK_XYZW;
1321
1322       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1323        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1324        * subtract the result from 31 to convert the MSB count into an LSB count.
1325        */
1326
1327       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1328       temp.swizzle = BRW_SWIZZLE_NOOP;
1329       emit(MOV(result_dst, temp));
1330
1331       src_reg src_tmp = src_reg(result_dst);
1332       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1333
1334       src_tmp.negate = true;
1335       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1336       inst->predicate = BRW_PREDICATE_NORMAL;
1337       break;
1338    }
1339    case ir_unop_find_lsb:
1340       emit(FBL(result_dst, op[0]));
1341       break;
1342
1343    case ir_unop_noise:
1344       assert(!"not reached: should be handled by lower_noise");
1345       break;
1346
1347    case ir_binop_add:
1348       emit(ADD(result_dst, op[0], op[1]));
1349       break;
1350    case ir_binop_sub:
1351       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1352       break;
1353
1354    case ir_binop_mul:
1355       if (brw->gen < 8 && ir->type->is_integer()) {
1356          /* For integer multiplication, the MUL uses the low 16 bits of one of
1357           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1358           * accumulates in the contribution of the upper 16 bits of that
1359           * operand.  If we can determine that one of the args is in the low
1360           * 16 bits, though, we can just emit a single MUL.
1361           */
1362          if (ir->operands[0]->is_uint16_constant()) {
1363             if (brw->gen < 7)
1364                emit(MUL(result_dst, op[0], op[1]));
1365             else
1366                emit(MUL(result_dst, op[1], op[0]));
1367          } else if (ir->operands[1]->is_uint16_constant()) {
1368             if (brw->gen < 7)
1369                emit(MUL(result_dst, op[1], op[0]));
1370             else
1371                emit(MUL(result_dst, op[0], op[1]));
1372          } else {
1373             struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1374
1375             emit(MUL(acc, op[0], op[1]));
1376             emit(MACH(dst_null_d(), op[0], op[1]));
1377             emit(MOV(result_dst, src_reg(acc)));
1378          }
1379       } else {
1380          emit(MUL(result_dst, op[0], op[1]));
1381       }
1382       break;
1383    case ir_binop_imul_high: {
1384       struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1385
1386       emit(MUL(acc, op[0], op[1]));
1387       emit(MACH(result_dst, op[0], op[1]));
1388       break;
1389    }
1390    case ir_binop_div:
1391       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1392       assert(ir->type->is_integer());
1393       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1394       break;
1395    case ir_binop_carry: {
1396       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1397
1398       emit(ADDC(dst_null_ud(), op[0], op[1]));
1399       emit(MOV(result_dst, src_reg(acc)));
1400       break;
1401    }
1402    case ir_binop_borrow: {
1403       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1404
1405       emit(SUBB(dst_null_ud(), op[0], op[1]));
1406       emit(MOV(result_dst, src_reg(acc)));
1407       break;
1408    }
1409    case ir_binop_mod:
1410       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1411       assert(ir->type->is_integer());
1412       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1413       break;
1414
1415    case ir_binop_less:
1416    case ir_binop_greater:
1417    case ir_binop_lequal:
1418    case ir_binop_gequal:
1419    case ir_binop_equal:
1420    case ir_binop_nequal: {
1421       emit(CMP(result_dst, op[0], op[1],
1422                brw_conditional_for_comparison(ir->operation)));
1423       emit(AND(result_dst, result_src, src_reg(0x1)));
1424       break;
1425    }
1426
1427    case ir_binop_all_equal:
1428       /* "==" operator producing a scalar boolean. */
1429       if (ir->operands[0]->type->is_vector() ||
1430           ir->operands[1]->type->is_vector()) {
1431          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1432          emit(MOV(result_dst, src_reg(0)));
1433          inst = emit(MOV(result_dst, src_reg(1)));
1434          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1435       } else {
1436          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1437          emit(AND(result_dst, result_src, src_reg(0x1)));
1438       }
1439       break;
1440    case ir_binop_any_nequal:
1441       /* "!=" operator producing a scalar boolean. */
1442       if (ir->operands[0]->type->is_vector() ||
1443           ir->operands[1]->type->is_vector()) {
1444          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1445
1446          emit(MOV(result_dst, src_reg(0)));
1447          inst = emit(MOV(result_dst, src_reg(1)));
1448          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1449       } else {
1450          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1451          emit(AND(result_dst, result_src, src_reg(0x1)));
1452       }
1453       break;
1454
1455    case ir_unop_any:
1456       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1457       emit(MOV(result_dst, src_reg(0)));
1458
1459       inst = emit(MOV(result_dst, src_reg(1)));
1460       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1461       break;
1462
1463    case ir_binop_logic_xor:
1464       emit(XOR(result_dst, op[0], op[1]));
1465       break;
1466
1467    case ir_binop_logic_or:
1468       emit(OR(result_dst, op[0], op[1]));
1469       break;
1470
1471    case ir_binop_logic_and:
1472       emit(AND(result_dst, op[0], op[1]));
1473       break;
1474
1475    case ir_binop_dot:
1476       assert(ir->operands[0]->type->is_vector());
1477       assert(ir->operands[0]->type == ir->operands[1]->type);
1478       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1479       break;
1480
1481    case ir_unop_sqrt:
1482       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1483       break;
1484    case ir_unop_rsq:
1485       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1486       break;
1487
1488    case ir_unop_bitcast_i2f:
1489    case ir_unop_bitcast_u2f:
1490       this->result = op[0];
1491       this->result.type = BRW_REGISTER_TYPE_F;
1492       break;
1493
1494    case ir_unop_bitcast_f2i:
1495       this->result = op[0];
1496       this->result.type = BRW_REGISTER_TYPE_D;
1497       break;
1498
1499    case ir_unop_bitcast_f2u:
1500       this->result = op[0];
1501       this->result.type = BRW_REGISTER_TYPE_UD;
1502       break;
1503
1504    case ir_unop_i2f:
1505    case ir_unop_i2u:
1506    case ir_unop_u2i:
1507    case ir_unop_u2f:
1508    case ir_unop_b2f:
1509    case ir_unop_b2i:
1510    case ir_unop_f2i:
1511    case ir_unop_f2u:
1512       emit(MOV(result_dst, op[0]));
1513       break;
1514    case ir_unop_f2b:
1515    case ir_unop_i2b: {
1516       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1517       emit(AND(result_dst, result_src, src_reg(1)));
1518       break;
1519    }
1520
1521    case ir_unop_trunc:
1522       emit(RNDZ(result_dst, op[0]));
1523       break;
1524    case ir_unop_ceil:
1525       op[0].negate = !op[0].negate;
1526       inst = emit(RNDD(result_dst, op[0]));
1527       this->result.negate = true;
1528       break;
1529    case ir_unop_floor:
1530       inst = emit(RNDD(result_dst, op[0]));
1531       break;
1532    case ir_unop_fract:
1533       inst = emit(FRC(result_dst, op[0]));
1534       break;
1535    case ir_unop_round_even:
1536       emit(RNDE(result_dst, op[0]));
1537       break;
1538
1539    case ir_binop_min:
1540       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1541       break;
1542    case ir_binop_max:
1543       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1544       break;
1545
1546    case ir_binop_pow:
1547       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1548       break;
1549
1550    case ir_unop_bit_not:
1551       inst = emit(NOT(result_dst, op[0]));
1552       break;
1553    case ir_binop_bit_and:
1554       inst = emit(AND(result_dst, op[0], op[1]));
1555       break;
1556    case ir_binop_bit_xor:
1557       inst = emit(XOR(result_dst, op[0], op[1]));
1558       break;
1559    case ir_binop_bit_or:
1560       inst = emit(OR(result_dst, op[0], op[1]));
1561       break;
1562
1563    case ir_binop_lshift:
1564       inst = emit(SHL(result_dst, op[0], op[1]));
1565       break;
1566
1567    case ir_binop_rshift:
1568       if (ir->type->base_type == GLSL_TYPE_INT)
1569          inst = emit(ASR(result_dst, op[0], op[1]));
1570       else
1571          inst = emit(SHR(result_dst, op[0], op[1]));
1572       break;
1573
1574    case ir_binop_bfm:
1575       emit(BFI1(result_dst, op[0], op[1]));
1576       break;
1577
1578    case ir_binop_ubo_load: {
1579       ir_constant *uniform_block = ir->operands[0]->as_constant();
1580       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1581       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1582       src_reg offset;
1583
1584       /* Now, load the vector from that offset. */
1585       assert(ir->type->is_vector() || ir->type->is_scalar());
1586
1587       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1588       packed_consts.type = result.type;
1589       src_reg surf_index =
1590          src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1591       if (const_offset_ir) {
1592          if (brw->gen >= 8) {
1593             /* Store the offset in a GRF so we can send-from-GRF. */
1594             offset = src_reg(this, glsl_type::int_type);
1595             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1596          } else {
1597             /* Immediates are fine on older generations since they'll be moved
1598              * to a (potentially fake) MRF at the generator level.
1599              */
1600             offset = src_reg(const_offset / 16);
1601          }
1602       } else {
1603          offset = src_reg(this, glsl_type::uint_type);
1604          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1605       }
1606
1607       if (brw->gen >= 7) {
1608          dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1609          grf_offset.type = offset.type;
1610
1611          emit(MOV(grf_offset, offset));
1612
1613          emit(new(mem_ctx) vec4_instruction(this,
1614                                             VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1615                                             dst_reg(packed_consts),
1616                                             surf_index,
1617                                             src_reg(grf_offset)));
1618       } else {
1619          vec4_instruction *pull =
1620             emit(new(mem_ctx) vec4_instruction(this,
1621                                                VS_OPCODE_PULL_CONSTANT_LOAD,
1622                                                dst_reg(packed_consts),
1623                                                surf_index,
1624                                                offset));
1625          pull->base_mrf = 14;
1626          pull->mlen = 1;
1627       }
1628
1629       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1630       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1631                                             const_offset % 16 / 4,
1632                                             const_offset % 16 / 4,
1633                                             const_offset % 16 / 4);
1634
1635       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1636       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1637          emit(CMP(result_dst, packed_consts, src_reg(0u),
1638                   BRW_CONDITIONAL_NZ));
1639          emit(AND(result_dst, result, src_reg(0x1)));
1640       } else {
1641          emit(MOV(result_dst, packed_consts));
1642       }
1643       break;
1644    }
1645
1646    case ir_binop_vector_extract:
1647       assert(!"should have been lowered by vec_index_to_cond_assign");
1648       break;
1649
1650    case ir_triop_fma:
1651       op[0] = fix_3src_operand(op[0]);
1652       op[1] = fix_3src_operand(op[1]);
1653       op[2] = fix_3src_operand(op[2]);
1654       /* Note that the instruction's argument order is reversed from GLSL
1655        * and the IR.
1656        */
1657       emit(MAD(result_dst, op[2], op[1], op[0]));
1658       break;
1659
1660    case ir_triop_lrp:
1661       emit_lrp(result_dst, op[0], op[1], op[2]);
1662       break;
1663
1664    case ir_triop_csel:
1665       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1666       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1667       inst->predicate = BRW_PREDICATE_NORMAL;
1668       break;
1669
1670    case ir_triop_bfi:
1671       op[0] = fix_3src_operand(op[0]);
1672       op[1] = fix_3src_operand(op[1]);
1673       op[2] = fix_3src_operand(op[2]);
1674       emit(BFI2(result_dst, op[0], op[1], op[2]));
1675       break;
1676
1677    case ir_triop_bitfield_extract:
1678       op[0] = fix_3src_operand(op[0]);
1679       op[1] = fix_3src_operand(op[1]);
1680       op[2] = fix_3src_operand(op[2]);
1681       /* Note that the instruction's argument order is reversed from GLSL
1682        * and the IR.
1683        */
1684       emit(BFE(result_dst, op[2], op[1], op[0]));
1685       break;
1686
1687    case ir_triop_vector_insert:
1688       assert(!"should have been lowered by lower_vector_insert");
1689       break;
1690
1691    case ir_quadop_bitfield_insert:
1692       assert(!"not reached: should be handled by "
1693               "bitfield_insert_to_bfm_bfi\n");
1694       break;
1695
1696    case ir_quadop_vector:
1697       assert(!"not reached: should be handled by lower_quadop_vector");
1698       break;
1699
1700    case ir_unop_pack_half_2x16:
1701       emit_pack_half_2x16(result_dst, op[0]);
1702       break;
1703    case ir_unop_unpack_half_2x16:
1704       emit_unpack_half_2x16(result_dst, op[0]);
1705       break;
1706    case ir_unop_pack_snorm_2x16:
1707    case ir_unop_pack_snorm_4x8:
1708    case ir_unop_pack_unorm_2x16:
1709    case ir_unop_pack_unorm_4x8:
1710    case ir_unop_unpack_snorm_2x16:
1711    case ir_unop_unpack_snorm_4x8:
1712    case ir_unop_unpack_unorm_2x16:
1713    case ir_unop_unpack_unorm_4x8:
1714       assert(!"not reached: should be handled by lower_packing_builtins");
1715       break;
1716    case ir_unop_unpack_half_2x16_split_x:
1717    case ir_unop_unpack_half_2x16_split_y:
1718    case ir_binop_pack_half_2x16_split:
1719       assert(!"not reached: should not occur in vertex shader");
1720       break;
1721    case ir_binop_ldexp:
1722       assert(!"not reached: should be handled by ldexp_to_arith()");
1723       break;
1724    }
1725 }
1726
1727
1728 void
1729 vec4_visitor::visit(ir_swizzle *ir)
1730 {
1731    src_reg src;
1732    int i = 0;
1733    int swizzle[4];
1734
1735    /* Note that this is only swizzles in expressions, not those on the left
1736     * hand side of an assignment, which do write masking.  See ir_assignment
1737     * for that.
1738     */
1739
1740    ir->val->accept(this);
1741    src = this->result;
1742    assert(src.file != BAD_FILE);
1743
1744    for (i = 0; i < ir->type->vector_elements; i++) {
1745       switch (i) {
1746       case 0:
1747          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1748          break;
1749       case 1:
1750          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1751          break;
1752       case 2:
1753          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1754          break;
1755       case 3:
1756          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1757             break;
1758       }
1759    }
1760    for (; i < 4; i++) {
1761       /* Replicate the last channel out. */
1762       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1763    }
1764
1765    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1766
1767    this->result = src;
1768 }
1769
1770 void
1771 vec4_visitor::visit(ir_dereference_variable *ir)
1772 {
1773    const struct glsl_type *type = ir->type;
1774    dst_reg *reg = variable_storage(ir->var);
1775
1776    if (!reg) {
1777       fail("Failed to find variable storage for %s\n", ir->var->name);
1778       this->result = src_reg(brw_null_reg());
1779       return;
1780    }
1781
1782    this->result = src_reg(*reg);
1783
1784    /* System values get their swizzle from the dst_reg writemask */
1785    if (ir->var->data.mode == ir_var_system_value)
1786       return;
1787
1788    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1789       this->result.swizzle = swizzle_for_size(type->vector_elements);
1790 }
1791
1792
1793 int
1794 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1795 {
1796    /* Under normal circumstances array elements are stored consecutively, so
1797     * the stride is equal to the size of the array element.
1798     */
1799    return type_size(ir->type);
1800 }
1801
1802
1803 void
1804 vec4_visitor::visit(ir_dereference_array *ir)
1805 {
1806    ir_constant *constant_index;
1807    src_reg src;
1808    int array_stride = compute_array_stride(ir);
1809
1810    constant_index = ir->array_index->constant_expression_value();
1811
1812    ir->array->accept(this);
1813    src = this->result;
1814
1815    if (constant_index) {
1816       src.reg_offset += constant_index->value.i[0] * array_stride;
1817    } else {
1818       /* Variable index array dereference.  It eats the "vec4" of the
1819        * base of the array and an index that offsets the Mesa register
1820        * index.
1821        */
1822       ir->array_index->accept(this);
1823
1824       src_reg index_reg;
1825
1826       if (array_stride == 1) {
1827          index_reg = this->result;
1828       } else {
1829          index_reg = src_reg(this, glsl_type::int_type);
1830
1831          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1832       }
1833
1834       if (src.reladdr) {
1835          src_reg temp = src_reg(this, glsl_type::int_type);
1836
1837          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1838
1839          index_reg = temp;
1840       }
1841
1842       src.reladdr = ralloc(mem_ctx, src_reg);
1843       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1844    }
1845
1846    /* If the type is smaller than a vec4, replicate the last channel out. */
1847    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1848       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1849    else
1850       src.swizzle = BRW_SWIZZLE_NOOP;
1851    src.type = brw_type_for_base_type(ir->type);
1852
1853    this->result = src;
1854 }
1855
1856 void
1857 vec4_visitor::visit(ir_dereference_record *ir)
1858 {
1859    unsigned int i;
1860    const glsl_type *struct_type = ir->record->type;
1861    int offset = 0;
1862
1863    ir->record->accept(this);
1864
1865    for (i = 0; i < struct_type->length; i++) {
1866       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1867          break;
1868       offset += type_size(struct_type->fields.structure[i].type);
1869    }
1870
1871    /* If the type is smaller than a vec4, replicate the last channel out. */
1872    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1873       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1874    else
1875       this->result.swizzle = BRW_SWIZZLE_NOOP;
1876    this->result.type = brw_type_for_base_type(ir->type);
1877
1878    this->result.reg_offset += offset;
1879 }
1880
1881 /**
1882  * We want to be careful in assignment setup to hit the actual storage
1883  * instead of potentially using a temporary like we might with the
1884  * ir_dereference handler.
1885  */
1886 static dst_reg
1887 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1888 {
1889    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1890     * access of a vector, it must be separated into a series conditional moves
1891     * before reaching this point (see ir_vec_index_to_cond_assign).
1892     */
1893    assert(ir->as_dereference());
1894    ir_dereference_array *deref_array = ir->as_dereference_array();
1895    if (deref_array) {
1896       assert(!deref_array->array->type->is_vector());
1897    }
1898
1899    /* Use the rvalue deref handler for the most part.  We'll ignore
1900     * swizzles in it and write swizzles using writemask, though.
1901     */
1902    ir->accept(v);
1903    return dst_reg(v->result);
1904 }
1905
1906 void
1907 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1908                               const struct glsl_type *type, uint32_t predicate)
1909 {
1910    if (type->base_type == GLSL_TYPE_STRUCT) {
1911       for (unsigned int i = 0; i < type->length; i++) {
1912          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1913       }
1914       return;
1915    }
1916
1917    if (type->is_array()) {
1918       for (unsigned int i = 0; i < type->length; i++) {
1919          emit_block_move(dst, src, type->fields.array, predicate);
1920       }
1921       return;
1922    }
1923
1924    if (type->is_matrix()) {
1925       const struct glsl_type *vec_type;
1926
1927       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1928                                          type->vector_elements, 1);
1929
1930       for (int i = 0; i < type->matrix_columns; i++) {
1931          emit_block_move(dst, src, vec_type, predicate);
1932       }
1933       return;
1934    }
1935
1936    assert(type->is_scalar() || type->is_vector());
1937
1938    dst->type = brw_type_for_base_type(type);
1939    src->type = dst->type;
1940
1941    dst->writemask = (1 << type->vector_elements) - 1;
1942
1943    src->swizzle = swizzle_for_size(type->vector_elements);
1944
1945    vec4_instruction *inst = emit(MOV(*dst, *src));
1946    inst->predicate = predicate;
1947
1948    dst->reg_offset++;
1949    src->reg_offset++;
1950 }
1951
1952
1953 /* If the RHS processing resulted in an instruction generating a
1954  * temporary value, and it would be easy to rewrite the instruction to
1955  * generate its result right into the LHS instead, do so.  This ends
1956  * up reliably removing instructions where it can be tricky to do so
1957  * later without real UD chain information.
1958  */
1959 bool
1960 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1961                                      dst_reg dst,
1962                                      src_reg src,
1963                                      vec4_instruction *pre_rhs_inst,
1964                                      vec4_instruction *last_rhs_inst)
1965 {
1966    /* This could be supported, but it would take more smarts. */
1967    if (ir->condition)
1968       return false;
1969
1970    if (pre_rhs_inst == last_rhs_inst)
1971       return false; /* No instructions generated to work with. */
1972
1973    /* Make sure the last instruction generated our source reg. */
1974    if (src.file != GRF ||
1975        src.file != last_rhs_inst->dst.file ||
1976        src.reg != last_rhs_inst->dst.reg ||
1977        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1978        src.reladdr ||
1979        src.abs ||
1980        src.negate ||
1981        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1982       return false;
1983
1984    /* Check that that last instruction fully initialized the channels
1985     * we want to use, in the order we want to use them.  We could
1986     * potentially reswizzle the operands of many instructions so that
1987     * we could handle out of order channels, but don't yet.
1988     */
1989
1990    for (unsigned i = 0; i < 4; i++) {
1991       if (dst.writemask & (1 << i)) {
1992          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1993             return false;
1994
1995          if (BRW_GET_SWZ(src.swizzle, i) != i)
1996             return false;
1997       }
1998    }
1999
2000    /* Success!  Rewrite the instruction. */
2001    last_rhs_inst->dst.file = dst.file;
2002    last_rhs_inst->dst.reg = dst.reg;
2003    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2004    last_rhs_inst->dst.reladdr = dst.reladdr;
2005    last_rhs_inst->dst.writemask &= dst.writemask;
2006
2007    return true;
2008 }
2009
2010 void
2011 vec4_visitor::visit(ir_assignment *ir)
2012 {
2013    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2014    uint32_t predicate = BRW_PREDICATE_NONE;
2015
2016    if (!ir->lhs->type->is_scalar() &&
2017        !ir->lhs->type->is_vector()) {
2018       ir->rhs->accept(this);
2019       src_reg src = this->result;
2020
2021       if (ir->condition) {
2022          emit_bool_to_cond_code(ir->condition, &predicate);
2023       }
2024
2025       /* emit_block_move doesn't account for swizzles in the source register.
2026        * This should be ok, since the source register is a structure or an
2027        * array, and those can't be swizzled.  But double-check to be sure.
2028        */
2029       assert(src.swizzle ==
2030              (ir->rhs->type->is_matrix()
2031               ? swizzle_for_size(ir->rhs->type->vector_elements)
2032               : BRW_SWIZZLE_NOOP));
2033
2034       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2035       return;
2036    }
2037
2038    /* Now we're down to just a scalar/vector with writemasks. */
2039    int i;
2040
2041    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2042    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2043
2044    ir->rhs->accept(this);
2045
2046    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2047
2048    src_reg src = this->result;
2049
2050    int swizzles[4];
2051    int first_enabled_chan = 0;
2052    int src_chan = 0;
2053
2054    assert(ir->lhs->type->is_vector() ||
2055           ir->lhs->type->is_scalar());
2056    dst.writemask = ir->write_mask;
2057
2058    for (int i = 0; i < 4; i++) {
2059       if (dst.writemask & (1 << i)) {
2060          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2061          break;
2062       }
2063    }
2064
2065    /* Swizzle a small RHS vector into the channels being written.
2066     *
2067     * glsl ir treats write_mask as dictating how many channels are
2068     * present on the RHS while in our instructions we need to make
2069     * those channels appear in the slots of the vec4 they're written to.
2070     */
2071    for (int i = 0; i < 4; i++) {
2072       if (dst.writemask & (1 << i))
2073          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2074       else
2075          swizzles[i] = first_enabled_chan;
2076    }
2077    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2078                               swizzles[2], swizzles[3]);
2079
2080    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2081       return;
2082    }
2083
2084    if (ir->condition) {
2085       emit_bool_to_cond_code(ir->condition, &predicate);
2086    }
2087
2088    for (i = 0; i < type_size(ir->lhs->type); i++) {
2089       vec4_instruction *inst = emit(MOV(dst, src));
2090       inst->predicate = predicate;
2091
2092       dst.reg_offset++;
2093       src.reg_offset++;
2094    }
2095 }
2096
2097 void
2098 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2099 {
2100    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2101       foreach_list(node, &ir->components) {
2102          ir_constant *field_value = (ir_constant *)node;
2103
2104          emit_constant_values(dst, field_value);
2105       }
2106       return;
2107    }
2108
2109    if (ir->type->is_array()) {
2110       for (unsigned int i = 0; i < ir->type->length; i++) {
2111          emit_constant_values(dst, ir->array_elements[i]);
2112       }
2113       return;
2114    }
2115
2116    if (ir->type->is_matrix()) {
2117       for (int i = 0; i < ir->type->matrix_columns; i++) {
2118          float *vec = &ir->value.f[i * ir->type->vector_elements];
2119
2120          for (int j = 0; j < ir->type->vector_elements; j++) {
2121             dst->writemask = 1 << j;
2122             dst->type = BRW_REGISTER_TYPE_F;
2123
2124             emit(MOV(*dst, src_reg(vec[j])));
2125          }
2126          dst->reg_offset++;
2127       }
2128       return;
2129    }
2130
2131    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2132
2133    for (int i = 0; i < ir->type->vector_elements; i++) {
2134       if (!(remaining_writemask & (1 << i)))
2135          continue;
2136
2137       dst->writemask = 1 << i;
2138       dst->type = brw_type_for_base_type(ir->type);
2139
2140       /* Find other components that match the one we're about to
2141        * write.  Emits fewer instructions for things like vec4(0.5,
2142        * 1.5, 1.5, 1.5).
2143        */
2144       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2145          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2146             if (ir->value.b[i] == ir->value.b[j])
2147                dst->writemask |= (1 << j);
2148          } else {
2149             /* u, i, and f storage all line up, so no need for a
2150              * switch case for comparing each type.
2151              */
2152             if (ir->value.u[i] == ir->value.u[j])
2153                dst->writemask |= (1 << j);
2154          }
2155       }
2156
2157       switch (ir->type->base_type) {
2158       case GLSL_TYPE_FLOAT:
2159          emit(MOV(*dst, src_reg(ir->value.f[i])));
2160          break;
2161       case GLSL_TYPE_INT:
2162          emit(MOV(*dst, src_reg(ir->value.i[i])));
2163          break;
2164       case GLSL_TYPE_UINT:
2165          emit(MOV(*dst, src_reg(ir->value.u[i])));
2166          break;
2167       case GLSL_TYPE_BOOL:
2168          emit(MOV(*dst, src_reg(ir->value.b[i])));
2169          break;
2170       default:
2171          assert(!"Non-float/uint/int/bool constant");
2172          break;
2173       }
2174
2175       remaining_writemask &= ~dst->writemask;
2176    }
2177    dst->reg_offset++;
2178 }
2179
2180 void
2181 vec4_visitor::visit(ir_constant *ir)
2182 {
2183    dst_reg dst = dst_reg(this, ir->type);
2184    this->result = src_reg(dst);
2185
2186    emit_constant_values(&dst, ir);
2187 }
2188
2189 void
2190 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2191 {
2192    ir_dereference *deref = static_cast<ir_dereference *>(
2193       ir->actual_parameters.get_head());
2194    ir_variable *location = deref->variable_referenced();
2195    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2196                           location->data.atomic.buffer_index);
2197
2198    /* Calculate the surface offset */
2199    src_reg offset(this, glsl_type::uint_type);
2200    ir_dereference_array *deref_array = deref->as_dereference_array();
2201    if (deref_array) {
2202       deref_array->array_index->accept(this);
2203
2204       src_reg tmp(this, glsl_type::uint_type);
2205       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2206       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2207    } else {
2208       offset = location->data.atomic.offset;
2209    }
2210
2211    /* Emit the appropriate machine instruction */
2212    const char *callee = ir->callee->function_name();
2213    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2214
2215    if (!strcmp("__intrinsic_atomic_read", callee)) {
2216       emit_untyped_surface_read(surf_index, dst, offset);
2217
2218    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2219       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2220                           src_reg(), src_reg());
2221
2222    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2223       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2224                           src_reg(), src_reg());
2225    }
2226 }
2227
2228 void
2229 vec4_visitor::visit(ir_call *ir)
2230 {
2231    const char *callee = ir->callee->function_name();
2232
2233    if (!strcmp("__intrinsic_atomic_read", callee) ||
2234        !strcmp("__intrinsic_atomic_increment", callee) ||
2235        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2236       visit_atomic_counter_intrinsic(ir);
2237    } else {
2238       assert(!"Unsupported intrinsic.");
2239    }
2240 }
2241
2242 src_reg
2243 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2244 {
2245    vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2246    inst->base_mrf = 2;
2247    inst->mlen = 1;
2248    inst->sampler = sampler;
2249    inst->dst = dst_reg(this, glsl_type::uvec4_type);
2250    inst->dst.writemask = WRITEMASK_XYZW;
2251
2252    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2253    int param_base = inst->base_mrf;
2254    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2255    int zero_mask = 0xf & ~coord_mask;
2256
2257    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2258             coordinate));
2259
2260    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2261             src_reg(0)));
2262
2263    emit(inst);
2264    return src_reg(inst->dst);
2265 }
2266
2267 void
2268 vec4_visitor::visit(ir_texture *ir)
2269 {
2270    int sampler =
2271       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2272
2273    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2274     * emitting anything other than setting up the constant result.
2275     */
2276    if (ir->op == ir_tg4) {
2277       ir_constant *chan = ir->lod_info.component->as_constant();
2278       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2279       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2280          dst_reg result(this, ir->type);
2281          this->result = src_reg(result);
2282          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2283          return;
2284       }
2285    }
2286
2287    /* Should be lowered by do_lower_texture_projection */
2288    assert(!ir->projector);
2289
2290    /* Should be lowered */
2291    assert(!ir->offset || !ir->offset->type->is_array());
2292
2293    /* Generate code to compute all the subexpression trees.  This has to be
2294     * done before loading any values into MRFs for the sampler message since
2295     * generating these values may involve SEND messages that need the MRFs.
2296     */
2297    src_reg coordinate;
2298    if (ir->coordinate) {
2299       ir->coordinate->accept(this);
2300       coordinate = this->result;
2301    }
2302
2303    src_reg shadow_comparitor;
2304    if (ir->shadow_comparitor) {
2305       ir->shadow_comparitor->accept(this);
2306       shadow_comparitor = this->result;
2307    }
2308
2309    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2310    src_reg offset_value;
2311    if (has_nonconstant_offset) {
2312       ir->offset->accept(this);
2313       offset_value = src_reg(this->result);
2314    }
2315
2316    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2317    src_reg lod, dPdx, dPdy, sample_index, mcs;
2318    switch (ir->op) {
2319    case ir_tex:
2320       lod = src_reg(0.0f);
2321       lod_type = glsl_type::float_type;
2322       break;
2323    case ir_txf:
2324    case ir_txl:
2325    case ir_txs:
2326       ir->lod_info.lod->accept(this);
2327       lod = this->result;
2328       lod_type = ir->lod_info.lod->type;
2329       break;
2330    case ir_query_levels:
2331       lod = src_reg(0);
2332       lod_type = glsl_type::int_type;
2333       break;
2334    case ir_txf_ms:
2335       ir->lod_info.sample_index->accept(this);
2336       sample_index = this->result;
2337       sample_index_type = ir->lod_info.sample_index->type;
2338
2339       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2340          mcs = emit_mcs_fetch(ir, coordinate, sampler);
2341       else
2342          mcs = src_reg(0u);
2343       break;
2344    case ir_txd:
2345       ir->lod_info.grad.dPdx->accept(this);
2346       dPdx = this->result;
2347
2348       ir->lod_info.grad.dPdy->accept(this);
2349       dPdy = this->result;
2350
2351       lod_type = ir->lod_info.grad.dPdx->type;
2352       break;
2353    case ir_txb:
2354    case ir_lod:
2355    case ir_tg4:
2356       break;
2357    }
2358
2359    vec4_instruction *inst = NULL;
2360    switch (ir->op) {
2361    case ir_tex:
2362    case ir_txl:
2363       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2364       break;
2365    case ir_txd:
2366       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2367       break;
2368    case ir_txf:
2369       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2370       break;
2371    case ir_txf_ms:
2372       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2373       break;
2374    case ir_txs:
2375       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2376       break;
2377    case ir_tg4:
2378       if (has_nonconstant_offset)
2379          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2380       else
2381          inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2382       break;
2383    case ir_query_levels:
2384       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2385       break;
2386    case ir_txb:
2387       assert(!"TXB is not valid for vertex shaders.");
2388       break;
2389    case ir_lod:
2390       assert(!"LOD is not valid for vertex shaders.");
2391       break;
2392    default:
2393       assert(!"Unrecognized tex op");
2394    }
2395
2396    if (ir->offset != NULL && ir->op != ir_txf)
2397       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2398
2399    /* Stuff the channel select bits in the top of the texture offset */
2400    if (ir->op == ir_tg4)
2401       inst->texture_offset |= gather_channel(ir, sampler) << 16;
2402
2403    /* The message header is necessary for:
2404     * - Gen4 (always)
2405     * - Texel offsets
2406     * - Gather channel selection
2407     * - Sampler indices too large to fit in a 4-bit value.
2408     */
2409    inst->header_present =
2410       brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2411       sampler >= 16;
2412    inst->base_mrf = 2;
2413    inst->mlen = inst->header_present + 1; /* always at least one */
2414    inst->sampler = sampler;
2415    inst->dst = dst_reg(this, ir->type);
2416    inst->dst.writemask = WRITEMASK_XYZW;
2417    inst->shadow_compare = ir->shadow_comparitor != NULL;
2418
2419    /* MRF for the first parameter */
2420    int param_base = inst->base_mrf + inst->header_present;
2421
2422    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2423       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2424       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2425    } else {
2426       /* Load the coordinate */
2427       /* FINISHME: gl_clamp_mask and saturate */
2428       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2429       int zero_mask = 0xf & ~coord_mask;
2430
2431       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2432                coordinate));
2433
2434       if (zero_mask != 0) {
2435          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2436                   src_reg(0)));
2437       }
2438       /* Load the shadow comparitor */
2439       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2440          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2441                           WRITEMASK_X),
2442                   shadow_comparitor));
2443          inst->mlen++;
2444       }
2445
2446       /* Load the LOD info */
2447       if (ir->op == ir_tex || ir->op == ir_txl) {
2448          int mrf, writemask;
2449          if (brw->gen >= 5) {
2450             mrf = param_base + 1;
2451             if (ir->shadow_comparitor) {
2452                writemask = WRITEMASK_Y;
2453                /* mlen already incremented */
2454             } else {
2455                writemask = WRITEMASK_X;
2456                inst->mlen++;
2457             }
2458          } else /* brw->gen == 4 */ {
2459             mrf = param_base;
2460             writemask = WRITEMASK_W;
2461          }
2462          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2463       } else if (ir->op == ir_txf) {
2464          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2465       } else if (ir->op == ir_txf_ms) {
2466          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2467                   sample_index));
2468          if (brw->gen >= 7)
2469             /* MCS data is in the first channel of `mcs`, but we need to get it into
2470              * the .y channel of the second vec4 of params, so replicate .x across
2471              * the whole vec4 and then mask off everything except .y
2472              */
2473             mcs.swizzle = BRW_SWIZZLE_XXXX;
2474             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2475                      mcs));
2476          inst->mlen++;
2477       } else if (ir->op == ir_txd) {
2478          const glsl_type *type = lod_type;
2479
2480          if (brw->gen >= 5) {
2481             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2482             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2483             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2484             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2485             inst->mlen++;
2486
2487             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2488                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2489                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2490                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2491                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2492                inst->mlen++;
2493
2494                if (ir->shadow_comparitor) {
2495                   emit(MOV(dst_reg(MRF, param_base + 2,
2496                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2497                            shadow_comparitor));
2498                }
2499             }
2500          } else /* brw->gen == 4 */ {
2501             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2502             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2503             inst->mlen += 2;
2504          }
2505       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2506          if (ir->shadow_comparitor) {
2507             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2508                      shadow_comparitor));
2509          }
2510
2511          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2512                   offset_value));
2513          inst->mlen++;
2514       }
2515    }
2516
2517    emit(inst);
2518
2519    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2520     * spec requires layers.
2521     */
2522    if (ir->op == ir_txs) {
2523       glsl_type const *type = ir->sampler->type;
2524       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2525           type->sampler_array) {
2526          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2527                    writemask(inst->dst, WRITEMASK_Z),
2528                    src_reg(inst->dst), src_reg(6));
2529       }
2530    }
2531
2532    if (brw->gen == 6 && ir->op == ir_tg4) {
2533       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2534    }
2535
2536    swizzle_result(ir, src_reg(inst->dst), sampler);
2537 }
2538
2539 /**
2540  * Apply workarounds for Gen6 gather with UINT/SINT
2541  */
2542 void
2543 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2544 {
2545    if (!wa)
2546       return;
2547
2548    int width = (wa & WA_8BIT) ? 8 : 16;
2549    dst_reg dst_f = dst;
2550    dst_f.type = BRW_REGISTER_TYPE_F;
2551
2552    /* Convert from UNORM to UINT */
2553    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2554    emit(MOV(dst, src_reg(dst_f)));
2555
2556    if (wa & WA_SIGN) {
2557       /* Reinterpret the UINT value as a signed INT value by
2558        * shifting the sign bit into place, then shifting back
2559        * preserving sign.
2560        */
2561       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2562       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2563    }
2564 }
2565
2566 /**
2567  * Set up the gather channel based on the swizzle, for gather4.
2568  */
2569 uint32_t
2570 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2571 {
2572    ir_constant *chan = ir->lod_info.component->as_constant();
2573    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2574    switch (swiz) {
2575       case SWIZZLE_X: return 0;
2576       case SWIZZLE_Y:
2577          /* gather4 sampler is broken for green channel on RG32F --
2578           * we must ask for blue instead.
2579           */
2580          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2581             return 2;
2582          return 1;
2583       case SWIZZLE_Z: return 2;
2584       case SWIZZLE_W: return 3;
2585       default:
2586          assert(!"Not reached"); /* zero, one swizzles handled already */
2587          return 0;
2588    }
2589 }
2590
2591 void
2592 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2593 {
2594    int s = key->tex.swizzles[sampler];
2595
2596    this->result = src_reg(this, ir->type);
2597    dst_reg swizzled_result(this->result);
2598
2599    if (ir->op == ir_query_levels) {
2600       /* # levels is in .w */
2601       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2602       emit(MOV(swizzled_result, orig_val));
2603       return;
2604    }
2605
2606    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2607                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2608       emit(MOV(swizzled_result, orig_val));
2609       return;
2610    }
2611
2612
2613    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2614    int swizzle[4] = {0};
2615
2616    for (int i = 0; i < 4; i++) {
2617       switch (GET_SWZ(s, i)) {
2618       case SWIZZLE_ZERO:
2619          zero_mask |= (1 << i);
2620          break;
2621       case SWIZZLE_ONE:
2622          one_mask |= (1 << i);
2623          break;
2624       default:
2625          copy_mask |= (1 << i);
2626          swizzle[i] = GET_SWZ(s, i);
2627          break;
2628       }
2629    }
2630
2631    if (copy_mask) {
2632       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2633       swizzled_result.writemask = copy_mask;
2634       emit(MOV(swizzled_result, orig_val));
2635    }
2636
2637    if (zero_mask) {
2638       swizzled_result.writemask = zero_mask;
2639       emit(MOV(swizzled_result, src_reg(0.0f)));
2640    }
2641
2642    if (one_mask) {
2643       swizzled_result.writemask = one_mask;
2644       emit(MOV(swizzled_result, src_reg(1.0f)));
2645    }
2646 }
2647
2648 void
2649 vec4_visitor::visit(ir_return *ir)
2650 {
2651    assert(!"not reached");
2652 }
2653
2654 void
2655 vec4_visitor::visit(ir_discard *ir)
2656 {
2657    assert(!"not reached");
2658 }
2659
2660 void
2661 vec4_visitor::visit(ir_if *ir)
2662 {
2663    /* Don't point the annotation at the if statement, because then it plus
2664     * the then and else blocks get printed.
2665     */
2666    this->base_ir = ir->condition;
2667
2668    if (brw->gen == 6) {
2669       emit_if_gen6(ir);
2670    } else {
2671       uint32_t predicate;
2672       emit_bool_to_cond_code(ir->condition, &predicate);
2673       emit(IF(predicate));
2674    }
2675
2676    visit_instructions(&ir->then_instructions);
2677
2678    if (!ir->else_instructions.is_empty()) {
2679       this->base_ir = ir->condition;
2680       emit(BRW_OPCODE_ELSE);
2681
2682       visit_instructions(&ir->else_instructions);
2683    }
2684
2685    this->base_ir = ir->condition;
2686    emit(BRW_OPCODE_ENDIF);
2687 }
2688
2689 void
2690 vec4_visitor::visit(ir_emit_vertex *)
2691 {
2692    assert(!"not reached");
2693 }
2694
2695 void
2696 vec4_visitor::visit(ir_end_primitive *)
2697 {
2698    assert(!"not reached");
2699 }
2700
2701 void
2702 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2703                                   dst_reg dst, src_reg offset,
2704                                   src_reg src0, src_reg src1)
2705 {
2706    unsigned mlen = 0;
2707
2708    /* Set the atomic operation offset. */
2709    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2710    mlen++;
2711
2712    /* Set the atomic operation arguments. */
2713    if (src0.file != BAD_FILE) {
2714       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2715       mlen++;
2716    }
2717
2718    if (src1.file != BAD_FILE) {
2719       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2720       mlen++;
2721    }
2722
2723    /* Emit the instruction.  Note that this maps to the normal SIMD8
2724     * untyped atomic message on Ivy Bridge, but that's OK because
2725     * unused channels will be masked out.
2726     */
2727    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2728                                  src_reg(atomic_op), src_reg(surf_index));
2729    inst->base_mrf = 0;
2730    inst->mlen = mlen;
2731 }
2732
2733 void
2734 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2735                                         src_reg offset)
2736 {
2737    /* Set the surface read offset. */
2738    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2739
2740    /* Emit the instruction.  Note that this maps to the normal SIMD8
2741     * untyped surface read message, but that's OK because unused
2742     * channels will be masked out.
2743     */
2744    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2745                                  dst, src_reg(surf_index));
2746    inst->base_mrf = 0;
2747    inst->mlen = 1;
2748 }
2749
2750 void
2751 vec4_visitor::emit_ndc_computation()
2752 {
2753    /* Get the position */
2754    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2755
2756    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2757    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2758    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2759
2760    current_annotation = "NDC";
2761    dst_reg ndc_w = ndc;
2762    ndc_w.writemask = WRITEMASK_W;
2763    src_reg pos_w = pos;
2764    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2765    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2766
2767    dst_reg ndc_xyz = ndc;
2768    ndc_xyz.writemask = WRITEMASK_XYZ;
2769
2770    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2771 }
2772
2773 void
2774 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2775 {
2776    if (brw->gen < 6 &&
2777        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2778         key->userclip_active || brw->has_negative_rhw_bug)) {
2779       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2780       dst_reg header1_w = header1;
2781       header1_w.writemask = WRITEMASK_W;
2782
2783       emit(MOV(header1, 0u));
2784
2785       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2786          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2787
2788          current_annotation = "Point size";
2789          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2790          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2791       }
2792
2793       if (key->userclip_active) {
2794          current_annotation = "Clipping flags";
2795          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2796          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2797
2798          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2799          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2800          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2801
2802          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2803          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2804          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2805          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2806       }
2807
2808       /* i965 clipping workaround:
2809        * 1) Test for -ve rhw
2810        * 2) If set,
2811        *      set ndc = (0,0,0,0)
2812        *      set ucp[6] = 1
2813        *
2814        * Later, clipping will detect ucp[6] and ensure the primitive is
2815        * clipped against all fixed planes.
2816        */
2817       if (brw->has_negative_rhw_bug) {
2818          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2819          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2820          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2821          vec4_instruction *inst;
2822          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2823          inst->predicate = BRW_PREDICATE_NORMAL;
2824          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2825          inst->predicate = BRW_PREDICATE_NORMAL;
2826       }
2827
2828       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2829    } else if (brw->gen < 6) {
2830       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2831    } else {
2832       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2833       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2834          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2835                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2836       }
2837       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2838          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2839                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2840       }
2841       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2842          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2843                   src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2844       }
2845    }
2846 }
2847
2848 void
2849 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2850 {
2851    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2852     *
2853     *     "If a linked set of shaders forming the vertex stage contains no
2854     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2855     *     application has requested clipping against user clip planes through
2856     *     the API, then the coordinate written to gl_Position is used for
2857     *     comparison against the user clip planes."
2858     *
2859     * This function is only called if the shader didn't write to
2860     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2861     * if the user wrote to it; otherwise we use gl_Position.
2862     */
2863    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2864    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2865       clip_vertex = VARYING_SLOT_POS;
2866    }
2867
2868    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2869         ++i) {
2870       reg.writemask = 1 << i;
2871       emit(DP4(reg,
2872                src_reg(output_reg[clip_vertex]),
2873                src_reg(this->userplane[i + offset])));
2874    }
2875 }
2876
2877 void
2878 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2879 {
2880    assert (varying < VARYING_SLOT_MAX);
2881    reg.type = output_reg[varying].type;
2882    current_annotation = output_reg_annotation[varying];
2883    /* Copy the register, saturating if necessary */
2884    vec4_instruction *inst = emit(MOV(reg,
2885                                      src_reg(output_reg[varying])));
2886    if ((varying == VARYING_SLOT_COL0 ||
2887         varying == VARYING_SLOT_COL1 ||
2888         varying == VARYING_SLOT_BFC0 ||
2889         varying == VARYING_SLOT_BFC1) &&
2890        key->clamp_vertex_color) {
2891       inst->saturate = true;
2892    }
2893 }
2894
2895 void
2896 vec4_visitor::emit_urb_slot(int mrf, int varying)
2897 {
2898    struct brw_reg hw_reg = brw_message_reg(mrf);
2899    dst_reg reg = dst_reg(MRF, mrf);
2900    reg.type = BRW_REGISTER_TYPE_F;
2901
2902    switch (varying) {
2903    case VARYING_SLOT_PSIZ:
2904       /* PSIZ is always in slot 0, and is coupled with other flags. */
2905       current_annotation = "indices, point width, clip flags";
2906       emit_psiz_and_flags(hw_reg);
2907       break;
2908    case BRW_VARYING_SLOT_NDC:
2909       current_annotation = "NDC";
2910       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2911       break;
2912    case VARYING_SLOT_POS:
2913       current_annotation = "gl_Position";
2914       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2915       break;
2916    case VARYING_SLOT_EDGE:
2917       /* This is present when doing unfilled polygons.  We're supposed to copy
2918        * the edge flag from the user-provided vertex array
2919        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2920        * of that attribute (starts as 1.0f).  This is then used in clipping to
2921        * determine which edges should be drawn as wireframe.
2922        */
2923       current_annotation = "edge flag";
2924       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2925                                     glsl_type::float_type, WRITEMASK_XYZW))));
2926       break;
2927    case BRW_VARYING_SLOT_PAD:
2928       /* No need to write to this slot */
2929       break;
2930    default:
2931       emit_generic_urb_slot(reg, varying);
2932       break;
2933    }
2934 }
2935
2936 static int
2937 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2938 {
2939    if (brw->gen >= 6) {
2940       /* URB data written (does not include the message header reg) must
2941        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2942        * section 5.4.3.2.2: URB_INTERLEAVED.
2943        *
2944        * URB entries are allocated on a multiple of 1024 bits, so an
2945        * extra 128 bits written here to make the end align to 256 is
2946        * no problem.
2947        */
2948       if ((mlen % 2) != 1)
2949          mlen++;
2950    }
2951
2952    return mlen;
2953 }
2954
2955
2956 /**
2957  * Generates the VUE payload plus the necessary URB write instructions to
2958  * output it.
2959  *
2960  * The VUE layout is documented in Volume 2a.
2961  */
2962 void
2963 vec4_visitor::emit_vertex()
2964 {
2965    /* MRF 0 is reserved for the debugger, so start with message header
2966     * in MRF 1.
2967     */
2968    int base_mrf = 1;
2969    int mrf = base_mrf;
2970    /* In the process of generating our URB write message contents, we
2971     * may need to unspill a register or load from an array.  Those
2972     * reads would use MRFs 14-15.
2973     */
2974    int max_usable_mrf = 13;
2975
2976    /* The following assertion verifies that max_usable_mrf causes an
2977     * even-numbered amount of URB write data, which will meet gen6's
2978     * requirements for length alignment.
2979     */
2980    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2981
2982    /* First mrf is the g0-based message header containing URB handles and
2983     * such.
2984     */
2985    emit_urb_write_header(mrf++);
2986
2987    if (brw->gen < 6) {
2988       emit_ndc_computation();
2989    }
2990
2991    /* Lower legacy ff and ClipVertex clipping to clip distances */
2992    if (key->userclip_active && !prog->UsesClipDistanceOut) {
2993       current_annotation = "user clip distances";
2994
2995       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2996       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2997
2998       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2999       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3000    }
3001
3002    /* We may need to split this up into several URB writes, so do them in a
3003     * loop.
3004     */
3005    int slot = 0;
3006    bool complete = false;
3007    do {
3008       /* URB offset is in URB row increments, and each of our MRFs is half of
3009        * one of those, since we're doing interleaved writes.
3010        */
3011       int offset = slot / 2;
3012
3013       mrf = base_mrf + 1;
3014       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3015          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3016
3017          /* If this was max_usable_mrf, we can't fit anything more into this
3018           * URB WRITE.
3019           */
3020          if (mrf > max_usable_mrf) {
3021             slot++;
3022             break;
3023          }
3024       }
3025
3026       complete = slot >= prog_data->vue_map.num_slots;
3027       current_annotation = "URB write";
3028       vec4_instruction *inst = emit_urb_write_opcode(complete);
3029       inst->base_mrf = base_mrf;
3030       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3031       inst->offset += offset;
3032    } while(!complete);
3033 }
3034
3035
3036 src_reg
3037 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3038                                  src_reg *reladdr, int reg_offset)
3039 {
3040    /* Because we store the values to scratch interleaved like our
3041     * vertex data, we need to scale the vec4 index by 2.
3042     */
3043    int message_header_scale = 2;
3044
3045    /* Pre-gen6, the message header uses byte offsets instead of vec4
3046     * (16-byte) offset units.
3047     */
3048    if (brw->gen < 6)
3049       message_header_scale *= 16;
3050
3051    if (reladdr) {
3052       src_reg index = src_reg(this, glsl_type::int_type);
3053
3054       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3055       emit_before(inst, MUL(dst_reg(index),
3056                             index, src_reg(message_header_scale)));
3057
3058       return index;
3059    } else {
3060       return src_reg(reg_offset * message_header_scale);
3061    }
3062 }
3063
3064 src_reg
3065 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3066                                        src_reg *reladdr, int reg_offset)
3067 {
3068    if (reladdr) {
3069       src_reg index = src_reg(this, glsl_type::int_type);
3070
3071       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3072
3073       /* Pre-gen6, the message header uses byte offsets instead of vec4
3074        * (16-byte) offset units.
3075        */
3076       if (brw->gen < 6) {
3077          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3078       }
3079
3080       return index;
3081    } else if (brw->gen >= 8) {
3082       /* Store the offset in a GRF so we can send-from-GRF. */
3083       src_reg offset = src_reg(this, glsl_type::int_type);
3084       emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3085       return offset;
3086    } else {
3087       int message_header_scale = brw->gen < 6 ? 16 : 1;
3088       return src_reg(reg_offset * message_header_scale);
3089    }
3090 }
3091
3092 /**
3093  * Emits an instruction before @inst to load the value named by @orig_src
3094  * from scratch space at @base_offset to @temp.
3095  *
3096  * @base_offset is measured in 32-byte units (the size of a register).
3097  */
3098 void
3099 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3100                                 dst_reg temp, src_reg orig_src,
3101                                 int base_offset)
3102 {
3103    int reg_offset = base_offset + orig_src.reg_offset;
3104    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3105
3106    emit_before(inst, SCRATCH_READ(temp, index));
3107 }
3108
3109 /**
3110  * Emits an instruction after @inst to store the value to be written
3111  * to @orig_dst to scratch space at @base_offset, from @temp.
3112  *
3113  * @base_offset is measured in 32-byte units (the size of a register).
3114  */
3115 void
3116 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3117 {
3118    int reg_offset = base_offset + inst->dst.reg_offset;
3119    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3120
3121    /* Create a temporary register to store *inst's result in.
3122     *
3123     * We have to be careful in MOVing from our temporary result register in
3124     * the scratch write.  If we swizzle from channels of the temporary that
3125     * weren't initialized, it will confuse live interval analysis, which will
3126     * make spilling fail to make progress.
3127     */
3128    src_reg temp = src_reg(this, glsl_type::vec4_type);
3129    temp.type = inst->dst.type;
3130    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3131    int swizzles[4];
3132    for (int i = 0; i < 4; i++)
3133       if (inst->dst.writemask & (1 << i))
3134          swizzles[i] = i;
3135       else
3136          swizzles[i] = first_writemask_chan;
3137    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3138                                swizzles[2], swizzles[3]);
3139
3140    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3141                                        inst->dst.writemask));
3142    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3143    write->predicate = inst->predicate;
3144    write->ir = inst->ir;
3145    write->annotation = inst->annotation;
3146    inst->insert_after(write);
3147
3148    inst->dst.file = temp.file;
3149    inst->dst.reg = temp.reg;
3150    inst->dst.reg_offset = temp.reg_offset;
3151    inst->dst.reladdr = NULL;
3152 }
3153
3154 /**
3155  * We can't generally support array access in GRF space, because a
3156  * single instruction's destination can only span 2 contiguous
3157  * registers.  So, we send all GRF arrays that get variable index
3158  * access to scratch space.
3159  */
3160 void
3161 vec4_visitor::move_grf_array_access_to_scratch()
3162 {
3163    int scratch_loc[this->virtual_grf_count];
3164
3165    for (int i = 0; i < this->virtual_grf_count; i++) {
3166       scratch_loc[i] = -1;
3167    }
3168
3169    /* First, calculate the set of virtual GRFs that need to be punted
3170     * to scratch due to having any array access on them, and where in
3171     * scratch.
3172     */
3173    foreach_list(node, &this->instructions) {
3174       vec4_instruction *inst = (vec4_instruction *)node;
3175
3176       if (inst->dst.file == GRF && inst->dst.reladdr &&
3177           scratch_loc[inst->dst.reg] == -1) {
3178          scratch_loc[inst->dst.reg] = c->last_scratch;
3179          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3180       }
3181
3182       for (int i = 0 ; i < 3; i++) {
3183          src_reg *src = &inst->src[i];
3184
3185          if (src->file == GRF && src->reladdr &&
3186              scratch_loc[src->reg] == -1) {
3187             scratch_loc[src->reg] = c->last_scratch;
3188             c->last_scratch += this->virtual_grf_sizes[src->reg];
3189          }
3190       }
3191    }
3192
3193    /* Now, for anything that will be accessed through scratch, rewrite
3194     * it to load/store.  Note that this is a _safe list walk, because
3195     * we may generate a new scratch_write instruction after the one
3196     * we're processing.
3197     */
3198    foreach_list_safe(node, &this->instructions) {
3199       vec4_instruction *inst = (vec4_instruction *)node;
3200
3201       /* Set up the annotation tracking for new generated instructions. */
3202       base_ir = inst->ir;
3203       current_annotation = inst->annotation;
3204
3205       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3206          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3207       }
3208
3209       for (int i = 0 ; i < 3; i++) {
3210          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3211             continue;
3212
3213          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3214
3215          emit_scratch_read(inst, temp, inst->src[i],
3216                            scratch_loc[inst->src[i].reg]);
3217
3218          inst->src[i].file = temp.file;
3219          inst->src[i].reg = temp.reg;
3220          inst->src[i].reg_offset = temp.reg_offset;
3221          inst->src[i].reladdr = NULL;
3222       }
3223    }
3224 }
3225
3226 /**
3227  * Emits an instruction before @inst to load the value named by @orig_src
3228  * from the pull constant buffer (surface) at @base_offset to @temp.
3229  */
3230 void
3231 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3232                                       dst_reg temp, src_reg orig_src,
3233                                       int base_offset)
3234 {
3235    int reg_offset = base_offset + orig_src.reg_offset;
3236    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3237    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3238    vec4_instruction *load;
3239
3240    if (brw->gen >= 7) {
3241       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3242       grf_offset.type = offset.type;
3243       emit_before(inst, MOV(grf_offset, offset));
3244
3245       load = new(mem_ctx) vec4_instruction(this,
3246                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3247                                            temp, index, src_reg(grf_offset));
3248    } else {
3249       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3250                                            temp, index, offset);
3251       load->base_mrf = 14;
3252       load->mlen = 1;
3253    }
3254    emit_before(inst, load);
3255 }
3256
3257 /**
3258  * Implements array access of uniforms by inserting a
3259  * PULL_CONSTANT_LOAD instruction.
3260  *
3261  * Unlike temporary GRF array access (where we don't support it due to
3262  * the difficulty of doing relative addressing on instruction
3263  * destinations), we could potentially do array access of uniforms
3264  * that were loaded in GRF space as push constants.  In real-world
3265  * usage we've seen, though, the arrays being used are always larger
3266  * than we could load as push constants, so just always move all
3267  * uniform array access out to a pull constant buffer.
3268  */
3269 void
3270 vec4_visitor::move_uniform_array_access_to_pull_constants()
3271 {
3272    int pull_constant_loc[this->uniforms];
3273
3274    for (int i = 0; i < this->uniforms; i++) {
3275       pull_constant_loc[i] = -1;
3276    }
3277
3278    /* Walk through and find array access of uniforms.  Put a copy of that
3279     * uniform in the pull constant buffer.
3280     *
3281     * Note that we don't move constant-indexed accesses to arrays.  No
3282     * testing has been done of the performance impact of this choice.
3283     */
3284    foreach_list_safe(node, &this->instructions) {
3285       vec4_instruction *inst = (vec4_instruction *)node;
3286
3287       for (int i = 0 ; i < 3; i++) {
3288          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3289             continue;
3290
3291          int uniform = inst->src[i].reg;
3292
3293          /* If this array isn't already present in the pull constant buffer,
3294           * add it.
3295           */
3296          if (pull_constant_loc[uniform] == -1) {
3297             const float **values = &stage_prog_data->param[uniform * 4];
3298
3299             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3300
3301             assert(uniform < uniform_array_size);
3302             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3303                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3304                   = values[j];
3305             }
3306          }
3307
3308          /* Set up the annotation tracking for new generated instructions. */
3309          base_ir = inst->ir;
3310          current_annotation = inst->annotation;
3311
3312          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3313
3314          emit_pull_constant_load(inst, temp, inst->src[i],
3315                                  pull_constant_loc[uniform]);
3316
3317          inst->src[i].file = temp.file;
3318          inst->src[i].reg = temp.reg;
3319          inst->src[i].reg_offset = temp.reg_offset;
3320          inst->src[i].reladdr = NULL;
3321       }
3322    }
3323
3324    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3325     * no need to track them as larger-than-vec4 objects.  This will be
3326     * relied on in cutting out unused uniform vectors from push
3327     * constants.
3328     */
3329    split_uniform_registers();
3330 }
3331
3332 void
3333 vec4_visitor::resolve_ud_negate(src_reg *reg)
3334 {
3335    if (reg->type != BRW_REGISTER_TYPE_UD ||
3336        !reg->negate)
3337       return;
3338
3339    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3340    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3341    *reg = temp;
3342 }
3343
3344 vec4_visitor::vec4_visitor(struct brw_context *brw,
3345                            struct brw_vec4_compile *c,
3346                            struct gl_program *prog,
3347                            const struct brw_vec4_prog_key *key,
3348                            struct brw_vec4_prog_data *prog_data,
3349                            struct gl_shader_program *shader_prog,
3350                            gl_shader_stage stage,
3351                            void *mem_ctx,
3352                            bool debug_flag,
3353                            bool no_spills,
3354                            shader_time_shader_type st_base,
3355                            shader_time_shader_type st_written,
3356                            shader_time_shader_type st_reset)
3357    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3358      c(c),
3359      key(key),
3360      prog_data(prog_data),
3361      sanity_param_count(0),
3362      fail_msg(NULL),
3363      first_non_payload_grf(0),
3364      need_all_constants_in_pull_buffer(false),
3365      debug_flag(debug_flag),
3366      no_spills(no_spills),
3367      st_base(st_base),
3368      st_written(st_written),
3369      st_reset(st_reset)
3370 {
3371    this->mem_ctx = mem_ctx;
3372    this->failed = false;
3373
3374    this->base_ir = NULL;
3375    this->current_annotation = NULL;
3376    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3377
3378    this->variable_ht = hash_table_ctor(0,
3379                                        hash_table_pointer_hash,
3380                                        hash_table_pointer_compare);
3381
3382    this->virtual_grf_start = NULL;
3383    this->virtual_grf_end = NULL;
3384    this->virtual_grf_sizes = NULL;
3385    this->virtual_grf_count = 0;
3386    this->virtual_grf_reg_map = NULL;
3387    this->virtual_grf_reg_count = 0;
3388    this->virtual_grf_array_size = 0;
3389    this->live_intervals_valid = false;
3390
3391    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3392
3393    this->uniforms = 0;
3394
3395    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3396     * at least one. See setup_uniforms() in brw_vec4.cpp.
3397     */
3398    this->uniform_array_size = 1;
3399    if (prog_data) {
3400       this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3401    }
3402
3403    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3404    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3405 }
3406
3407 vec4_visitor::~vec4_visitor()
3408 {
3409    hash_table_dtor(this->variable_ht);
3410 }
3411
3412
3413 void
3414 vec4_visitor::fail(const char *format, ...)
3415 {
3416    va_list va;
3417    char *msg;
3418
3419    if (failed)
3420       return;
3421
3422    failed = true;
3423
3424    va_start(va, format);
3425    msg = ralloc_vasprintf(mem_ctx, format, va);
3426    va_end(va);
3427    msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3428
3429    this->fail_msg = msg;
3430
3431    if (debug_flag) {
3432       fprintf(stderr, "%s",  msg);
3433    }
3434 }
3435
3436 } /* namespace brw */