src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 #define ALU3(op)                                                        \
 111    vec4_instruction *                                                   \
 112    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 113    {                                                                    \
 114       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 115                                            src0, src1, src2);           \
 116    }
 117
 118 ALU1(NOT)
 119 ALU1(MOV)
 120 ALU1(FRC)
 121 ALU1(RNDD)
 122 ALU1(RNDE)
 123 ALU1(RNDZ)
 124 ALU1(F32TO16)
 125 ALU1(F16TO32)
 126 ALU2(ADD)
 127 ALU2(MUL)
 128 ALU2(MACH)
 129 ALU2(AND)
 130 ALU2(OR)
 131 ALU2(XOR)
 132 ALU2(DP3)
 133 ALU2(DP4)
 134 ALU2(DPH)
 135 ALU2(SHL)
 136 ALU2(SHR)
 137 ALU2(ASR)
 138 ALU3(LRP)
 139
 140 /** Gen4 predicated IF. */
 141 vec4_instruction *
 142 vec4_visitor::IF(uint32_t predicate)
 143 {
 144    vec4_instruction *inst;
 145
 146    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 147    inst->predicate = predicate;
 148
 149    return inst;
 150 }
 151
 152 /** Gen6+ IF with embedded comparison. */
 153 vec4_instruction *
 154 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 155 {
 156    assert(intel->gen >= 6);
 157
 158    vec4_instruction *inst;
 159
 160    resolve_ud_negate(&src0);
 161    resolve_ud_negate(&src1);
 162
 163    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 164                                         src0, src1);
 165    inst->conditional_mod = condition;
 166
 167    return inst;
 168 }
 169
 170 /**
 171  * CMP: Sets the low bit of the destination channels with the result
 172  * of the comparison, while the upper bits are undefined, and updates
 173  * the flag register with the packed 16 bits of the result.
 174  */
 175 vec4_instruction *
 176 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 177 {
 178    vec4_instruction *inst;
 179
 180    /* original gen4 does type conversion to the destination type
 181     * before before comparison, producing garbage results for floating
 182     * point comparisons.
 183     */
 184    if (intel->gen == 4) {
 185       dst.type = src0.type;
 186       if (dst.file == HW_REG)
 187          dst.fixed_hw_reg.type = dst.type;
 188    }
 189
 190    resolve_ud_negate(&src0);
 191    resolve_ud_negate(&src1);
 192
 193    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 194    inst->conditional_mod = condition;
 195
 196    return inst;
 197 }
 198
 199 vec4_instruction *
 200 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 201 {
 202    vec4_instruction *inst;
 203
 204    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 205                                         dst, index);
 206    inst->base_mrf = 14;
 207    inst->mlen = 2;
 208
 209    return inst;
 210 }
 211
 212 vec4_instruction *
 213 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 214 {
 215    vec4_instruction *inst;
 216
 217    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 218                                         dst, src, index);
 219    inst->base_mrf = 13;
 220    inst->mlen = 3;
 221
 222    return inst;
 223 }
 224
 225 void
 226 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 227 {
 228    static enum opcode dot_opcodes[] = {
 229       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 230    };
 231
 232    emit(dot_opcodes[elements - 2], dst, src0, src1);
 233 }
 234
 235 src_reg
 236 vec4_visitor::fix_3src_operand(src_reg src)
 237 {
 238    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 239     * able to use vertical stride of zero to replicate the vec4 uniform, like
 240     *
 241     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 242     *
 243     * But you can't, since vertical stride is always four in three-source
 244     * instructions. Instead, insert a MOV instruction to do the replication so
 245     * that the three-source instruction can consume it.
 246     */
 247
 248    /* The MOV is only needed if the source is a uniform or immediate. */
 249    if (src.file != UNIFORM && src.file != IMM)
 250       return src;
 251
 252    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 253    expanded.type = src.type;
 254    emit(MOV(expanded, src));
 255    return src_reg(expanded);
 256 }
 257
 258 src_reg
 259 vec4_visitor::fix_math_operand(src_reg src)
 260 {
 261    /* The gen6 math instruction ignores the source modifiers --
 262     * swizzle, abs, negate, and at least some parts of the register
 263     * region description.
 264     *
 265     * Rather than trying to enumerate all these cases, *always* expand the
 266     * operand to a temp GRF for gen6.
 267     *
 268     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 269     * can't use.
 270     */
 271
 272    if (intel->gen == 7 && src.file != IMM)
 273       return src;
 274
 275    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 276    expanded.type = src.type;
 277    emit(MOV(expanded, src));
 278    return src_reg(expanded);
 279 }
 280
 281 void
 282 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 283 {
 284    src = fix_math_operand(src);
 285
 286    if (dst.writemask != WRITEMASK_XYZW) {
 287       /* The gen6 math instruction must be align1, so we can't do
 288        * writemasks.
 289        */
 290       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 291
 292       emit(opcode, temp_dst, src);
 293
 294       emit(MOV(dst, src_reg(temp_dst)));
 295    } else {
 296       emit(opcode, dst, src);
 297    }
 298 }
 299
 300 void
 301 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 302 {
 303    vec4_instruction *inst = emit(opcode, dst, src);
 304    inst->base_mrf = 1;
 305    inst->mlen = 1;
 306 }
 307
 308 void
 309 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    switch (opcode) {
 312    case SHADER_OPCODE_RCP:
 313    case SHADER_OPCODE_RSQ:
 314    case SHADER_OPCODE_SQRT:
 315    case SHADER_OPCODE_EXP2:
 316    case SHADER_OPCODE_LOG2:
 317    case SHADER_OPCODE_SIN:
 318    case SHADER_OPCODE_COS:
 319       break;
 320    default:
 321       assert(!"not reached: bad math opcode");
 322       return;
 323    }
 324
 325    if (intel->gen >= 6) {
 326       return emit_math1_gen6(opcode, dst, src);
 327    } else {
 328       return emit_math1_gen4(opcode, dst, src);
 329    }
 330 }
 331
 332 void
 333 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 334                               dst_reg dst, src_reg src0, src_reg src1)
 335 {
 336    src0 = fix_math_operand(src0);
 337    src1 = fix_math_operand(src1);
 338
 339    if (dst.writemask != WRITEMASK_XYZW) {
 340       /* The gen6 math instruction must be align1, so we can't do
 341        * writemasks.
 342        */
 343       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 344       temp_dst.type = dst.type;
 345
 346       emit(opcode, temp_dst, src0, src1);
 347
 348       emit(MOV(dst, src_reg(temp_dst)));
 349    } else {
 350       emit(opcode, dst, src0, src1);
 351    }
 352 }
 353
 354 void
 355 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 356                               dst_reg dst, src_reg src0, src_reg src1)
 357 {
 358    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 359    inst->base_mrf = 1;
 360    inst->mlen = 2;
 361 }
 362
 363 void
 364 vec4_visitor::emit_math(enum opcode opcode,
 365                         dst_reg dst, src_reg src0, src_reg src1)
 366 {
 367    switch (opcode) {
 368    case SHADER_OPCODE_POW:
 369    case SHADER_OPCODE_INT_QUOTIENT:
 370    case SHADER_OPCODE_INT_REMAINDER:
 371       break;
 372    default:
 373       assert(!"not reached: unsupported binary math opcode");
 374       return;
 375    }
 376
 377    if (intel->gen >= 6) {
 378       return emit_math2_gen6(opcode, dst, src0, src1);
 379    } else {
 380       return emit_math2_gen4(opcode, dst, src0, src1);
 381    }
 382 }
 383
 384 void
 385 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 386 {
 387    if (intel->gen < 7)
 388       assert(!"ir_unop_pack_half_2x16 should be lowered");
 389
 390    assert(dst.type == BRW_REGISTER_TYPE_UD);
 391    assert(src0.type == BRW_REGISTER_TYPE_F);
 392
 393    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 394     *
 395     *   Because this instruction does not have a 16-bit floating-point type,
 396     *   the destination data type must be Word (W).
 397     *
 398     *   The destination must be DWord-aligned and specify a horizontal stride
 399     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 400     *   each destination channel and the upper word is not modified.
 401     *
 402     * The above restriction implies that the f32to16 instruction must use
 403     * align1 mode, because only in align1 mode is it possible to specify
 404     * horizontal stride.  We choose here to defy the hardware docs and emit
 405     * align16 instructions.
 406     *
 407     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 408     * instructions. I was partially successful in that the code passed all
 409     * tests.  However, the code was dubiously correct and fragile, and the
 410     * tests were not harsh enough to probe that frailty. Not trusting the
 411     * code, I chose instead to remain in align16 mode in defiance of the hw
 412     * docs).
 413     *
 414     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 415     * simulator, emitting a f32to16 in align16 mode with UD as destination
 416     * data type is safe. The behavior differs from that specified in the PRM
 417     * in that the upper word of each destination channel is cleared to 0.
 418     */
 419
 420    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 421    src_reg tmp_src(tmp_dst);
 422
 423 #if 0
 424    /* Verify the undocumented behavior on which the following instructions
 425     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 426     * then the result of the bit-or instruction below will be incorrect.
 427     *
 428     * You should inspect the disasm output in order to verify that the MOV is
 429     * not optimized away.
 430     */
 431    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 432 #endif
 433
 434    /* Give tmp the form below, where "." means untouched.
 435     *
 436     *     w z          y          x w z          y          x
 437     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 438     *
 439     * That the upper word of each write-channel be 0 is required for the
 440     * following bit-shift and bit-or instructions to work. Note that this
 441     * relies on the undocumented hardware behavior mentioned above.
 442     */
 443    tmp_dst.writemask = WRITEMASK_XY;
 444    emit(F32TO16(tmp_dst, src0));
 445
 446    /* Give the write-channels of dst the form:
 447     *   0xhhhh0000
 448     */
 449    tmp_src.swizzle = SWIZZLE_Y;
 450    emit(SHL(dst, tmp_src, src_reg(16u)));
 451
 452    /* Finally, give the write-channels of dst the form of packHalf2x16's
 453     * output:
 454     *   0xhhhhllll
 455     */
 456    tmp_src.swizzle = SWIZZLE_X;
 457    emit(OR(dst, src_reg(dst), tmp_src));
 458 }
 459
 460 void
 461 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 462 {
 463    if (intel->gen < 7)
 464       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 465
 466    assert(dst.type == BRW_REGISTER_TYPE_F);
 467    assert(src0.type == BRW_REGISTER_TYPE_UD);
 468
 469    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 470     *
 471     *   Because this instruction does not have a 16-bit floating-point type,
 472     *   the source data type must be Word (W). The destination type must be
 473     *   F (Float).
 474     *
 475     * To use W as the source data type, we must adjust horizontal strides,
 476     * which is only possible in align1 mode. All my [chadv] attempts at
 477     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 478     * Piglit tests, so I gave up.
 479     *
 480     * I've verified that, on gen7 hardware and the simulator, it is safe to
 481     * emit f16to32 in align16 mode with UD as source data type.
 482     */
 483
 484    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 485    src_reg tmp_src(tmp_dst);
 486
 487    tmp_dst.writemask = WRITEMASK_X;
 488    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 489
 490    tmp_dst.writemask = WRITEMASK_Y;
 491    emit(SHR(tmp_dst, src0, src_reg(16u)));
 492
 493    dst.writemask = WRITEMASK_XY;
 494    emit(F16TO32(dst, tmp_src));
 495 }
 496
 497 void
 498 vec4_visitor::visit_instructions(const exec_list *list)
 499 {
 500    foreach_list(node, list) {
 501       ir_instruction *ir = (ir_instruction *)node;
 502
 503       base_ir = ir;
 504       ir->accept(this);
 505    }
 506 }
 507
 508
 509 static int
 510 type_size(const struct glsl_type *type)
 511 {
 512    unsigned int i;
 513    int size;
 514
 515    switch (type->base_type) {
 516    case GLSL_TYPE_UINT:
 517    case GLSL_TYPE_INT:
 518    case GLSL_TYPE_FLOAT:
 519    case GLSL_TYPE_BOOL:
 520       if (type->is_matrix()) {
 521          return type->matrix_columns;
 522       } else {
 523          /* Regardless of size of vector, it gets a vec4. This is bad
 524           * packing for things like floats, but otherwise arrays become a
 525           * mess.  Hopefully a later pass over the code can pack scalars
 526           * down if appropriate.
 527           */
 528          return 1;
 529       }
 530    case GLSL_TYPE_ARRAY:
 531       assert(type->length > 0);
 532       return type_size(type->fields.array) * type->length;
 533    case GLSL_TYPE_STRUCT:
 534       size = 0;
 535       for (i = 0; i < type->length; i++) {
 536          size += type_size(type->fields.structure[i].type);
 537       }
 538       return size;
 539    case GLSL_TYPE_SAMPLER:
 540       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 541        * at link time.
 542        */
 543       return 1;
 544    case GLSL_TYPE_VOID:
 545    case GLSL_TYPE_ERROR:
 546    case GLSL_TYPE_INTERFACE:
 547       assert(0);
 548       break;
 549    }
 550
 551    return 0;
 552 }
 553
 554 int
 555 vec4_visitor::virtual_grf_alloc(int size)
 556 {
 557    if (virtual_grf_array_size <= virtual_grf_count) {
 558       if (virtual_grf_array_size == 0)
 559          virtual_grf_array_size = 16;
 560       else
 561          virtual_grf_array_size *= 2;
 562       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 563                                    virtual_grf_array_size);
 564       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 565                                      virtual_grf_array_size);
 566    }
 567    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 568    virtual_grf_reg_count += size;
 569    virtual_grf_sizes[virtual_grf_count] = size;
 570    return virtual_grf_count++;
 571 }
 572
 573 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 574 {
 575    init();
 576
 577    this->file = GRF;
 578    this->reg = v->virtual_grf_alloc(type_size(type));
 579
 580    if (type->is_array() || type->is_record()) {
 581       this->swizzle = BRW_SWIZZLE_NOOP;
 582    } else {
 583       this->swizzle = swizzle_for_size(type->vector_elements);
 584    }
 585
 586    this->type = brw_type_for_base_type(type);
 587 }
 588
 589 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 590 {
 591    init();
 592
 593    this->file = GRF;
 594    this->reg = v->virtual_grf_alloc(type_size(type));
 595
 596    if (type->is_array() || type->is_record()) {
 597       this->writemask = WRITEMASK_XYZW;
 598    } else {
 599       this->writemask = (1 << type->vector_elements) - 1;
 600    }
 601
 602    this->type = brw_type_for_base_type(type);
 603 }
 604
 605 /* Our support for uniforms is piggy-backed on the struct
 606  * gl_fragment_program, because that's where the values actually
 607  * get stored, rather than in some global gl_shader_program uniform
 608  * store.
 609  */
 610 void
 611 vec4_visitor::setup_uniform_values(ir_variable *ir)
 612 {
 613    int namelen = strlen(ir->name);
 614
 615    /* The data for our (non-builtin) uniforms is stored in a series of
 616     * gl_uniform_driver_storage structs for each subcomponent that
 617     * glGetUniformLocation() could name.  We know it's been set up in the same
 618     * order we'd walk the type, so walk the list of storage and find anything
 619     * with our name, or the prefix of a component that starts with our name.
 620     */
 621    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 622       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 623
 624       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 625           (storage->name[namelen] != 0 &&
 626            storage->name[namelen] != '.' &&
 627            storage->name[namelen] != '[')) {
 628          continue;
 629       }
 630
 631       gl_constant_value *components = storage->storage;
 632       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 633                                storage->type->matrix_columns);
 634
 635       for (unsigned s = 0; s < vector_count; s++) {
 636          uniform_vector_size[uniforms] = storage->type->vector_elements;
 637
 638          int i;
 639          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 640             prog_data->param[uniforms * 4 + i] = &components->f;
 641             components++;
 642          }
 643          for (; i < 4; i++) {
 644             static float zero = 0;
 645             prog_data->param[uniforms * 4 + i] = &zero;
 646          }
 647
 648          uniforms++;
 649       }
 650    }
 651 }
 652
 653 void
 654 vec4_visitor::setup_uniform_clipplane_values()
 655 {
 656    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 657
 658    if (intel->gen < 6) {
 659       /* Pre-Gen6, we compact clip planes.  For example, if the user
 660        * enables just clip planes 0, 1, and 3, we will enable clip planes
 661        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 662        * plane 2.  This simplifies the implementation of the Gen6 clip
 663        * thread.
 664        */
 665       int compacted_clipplane_index = 0;
 666       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 667          if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
 668             continue;
 669
 670          this->uniform_vector_size[this->uniforms] = 4;
 671          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 672          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 673          for (int j = 0; j < 4; ++j) {
 674             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 675          }
 676          ++compacted_clipplane_index;
 677          ++this->uniforms;
 678       }
 679    } else {
 680       /* In Gen6 and later, we don't compact clip planes, because this
 681        * simplifies the implementation of gl_ClipDistance.
 682        */
 683       for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 684          this->uniform_vector_size[this->uniforms] = 4;
 685          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 686          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 687          for (int j = 0; j < 4; ++j) {
 688             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 689          }
 690          ++this->uniforms;
 691       }
 692    }
 693 }
 694
 695 /* Our support for builtin uniforms is even scarier than non-builtin.
 696  * It sits on top of the PROG_STATE_VAR parameters that are
 697  * automatically updated from GL context state.
 698  */
 699 void
 700 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 701 {
 702    const ir_state_slot *const slots = ir->state_slots;
 703    assert(ir->state_slots != NULL);
 704
 705    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 706       /* This state reference has already been setup by ir_to_mesa,
 707        * but we'll get the same index back here.  We can reference
 708        * ParameterValues directly, since unlike brw_fs.cpp, we never
 709        * add new state references during compile.
 710        */
 711       int index = _mesa_add_state_reference(this->prog->Parameters,
 712                                             (gl_state_index *)slots[i].tokens);
 713       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 714
 715       this->uniform_vector_size[this->uniforms] = 0;
 716       /* Add each of the unique swizzled channels of the element.
 717        * This will end up matching the size of the glsl_type of this field.
 718        */
 719       int last_swiz = -1;
 720       for (unsigned int j = 0; j < 4; j++) {
 721          int swiz = GET_SWZ(slots[i].swizzle, j);
 722          last_swiz = swiz;
 723
 724          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 725          if (swiz <= last_swiz)
 726             this->uniform_vector_size[this->uniforms]++;
 727       }
 728       this->uniforms++;
 729    }
 730 }
 731
 732 dst_reg *
 733 vec4_visitor::variable_storage(ir_variable *var)
 734 {
 735    return (dst_reg *)hash_table_find(this->variable_ht, var);
 736 }
 737
 738 void
 739 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 740 {
 741    ir_expression *expr = ir->as_expression();
 742
 743    *predicate = BRW_PREDICATE_NORMAL;
 744
 745    if (expr) {
 746       src_reg op[2];
 747       vec4_instruction *inst;
 748
 749       assert(expr->get_num_operands() <= 2);
 750       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 751          expr->operands[i]->accept(this);
 752          op[i] = this->result;
 753
 754          resolve_ud_negate(&op[i]);
 755       }
 756
 757       switch (expr->operation) {
 758       case ir_unop_logic_not:
 759          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 760          inst->conditional_mod = BRW_CONDITIONAL_Z;
 761          break;
 762
 763       case ir_binop_logic_xor:
 764          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 765          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 766          break;
 767
 768       case ir_binop_logic_or:
 769          inst = emit(OR(dst_null_d(), op[0], op[1]));
 770          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 771          break;
 772
 773       case ir_binop_logic_and:
 774          inst = emit(AND(dst_null_d(), op[0], op[1]));
 775          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 776          break;
 777
 778       case ir_unop_f2b:
 779          if (intel->gen >= 6) {
 780             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 781          } else {
 782             inst = emit(MOV(dst_null_f(), op[0]));
 783             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 784          }
 785          break;
 786
 787       case ir_unop_i2b:
 788          if (intel->gen >= 6) {
 789             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 790          } else {
 791             inst = emit(MOV(dst_null_d(), op[0]));
 792             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 793          }
 794          break;
 795
 796       case ir_binop_all_equal:
 797          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 798          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 799          break;
 800
 801       case ir_binop_any_nequal:
 802          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 803          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 804          break;
 805
 806       case ir_unop_any:
 807          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 808          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 809          break;
 810
 811       case ir_binop_greater:
 812       case ir_binop_gequal:
 813       case ir_binop_less:
 814       case ir_binop_lequal:
 815       case ir_binop_equal:
 816       case ir_binop_nequal:
 817          emit(CMP(dst_null_d(), op[0], op[1],
 818                   brw_conditional_for_comparison(expr->operation)));
 819          break;
 820
 821       default:
 822          assert(!"not reached");
 823          break;
 824       }
 825       return;
 826    }
 827
 828    ir->accept(this);
 829
 830    resolve_ud_negate(&this->result);
 831
 832    if (intel->gen >= 6) {
 833       vec4_instruction *inst = emit(AND(dst_null_d(),
 834                                         this->result, src_reg(1)));
 835       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836    } else {
 837       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 838       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839    }
 840 }
 841
 842 /**
 843  * Emit a gen6 IF statement with the comparison folded into the IF
 844  * instruction.
 845  */
 846 void
 847 vec4_visitor::emit_if_gen6(ir_if *ir)
 848 {
 849    ir_expression *expr = ir->condition->as_expression();
 850
 851    if (expr) {
 852       src_reg op[2];
 853       dst_reg temp;
 854
 855       assert(expr->get_num_operands() <= 2);
 856       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 857          expr->operands[i]->accept(this);
 858          op[i] = this->result;
 859       }
 860
 861       switch (expr->operation) {
 862       case ir_unop_logic_not:
 863          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 864          return;
 865
 866       case ir_binop_logic_xor:
 867          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 868          return;
 869
 870       case ir_binop_logic_or:
 871          temp = dst_reg(this, glsl_type::bool_type);
 872          emit(OR(temp, op[0], op[1]));
 873          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 874          return;
 875
 876       case ir_binop_logic_and:
 877          temp = dst_reg(this, glsl_type::bool_type);
 878          emit(AND(temp, op[0], op[1]));
 879          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 880          return;
 881
 882       case ir_unop_f2b:
 883          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 884          return;
 885
 886       case ir_unop_i2b:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 888          return;
 889
 890       case ir_binop_greater:
 891       case ir_binop_gequal:
 892       case ir_binop_less:
 893       case ir_binop_lequal:
 894       case ir_binop_equal:
 895       case ir_binop_nequal:
 896          emit(IF(op[0], op[1],
 897                  brw_conditional_for_comparison(expr->operation)));
 898          return;
 899
 900       case ir_binop_all_equal:
 901          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 902          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 903          return;
 904
 905       case ir_binop_any_nequal:
 906          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 907          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 908          return;
 909
 910       case ir_unop_any:
 911          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 912          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 913          return;
 914
 915       default:
 916          assert(!"not reached");
 917          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 918          return;
 919       }
 920       return;
 921    }
 922
 923    ir->condition->accept(this);
 924
 925    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 926 }
 927
 928 static dst_reg
 929 with_writemask(dst_reg const & r, int mask)
 930 {
 931    dst_reg result = r;
 932    result.writemask = mask;
 933    return result;
 934 }
 935
 936 void
 937 vec4_vs_visitor::emit_prolog()
 938 {
 939    dst_reg sign_recovery_shift;
 940    dst_reg normalize_factor;
 941    dst_reg es3_normalize_factor;
 942
 943    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 944       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 945          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 946          dst_reg reg(ATTR, i);
 947          dst_reg reg_d = reg;
 948          reg_d.type = BRW_REGISTER_TYPE_D;
 949          dst_reg reg_ud = reg;
 950          reg_ud.type = BRW_REGISTER_TYPE_UD;
 951
 952          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 953           * come in as floating point conversions of the integer values.
 954           */
 955          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 956             dst_reg dst = reg;
 957             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 958             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 959             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 960          }
 961
 962          /* Do sign recovery for 2101010 formats if required. */
 963          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 964             if (sign_recovery_shift.file == BAD_FILE) {
 965                /* shift constant: <22,22,22,30> */
 966                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 967                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 968                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 969             }
 970
 971             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 972             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 973          }
 974
 975          /* Apply BGRA swizzle if required. */
 976          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 977             src_reg temp = src_reg(reg);
 978             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 979             emit(MOV(reg, temp));
 980          }
 981
 982          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 983             /* ES 3.0 has different rules for converting signed normalized
 984              * fixed-point numbers than desktop GL.
 985              */
 986             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 987                /* According to equation 2.2 of the ES 3.0 specification,
 988                 * signed normalization conversion is done by:
 989                 *
 990                 * f = c / (2^(b-1)-1)
 991                 */
 992                if (es3_normalize_factor.file == BAD_FILE) {
 993                   /* mul constant: 1 / (2^(b-1) - 1) */
 994                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 995                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 996                            src_reg(1.0f / ((1<<9) - 1))));
 997                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 998                            src_reg(1.0f / ((1<<1) - 1))));
 999                }
1000
1001                dst_reg dst = reg;
1002                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1003                emit(MOV(dst, src_reg(reg_d)));
1004                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1005                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1006             } else {
1007                /* The following equations are from the OpenGL 3.2 specification:
1008                 *
1009                 * 2.1 unsigned normalization
1010                 * f = c/(2^n-1)
1011                 *
1012                 * 2.2 signed normalization
1013                 * f = (2c+1)/(2^n-1)
1014                 *
1015                 * Both of these share a common divisor, which is represented by
1016                 * "normalize_factor" in the code below.
1017                 */
1018                if (normalize_factor.file == BAD_FILE) {
1019                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1020                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1021                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1022                            src_reg(1.0f / ((1<<10) - 1))));
1023                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1024                            src_reg(1.0f / ((1<<2) - 1))));
1025                }
1026
1027                dst_reg dst = reg;
1028                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1029                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1030
1031                /* For signed normalization, we want the numerator to be 2c+1. */
1032                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1033                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1034                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1035                }
1036
1037                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1038             }
1039          }
1040
1041          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1042             dst_reg dst = reg;
1043             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1044             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1045          }
1046       }
1047    }
1048 }
1049
1050
1051 dst_reg *
1052 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1053 {
1054    /* VertexID is stored by the VF as the last vertex element, but
1055     * we don't represent it with a flag in inputs_read, so we call
1056     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1057     */
1058    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1059    vs_prog_data->uses_vertexid = true;
1060
1061    switch (ir->location) {
1062    case SYSTEM_VALUE_VERTEX_ID:
1063       reg->writemask = WRITEMASK_X;
1064       break;
1065    case SYSTEM_VALUE_INSTANCE_ID:
1066       reg->writemask = WRITEMASK_Y;
1067       break;
1068    default:
1069       assert(!"not reached");
1070       break;
1071    }
1072
1073    return reg;
1074 }
1075
1076
1077 void
1078 vec4_visitor::visit(ir_variable *ir)
1079 {
1080    dst_reg *reg = NULL;
1081
1082    if (variable_storage(ir))
1083       return;
1084
1085    switch (ir->mode) {
1086    case ir_var_shader_in:
1087       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1088       break;
1089
1090    case ir_var_shader_out:
1091       reg = new(mem_ctx) dst_reg(this, ir->type);
1092
1093       for (int i = 0; i < type_size(ir->type); i++) {
1094          output_reg[ir->location + i] = *reg;
1095          output_reg[ir->location + i].reg_offset = i;
1096          output_reg[ir->location + i].type =
1097             brw_type_for_base_type(ir->type->get_scalar_type());
1098          output_reg_annotation[ir->location + i] = ir->name;
1099       }
1100       break;
1101
1102    case ir_var_auto:
1103    case ir_var_temporary:
1104       reg = new(mem_ctx) dst_reg(this, ir->type);
1105       break;
1106
1107    case ir_var_uniform:
1108       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1109
1110       /* Thanks to the lower_ubo_reference pass, we will see only
1111        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1112        * variables, so no need for them to be in variable_ht.
1113        */
1114       if (ir->is_in_uniform_block())
1115          return;
1116
1117       /* Track how big the whole uniform variable is, in case we need to put a
1118        * copy of its data into pull constants for array access.
1119        */
1120       this->uniform_size[this->uniforms] = type_size(ir->type);
1121
1122       if (!strncmp(ir->name, "gl_", 3)) {
1123          setup_builtin_uniform_values(ir);
1124       } else {
1125          setup_uniform_values(ir);
1126       }
1127       break;
1128
1129    case ir_var_system_value:
1130       reg = make_reg_for_system_value(ir);
1131       break;
1132
1133    default:
1134       assert(!"not reached");
1135    }
1136
1137    reg->type = brw_type_for_base_type(ir->type);
1138    hash_table_insert(this->variable_ht, reg, ir);
1139 }
1140
1141 void
1142 vec4_visitor::visit(ir_loop *ir)
1143 {
1144    dst_reg counter;
1145
1146    /* We don't want debugging output to print the whole body of the
1147     * loop as the annotation.
1148     */
1149    this->base_ir = NULL;
1150
1151    if (ir->counter != NULL) {
1152       this->base_ir = ir->counter;
1153       ir->counter->accept(this);
1154       counter = *(variable_storage(ir->counter));
1155
1156       if (ir->from != NULL) {
1157          this->base_ir = ir->from;
1158          ir->from->accept(this);
1159
1160          emit(MOV(counter, this->result));
1161       }
1162    }
1163
1164    emit(BRW_OPCODE_DO);
1165
1166    if (ir->to) {
1167       this->base_ir = ir->to;
1168       ir->to->accept(this);
1169
1170       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1171                brw_conditional_for_comparison(ir->cmp)));
1172
1173       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1174       inst->predicate = BRW_PREDICATE_NORMAL;
1175    }
1176
1177    visit_instructions(&ir->body_instructions);
1178
1179
1180    if (ir->increment) {
1181       this->base_ir = ir->increment;
1182       ir->increment->accept(this);
1183       emit(ADD(counter, src_reg(counter), this->result));
1184    }
1185
1186    emit(BRW_OPCODE_WHILE);
1187 }
1188
1189 void
1190 vec4_visitor::visit(ir_loop_jump *ir)
1191 {
1192    switch (ir->mode) {
1193    case ir_loop_jump::jump_break:
1194       emit(BRW_OPCODE_BREAK);
1195       break;
1196    case ir_loop_jump::jump_continue:
1197       emit(BRW_OPCODE_CONTINUE);
1198       break;
1199    }
1200 }
1201
1202
1203 void
1204 vec4_visitor::visit(ir_function_signature *ir)
1205 {
1206    assert(0);
1207    (void)ir;
1208 }
1209
1210 void
1211 vec4_visitor::visit(ir_function *ir)
1212 {
1213    /* Ignore function bodies other than main() -- we shouldn't see calls to
1214     * them since they should all be inlined.
1215     */
1216    if (strcmp(ir->name, "main") == 0) {
1217       const ir_function_signature *sig;
1218       exec_list empty;
1219
1220       sig = ir->matching_signature(&empty);
1221
1222       assert(sig);
1223
1224       visit_instructions(&sig->body);
1225    }
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_sat(ir_expression *ir)
1230 {
1231    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1232    if (!sat_src)
1233       return false;
1234
1235    sat_src->accept(this);
1236    src_reg src = this->result;
1237
1238    this->result = src_reg(this, ir->type);
1239    vec4_instruction *inst;
1240    inst = emit(MOV(dst_reg(this->result), src));
1241    inst->saturate = true;
1242
1243    return true;
1244 }
1245
1246 void
1247 vec4_visitor::emit_bool_comparison(unsigned int op,
1248                                  dst_reg dst, src_reg src0, src_reg src1)
1249 {
1250    /* original gen4 does destination conversion before comparison. */
1251    if (intel->gen < 5)
1252       dst.type = src0.type;
1253
1254    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1255
1256    dst.type = BRW_REGISTER_TYPE_D;
1257    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1258 }
1259
1260 void
1261 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1262                           src_reg src0, src_reg src1)
1263 {
1264    vec4_instruction *inst;
1265
1266    if (intel->gen >= 6) {
1267       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268       inst->conditional_mod = conditionalmod;
1269    } else {
1270       emit(CMP(dst, src0, src1, conditionalmod));
1271
1272       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1273       inst->predicate = BRW_PREDICATE_NORMAL;
1274    }
1275 }
1276
1277 void
1278 vec4_visitor::visit(ir_expression *ir)
1279 {
1280    unsigned int operand;
1281    src_reg op[Elements(ir->operands)];
1282    src_reg result_src;
1283    dst_reg result_dst;
1284    vec4_instruction *inst;
1285
1286    if (try_emit_sat(ir))
1287       return;
1288
1289    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1290       this->result.file = BAD_FILE;
1291       ir->operands[operand]->accept(this);
1292       if (this->result.file == BAD_FILE) {
1293          printf("Failed to get tree for expression operand:\n");
1294          ir->operands[operand]->print();
1295          exit(1);
1296       }
1297       op[operand] = this->result;
1298
1299       /* Matrix expression operands should have been broken down to vector
1300        * operations already.
1301        */
1302       assert(!ir->operands[operand]->type->is_matrix());
1303    }
1304
1305    int vector_elements = ir->operands[0]->type->vector_elements;
1306    if (ir->operands[1]) {
1307       vector_elements = MAX2(vector_elements,
1308                              ir->operands[1]->type->vector_elements);
1309    }
1310
1311    this->result.file = BAD_FILE;
1312
1313    /* Storage for our result.  Ideally for an assignment we'd be using
1314     * the actual storage for the result here, instead.
1315     */
1316    result_src = src_reg(this, ir->type);
1317    /* convenience for the emit functions below. */
1318    result_dst = dst_reg(result_src);
1319    /* If nothing special happens, this is the result. */
1320    this->result = result_src;
1321    /* Limit writes to the channels that will be used by result_src later.
1322     * This does limit this temp's use as a temporary for multi-instruction
1323     * sequences.
1324     */
1325    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1326
1327    switch (ir->operation) {
1328    case ir_unop_logic_not:
1329       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1330        * ones complement of the whole register, not just bit 0.
1331        */
1332       emit(XOR(result_dst, op[0], src_reg(1)));
1333       break;
1334    case ir_unop_neg:
1335       op[0].negate = !op[0].negate;
1336       this->result = op[0];
1337       break;
1338    case ir_unop_abs:
1339       op[0].abs = true;
1340       op[0].negate = false;
1341       this->result = op[0];
1342       break;
1343
1344    case ir_unop_sign:
1345       emit(MOV(result_dst, src_reg(0.0f)));
1346
1347       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1348       inst = emit(MOV(result_dst, src_reg(1.0f)));
1349       inst->predicate = BRW_PREDICATE_NORMAL;
1350
1351       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1352       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1353       inst->predicate = BRW_PREDICATE_NORMAL;
1354
1355       break;
1356
1357    case ir_unop_rcp:
1358       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1359       break;
1360
1361    case ir_unop_exp2:
1362       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1363       break;
1364    case ir_unop_log2:
1365       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1366       break;
1367    case ir_unop_exp:
1368    case ir_unop_log:
1369       assert(!"not reached: should be handled by ir_explog_to_explog2");
1370       break;
1371    case ir_unop_sin:
1372    case ir_unop_sin_reduced:
1373       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1374       break;
1375    case ir_unop_cos:
1376    case ir_unop_cos_reduced:
1377       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1378       break;
1379
1380    case ir_unop_dFdx:
1381    case ir_unop_dFdy:
1382       assert(!"derivatives not valid in vertex shader");
1383       break;
1384
1385    case ir_unop_noise:
1386       assert(!"not reached: should be handled by lower_noise");
1387       break;
1388
1389    case ir_binop_add:
1390       emit(ADD(result_dst, op[0], op[1]));
1391       break;
1392    case ir_binop_sub:
1393       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1394       break;
1395
1396    case ir_binop_mul:
1397       if (ir->type->is_integer()) {
1398          /* For integer multiplication, the MUL uses the low 16 bits
1399           * of one of the operands (src0 on gen6, src1 on gen7).  The
1400           * MACH accumulates in the contribution of the upper 16 bits
1401           * of that operand.
1402           *
1403           * FINISHME: Emit just the MUL if we know an operand is small
1404           * enough.
1405           */
1406          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1407
1408          emit(MUL(acc, op[0], op[1]));
1409          emit(MACH(dst_null_d(), op[0], op[1]));
1410          emit(MOV(result_dst, src_reg(acc)));
1411       } else {
1412          emit(MUL(result_dst, op[0], op[1]));
1413       }
1414       break;
1415    case ir_binop_div:
1416       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1417       assert(ir->type->is_integer());
1418       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1419       break;
1420    case ir_binop_mod:
1421       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1422       assert(ir->type->is_integer());
1423       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1424       break;
1425
1426    case ir_binop_less:
1427    case ir_binop_greater:
1428    case ir_binop_lequal:
1429    case ir_binop_gequal:
1430    case ir_binop_equal:
1431    case ir_binop_nequal: {
1432       emit(CMP(result_dst, op[0], op[1],
1433                brw_conditional_for_comparison(ir->operation)));
1434       emit(AND(result_dst, result_src, src_reg(0x1)));
1435       break;
1436    }
1437
1438    case ir_binop_all_equal:
1439       /* "==" operator producing a scalar boolean. */
1440       if (ir->operands[0]->type->is_vector() ||
1441           ir->operands[1]->type->is_vector()) {
1442          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1443          emit(MOV(result_dst, src_reg(0)));
1444          inst = emit(MOV(result_dst, src_reg(1)));
1445          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1446       } else {
1447          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1448          emit(AND(result_dst, result_src, src_reg(0x1)));
1449       }
1450       break;
1451    case ir_binop_any_nequal:
1452       /* "!=" operator producing a scalar boolean. */
1453       if (ir->operands[0]->type->is_vector() ||
1454           ir->operands[1]->type->is_vector()) {
1455          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1456
1457          emit(MOV(result_dst, src_reg(0)));
1458          inst = emit(MOV(result_dst, src_reg(1)));
1459          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1460       } else {
1461          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1462          emit(AND(result_dst, result_src, src_reg(0x1)));
1463       }
1464       break;
1465
1466    case ir_unop_any:
1467       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1468       emit(MOV(result_dst, src_reg(0)));
1469
1470       inst = emit(MOV(result_dst, src_reg(1)));
1471       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1472       break;
1473
1474    case ir_binop_logic_xor:
1475       emit(XOR(result_dst, op[0], op[1]));
1476       break;
1477
1478    case ir_binop_logic_or:
1479       emit(OR(result_dst, op[0], op[1]));
1480       break;
1481
1482    case ir_binop_logic_and:
1483       emit(AND(result_dst, op[0], op[1]));
1484       break;
1485
1486    case ir_binop_dot:
1487       assert(ir->operands[0]->type->is_vector());
1488       assert(ir->operands[0]->type == ir->operands[1]->type);
1489       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1490       break;
1491
1492    case ir_unop_sqrt:
1493       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1494       break;
1495    case ir_unop_rsq:
1496       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1497       break;
1498
1499    case ir_unop_bitcast_i2f:
1500    case ir_unop_bitcast_u2f:
1501       this->result = op[0];
1502       this->result.type = BRW_REGISTER_TYPE_F;
1503       break;
1504
1505    case ir_unop_bitcast_f2i:
1506       this->result = op[0];
1507       this->result.type = BRW_REGISTER_TYPE_D;
1508       break;
1509
1510    case ir_unop_bitcast_f2u:
1511       this->result = op[0];
1512       this->result.type = BRW_REGISTER_TYPE_UD;
1513       break;
1514
1515    case ir_unop_i2f:
1516    case ir_unop_i2u:
1517    case ir_unop_u2i:
1518    case ir_unop_u2f:
1519    case ir_unop_b2f:
1520    case ir_unop_b2i:
1521    case ir_unop_f2i:
1522    case ir_unop_f2u:
1523       emit(MOV(result_dst, op[0]));
1524       break;
1525    case ir_unop_f2b:
1526    case ir_unop_i2b: {
1527       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1528       emit(AND(result_dst, result_src, src_reg(1)));
1529       break;
1530    }
1531
1532    case ir_unop_trunc:
1533       emit(RNDZ(result_dst, op[0]));
1534       break;
1535    case ir_unop_ceil:
1536       op[0].negate = !op[0].negate;
1537       inst = emit(RNDD(result_dst, op[0]));
1538       this->result.negate = true;
1539       break;
1540    case ir_unop_floor:
1541       inst = emit(RNDD(result_dst, op[0]));
1542       break;
1543    case ir_unop_fract:
1544       inst = emit(FRC(result_dst, op[0]));
1545       break;
1546    case ir_unop_round_even:
1547       emit(RNDE(result_dst, op[0]));
1548       break;
1549
1550    case ir_binop_min:
1551       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1552       break;
1553    case ir_binop_max:
1554       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1555       break;
1556
1557    case ir_binop_pow:
1558       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1559       break;
1560
1561    case ir_unop_bit_not:
1562       inst = emit(NOT(result_dst, op[0]));
1563       break;
1564    case ir_binop_bit_and:
1565       inst = emit(AND(result_dst, op[0], op[1]));
1566       break;
1567    case ir_binop_bit_xor:
1568       inst = emit(XOR(result_dst, op[0], op[1]));
1569       break;
1570    case ir_binop_bit_or:
1571       inst = emit(OR(result_dst, op[0], op[1]));
1572       break;
1573
1574    case ir_binop_lshift:
1575       inst = emit(SHL(result_dst, op[0], op[1]));
1576       break;
1577
1578    case ir_binop_rshift:
1579       if (ir->type->base_type == GLSL_TYPE_INT)
1580          inst = emit(ASR(result_dst, op[0], op[1]));
1581       else
1582          inst = emit(SHR(result_dst, op[0], op[1]));
1583       break;
1584
1585    case ir_binop_ubo_load: {
1586       ir_constant *uniform_block = ir->operands[0]->as_constant();
1587       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1588       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1589       src_reg offset = op[1];
1590
1591       /* Now, load the vector from that offset. */
1592       assert(ir->type->is_vector() || ir->type->is_scalar());
1593
1594       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1595       packed_consts.type = result.type;
1596       src_reg surf_index =
1597          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1598       if (const_offset_ir) {
1599          offset = src_reg(const_offset / 16);
1600       } else {
1601          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1602       }
1603
1604       vec4_instruction *pull =
1605          emit(new(mem_ctx) vec4_instruction(this,
1606                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1607                                             dst_reg(packed_consts),
1608                                             surf_index,
1609                                             offset));
1610       pull->base_mrf = 14;
1611       pull->mlen = 1;
1612
1613       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1614       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1615                                             const_offset % 16 / 4,
1616                                             const_offset % 16 / 4,
1617                                             const_offset % 16 / 4);
1618
1619       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1620       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1621          emit(CMP(result_dst, packed_consts, src_reg(0u),
1622                   BRW_CONDITIONAL_NZ));
1623          emit(AND(result_dst, result, src_reg(0x1)));
1624       } else {
1625          emit(MOV(result_dst, packed_consts));
1626       }
1627       break;
1628    }
1629
1630    case ir_triop_lrp:
1631       op[0] = fix_3src_operand(op[0]);
1632       op[1] = fix_3src_operand(op[1]);
1633       op[2] = fix_3src_operand(op[2]);
1634       emit(LRP(result_dst, op[0], op[1], op[2]));
1635       break;
1636
1637    case ir_quadop_vector:
1638       assert(!"not reached: should be handled by lower_quadop_vector");
1639       break;
1640
1641    case ir_unop_pack_half_2x16:
1642       emit_pack_half_2x16(result_dst, op[0]);
1643       break;
1644    case ir_unop_unpack_half_2x16:
1645       emit_unpack_half_2x16(result_dst, op[0]);
1646       break;
1647    case ir_unop_pack_snorm_2x16:
1648    case ir_unop_pack_snorm_4x8:
1649    case ir_unop_pack_unorm_2x16:
1650    case ir_unop_pack_unorm_4x8:
1651    case ir_unop_unpack_snorm_2x16:
1652    case ir_unop_unpack_snorm_4x8:
1653    case ir_unop_unpack_unorm_2x16:
1654    case ir_unop_unpack_unorm_4x8:
1655       assert(!"not reached: should be handled by lower_packing_builtins");
1656       break;
1657    case ir_unop_unpack_half_2x16_split_x:
1658    case ir_unop_unpack_half_2x16_split_y:
1659    case ir_binop_pack_half_2x16_split:
1660       assert(!"not reached: should not occur in vertex shader");
1661       break;
1662    }
1663 }
1664
1665
1666 void
1667 vec4_visitor::visit(ir_swizzle *ir)
1668 {
1669    src_reg src;
1670    int i = 0;
1671    int swizzle[4];
1672
1673    /* Note that this is only swizzles in expressions, not those on the left
1674     * hand side of an assignment, which do write masking.  See ir_assignment
1675     * for that.
1676     */
1677
1678    ir->val->accept(this);
1679    src = this->result;
1680    assert(src.file != BAD_FILE);
1681
1682    for (i = 0; i < ir->type->vector_elements; i++) {
1683       switch (i) {
1684       case 0:
1685          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1686          break;
1687       case 1:
1688          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1689          break;
1690       case 2:
1691          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1692          break;
1693       case 3:
1694          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1695             break;
1696       }
1697    }
1698    for (; i < 4; i++) {
1699       /* Replicate the last channel out. */
1700       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1701    }
1702
1703    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1704
1705    this->result = src;
1706 }
1707
1708 void
1709 vec4_visitor::visit(ir_dereference_variable *ir)
1710 {
1711    const struct glsl_type *type = ir->type;
1712    dst_reg *reg = variable_storage(ir->var);
1713
1714    if (!reg) {
1715       fail("Failed to find variable storage for %s\n", ir->var->name);
1716       this->result = src_reg(brw_null_reg());
1717       return;
1718    }
1719
1720    this->result = src_reg(*reg);
1721
1722    /* System values get their swizzle from the dst_reg writemask */
1723    if (ir->var->mode == ir_var_system_value)
1724       return;
1725
1726    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1727       this->result.swizzle = swizzle_for_size(type->vector_elements);
1728 }
1729
1730
1731 int
1732 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1733 {
1734    /* Under normal circumstances array elements are stored consecutively, so
1735     * the stride is equal to the size of the array element.
1736     */
1737    return type_size(ir->type);
1738 }
1739
1740
1741 void
1742 vec4_visitor::visit(ir_dereference_array *ir)
1743 {
1744    ir_constant *constant_index;
1745    src_reg src;
1746    int array_stride = compute_array_stride(ir);
1747
1748    constant_index = ir->array_index->constant_expression_value();
1749
1750    ir->array->accept(this);
1751    src = this->result;
1752
1753    if (constant_index) {
1754       src.reg_offset += constant_index->value.i[0] * array_stride;
1755    } else {
1756       /* Variable index array dereference.  It eats the "vec4" of the
1757        * base of the array and an index that offsets the Mesa register
1758        * index.
1759        */
1760       ir->array_index->accept(this);
1761
1762       src_reg index_reg;
1763
1764       if (array_stride == 1) {
1765          index_reg = this->result;
1766       } else {
1767          index_reg = src_reg(this, glsl_type::int_type);
1768
1769          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1770       }
1771
1772       if (src.reladdr) {
1773          src_reg temp = src_reg(this, glsl_type::int_type);
1774
1775          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1776
1777          index_reg = temp;
1778       }
1779
1780       src.reladdr = ralloc(mem_ctx, src_reg);
1781       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1782    }
1783
1784    /* If the type is smaller than a vec4, replicate the last channel out. */
1785    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1786       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1787    else
1788       src.swizzle = BRW_SWIZZLE_NOOP;
1789    src.type = brw_type_for_base_type(ir->type);
1790
1791    this->result = src;
1792 }
1793
1794 void
1795 vec4_visitor::visit(ir_dereference_record *ir)
1796 {
1797    unsigned int i;
1798    const glsl_type *struct_type = ir->record->type;
1799    int offset = 0;
1800
1801    ir->record->accept(this);
1802
1803    for (i = 0; i < struct_type->length; i++) {
1804       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1805          break;
1806       offset += type_size(struct_type->fields.structure[i].type);
1807    }
1808
1809    /* If the type is smaller than a vec4, replicate the last channel out. */
1810    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1811       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1812    else
1813       this->result.swizzle = BRW_SWIZZLE_NOOP;
1814    this->result.type = brw_type_for_base_type(ir->type);
1815
1816    this->result.reg_offset += offset;
1817 }
1818
1819 /**
1820  * We want to be careful in assignment setup to hit the actual storage
1821  * instead of potentially using a temporary like we might with the
1822  * ir_dereference handler.
1823  */
1824 static dst_reg
1825 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1826 {
1827    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1828     * access of a vector, it must be separated into a series conditional moves
1829     * before reaching this point (see ir_vec_index_to_cond_assign).
1830     */
1831    assert(ir->as_dereference());
1832    ir_dereference_array *deref_array = ir->as_dereference_array();
1833    if (deref_array) {
1834       assert(!deref_array->array->type->is_vector());
1835    }
1836
1837    /* Use the rvalue deref handler for the most part.  We'll ignore
1838     * swizzles in it and write swizzles using writemask, though.
1839     */
1840    ir->accept(v);
1841    return dst_reg(v->result);
1842 }
1843
1844 void
1845 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1846                               const struct glsl_type *type, uint32_t predicate)
1847 {
1848    if (type->base_type == GLSL_TYPE_STRUCT) {
1849       for (unsigned int i = 0; i < type->length; i++) {
1850          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1851       }
1852       return;
1853    }
1854
1855    if (type->is_array()) {
1856       for (unsigned int i = 0; i < type->length; i++) {
1857          emit_block_move(dst, src, type->fields.array, predicate);
1858       }
1859       return;
1860    }
1861
1862    if (type->is_matrix()) {
1863       const struct glsl_type *vec_type;
1864
1865       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1866                                          type->vector_elements, 1);
1867
1868       for (int i = 0; i < type->matrix_columns; i++) {
1869          emit_block_move(dst, src, vec_type, predicate);
1870       }
1871       return;
1872    }
1873
1874    assert(type->is_scalar() || type->is_vector());
1875
1876    dst->type = brw_type_for_base_type(type);
1877    src->type = dst->type;
1878
1879    dst->writemask = (1 << type->vector_elements) - 1;
1880
1881    src->swizzle = swizzle_for_size(type->vector_elements);
1882
1883    vec4_instruction *inst = emit(MOV(*dst, *src));
1884    inst->predicate = predicate;
1885
1886    dst->reg_offset++;
1887    src->reg_offset++;
1888 }
1889
1890
1891 /* If the RHS processing resulted in an instruction generating a
1892  * temporary value, and it would be easy to rewrite the instruction to
1893  * generate its result right into the LHS instead, do so.  This ends
1894  * up reliably removing instructions where it can be tricky to do so
1895  * later without real UD chain information.
1896  */
1897 bool
1898 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1899                                      dst_reg dst,
1900                                      src_reg src,
1901                                      vec4_instruction *pre_rhs_inst,
1902                                      vec4_instruction *last_rhs_inst)
1903 {
1904    /* This could be supported, but it would take more smarts. */
1905    if (ir->condition)
1906       return false;
1907
1908    if (pre_rhs_inst == last_rhs_inst)
1909       return false; /* No instructions generated to work with. */
1910
1911    /* Make sure the last instruction generated our source reg. */
1912    if (src.file != GRF ||
1913        src.file != last_rhs_inst->dst.file ||
1914        src.reg != last_rhs_inst->dst.reg ||
1915        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1916        src.reladdr ||
1917        src.abs ||
1918        src.negate ||
1919        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1920       return false;
1921
1922    /* Check that that last instruction fully initialized the channels
1923     * we want to use, in the order we want to use them.  We could
1924     * potentially reswizzle the operands of many instructions so that
1925     * we could handle out of order channels, but don't yet.
1926     */
1927
1928    for (unsigned i = 0; i < 4; i++) {
1929       if (dst.writemask & (1 << i)) {
1930          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1931             return false;
1932
1933          if (BRW_GET_SWZ(src.swizzle, i) != i)
1934             return false;
1935       }
1936    }
1937
1938    /* Success!  Rewrite the instruction. */
1939    last_rhs_inst->dst.file = dst.file;
1940    last_rhs_inst->dst.reg = dst.reg;
1941    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1942    last_rhs_inst->dst.reladdr = dst.reladdr;
1943    last_rhs_inst->dst.writemask &= dst.writemask;
1944
1945    return true;
1946 }
1947
1948 void
1949 vec4_visitor::visit(ir_assignment *ir)
1950 {
1951    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1952    uint32_t predicate = BRW_PREDICATE_NONE;
1953
1954    if (!ir->lhs->type->is_scalar() &&
1955        !ir->lhs->type->is_vector()) {
1956       ir->rhs->accept(this);
1957       src_reg src = this->result;
1958
1959       if (ir->condition) {
1960          emit_bool_to_cond_code(ir->condition, &predicate);
1961       }
1962
1963       /* emit_block_move doesn't account for swizzles in the source register.
1964        * This should be ok, since the source register is a structure or an
1965        * array, and those can't be swizzled.  But double-check to be sure.
1966        */
1967       assert(src.swizzle ==
1968              (ir->rhs->type->is_matrix()
1969               ? swizzle_for_size(ir->rhs->type->vector_elements)
1970               : BRW_SWIZZLE_NOOP));
1971
1972       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1973       return;
1974    }
1975
1976    /* Now we're down to just a scalar/vector with writemasks. */
1977    int i;
1978
1979    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1980    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1981
1982    ir->rhs->accept(this);
1983
1984    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1985
1986    src_reg src = this->result;
1987
1988    int swizzles[4];
1989    int first_enabled_chan = 0;
1990    int src_chan = 0;
1991
1992    assert(ir->lhs->type->is_vector() ||
1993           ir->lhs->type->is_scalar());
1994    dst.writemask = ir->write_mask;
1995
1996    for (int i = 0; i < 4; i++) {
1997       if (dst.writemask & (1 << i)) {
1998          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1999          break;
2000       }
2001    }
2002
2003    /* Swizzle a small RHS vector into the channels being written.
2004     *
2005     * glsl ir treats write_mask as dictating how many channels are
2006     * present on the RHS while in our instructions we need to make
2007     * those channels appear in the slots of the vec4 they're written to.
2008     */
2009    for (int i = 0; i < 4; i++) {
2010       if (dst.writemask & (1 << i))
2011          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2012       else
2013          swizzles[i] = first_enabled_chan;
2014    }
2015    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2016                               swizzles[2], swizzles[3]);
2017
2018    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2019       return;
2020    }
2021
2022    if (ir->condition) {
2023       emit_bool_to_cond_code(ir->condition, &predicate);
2024    }
2025
2026    for (i = 0; i < type_size(ir->lhs->type); i++) {
2027       vec4_instruction *inst = emit(MOV(dst, src));
2028       inst->predicate = predicate;
2029
2030       dst.reg_offset++;
2031       src.reg_offset++;
2032    }
2033 }
2034
2035 void
2036 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2037 {
2038    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2039       foreach_list(node, &ir->components) {
2040          ir_constant *field_value = (ir_constant *)node;
2041
2042          emit_constant_values(dst, field_value);
2043       }
2044       return;
2045    }
2046
2047    if (ir->type->is_array()) {
2048       for (unsigned int i = 0; i < ir->type->length; i++) {
2049          emit_constant_values(dst, ir->array_elements[i]);
2050       }
2051       return;
2052    }
2053
2054    if (ir->type->is_matrix()) {
2055       for (int i = 0; i < ir->type->matrix_columns; i++) {
2056          float *vec = &ir->value.f[i * ir->type->vector_elements];
2057
2058          for (int j = 0; j < ir->type->vector_elements; j++) {
2059             dst->writemask = 1 << j;
2060             dst->type = BRW_REGISTER_TYPE_F;
2061
2062             emit(MOV(*dst, src_reg(vec[j])));
2063          }
2064          dst->reg_offset++;
2065       }
2066       return;
2067    }
2068
2069    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2070
2071    for (int i = 0; i < ir->type->vector_elements; i++) {
2072       if (!(remaining_writemask & (1 << i)))
2073          continue;
2074
2075       dst->writemask = 1 << i;
2076       dst->type = brw_type_for_base_type(ir->type);
2077
2078       /* Find other components that match the one we're about to
2079        * write.  Emits fewer instructions for things like vec4(0.5,
2080        * 1.5, 1.5, 1.5).
2081        */
2082       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2083          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2084             if (ir->value.b[i] == ir->value.b[j])
2085                dst->writemask |= (1 << j);
2086          } else {
2087             /* u, i, and f storage all line up, so no need for a
2088              * switch case for comparing each type.
2089              */
2090             if (ir->value.u[i] == ir->value.u[j])
2091                dst->writemask |= (1 << j);
2092          }
2093       }
2094
2095       switch (ir->type->base_type) {
2096       case GLSL_TYPE_FLOAT:
2097          emit(MOV(*dst, src_reg(ir->value.f[i])));
2098          break;
2099       case GLSL_TYPE_INT:
2100          emit(MOV(*dst, src_reg(ir->value.i[i])));
2101          break;
2102       case GLSL_TYPE_UINT:
2103          emit(MOV(*dst, src_reg(ir->value.u[i])));
2104          break;
2105       case GLSL_TYPE_BOOL:
2106          emit(MOV(*dst, src_reg(ir->value.b[i])));
2107          break;
2108       default:
2109          assert(!"Non-float/uint/int/bool constant");
2110          break;
2111       }
2112
2113       remaining_writemask &= ~dst->writemask;
2114    }
2115    dst->reg_offset++;
2116 }
2117
2118 void
2119 vec4_visitor::visit(ir_constant *ir)
2120 {
2121    dst_reg dst = dst_reg(this, ir->type);
2122    this->result = src_reg(dst);
2123
2124    emit_constant_values(&dst, ir);
2125 }
2126
2127 void
2128 vec4_visitor::visit(ir_call *ir)
2129 {
2130    assert(!"not reached");
2131 }
2132
2133 void
2134 vec4_visitor::visit(ir_texture *ir)
2135 {
2136    int sampler =
2137       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2138
2139    /* Should be lowered by do_lower_texture_projection */
2140    assert(!ir->projector);
2141
2142    /* Generate code to compute all the subexpression trees.  This has to be
2143     * done before loading any values into MRFs for the sampler message since
2144     * generating these values may involve SEND messages that need the MRFs.
2145     */
2146    src_reg coordinate;
2147    if (ir->coordinate) {
2148       ir->coordinate->accept(this);
2149       coordinate = this->result;
2150    }
2151
2152    src_reg shadow_comparitor;
2153    if (ir->shadow_comparitor) {
2154       ir->shadow_comparitor->accept(this);
2155       shadow_comparitor = this->result;
2156    }
2157
2158    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2159    src_reg lod, dPdx, dPdy, sample_index;
2160    switch (ir->op) {
2161    case ir_tex:
2162       lod = src_reg(0.0f);
2163       lod_type = glsl_type::float_type;
2164       break;
2165    case ir_txf:
2166    case ir_txl:
2167    case ir_txs:
2168       ir->lod_info.lod->accept(this);
2169       lod = this->result;
2170       lod_type = ir->lod_info.lod->type;
2171       break;
2172    case ir_txf_ms:
2173       ir->lod_info.sample_index->accept(this);
2174       sample_index = this->result;
2175       sample_index_type = ir->lod_info.sample_index->type;
2176       break;
2177    case ir_txd:
2178       ir->lod_info.grad.dPdx->accept(this);
2179       dPdx = this->result;
2180
2181       ir->lod_info.grad.dPdy->accept(this);
2182       dPdy = this->result;
2183
2184       lod_type = ir->lod_info.grad.dPdx->type;
2185       break;
2186    case ir_txb:
2187    case ir_lod:
2188       break;
2189    }
2190
2191    vec4_instruction *inst = NULL;
2192    switch (ir->op) {
2193    case ir_tex:
2194    case ir_txl:
2195       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2196       break;
2197    case ir_txd:
2198       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2199       break;
2200    case ir_txf:
2201       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2202       break;
2203    case ir_txf_ms:
2204       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2205       break;
2206    case ir_txs:
2207       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2208       break;
2209    case ir_txb:
2210       assert(!"TXB is not valid for vertex shaders.");
2211       break;
2212    case ir_lod:
2213       assert(!"LOD is not valid for vertex shaders.");
2214       break;
2215    }
2216
2217    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2218
2219    /* Texel offsets go in the message header; Gen4 also requires headers. */
2220    inst->header_present = use_texture_offset || intel->gen < 5;
2221    inst->base_mrf = 2;
2222    inst->mlen = inst->header_present + 1; /* always at least one */
2223    inst->sampler = sampler;
2224    inst->dst = dst_reg(this, ir->type);
2225    inst->dst.writemask = WRITEMASK_XYZW;
2226    inst->shadow_compare = ir->shadow_comparitor != NULL;
2227
2228    if (use_texture_offset)
2229       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2230
2231    /* MRF for the first parameter */
2232    int param_base = inst->base_mrf + inst->header_present;
2233
2234    if (ir->op == ir_txs) {
2235       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2236       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2237    } else {
2238       int i, coord_mask = 0, zero_mask = 0;
2239       /* Load the coordinate */
2240       /* FINISHME: gl_clamp_mask and saturate */
2241       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2242          coord_mask |= (1 << i);
2243       for (; i < 4; i++)
2244          zero_mask |= (1 << i);
2245
2246       if (ir->offset && ir->op == ir_txf) {
2247          /* It appears that the ld instruction used for txf does its
2248           * address bounds check before adding in the offset.  To work
2249           * around this, just add the integer offset to the integer
2250           * texel coordinate, and don't put the offset in the header.
2251           */
2252          ir_constant *offset = ir->offset->as_constant();
2253          assert(offset);
2254
2255          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2256             src_reg src = coordinate;
2257             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2258                                        BRW_GET_SWZ(src.swizzle, j),
2259                                        BRW_GET_SWZ(src.swizzle, j),
2260                                        BRW_GET_SWZ(src.swizzle, j));
2261             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2262                      src, offset->value.i[j]));
2263          }
2264       } else {
2265          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2266                   coordinate));
2267       }
2268       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2269                src_reg(0)));
2270       /* Load the shadow comparitor */
2271       if (ir->shadow_comparitor) {
2272          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2273                           WRITEMASK_X),
2274                   shadow_comparitor));
2275          inst->mlen++;
2276       }
2277
2278       /* Load the LOD info */
2279       if (ir->op == ir_tex || ir->op == ir_txl) {
2280          int mrf, writemask;
2281          if (intel->gen >= 5) {
2282             mrf = param_base + 1;
2283             if (ir->shadow_comparitor) {
2284                writemask = WRITEMASK_Y;
2285                /* mlen already incremented */
2286             } else {
2287                writemask = WRITEMASK_X;
2288                inst->mlen++;
2289             }
2290          } else /* intel->gen == 4 */ {
2291             mrf = param_base;
2292             writemask = WRITEMASK_Z;
2293          }
2294          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2295       } else if (ir->op == ir_txf) {
2296          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2297       } else if (ir->op == ir_txf_ms) {
2298          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2299                   sample_index));
2300          inst->mlen++;
2301
2302          /* on Gen7, there is an additional MCS parameter here after SI,
2303           * but we don't bother to emit it since it's always zero. If
2304           * we start supporting texturing from CMS surfaces, this will have
2305           * to change
2306           */
2307       } else if (ir->op == ir_txd) {
2308          const glsl_type *type = lod_type;
2309
2310          if (intel->gen >= 5) {
2311             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2312             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2313             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2314             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2315             inst->mlen++;
2316
2317             if (ir->type->vector_elements == 3) {
2318                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2319                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2320                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2321                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2322                inst->mlen++;
2323             }
2324          } else /* intel->gen == 4 */ {
2325             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2326             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2327             inst->mlen += 2;
2328          }
2329       }
2330    }
2331
2332    emit(inst);
2333
2334    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2335     * spec requires layers.
2336     */
2337    if (ir->op == ir_txs) {
2338       glsl_type const *type = ir->sampler->type;
2339       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2340           type->sampler_array) {
2341          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2342                    with_writemask(inst->dst, WRITEMASK_Z),
2343                    src_reg(inst->dst), src_reg(6));
2344       }
2345    }
2346
2347    swizzle_result(ir, src_reg(inst->dst), sampler);
2348 }
2349
2350 void
2351 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2352 {
2353    int s = key->tex.swizzles[sampler];
2354
2355    this->result = src_reg(this, ir->type);
2356    dst_reg swizzled_result(this->result);
2357
2358    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2359                         || s == SWIZZLE_NOOP) {
2360       emit(MOV(swizzled_result, orig_val));
2361       return;
2362    }
2363
2364    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2365    int swizzle[4];
2366
2367    for (int i = 0; i < 4; i++) {
2368       switch (GET_SWZ(s, i)) {
2369       case SWIZZLE_ZERO:
2370          zero_mask |= (1 << i);
2371          break;
2372       case SWIZZLE_ONE:
2373          one_mask |= (1 << i);
2374          break;
2375       default:
2376          copy_mask |= (1 << i);
2377          swizzle[i] = GET_SWZ(s, i);
2378          break;
2379       }
2380    }
2381
2382    if (copy_mask) {
2383       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2384       swizzled_result.writemask = copy_mask;
2385       emit(MOV(swizzled_result, orig_val));
2386    }
2387
2388    if (zero_mask) {
2389       swizzled_result.writemask = zero_mask;
2390       emit(MOV(swizzled_result, src_reg(0.0f)));
2391    }
2392
2393    if (one_mask) {
2394       swizzled_result.writemask = one_mask;
2395       emit(MOV(swizzled_result, src_reg(1.0f)));
2396    }
2397 }
2398
2399 void
2400 vec4_visitor::visit(ir_return *ir)
2401 {
2402    assert(!"not reached");
2403 }
2404
2405 void
2406 vec4_visitor::visit(ir_discard *ir)
2407 {
2408    assert(!"not reached");
2409 }
2410
2411 void
2412 vec4_visitor::visit(ir_if *ir)
2413 {
2414    /* Don't point the annotation at the if statement, because then it plus
2415     * the then and else blocks get printed.
2416     */
2417    this->base_ir = ir->condition;
2418
2419    if (intel->gen == 6) {
2420       emit_if_gen6(ir);
2421    } else {
2422       uint32_t predicate;
2423       emit_bool_to_cond_code(ir->condition, &predicate);
2424       emit(IF(predicate));
2425    }
2426
2427    visit_instructions(&ir->then_instructions);
2428
2429    if (!ir->else_instructions.is_empty()) {
2430       this->base_ir = ir->condition;
2431       emit(BRW_OPCODE_ELSE);
2432
2433       visit_instructions(&ir->else_instructions);
2434    }
2435
2436    this->base_ir = ir->condition;
2437    emit(BRW_OPCODE_ENDIF);
2438 }
2439
2440 void
2441 vec4_visitor::emit_ndc_computation()
2442 {
2443    /* Get the position */
2444    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2445
2446    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2447    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2448    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2449
2450    current_annotation = "NDC";
2451    dst_reg ndc_w = ndc;
2452    ndc_w.writemask = WRITEMASK_W;
2453    src_reg pos_w = pos;
2454    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2455    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2456
2457    dst_reg ndc_xyz = ndc;
2458    ndc_xyz.writemask = WRITEMASK_XYZ;
2459
2460    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2461 }
2462
2463 void
2464 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2465 {
2466    if (intel->gen < 6 &&
2467        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2468         key->userclip_active || brw->has_negative_rhw_bug)) {
2469       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2470       dst_reg header1_w = header1;
2471       header1_w.writemask = WRITEMASK_W;
2472       GLuint i;
2473
2474       emit(MOV(header1, 0u));
2475
2476       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2477          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2478
2479          current_annotation = "Point size";
2480          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2481          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2482       }
2483
2484       current_annotation = "Clipping flags";
2485       for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2486          vec4_instruction *inst;
2487
2488          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2489                          src_reg(this->userplane[i])));
2490          inst->conditional_mod = BRW_CONDITIONAL_L;
2491
2492          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2493          inst->predicate = BRW_PREDICATE_NORMAL;
2494       }
2495
2496       /* i965 clipping workaround:
2497        * 1) Test for -ve rhw
2498        * 2) If set,
2499        *      set ndc = (0,0,0,0)
2500        *      set ucp[6] = 1
2501        *
2502        * Later, clipping will detect ucp[6] and ensure the primitive is
2503        * clipped against all fixed planes.
2504        */
2505       if (brw->has_negative_rhw_bug) {
2506          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2507          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2508          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2509          vec4_instruction *inst;
2510          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2511          inst->predicate = BRW_PREDICATE_NORMAL;
2512          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2513          inst->predicate = BRW_PREDICATE_NORMAL;
2514       }
2515
2516       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2517    } else if (intel->gen < 6) {
2518       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2519    } else {
2520       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2521       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2522          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2523                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2524       }
2525    }
2526 }
2527
2528 void
2529 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2530 {
2531    if (intel->gen < 6) {
2532       /* Clip distance slots are set aside in gen5, but they are not used.  It
2533        * is not clear whether we actually need to set aside space for them,
2534        * but the performance cost is negligible.
2535        */
2536       return;
2537    }
2538
2539    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2540     *
2541     *     "If a linked set of shaders forming the vertex stage contains no
2542     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2543     *     application has requested clipping against user clip planes through
2544     *     the API, then the coordinate written to gl_Position is used for
2545     *     comparison against the user clip planes."
2546     *
2547     * This function is only called if the shader didn't write to
2548     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2549     * if the user wrote to it; otherwise we use gl_Position.
2550     */
2551    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2552    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2553       clip_vertex = VARYING_SLOT_POS;
2554    }
2555
2556    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2557         ++i) {
2558       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2559                src_reg(output_reg[clip_vertex]),
2560                src_reg(this->userplane[i + offset])));
2561    }
2562 }
2563
2564 void
2565 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2566 {
2567    assert (varying < VARYING_SLOT_MAX);
2568    reg.type = output_reg[varying].type;
2569    current_annotation = output_reg_annotation[varying];
2570    /* Copy the register, saturating if necessary */
2571    vec4_instruction *inst = emit(MOV(reg,
2572                                      src_reg(output_reg[varying])));
2573    if ((varying == VARYING_SLOT_COL0 ||
2574         varying == VARYING_SLOT_COL1 ||
2575         varying == VARYING_SLOT_BFC0 ||
2576         varying == VARYING_SLOT_BFC1) &&
2577        key->clamp_vertex_color) {
2578       inst->saturate = true;
2579    }
2580 }
2581
2582 void
2583 vec4_visitor::emit_urb_slot(int mrf, int varying)
2584 {
2585    struct brw_reg hw_reg = brw_message_reg(mrf);
2586    dst_reg reg = dst_reg(MRF, mrf);
2587    reg.type = BRW_REGISTER_TYPE_F;
2588
2589    switch (varying) {
2590    case VARYING_SLOT_PSIZ:
2591       /* PSIZ is always in slot 0, and is coupled with other flags. */
2592       current_annotation = "indices, point width, clip flags";
2593       emit_psiz_and_flags(hw_reg);
2594       break;
2595    case BRW_VARYING_SLOT_NDC:
2596       current_annotation = "NDC";
2597       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2598       break;
2599    case BRW_VARYING_SLOT_POS_DUPLICATE:
2600    case VARYING_SLOT_POS:
2601       current_annotation = "gl_Position";
2602       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2603       break;
2604    case VARYING_SLOT_CLIP_DIST0:
2605    case VARYING_SLOT_CLIP_DIST1:
2606       if (this->key->uses_clip_distance) {
2607          emit_generic_urb_slot(reg, varying);
2608       } else {
2609          current_annotation = "user clip distances";
2610          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2611       }
2612       break;
2613    case VARYING_SLOT_EDGE:
2614       /* This is present when doing unfilled polygons.  We're supposed to copy
2615        * the edge flag from the user-provided vertex array
2616        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2617        * of that attribute (starts as 1.0f).  This is then used in clipping to
2618        * determine which edges should be drawn as wireframe.
2619        */
2620       current_annotation = "edge flag";
2621       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2622                                     glsl_type::float_type, WRITEMASK_XYZW))));
2623       break;
2624    case BRW_VARYING_SLOT_PAD:
2625       /* No need to write to this slot */
2626       break;
2627    default:
2628       emit_generic_urb_slot(reg, varying);
2629       break;
2630    }
2631 }
2632
2633 static int
2634 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2635 {
2636    struct intel_context *intel = &brw->intel;
2637
2638    if (intel->gen >= 6) {
2639       /* URB data written (does not include the message header reg) must
2640        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2641        * section 5.4.3.2.2: URB_INTERLEAVED.
2642        *
2643        * URB entries are allocated on a multiple of 1024 bits, so an
2644        * extra 128 bits written here to make the end align to 256 is
2645        * no problem.
2646        */
2647       if ((mlen % 2) != 1)
2648          mlen++;
2649    }
2650
2651    return mlen;
2652 }
2653
2654 void
2655 vec4_vs_visitor::emit_urb_write_header(int mrf)
2656 {
2657    /* No need to do anything for VS; an implied write to this MRF will be
2658     * performed by VS_OPCODE_URB_WRITE.
2659     */
2660    (void) mrf;
2661 }
2662
2663 vec4_instruction *
2664 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2665 {
2666    /* For VS, the URB writes end the thread. */
2667    if (complete) {
2668       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2669          emit_shader_time_end();
2670    }
2671
2672    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2673    inst->eot = complete;
2674
2675    return inst;
2676 }
2677
2678 /**
2679  * Generates the VUE payload plus the necessary URB write instructions to
2680  * output it.
2681  *
2682  * The VUE layout is documented in Volume 2a.
2683  */
2684 void
2685 vec4_visitor::emit_vertex()
2686 {
2687    /* MRF 0 is reserved for the debugger, so start with message header
2688     * in MRF 1.
2689     */
2690    int base_mrf = 1;
2691    int mrf = base_mrf;
2692    /* In the process of generating our URB write message contents, we
2693     * may need to unspill a register or load from an array.  Those
2694     * reads would use MRFs 14-15.
2695     */
2696    int max_usable_mrf = 13;
2697
2698    /* The following assertion verifies that max_usable_mrf causes an
2699     * even-numbered amount of URB write data, which will meet gen6's
2700     * requirements for length alignment.
2701     */
2702    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2703
2704    /* First mrf is the g0-based message header containing URB handles and
2705     * such.
2706     */
2707    emit_urb_write_header(mrf++);
2708
2709    if (intel->gen < 6) {
2710       emit_ndc_computation();
2711    }
2712
2713    /* Set up the VUE data for the first URB write */
2714    int slot;
2715    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2716       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2717
2718       /* If this was max_usable_mrf, we can't fit anything more into this URB
2719        * WRITE.
2720        */
2721       if (mrf > max_usable_mrf) {
2722          slot++;
2723          break;
2724       }
2725    }
2726
2727    bool complete = slot >= prog_data->vue_map.num_slots;
2728    current_annotation = "URB write";
2729    vec4_instruction *inst = emit_urb_write_opcode(complete);
2730    inst->base_mrf = base_mrf;
2731    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2732
2733    /* Optional second URB write */
2734    if (!complete) {
2735       mrf = base_mrf + 1;
2736
2737       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2738          assert(mrf < max_usable_mrf);
2739
2740          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2741       }
2742
2743       current_annotation = "URB write";
2744       inst = emit_urb_write_opcode(true /* complete */);
2745       inst->base_mrf = base_mrf;
2746       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2747       /* URB destination offset.  In the previous write, we got MRFs
2748        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2749        * URB row increments, and each of our MRFs is half of one of
2750        * those, since we're doing interleaved writes.
2751        */
2752       inst->offset = (max_usable_mrf - base_mrf) / 2;
2753    }
2754 }
2755
2756 void
2757 vec4_vs_visitor::emit_thread_end()
2758 {
2759    /* For VS, we always end the thread by emitting a single vertex.
2760     * emit_urb_write_opcode() will take care of setting the eot flag on the
2761     * SEND instruction.
2762     */
2763    emit_vertex();
2764 }
2765
2766 src_reg
2767 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2768                                  src_reg *reladdr, int reg_offset)
2769 {
2770    /* Because we store the values to scratch interleaved like our
2771     * vertex data, we need to scale the vec4 index by 2.
2772     */
2773    int message_header_scale = 2;
2774
2775    /* Pre-gen6, the message header uses byte offsets instead of vec4
2776     * (16-byte) offset units.
2777     */
2778    if (intel->gen < 6)
2779       message_header_scale *= 16;
2780
2781    if (reladdr) {
2782       src_reg index = src_reg(this, glsl_type::int_type);
2783
2784       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2785       emit_before(inst, MUL(dst_reg(index),
2786                             index, src_reg(message_header_scale)));
2787
2788       return index;
2789    } else {
2790       return src_reg(reg_offset * message_header_scale);
2791    }
2792 }
2793
2794 src_reg
2795 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2796                                        src_reg *reladdr, int reg_offset)
2797 {
2798    if (reladdr) {
2799       src_reg index = src_reg(this, glsl_type::int_type);
2800
2801       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2802
2803       /* Pre-gen6, the message header uses byte offsets instead of vec4
2804        * (16-byte) offset units.
2805        */
2806       if (intel->gen < 6) {
2807          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2808       }
2809
2810       return index;
2811    } else {
2812       int message_header_scale = intel->gen < 6 ? 16 : 1;
2813       return src_reg(reg_offset * message_header_scale);
2814    }
2815 }
2816
2817 /**
2818  * Emits an instruction before @inst to load the value named by @orig_src
2819  * from scratch space at @base_offset to @temp.
2820  *
2821  * @base_offset is measured in 32-byte units (the size of a register).
2822  */
2823 void
2824 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2825                                 dst_reg temp, src_reg orig_src,
2826                                 int base_offset)
2827 {
2828    int reg_offset = base_offset + orig_src.reg_offset;
2829    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2830
2831    emit_before(inst, SCRATCH_READ(temp, index));
2832 }
2833
2834 /**
2835  * Emits an instruction after @inst to store the value to be written
2836  * to @orig_dst to scratch space at @base_offset, from @temp.
2837  *
2838  * @base_offset is measured in 32-byte units (the size of a register).
2839  */
2840 void
2841 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2842 {
2843    int reg_offset = base_offset + inst->dst.reg_offset;
2844    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2845
2846    /* Create a temporary register to store *inst's result in.
2847     *
2848     * We have to be careful in MOVing from our temporary result register in
2849     * the scratch write.  If we swizzle from channels of the temporary that
2850     * weren't initialized, it will confuse live interval analysis, which will
2851     * make spilling fail to make progress.
2852     */
2853    src_reg temp = src_reg(this, glsl_type::vec4_type);
2854    temp.type = inst->dst.type;
2855    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2856    int swizzles[4];
2857    for (int i = 0; i < 4; i++)
2858       if (inst->dst.writemask & (1 << i))
2859          swizzles[i] = i;
2860       else
2861          swizzles[i] = first_writemask_chan;
2862    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2863                                swizzles[2], swizzles[3]);
2864
2865    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2866                                        inst->dst.writemask));
2867    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2868    write->predicate = inst->predicate;
2869    write->ir = inst->ir;
2870    write->annotation = inst->annotation;
2871    inst->insert_after(write);
2872
2873    inst->dst.file = temp.file;
2874    inst->dst.reg = temp.reg;
2875    inst->dst.reg_offset = temp.reg_offset;
2876    inst->dst.reladdr = NULL;
2877 }
2878
2879 /**
2880  * We can't generally support array access in GRF space, because a
2881  * single instruction's destination can only span 2 contiguous
2882  * registers.  So, we send all GRF arrays that get variable index
2883  * access to scratch space.
2884  */
2885 void
2886 vec4_visitor::move_grf_array_access_to_scratch()
2887 {
2888    int scratch_loc[this->virtual_grf_count];
2889
2890    for (int i = 0; i < this->virtual_grf_count; i++) {
2891       scratch_loc[i] = -1;
2892    }
2893
2894    /* First, calculate the set of virtual GRFs that need to be punted
2895     * to scratch due to having any array access on them, and where in
2896     * scratch.
2897     */
2898    foreach_list(node, &this->instructions) {
2899       vec4_instruction *inst = (vec4_instruction *)node;
2900
2901       if (inst->dst.file == GRF && inst->dst.reladdr &&
2902           scratch_loc[inst->dst.reg] == -1) {
2903          scratch_loc[inst->dst.reg] = c->last_scratch;
2904          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2905       }
2906
2907       for (int i = 0 ; i < 3; i++) {
2908          src_reg *src = &inst->src[i];
2909
2910          if (src->file == GRF && src->reladdr &&
2911              scratch_loc[src->reg] == -1) {
2912             scratch_loc[src->reg] = c->last_scratch;
2913             c->last_scratch += this->virtual_grf_sizes[src->reg];
2914          }
2915       }
2916    }
2917
2918    /* Now, for anything that will be accessed through scratch, rewrite
2919     * it to load/store.  Note that this is a _safe list walk, because
2920     * we may generate a new scratch_write instruction after the one
2921     * we're processing.
2922     */
2923    foreach_list_safe(node, &this->instructions) {
2924       vec4_instruction *inst = (vec4_instruction *)node;
2925
2926       /* Set up the annotation tracking for new generated instructions. */
2927       base_ir = inst->ir;
2928       current_annotation = inst->annotation;
2929
2930       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2931          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2932       }
2933
2934       for (int i = 0 ; i < 3; i++) {
2935          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2936             continue;
2937
2938          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2939
2940          emit_scratch_read(inst, temp, inst->src[i],
2941                            scratch_loc[inst->src[i].reg]);
2942
2943          inst->src[i].file = temp.file;
2944          inst->src[i].reg = temp.reg;
2945          inst->src[i].reg_offset = temp.reg_offset;
2946          inst->src[i].reladdr = NULL;
2947       }
2948    }
2949 }
2950
2951 /**
2952  * Emits an instruction before @inst to load the value named by @orig_src
2953  * from the pull constant buffer (surface) at @base_offset to @temp.
2954  */
2955 void
2956 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2957                                       dst_reg temp, src_reg orig_src,
2958                                       int base_offset)
2959 {
2960    int reg_offset = base_offset + orig_src.reg_offset;
2961    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2962    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2963    vec4_instruction *load;
2964
2965    if (intel->gen >= 7) {
2966       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2967       grf_offset.type = offset.type;
2968       emit_before(inst, MOV(grf_offset, offset));
2969
2970       load = new(mem_ctx) vec4_instruction(this,
2971                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2972                                            temp, index, src_reg(grf_offset));
2973    } else {
2974       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2975                                            temp, index, offset);
2976       load->base_mrf = 14;
2977       load->mlen = 1;
2978    }
2979    emit_before(inst, load);
2980 }
2981
2982 /**
2983  * Implements array access of uniforms by inserting a
2984  * PULL_CONSTANT_LOAD instruction.
2985  *
2986  * Unlike temporary GRF array access (where we don't support it due to
2987  * the difficulty of doing relative addressing on instruction
2988  * destinations), we could potentially do array access of uniforms
2989  * that were loaded in GRF space as push constants.  In real-world
2990  * usage we've seen, though, the arrays being used are always larger
2991  * than we could load as push constants, so just always move all
2992  * uniform array access out to a pull constant buffer.
2993  */
2994 void
2995 vec4_visitor::move_uniform_array_access_to_pull_constants()
2996 {
2997    int pull_constant_loc[this->uniforms];
2998
2999    for (int i = 0; i < this->uniforms; i++) {
3000       pull_constant_loc[i] = -1;
3001    }
3002
3003    /* Walk through and find array access of uniforms.  Put a copy of that
3004     * uniform in the pull constant buffer.
3005     *
3006     * Note that we don't move constant-indexed accesses to arrays.  No
3007     * testing has been done of the performance impact of this choice.
3008     */
3009    foreach_list_safe(node, &this->instructions) {
3010       vec4_instruction *inst = (vec4_instruction *)node;
3011
3012       for (int i = 0 ; i < 3; i++) {
3013          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3014             continue;
3015
3016          int uniform = inst->src[i].reg;
3017
3018          /* If this array isn't already present in the pull constant buffer,
3019           * add it.
3020           */
3021          if (pull_constant_loc[uniform] == -1) {
3022             const float **values = &prog_data->param[uniform * 4];
3023
3024             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3025
3026             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3027                prog_data->pull_param[prog_data->nr_pull_params++]
3028                   = values[j];
3029             }
3030          }
3031
3032          /* Set up the annotation tracking for new generated instructions. */
3033          base_ir = inst->ir;
3034          current_annotation = inst->annotation;
3035
3036          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3037
3038          emit_pull_constant_load(inst, temp, inst->src[i],
3039                                  pull_constant_loc[uniform]);
3040
3041          inst->src[i].file = temp.file;
3042          inst->src[i].reg = temp.reg;
3043          inst->src[i].reg_offset = temp.reg_offset;
3044          inst->src[i].reladdr = NULL;
3045       }
3046    }
3047
3048    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3049     * no need to track them as larger-than-vec4 objects.  This will be
3050     * relied on in cutting out unused uniform vectors from push
3051     * constants.
3052     */
3053    split_uniform_registers();
3054 }
3055
3056 void
3057 vec4_visitor::resolve_ud_negate(src_reg *reg)
3058 {
3059    if (reg->type != BRW_REGISTER_TYPE_UD ||
3060        !reg->negate)
3061       return;
3062
3063    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3064    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3065    *reg = temp;
3066 }
3067
3068 vec4_visitor::vec4_visitor(struct brw_context *brw,
3069                            struct brw_vec4_compile *c,
3070                            struct gl_program *prog,
3071                            const struct brw_vec4_prog_key *key,
3072                            struct brw_vec4_prog_data *prog_data,
3073                            struct gl_shader_program *shader_prog,
3074                            struct brw_shader *shader,
3075                            void *mem_ctx,
3076                            bool debug_flag)
3077    : debug_flag(debug_flag)
3078 {
3079    this->brw = brw;
3080    this->intel = &brw->intel;
3081    this->ctx = &intel->ctx;
3082    this->shader_prog = shader_prog;
3083    this->shader = shader;
3084
3085    this->mem_ctx = mem_ctx;
3086    this->failed = false;
3087
3088    this->base_ir = NULL;
3089    this->current_annotation = NULL;
3090    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3091
3092    this->c = c;
3093    this->prog = prog;
3094    this->key = key;
3095    this->prog_data = prog_data;
3096
3097    this->variable_ht = hash_table_ctor(0,
3098                                        hash_table_pointer_hash,
3099                                        hash_table_pointer_compare);
3100
3101    this->virtual_grf_def = NULL;
3102    this->virtual_grf_use = NULL;
3103    this->virtual_grf_sizes = NULL;
3104    this->virtual_grf_count = 0;
3105    this->virtual_grf_reg_map = NULL;
3106    this->virtual_grf_reg_count = 0;
3107    this->virtual_grf_array_size = 0;
3108    this->live_intervals_valid = false;
3109
3110    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3111
3112    this->uniforms = 0;
3113 }
3114
3115 vec4_visitor::~vec4_visitor()
3116 {
3117    hash_table_dtor(this->variable_ht);
3118 }
3119
3120
3121 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3122                                  struct brw_vs_compile *vs_compile,
3123                                  struct brw_vs_prog_data *vs_prog_data,
3124                                  struct gl_shader_program *prog,
3125                                  struct brw_shader *shader,
3126                                  void *mem_ctx)
3127    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3128                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3129                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3130      vs_compile(vs_compile),
3131      vs_prog_data(vs_prog_data)
3132 {
3133 }
3134
3135
3136 void
3137 vec4_visitor::fail(const char *format, ...)
3138 {
3139    va_list va;
3140    char *msg;
3141
3142    if (failed)
3143       return;
3144
3145    failed = true;
3146
3147    va_start(va, format);
3148    msg = ralloc_vasprintf(mem_ctx, format, va);
3149    va_end(va);
3150    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3151
3152    this->fail_msg = msg;
3153
3154    if (debug_flag) {
3155       fprintf(stderr, "%s",  msg);
3156    }
3157 }
3158
3159 } /* namespace brw */