src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 #define ALU3(op)                                                        \
 111    vec4_instruction *                                                   \
 112    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 113    {                                                                    \
 114       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 115                                            src0, src1, src2);           \
 116    }
 117
 118 ALU1(NOT)
 119 ALU1(MOV)
 120 ALU1(FRC)
 121 ALU1(RNDD)
 122 ALU1(RNDE)
 123 ALU1(RNDZ)
 124 ALU1(F32TO16)
 125 ALU1(F16TO32)
 126 ALU2(ADD)
 127 ALU2(MUL)
 128 ALU2(MACH)
 129 ALU2(AND)
 130 ALU2(OR)
 131 ALU2(XOR)
 132 ALU2(DP3)
 133 ALU2(DP4)
 134 ALU2(DPH)
 135 ALU2(SHL)
 136 ALU2(SHR)
 137 ALU2(ASR)
 138 ALU3(LRP)
 139
 140 /** Gen4 predicated IF. */
 141 vec4_instruction *
 142 vec4_visitor::IF(uint32_t predicate)
 143 {
 144    vec4_instruction *inst;
 145
 146    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 147    inst->predicate = predicate;
 148
 149    return inst;
 150 }
 151
 152 /** Gen6+ IF with embedded comparison. */
 153 vec4_instruction *
 154 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 155 {
 156    assert(intel->gen >= 6);
 157
 158    vec4_instruction *inst;
 159
 160    resolve_ud_negate(&src0);
 161    resolve_ud_negate(&src1);
 162
 163    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 164                                         src0, src1);
 165    inst->conditional_mod = condition;
 166
 167    return inst;
 168 }
 169
 170 /**
 171  * CMP: Sets the low bit of the destination channels with the result
 172  * of the comparison, while the upper bits are undefined, and updates
 173  * the flag register with the packed 16 bits of the result.
 174  */
 175 vec4_instruction *
 176 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 177 {
 178    vec4_instruction *inst;
 179
 180    /* original gen4 does type conversion to the destination type
 181     * before before comparison, producing garbage results for floating
 182     * point comparisons.
 183     */
 184    if (intel->gen == 4) {
 185       dst.type = src0.type;
 186       if (dst.file == HW_REG)
 187          dst.fixed_hw_reg.type = dst.type;
 188    }
 189
 190    resolve_ud_negate(&src0);
 191    resolve_ud_negate(&src1);
 192
 193    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 194    inst->conditional_mod = condition;
 195
 196    return inst;
 197 }
 198
 199 vec4_instruction *
 200 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 201 {
 202    vec4_instruction *inst;
 203
 204    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 205                                         dst, index);
 206    inst->base_mrf = 14;
 207    inst->mlen = 2;
 208
 209    return inst;
 210 }
 211
 212 vec4_instruction *
 213 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 214 {
 215    vec4_instruction *inst;
 216
 217    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 218                                         dst, src, index);
 219    inst->base_mrf = 13;
 220    inst->mlen = 3;
 221
 222    return inst;
 223 }
 224
 225 void
 226 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 227 {
 228    static enum opcode dot_opcodes[] = {
 229       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 230    };
 231
 232    emit(dot_opcodes[elements - 2], dst, src0, src1);
 233 }
 234
 235 src_reg
 236 vec4_visitor::fix_3src_operand(src_reg src)
 237 {
 238    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 239     * able to use vertical stride of zero to replicate the vec4 uniform, like
 240     *
 241     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 242     *
 243     * But you can't, since vertical stride is always four in three-source
 244     * instructions. Instead, insert a MOV instruction to do the replication so
 245     * that the three-source instruction can consume it.
 246     */
 247
 248    /* The MOV is only needed if the source is a uniform or immediate. */
 249    if (src.file != UNIFORM && src.file != IMM)
 250       return src;
 251
 252    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 253    expanded.type = src.type;
 254    emit(MOV(expanded, src));
 255    return src_reg(expanded);
 256 }
 257
 258 src_reg
 259 vec4_visitor::fix_math_operand(src_reg src)
 260 {
 261    /* The gen6 math instruction ignores the source modifiers --
 262     * swizzle, abs, negate, and at least some parts of the register
 263     * region description.
 264     *
 265     * Rather than trying to enumerate all these cases, *always* expand the
 266     * operand to a temp GRF for gen6.
 267     *
 268     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 269     * can't use.
 270     */
 271
 272    if (intel->gen == 7 && src.file != IMM)
 273       return src;
 274
 275    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 276    expanded.type = src.type;
 277    emit(MOV(expanded, src));
 278    return src_reg(expanded);
 279 }
 280
 281 void
 282 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 283 {
 284    src = fix_math_operand(src);
 285
 286    if (dst.writemask != WRITEMASK_XYZW) {
 287       /* The gen6 math instruction must be align1, so we can't do
 288        * writemasks.
 289        */
 290       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 291
 292       emit(opcode, temp_dst, src);
 293
 294       emit(MOV(dst, src_reg(temp_dst)));
 295    } else {
 296       emit(opcode, dst, src);
 297    }
 298 }
 299
 300 void
 301 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 302 {
 303    vec4_instruction *inst = emit(opcode, dst, src);
 304    inst->base_mrf = 1;
 305    inst->mlen = 1;
 306 }
 307
 308 void
 309 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    switch (opcode) {
 312    case SHADER_OPCODE_RCP:
 313    case SHADER_OPCODE_RSQ:
 314    case SHADER_OPCODE_SQRT:
 315    case SHADER_OPCODE_EXP2:
 316    case SHADER_OPCODE_LOG2:
 317    case SHADER_OPCODE_SIN:
 318    case SHADER_OPCODE_COS:
 319       break;
 320    default:
 321       assert(!"not reached: bad math opcode");
 322       return;
 323    }
 324
 325    if (intel->gen >= 6) {
 326       return emit_math1_gen6(opcode, dst, src);
 327    } else {
 328       return emit_math1_gen4(opcode, dst, src);
 329    }
 330 }
 331
 332 void
 333 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 334                               dst_reg dst, src_reg src0, src_reg src1)
 335 {
 336    src0 = fix_math_operand(src0);
 337    src1 = fix_math_operand(src1);
 338
 339    if (dst.writemask != WRITEMASK_XYZW) {
 340       /* The gen6 math instruction must be align1, so we can't do
 341        * writemasks.
 342        */
 343       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 344       temp_dst.type = dst.type;
 345
 346       emit(opcode, temp_dst, src0, src1);
 347
 348       emit(MOV(dst, src_reg(temp_dst)));
 349    } else {
 350       emit(opcode, dst, src0, src1);
 351    }
 352 }
 353
 354 void
 355 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 356                               dst_reg dst, src_reg src0, src_reg src1)
 357 {
 358    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 359    inst->base_mrf = 1;
 360    inst->mlen = 2;
 361 }
 362
 363 void
 364 vec4_visitor::emit_math(enum opcode opcode,
 365                         dst_reg dst, src_reg src0, src_reg src1)
 366 {
 367    switch (opcode) {
 368    case SHADER_OPCODE_POW:
 369    case SHADER_OPCODE_INT_QUOTIENT:
 370    case SHADER_OPCODE_INT_REMAINDER:
 371       break;
 372    default:
 373       assert(!"not reached: unsupported binary math opcode");
 374       return;
 375    }
 376
 377    if (intel->gen >= 6) {
 378       return emit_math2_gen6(opcode, dst, src0, src1);
 379    } else {
 380       return emit_math2_gen4(opcode, dst, src0, src1);
 381    }
 382 }
 383
 384 void
 385 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 386 {
 387    if (intel->gen < 7)
 388       assert(!"ir_unop_pack_half_2x16 should be lowered");
 389
 390    assert(dst.type == BRW_REGISTER_TYPE_UD);
 391    assert(src0.type == BRW_REGISTER_TYPE_F);
 392
 393    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 394     *
 395     *   Because this instruction does not have a 16-bit floating-point type,
 396     *   the destination data type must be Word (W).
 397     *
 398     *   The destination must be DWord-aligned and specify a horizontal stride
 399     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 400     *   each destination channel and the upper word is not modified.
 401     *
 402     * The above restriction implies that the f32to16 instruction must use
 403     * align1 mode, because only in align1 mode is it possible to specify
 404     * horizontal stride.  We choose here to defy the hardware docs and emit
 405     * align16 instructions.
 406     *
 407     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 408     * instructions. I was partially successful in that the code passed all
 409     * tests.  However, the code was dubiously correct and fragile, and the
 410     * tests were not harsh enough to probe that frailty. Not trusting the
 411     * code, I chose instead to remain in align16 mode in defiance of the hw
 412     * docs).
 413     *
 414     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 415     * simulator, emitting a f32to16 in align16 mode with UD as destination
 416     * data type is safe. The behavior differs from that specified in the PRM
 417     * in that the upper word of each destination channel is cleared to 0.
 418     */
 419
 420    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 421    src_reg tmp_src(tmp_dst);
 422
 423 #if 0
 424    /* Verify the undocumented behavior on which the following instructions
 425     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 426     * then the result of the bit-or instruction below will be incorrect.
 427     *
 428     * You should inspect the disasm output in order to verify that the MOV is
 429     * not optimized away.
 430     */
 431    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 432 #endif
 433
 434    /* Give tmp the form below, where "." means untouched.
 435     *
 436     *     w z          y          x w z          y          x
 437     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 438     *
 439     * That the upper word of each write-channel be 0 is required for the
 440     * following bit-shift and bit-or instructions to work. Note that this
 441     * relies on the undocumented hardware behavior mentioned above.
 442     */
 443    tmp_dst.writemask = WRITEMASK_XY;
 444    emit(F32TO16(tmp_dst, src0));
 445
 446    /* Give the write-channels of dst the form:
 447     *   0xhhhh0000
 448     */
 449    tmp_src.swizzle = SWIZZLE_Y;
 450    emit(SHL(dst, tmp_src, src_reg(16u)));
 451
 452    /* Finally, give the write-channels of dst the form of packHalf2x16's
 453     * output:
 454     *   0xhhhhllll
 455     */
 456    tmp_src.swizzle = SWIZZLE_X;
 457    emit(OR(dst, src_reg(dst), tmp_src));
 458 }
 459
 460 void
 461 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 462 {
 463    if (intel->gen < 7)
 464       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 465
 466    assert(dst.type == BRW_REGISTER_TYPE_F);
 467    assert(src0.type == BRW_REGISTER_TYPE_UD);
 468
 469    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 470     *
 471     *   Because this instruction does not have a 16-bit floating-point type,
 472     *   the source data type must be Word (W). The destination type must be
 473     *   F (Float).
 474     *
 475     * To use W as the source data type, we must adjust horizontal strides,
 476     * which is only possible in align1 mode. All my [chadv] attempts at
 477     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 478     * Piglit tests, so I gave up.
 479     *
 480     * I've verified that, on gen7 hardware and the simulator, it is safe to
 481     * emit f16to32 in align16 mode with UD as source data type.
 482     */
 483
 484    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 485    src_reg tmp_src(tmp_dst);
 486
 487    tmp_dst.writemask = WRITEMASK_X;
 488    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 489
 490    tmp_dst.writemask = WRITEMASK_Y;
 491    emit(SHR(tmp_dst, src0, src_reg(16u)));
 492
 493    dst.writemask = WRITEMASK_XY;
 494    emit(F16TO32(dst, tmp_src));
 495 }
 496
 497 void
 498 vec4_visitor::visit_instructions(const exec_list *list)
 499 {
 500    foreach_list(node, list) {
 501       ir_instruction *ir = (ir_instruction *)node;
 502
 503       base_ir = ir;
 504       ir->accept(this);
 505    }
 506 }
 507
 508
 509 static int
 510 type_size(const struct glsl_type *type)
 511 {
 512    unsigned int i;
 513    int size;
 514
 515    switch (type->base_type) {
 516    case GLSL_TYPE_UINT:
 517    case GLSL_TYPE_INT:
 518    case GLSL_TYPE_FLOAT:
 519    case GLSL_TYPE_BOOL:
 520       if (type->is_matrix()) {
 521          return type->matrix_columns;
 522       } else {
 523          /* Regardless of size of vector, it gets a vec4. This is bad
 524           * packing for things like floats, but otherwise arrays become a
 525           * mess.  Hopefully a later pass over the code can pack scalars
 526           * down if appropriate.
 527           */
 528          return 1;
 529       }
 530    case GLSL_TYPE_ARRAY:
 531       assert(type->length > 0);
 532       return type_size(type->fields.array) * type->length;
 533    case GLSL_TYPE_STRUCT:
 534       size = 0;
 535       for (i = 0; i < type->length; i++) {
 536          size += type_size(type->fields.structure[i].type);
 537       }
 538       return size;
 539    case GLSL_TYPE_SAMPLER:
 540       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 541        * at link time.
 542        */
 543       return 1;
 544    case GLSL_TYPE_VOID:
 545    case GLSL_TYPE_ERROR:
 546    case GLSL_TYPE_INTERFACE:
 547       assert(0);
 548       break;
 549    }
 550
 551    return 0;
 552 }
 553
 554 int
 555 vec4_visitor::virtual_grf_alloc(int size)
 556 {
 557    if (virtual_grf_array_size <= virtual_grf_count) {
 558       if (virtual_grf_array_size == 0)
 559          virtual_grf_array_size = 16;
 560       else
 561          virtual_grf_array_size *= 2;
 562       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 563                                    virtual_grf_array_size);
 564       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 565                                      virtual_grf_array_size);
 566    }
 567    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 568    virtual_grf_reg_count += size;
 569    virtual_grf_sizes[virtual_grf_count] = size;
 570    return virtual_grf_count++;
 571 }
 572
 573 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 574 {
 575    init();
 576
 577    this->file = GRF;
 578    this->reg = v->virtual_grf_alloc(type_size(type));
 579
 580    if (type->is_array() || type->is_record()) {
 581       this->swizzle = BRW_SWIZZLE_NOOP;
 582    } else {
 583       this->swizzle = swizzle_for_size(type->vector_elements);
 584    }
 585
 586    this->type = brw_type_for_base_type(type);
 587 }
 588
 589 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 590 {
 591    init();
 592
 593    this->file = GRF;
 594    this->reg = v->virtual_grf_alloc(type_size(type));
 595
 596    if (type->is_array() || type->is_record()) {
 597       this->writemask = WRITEMASK_XYZW;
 598    } else {
 599       this->writemask = (1 << type->vector_elements) - 1;
 600    }
 601
 602    this->type = brw_type_for_base_type(type);
 603 }
 604
 605 /* Our support for uniforms is piggy-backed on the struct
 606  * gl_fragment_program, because that's where the values actually
 607  * get stored, rather than in some global gl_shader_program uniform
 608  * store.
 609  */
 610 void
 611 vec4_visitor::setup_uniform_values(ir_variable *ir)
 612 {
 613    int namelen = strlen(ir->name);
 614
 615    /* The data for our (non-builtin) uniforms is stored in a series of
 616     * gl_uniform_driver_storage structs for each subcomponent that
 617     * glGetUniformLocation() could name.  We know it's been set up in the same
 618     * order we'd walk the type, so walk the list of storage and find anything
 619     * with our name, or the prefix of a component that starts with our name.
 620     */
 621    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 622       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 623
 624       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 625           (storage->name[namelen] != 0 &&
 626            storage->name[namelen] != '.' &&
 627            storage->name[namelen] != '[')) {
 628          continue;
 629       }
 630
 631       gl_constant_value *components = storage->storage;
 632       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 633                                storage->type->matrix_columns);
 634
 635       for (unsigned s = 0; s < vector_count; s++) {
 636          uniform_vector_size[uniforms] = storage->type->vector_elements;
 637
 638          int i;
 639          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 640             prog_data->param[uniforms * 4 + i] = &components->f;
 641             components++;
 642          }
 643          for (; i < 4; i++) {
 644             static float zero = 0;
 645             prog_data->param[uniforms * 4 + i] = &zero;
 646          }
 647
 648          uniforms++;
 649       }
 650    }
 651 }
 652
 653 void
 654 vec4_visitor::setup_uniform_clipplane_values()
 655 {
 656    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 657
 658    if (intel->gen < 6) {
 659       /* Pre-Gen6, we compact clip planes.  For example, if the user
 660        * enables just clip planes 0, 1, and 3, we will enable clip planes
 661        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 662        * plane 2.  This simplifies the implementation of the Gen6 clip
 663        * thread.
 664        */
 665       int compacted_clipplane_index = 0;
 666       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 667          if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
 668             continue;
 669
 670          this->uniform_vector_size[this->uniforms] = 4;
 671          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 672          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 673          for (int j = 0; j < 4; ++j) {
 674             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 675          }
 676          ++compacted_clipplane_index;
 677          ++this->uniforms;
 678       }
 679    } else {
 680       /* In Gen6 and later, we don't compact clip planes, because this
 681        * simplifies the implementation of gl_ClipDistance.
 682        */
 683       for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 684          this->uniform_vector_size[this->uniforms] = 4;
 685          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 686          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 687          for (int j = 0; j < 4; ++j) {
 688             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 689          }
 690          ++this->uniforms;
 691       }
 692    }
 693 }
 694
 695 /* Our support for builtin uniforms is even scarier than non-builtin.
 696  * It sits on top of the PROG_STATE_VAR parameters that are
 697  * automatically updated from GL context state.
 698  */
 699 void
 700 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 701 {
 702    const ir_state_slot *const slots = ir->state_slots;
 703    assert(ir->state_slots != NULL);
 704
 705    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 706       /* This state reference has already been setup by ir_to_mesa,
 707        * but we'll get the same index back here.  We can reference
 708        * ParameterValues directly, since unlike brw_fs.cpp, we never
 709        * add new state references during compile.
 710        */
 711       int index = _mesa_add_state_reference(this->prog->Parameters,
 712                                             (gl_state_index *)slots[i].tokens);
 713       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 714
 715       this->uniform_vector_size[this->uniforms] = 0;
 716       /* Add each of the unique swizzled channels of the element.
 717        * This will end up matching the size of the glsl_type of this field.
 718        */
 719       int last_swiz = -1;
 720       for (unsigned int j = 0; j < 4; j++) {
 721          int swiz = GET_SWZ(slots[i].swizzle, j);
 722          last_swiz = swiz;
 723
 724          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 725          if (swiz <= last_swiz)
 726             this->uniform_vector_size[this->uniforms]++;
 727       }
 728       this->uniforms++;
 729    }
 730 }
 731
 732 dst_reg *
 733 vec4_visitor::variable_storage(ir_variable *var)
 734 {
 735    return (dst_reg *)hash_table_find(this->variable_ht, var);
 736 }
 737
 738 void
 739 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 740 {
 741    ir_expression *expr = ir->as_expression();
 742
 743    *predicate = BRW_PREDICATE_NORMAL;
 744
 745    if (expr) {
 746       src_reg op[2];
 747       vec4_instruction *inst;
 748
 749       assert(expr->get_num_operands() <= 2);
 750       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 751          expr->operands[i]->accept(this);
 752          op[i] = this->result;
 753
 754          resolve_ud_negate(&op[i]);
 755       }
 756
 757       switch (expr->operation) {
 758       case ir_unop_logic_not:
 759          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 760          inst->conditional_mod = BRW_CONDITIONAL_Z;
 761          break;
 762
 763       case ir_binop_logic_xor:
 764          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 765          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 766          break;
 767
 768       case ir_binop_logic_or:
 769          inst = emit(OR(dst_null_d(), op[0], op[1]));
 770          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 771          break;
 772
 773       case ir_binop_logic_and:
 774          inst = emit(AND(dst_null_d(), op[0], op[1]));
 775          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 776          break;
 777
 778       case ir_unop_f2b:
 779          if (intel->gen >= 6) {
 780             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 781          } else {
 782             inst = emit(MOV(dst_null_f(), op[0]));
 783             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 784          }
 785          break;
 786
 787       case ir_unop_i2b:
 788          if (intel->gen >= 6) {
 789             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 790          } else {
 791             inst = emit(MOV(dst_null_d(), op[0]));
 792             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 793          }
 794          break;
 795
 796       case ir_binop_all_equal:
 797          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 798          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 799          break;
 800
 801       case ir_binop_any_nequal:
 802          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 803          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 804          break;
 805
 806       case ir_unop_any:
 807          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 808          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 809          break;
 810
 811       case ir_binop_greater:
 812       case ir_binop_gequal:
 813       case ir_binop_less:
 814       case ir_binop_lequal:
 815       case ir_binop_equal:
 816       case ir_binop_nequal:
 817          emit(CMP(dst_null_d(), op[0], op[1],
 818                   brw_conditional_for_comparison(expr->operation)));
 819          break;
 820
 821       default:
 822          assert(!"not reached");
 823          break;
 824       }
 825       return;
 826    }
 827
 828    ir->accept(this);
 829
 830    resolve_ud_negate(&this->result);
 831
 832    if (intel->gen >= 6) {
 833       vec4_instruction *inst = emit(AND(dst_null_d(),
 834                                         this->result, src_reg(1)));
 835       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 836    } else {
 837       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 838       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839    }
 840 }
 841
 842 /**
 843  * Emit a gen6 IF statement with the comparison folded into the IF
 844  * instruction.
 845  */
 846 void
 847 vec4_visitor::emit_if_gen6(ir_if *ir)
 848 {
 849    ir_expression *expr = ir->condition->as_expression();
 850
 851    if (expr) {
 852       src_reg op[2];
 853       dst_reg temp;
 854
 855       assert(expr->get_num_operands() <= 2);
 856       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 857          expr->operands[i]->accept(this);
 858          op[i] = this->result;
 859       }
 860
 861       switch (expr->operation) {
 862       case ir_unop_logic_not:
 863          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 864          return;
 865
 866       case ir_binop_logic_xor:
 867          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 868          return;
 869
 870       case ir_binop_logic_or:
 871          temp = dst_reg(this, glsl_type::bool_type);
 872          emit(OR(temp, op[0], op[1]));
 873          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 874          return;
 875
 876       case ir_binop_logic_and:
 877          temp = dst_reg(this, glsl_type::bool_type);
 878          emit(AND(temp, op[0], op[1]));
 879          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 880          return;
 881
 882       case ir_unop_f2b:
 883          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 884          return;
 885
 886       case ir_unop_i2b:
 887          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 888          return;
 889
 890       case ir_binop_greater:
 891       case ir_binop_gequal:
 892       case ir_binop_less:
 893       case ir_binop_lequal:
 894       case ir_binop_equal:
 895       case ir_binop_nequal:
 896          emit(IF(op[0], op[1],
 897                  brw_conditional_for_comparison(expr->operation)));
 898          return;
 899
 900       case ir_binop_all_equal:
 901          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 902          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 903          return;
 904
 905       case ir_binop_any_nequal:
 906          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 907          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 908          return;
 909
 910       case ir_unop_any:
 911          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 912          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 913          return;
 914
 915       default:
 916          assert(!"not reached");
 917          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 918          return;
 919       }
 920       return;
 921    }
 922
 923    ir->condition->accept(this);
 924
 925    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 926 }
 927
 928 static dst_reg
 929 with_writemask(dst_reg const & r, int mask)
 930 {
 931    dst_reg result = r;
 932    result.writemask = mask;
 933    return result;
 934 }
 935
 936 void
 937 vec4_vs_visitor::emit_prolog()
 938 {
 939    dst_reg sign_recovery_shift;
 940    dst_reg normalize_factor;
 941    dst_reg es3_normalize_factor;
 942
 943    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 944       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 945          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 946          dst_reg reg(ATTR, i);
 947          dst_reg reg_d = reg;
 948          reg_d.type = BRW_REGISTER_TYPE_D;
 949          dst_reg reg_ud = reg;
 950          reg_ud.type = BRW_REGISTER_TYPE_UD;
 951
 952          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 953           * come in as floating point conversions of the integer values.
 954           */
 955          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 956             dst_reg dst = reg;
 957             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 958             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 959             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 960          }
 961
 962          /* Do sign recovery for 2101010 formats if required. */
 963          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 964             if (sign_recovery_shift.file == BAD_FILE) {
 965                /* shift constant: <22,22,22,30> */
 966                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 967                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 968                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 969             }
 970
 971             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 972             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 973          }
 974
 975          /* Apply BGRA swizzle if required. */
 976          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 977             src_reg temp = src_reg(reg);
 978             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 979             emit(MOV(reg, temp));
 980          }
 981
 982          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 983             /* ES 3.0 has different rules for converting signed normalized
 984              * fixed-point numbers than desktop GL.
 985              */
 986             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 987                /* According to equation 2.2 of the ES 3.0 specification,
 988                 * signed normalization conversion is done by:
 989                 *
 990                 * f = c / (2^(b-1)-1)
 991                 */
 992                if (es3_normalize_factor.file == BAD_FILE) {
 993                   /* mul constant: 1 / (2^(b-1) - 1) */
 994                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 995                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 996                            src_reg(1.0f / ((1<<9) - 1))));
 997                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 998                            src_reg(1.0f / ((1<<1) - 1))));
 999                }
1000
1001                dst_reg dst = reg;
1002                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1003                emit(MOV(dst, src_reg(reg_d)));
1004                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1005                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1006             } else {
1007                /* The following equations are from the OpenGL 3.2 specification:
1008                 *
1009                 * 2.1 unsigned normalization
1010                 * f = c/(2^n-1)
1011                 *
1012                 * 2.2 signed normalization
1013                 * f = (2c+1)/(2^n-1)
1014                 *
1015                 * Both of these share a common divisor, which is represented by
1016                 * "normalize_factor" in the code below.
1017                 */
1018                if (normalize_factor.file == BAD_FILE) {
1019                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1020                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1021                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1022                            src_reg(1.0f / ((1<<10) - 1))));
1023                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1024                            src_reg(1.0f / ((1<<2) - 1))));
1025                }
1026
1027                dst_reg dst = reg;
1028                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1029                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1030
1031                /* For signed normalization, we want the numerator to be 2c+1. */
1032                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1033                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1034                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1035                }
1036
1037                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1038             }
1039          }
1040
1041          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1042             dst_reg dst = reg;
1043             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1044             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1045          }
1046       }
1047    }
1048 }
1049
1050
1051 dst_reg *
1052 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1053 {
1054    /* VertexID is stored by the VF as the last vertex element, but
1055     * we don't represent it with a flag in inputs_read, so we call
1056     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1057     */
1058    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1059    vs_prog_data->uses_vertexid = true;
1060
1061    switch (ir->location) {
1062    case SYSTEM_VALUE_VERTEX_ID:
1063       reg->writemask = WRITEMASK_X;
1064       break;
1065    case SYSTEM_VALUE_INSTANCE_ID:
1066       reg->writemask = WRITEMASK_Y;
1067       break;
1068    default:
1069       assert(!"not reached");
1070       break;
1071    }
1072
1073    return reg;
1074 }
1075
1076
1077 void
1078 vec4_visitor::visit(ir_variable *ir)
1079 {
1080    dst_reg *reg = NULL;
1081
1082    if (variable_storage(ir))
1083       return;
1084
1085    switch (ir->mode) {
1086    case ir_var_shader_in:
1087       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1088       break;
1089
1090    case ir_var_shader_out:
1091       reg = new(mem_ctx) dst_reg(this, ir->type);
1092
1093       for (int i = 0; i < type_size(ir->type); i++) {
1094          output_reg[ir->location + i] = *reg;
1095          output_reg[ir->location + i].reg_offset = i;
1096          output_reg[ir->location + i].type =
1097             brw_type_for_base_type(ir->type->get_scalar_type());
1098          output_reg_annotation[ir->location + i] = ir->name;
1099       }
1100       break;
1101
1102    case ir_var_auto:
1103    case ir_var_temporary:
1104       reg = new(mem_ctx) dst_reg(this, ir->type);
1105       break;
1106
1107    case ir_var_uniform:
1108       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1109
1110       /* Thanks to the lower_ubo_reference pass, we will see only
1111        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1112        * variables, so no need for them to be in variable_ht.
1113        */
1114       if (ir->is_in_uniform_block())
1115          return;
1116
1117       /* Track how big the whole uniform variable is, in case we need to put a
1118        * copy of its data into pull constants for array access.
1119        */
1120       this->uniform_size[this->uniforms] = type_size(ir->type);
1121
1122       if (!strncmp(ir->name, "gl_", 3)) {
1123          setup_builtin_uniform_values(ir);
1124       } else {
1125          setup_uniform_values(ir);
1126       }
1127       break;
1128
1129    case ir_var_system_value:
1130       reg = make_reg_for_system_value(ir);
1131       break;
1132
1133    default:
1134       assert(!"not reached");
1135    }
1136
1137    reg->type = brw_type_for_base_type(ir->type);
1138    hash_table_insert(this->variable_ht, reg, ir);
1139 }
1140
1141 void
1142 vec4_visitor::visit(ir_loop *ir)
1143 {
1144    dst_reg counter;
1145
1146    /* We don't want debugging output to print the whole body of the
1147     * loop as the annotation.
1148     */
1149    this->base_ir = NULL;
1150
1151    if (ir->counter != NULL) {
1152       this->base_ir = ir->counter;
1153       ir->counter->accept(this);
1154       counter = *(variable_storage(ir->counter));
1155
1156       if (ir->from != NULL) {
1157          this->base_ir = ir->from;
1158          ir->from->accept(this);
1159
1160          emit(MOV(counter, this->result));
1161       }
1162    }
1163
1164    emit(BRW_OPCODE_DO);
1165
1166    if (ir->to) {
1167       this->base_ir = ir->to;
1168       ir->to->accept(this);
1169
1170       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1171                brw_conditional_for_comparison(ir->cmp)));
1172
1173       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1174       inst->predicate = BRW_PREDICATE_NORMAL;
1175    }
1176
1177    visit_instructions(&ir->body_instructions);
1178
1179
1180    if (ir->increment) {
1181       this->base_ir = ir->increment;
1182       ir->increment->accept(this);
1183       emit(ADD(counter, src_reg(counter), this->result));
1184    }
1185
1186    emit(BRW_OPCODE_WHILE);
1187 }
1188
1189 void
1190 vec4_visitor::visit(ir_loop_jump *ir)
1191 {
1192    switch (ir->mode) {
1193    case ir_loop_jump::jump_break:
1194       emit(BRW_OPCODE_BREAK);
1195       break;
1196    case ir_loop_jump::jump_continue:
1197       emit(BRW_OPCODE_CONTINUE);
1198       break;
1199    }
1200 }
1201
1202
1203 void
1204 vec4_visitor::visit(ir_function_signature *ir)
1205 {
1206    assert(0);
1207    (void)ir;
1208 }
1209
1210 void
1211 vec4_visitor::visit(ir_function *ir)
1212 {
1213    /* Ignore function bodies other than main() -- we shouldn't see calls to
1214     * them since they should all be inlined.
1215     */
1216    if (strcmp(ir->name, "main") == 0) {
1217       const ir_function_signature *sig;
1218       exec_list empty;
1219
1220       sig = ir->matching_signature(&empty);
1221
1222       assert(sig);
1223
1224       visit_instructions(&sig->body);
1225    }
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_sat(ir_expression *ir)
1230 {
1231    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1232    if (!sat_src)
1233       return false;
1234
1235    sat_src->accept(this);
1236    src_reg src = this->result;
1237
1238    this->result = src_reg(this, ir->type);
1239    vec4_instruction *inst;
1240    inst = emit(MOV(dst_reg(this->result), src));
1241    inst->saturate = true;
1242
1243    return true;
1244 }
1245
1246 void
1247 vec4_visitor::emit_bool_comparison(unsigned int op,
1248                                  dst_reg dst, src_reg src0, src_reg src1)
1249 {
1250    /* original gen4 does destination conversion before comparison. */
1251    if (intel->gen < 5)
1252       dst.type = src0.type;
1253
1254    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1255
1256    dst.type = BRW_REGISTER_TYPE_D;
1257    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1258 }
1259
1260 void
1261 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1262                           src_reg src0, src_reg src1)
1263 {
1264    vec4_instruction *inst;
1265
1266    if (intel->gen >= 6) {
1267       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268       inst->conditional_mod = conditionalmod;
1269    } else {
1270       emit(CMP(dst, src0, src1, conditionalmod));
1271
1272       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1273       inst->predicate = BRW_PREDICATE_NORMAL;
1274    }
1275 }
1276
1277 void
1278 vec4_visitor::visit(ir_expression *ir)
1279 {
1280    unsigned int operand;
1281    src_reg op[Elements(ir->operands)];
1282    src_reg result_src;
1283    dst_reg result_dst;
1284    vec4_instruction *inst;
1285
1286    if (try_emit_sat(ir))
1287       return;
1288
1289    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1290       this->result.file = BAD_FILE;
1291       ir->operands[operand]->accept(this);
1292       if (this->result.file == BAD_FILE) {
1293          printf("Failed to get tree for expression operand:\n");
1294          ir->operands[operand]->print();
1295          exit(1);
1296       }
1297       op[operand] = this->result;
1298
1299       /* Matrix expression operands should have been broken down to vector
1300        * operations already.
1301        */
1302       assert(!ir->operands[operand]->type->is_matrix());
1303    }
1304
1305    int vector_elements = ir->operands[0]->type->vector_elements;
1306    if (ir->operands[1]) {
1307       vector_elements = MAX2(vector_elements,
1308                              ir->operands[1]->type->vector_elements);
1309    }
1310
1311    this->result.file = BAD_FILE;
1312
1313    /* Storage for our result.  Ideally for an assignment we'd be using
1314     * the actual storage for the result here, instead.
1315     */
1316    result_src = src_reg(this, ir->type);
1317    /* convenience for the emit functions below. */
1318    result_dst = dst_reg(result_src);
1319    /* If nothing special happens, this is the result. */
1320    this->result = result_src;
1321    /* Limit writes to the channels that will be used by result_src later.
1322     * This does limit this temp's use as a temporary for multi-instruction
1323     * sequences.
1324     */
1325    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1326
1327    switch (ir->operation) {
1328    case ir_unop_logic_not:
1329       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1330        * ones complement of the whole register, not just bit 0.
1331        */
1332       emit(XOR(result_dst, op[0], src_reg(1)));
1333       break;
1334    case ir_unop_neg:
1335       op[0].negate = !op[0].negate;
1336       this->result = op[0];
1337       break;
1338    case ir_unop_abs:
1339       op[0].abs = true;
1340       op[0].negate = false;
1341       this->result = op[0];
1342       break;
1343
1344    case ir_unop_sign:
1345       emit(MOV(result_dst, src_reg(0.0f)));
1346
1347       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1348       inst = emit(MOV(result_dst, src_reg(1.0f)));
1349       inst->predicate = BRW_PREDICATE_NORMAL;
1350
1351       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1352       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1353       inst->predicate = BRW_PREDICATE_NORMAL;
1354
1355       break;
1356
1357    case ir_unop_rcp:
1358       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1359       break;
1360
1361    case ir_unop_exp2:
1362       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1363       break;
1364    case ir_unop_log2:
1365       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1366       break;
1367    case ir_unop_exp:
1368    case ir_unop_log:
1369       assert(!"not reached: should be handled by ir_explog_to_explog2");
1370       break;
1371    case ir_unop_sin:
1372    case ir_unop_sin_reduced:
1373       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1374       break;
1375    case ir_unop_cos:
1376    case ir_unop_cos_reduced:
1377       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1378       break;
1379
1380    case ir_unop_dFdx:
1381    case ir_unop_dFdy:
1382       assert(!"derivatives not valid in vertex shader");
1383       break;
1384
1385    case ir_unop_noise:
1386       assert(!"not reached: should be handled by lower_noise");
1387       break;
1388
1389    case ir_binop_add:
1390       emit(ADD(result_dst, op[0], op[1]));
1391       break;
1392    case ir_binop_sub:
1393       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1394       break;
1395
1396    case ir_binop_mul:
1397       if (ir->type->is_integer()) {
1398          /* For integer multiplication, the MUL uses the low 16 bits
1399           * of one of the operands (src0 on gen6, src1 on gen7).  The
1400           * MACH accumulates in the contribution of the upper 16 bits
1401           * of that operand.
1402           *
1403           * FINISHME: Emit just the MUL if we know an operand is small
1404           * enough.
1405           */
1406          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1407
1408          emit(MUL(acc, op[0], op[1]));
1409          emit(MACH(dst_null_d(), op[0], op[1]));
1410          emit(MOV(result_dst, src_reg(acc)));
1411       } else {
1412          emit(MUL(result_dst, op[0], op[1]));
1413       }
1414       break;
1415    case ir_binop_div:
1416       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1417       assert(ir->type->is_integer());
1418       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1419       break;
1420    case ir_binop_mod:
1421       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1422       assert(ir->type->is_integer());
1423       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1424       break;
1425
1426    case ir_binop_less:
1427    case ir_binop_greater:
1428    case ir_binop_lequal:
1429    case ir_binop_gequal:
1430    case ir_binop_equal:
1431    case ir_binop_nequal: {
1432       emit(CMP(result_dst, op[0], op[1],
1433                brw_conditional_for_comparison(ir->operation)));
1434       emit(AND(result_dst, result_src, src_reg(0x1)));
1435       break;
1436    }
1437
1438    case ir_binop_all_equal:
1439       /* "==" operator producing a scalar boolean. */
1440       if (ir->operands[0]->type->is_vector() ||
1441           ir->operands[1]->type->is_vector()) {
1442          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1443          emit(MOV(result_dst, src_reg(0)));
1444          inst = emit(MOV(result_dst, src_reg(1)));
1445          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1446       } else {
1447          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1448          emit(AND(result_dst, result_src, src_reg(0x1)));
1449       }
1450       break;
1451    case ir_binop_any_nequal:
1452       /* "!=" operator producing a scalar boolean. */
1453       if (ir->operands[0]->type->is_vector() ||
1454           ir->operands[1]->type->is_vector()) {
1455          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1456
1457          emit(MOV(result_dst, src_reg(0)));
1458          inst = emit(MOV(result_dst, src_reg(1)));
1459          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1460       } else {
1461          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1462          emit(AND(result_dst, result_src, src_reg(0x1)));
1463       }
1464       break;
1465
1466    case ir_unop_any:
1467       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1468       emit(MOV(result_dst, src_reg(0)));
1469
1470       inst = emit(MOV(result_dst, src_reg(1)));
1471       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1472       break;
1473
1474    case ir_binop_logic_xor:
1475       emit(XOR(result_dst, op[0], op[1]));
1476       break;
1477
1478    case ir_binop_logic_or:
1479       emit(OR(result_dst, op[0], op[1]));
1480       break;
1481
1482    case ir_binop_logic_and:
1483       emit(AND(result_dst, op[0], op[1]));
1484       break;
1485
1486    case ir_binop_dot:
1487       assert(ir->operands[0]->type->is_vector());
1488       assert(ir->operands[0]->type == ir->operands[1]->type);
1489       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1490       break;
1491
1492    case ir_unop_sqrt:
1493       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1494       break;
1495    case ir_unop_rsq:
1496       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1497       break;
1498
1499    case ir_unop_bitcast_i2f:
1500    case ir_unop_bitcast_u2f:
1501       this->result = op[0];
1502       this->result.type = BRW_REGISTER_TYPE_F;
1503       break;
1504
1505    case ir_unop_bitcast_f2i:
1506       this->result = op[0];
1507       this->result.type = BRW_REGISTER_TYPE_D;
1508       break;
1509
1510    case ir_unop_bitcast_f2u:
1511       this->result = op[0];
1512       this->result.type = BRW_REGISTER_TYPE_UD;
1513       break;
1514
1515    case ir_unop_i2f:
1516    case ir_unop_i2u:
1517    case ir_unop_u2i:
1518    case ir_unop_u2f:
1519    case ir_unop_b2f:
1520    case ir_unop_b2i:
1521    case ir_unop_f2i:
1522    case ir_unop_f2u:
1523       emit(MOV(result_dst, op[0]));
1524       break;
1525    case ir_unop_f2b:
1526    case ir_unop_i2b: {
1527       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1528       emit(AND(result_dst, result_src, src_reg(1)));
1529       break;
1530    }
1531
1532    case ir_unop_trunc:
1533       emit(RNDZ(result_dst, op[0]));
1534       break;
1535    case ir_unop_ceil:
1536       op[0].negate = !op[0].negate;
1537       inst = emit(RNDD(result_dst, op[0]));
1538       this->result.negate = true;
1539       break;
1540    case ir_unop_floor:
1541       inst = emit(RNDD(result_dst, op[0]));
1542       break;
1543    case ir_unop_fract:
1544       inst = emit(FRC(result_dst, op[0]));
1545       break;
1546    case ir_unop_round_even:
1547       emit(RNDE(result_dst, op[0]));
1548       break;
1549
1550    case ir_binop_min:
1551       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1552       break;
1553    case ir_binop_max:
1554       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1555       break;
1556
1557    case ir_binop_pow:
1558       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1559       break;
1560
1561    case ir_unop_bit_not:
1562       inst = emit(NOT(result_dst, op[0]));
1563       break;
1564    case ir_binop_bit_and:
1565       inst = emit(AND(result_dst, op[0], op[1]));
1566       break;
1567    case ir_binop_bit_xor:
1568       inst = emit(XOR(result_dst, op[0], op[1]));
1569       break;
1570    case ir_binop_bit_or:
1571       inst = emit(OR(result_dst, op[0], op[1]));
1572       break;
1573
1574    case ir_binop_lshift:
1575       inst = emit(SHL(result_dst, op[0], op[1]));
1576       break;
1577
1578    case ir_binop_rshift:
1579       if (ir->type->base_type == GLSL_TYPE_INT)
1580          inst = emit(ASR(result_dst, op[0], op[1]));
1581       else
1582          inst = emit(SHR(result_dst, op[0], op[1]));
1583       break;
1584
1585    case ir_binop_ubo_load: {
1586       ir_constant *uniform_block = ir->operands[0]->as_constant();
1587       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1588       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1589       src_reg offset = op[1];
1590
1591       /* Now, load the vector from that offset. */
1592       assert(ir->type->is_vector() || ir->type->is_scalar());
1593
1594       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1595       packed_consts.type = result.type;
1596       src_reg surf_index =
1597          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1598       if (const_offset_ir) {
1599          offset = src_reg(const_offset / 16);
1600       } else {
1601          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1602       }
1603
1604       vec4_instruction *pull =
1605          emit(new(mem_ctx) vec4_instruction(this,
1606                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1607                                             dst_reg(packed_consts),
1608                                             surf_index,
1609                                             offset));
1610       pull->base_mrf = 14;
1611       pull->mlen = 1;
1612
1613       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1614       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1615                                             const_offset % 16 / 4,
1616                                             const_offset % 16 / 4,
1617                                             const_offset % 16 / 4);
1618
1619       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1620       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1621          emit(CMP(result_dst, packed_consts, src_reg(0u),
1622                   BRW_CONDITIONAL_NZ));
1623          emit(AND(result_dst, result, src_reg(0x1)));
1624       } else {
1625          emit(MOV(result_dst, packed_consts));
1626       }
1627       break;
1628    }
1629
1630    case ir_triop_lrp:
1631       op[0] = fix_3src_operand(op[0]);
1632       op[1] = fix_3src_operand(op[1]);
1633       op[2] = fix_3src_operand(op[2]);
1634       /* Note that the instruction's argument order is reversed from GLSL
1635        * and the IR.
1636        */
1637       emit(LRP(result_dst, op[2], op[1], op[0]));
1638       break;
1639
1640    case ir_quadop_vector:
1641       assert(!"not reached: should be handled by lower_quadop_vector");
1642       break;
1643
1644    case ir_unop_pack_half_2x16:
1645       emit_pack_half_2x16(result_dst, op[0]);
1646       break;
1647    case ir_unop_unpack_half_2x16:
1648       emit_unpack_half_2x16(result_dst, op[0]);
1649       break;
1650    case ir_unop_pack_snorm_2x16:
1651    case ir_unop_pack_snorm_4x8:
1652    case ir_unop_pack_unorm_2x16:
1653    case ir_unop_pack_unorm_4x8:
1654    case ir_unop_unpack_snorm_2x16:
1655    case ir_unop_unpack_snorm_4x8:
1656    case ir_unop_unpack_unorm_2x16:
1657    case ir_unop_unpack_unorm_4x8:
1658       assert(!"not reached: should be handled by lower_packing_builtins");
1659       break;
1660    case ir_unop_unpack_half_2x16_split_x:
1661    case ir_unop_unpack_half_2x16_split_y:
1662    case ir_binop_pack_half_2x16_split:
1663       assert(!"not reached: should not occur in vertex shader");
1664       break;
1665    }
1666 }
1667
1668
1669 void
1670 vec4_visitor::visit(ir_swizzle *ir)
1671 {
1672    src_reg src;
1673    int i = 0;
1674    int swizzle[4];
1675
1676    /* Note that this is only swizzles in expressions, not those on the left
1677     * hand side of an assignment, which do write masking.  See ir_assignment
1678     * for that.
1679     */
1680
1681    ir->val->accept(this);
1682    src = this->result;
1683    assert(src.file != BAD_FILE);
1684
1685    for (i = 0; i < ir->type->vector_elements; i++) {
1686       switch (i) {
1687       case 0:
1688          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1689          break;
1690       case 1:
1691          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1692          break;
1693       case 2:
1694          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1695          break;
1696       case 3:
1697          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1698             break;
1699       }
1700    }
1701    for (; i < 4; i++) {
1702       /* Replicate the last channel out. */
1703       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1704    }
1705
1706    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1707
1708    this->result = src;
1709 }
1710
1711 void
1712 vec4_visitor::visit(ir_dereference_variable *ir)
1713 {
1714    const struct glsl_type *type = ir->type;
1715    dst_reg *reg = variable_storage(ir->var);
1716
1717    if (!reg) {
1718       fail("Failed to find variable storage for %s\n", ir->var->name);
1719       this->result = src_reg(brw_null_reg());
1720       return;
1721    }
1722
1723    this->result = src_reg(*reg);
1724
1725    /* System values get their swizzle from the dst_reg writemask */
1726    if (ir->var->mode == ir_var_system_value)
1727       return;
1728
1729    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1730       this->result.swizzle = swizzle_for_size(type->vector_elements);
1731 }
1732
1733
1734 int
1735 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1736 {
1737    /* Under normal circumstances array elements are stored consecutively, so
1738     * the stride is equal to the size of the array element.
1739     */
1740    return type_size(ir->type);
1741 }
1742
1743
1744 void
1745 vec4_visitor::visit(ir_dereference_array *ir)
1746 {
1747    ir_constant *constant_index;
1748    src_reg src;
1749    int array_stride = compute_array_stride(ir);
1750
1751    constant_index = ir->array_index->constant_expression_value();
1752
1753    ir->array->accept(this);
1754    src = this->result;
1755
1756    if (constant_index) {
1757       src.reg_offset += constant_index->value.i[0] * array_stride;
1758    } else {
1759       /* Variable index array dereference.  It eats the "vec4" of the
1760        * base of the array and an index that offsets the Mesa register
1761        * index.
1762        */
1763       ir->array_index->accept(this);
1764
1765       src_reg index_reg;
1766
1767       if (array_stride == 1) {
1768          index_reg = this->result;
1769       } else {
1770          index_reg = src_reg(this, glsl_type::int_type);
1771
1772          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1773       }
1774
1775       if (src.reladdr) {
1776          src_reg temp = src_reg(this, glsl_type::int_type);
1777
1778          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1779
1780          index_reg = temp;
1781       }
1782
1783       src.reladdr = ralloc(mem_ctx, src_reg);
1784       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1785    }
1786
1787    /* If the type is smaller than a vec4, replicate the last channel out. */
1788    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1789       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1790    else
1791       src.swizzle = BRW_SWIZZLE_NOOP;
1792    src.type = brw_type_for_base_type(ir->type);
1793
1794    this->result = src;
1795 }
1796
1797 void
1798 vec4_visitor::visit(ir_dereference_record *ir)
1799 {
1800    unsigned int i;
1801    const glsl_type *struct_type = ir->record->type;
1802    int offset = 0;
1803
1804    ir->record->accept(this);
1805
1806    for (i = 0; i < struct_type->length; i++) {
1807       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1808          break;
1809       offset += type_size(struct_type->fields.structure[i].type);
1810    }
1811
1812    /* If the type is smaller than a vec4, replicate the last channel out. */
1813    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1814       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1815    else
1816       this->result.swizzle = BRW_SWIZZLE_NOOP;
1817    this->result.type = brw_type_for_base_type(ir->type);
1818
1819    this->result.reg_offset += offset;
1820 }
1821
1822 /**
1823  * We want to be careful in assignment setup to hit the actual storage
1824  * instead of potentially using a temporary like we might with the
1825  * ir_dereference handler.
1826  */
1827 static dst_reg
1828 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1829 {
1830    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1831     * access of a vector, it must be separated into a series conditional moves
1832     * before reaching this point (see ir_vec_index_to_cond_assign).
1833     */
1834    assert(ir->as_dereference());
1835    ir_dereference_array *deref_array = ir->as_dereference_array();
1836    if (deref_array) {
1837       assert(!deref_array->array->type->is_vector());
1838    }
1839
1840    /* Use the rvalue deref handler for the most part.  We'll ignore
1841     * swizzles in it and write swizzles using writemask, though.
1842     */
1843    ir->accept(v);
1844    return dst_reg(v->result);
1845 }
1846
1847 void
1848 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1849                               const struct glsl_type *type, uint32_t predicate)
1850 {
1851    if (type->base_type == GLSL_TYPE_STRUCT) {
1852       for (unsigned int i = 0; i < type->length; i++) {
1853          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1854       }
1855       return;
1856    }
1857
1858    if (type->is_array()) {
1859       for (unsigned int i = 0; i < type->length; i++) {
1860          emit_block_move(dst, src, type->fields.array, predicate);
1861       }
1862       return;
1863    }
1864
1865    if (type->is_matrix()) {
1866       const struct glsl_type *vec_type;
1867
1868       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1869                                          type->vector_elements, 1);
1870
1871       for (int i = 0; i < type->matrix_columns; i++) {
1872          emit_block_move(dst, src, vec_type, predicate);
1873       }
1874       return;
1875    }
1876
1877    assert(type->is_scalar() || type->is_vector());
1878
1879    dst->type = brw_type_for_base_type(type);
1880    src->type = dst->type;
1881
1882    dst->writemask = (1 << type->vector_elements) - 1;
1883
1884    src->swizzle = swizzle_for_size(type->vector_elements);
1885
1886    vec4_instruction *inst = emit(MOV(*dst, *src));
1887    inst->predicate = predicate;
1888
1889    dst->reg_offset++;
1890    src->reg_offset++;
1891 }
1892
1893
1894 /* If the RHS processing resulted in an instruction generating a
1895  * temporary value, and it would be easy to rewrite the instruction to
1896  * generate its result right into the LHS instead, do so.  This ends
1897  * up reliably removing instructions where it can be tricky to do so
1898  * later without real UD chain information.
1899  */
1900 bool
1901 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1902                                      dst_reg dst,
1903                                      src_reg src,
1904                                      vec4_instruction *pre_rhs_inst,
1905                                      vec4_instruction *last_rhs_inst)
1906 {
1907    /* This could be supported, but it would take more smarts. */
1908    if (ir->condition)
1909       return false;
1910
1911    if (pre_rhs_inst == last_rhs_inst)
1912       return false; /* No instructions generated to work with. */
1913
1914    /* Make sure the last instruction generated our source reg. */
1915    if (src.file != GRF ||
1916        src.file != last_rhs_inst->dst.file ||
1917        src.reg != last_rhs_inst->dst.reg ||
1918        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1919        src.reladdr ||
1920        src.abs ||
1921        src.negate ||
1922        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1923       return false;
1924
1925    /* Check that that last instruction fully initialized the channels
1926     * we want to use, in the order we want to use them.  We could
1927     * potentially reswizzle the operands of many instructions so that
1928     * we could handle out of order channels, but don't yet.
1929     */
1930
1931    for (unsigned i = 0; i < 4; i++) {
1932       if (dst.writemask & (1 << i)) {
1933          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1934             return false;
1935
1936          if (BRW_GET_SWZ(src.swizzle, i) != i)
1937             return false;
1938       }
1939    }
1940
1941    /* Success!  Rewrite the instruction. */
1942    last_rhs_inst->dst.file = dst.file;
1943    last_rhs_inst->dst.reg = dst.reg;
1944    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1945    last_rhs_inst->dst.reladdr = dst.reladdr;
1946    last_rhs_inst->dst.writemask &= dst.writemask;
1947
1948    return true;
1949 }
1950
1951 void
1952 vec4_visitor::visit(ir_assignment *ir)
1953 {
1954    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1955    uint32_t predicate = BRW_PREDICATE_NONE;
1956
1957    if (!ir->lhs->type->is_scalar() &&
1958        !ir->lhs->type->is_vector()) {
1959       ir->rhs->accept(this);
1960       src_reg src = this->result;
1961
1962       if (ir->condition) {
1963          emit_bool_to_cond_code(ir->condition, &predicate);
1964       }
1965
1966       /* emit_block_move doesn't account for swizzles in the source register.
1967        * This should be ok, since the source register is a structure or an
1968        * array, and those can't be swizzled.  But double-check to be sure.
1969        */
1970       assert(src.swizzle ==
1971              (ir->rhs->type->is_matrix()
1972               ? swizzle_for_size(ir->rhs->type->vector_elements)
1973               : BRW_SWIZZLE_NOOP));
1974
1975       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1976       return;
1977    }
1978
1979    /* Now we're down to just a scalar/vector with writemasks. */
1980    int i;
1981
1982    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1983    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1984
1985    ir->rhs->accept(this);
1986
1987    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1988
1989    src_reg src = this->result;
1990
1991    int swizzles[4];
1992    int first_enabled_chan = 0;
1993    int src_chan = 0;
1994
1995    assert(ir->lhs->type->is_vector() ||
1996           ir->lhs->type->is_scalar());
1997    dst.writemask = ir->write_mask;
1998
1999    for (int i = 0; i < 4; i++) {
2000       if (dst.writemask & (1 << i)) {
2001          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2002          break;
2003       }
2004    }
2005
2006    /* Swizzle a small RHS vector into the channels being written.
2007     *
2008     * glsl ir treats write_mask as dictating how many channels are
2009     * present on the RHS while in our instructions we need to make
2010     * those channels appear in the slots of the vec4 they're written to.
2011     */
2012    for (int i = 0; i < 4; i++) {
2013       if (dst.writemask & (1 << i))
2014          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2015       else
2016          swizzles[i] = first_enabled_chan;
2017    }
2018    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2019                               swizzles[2], swizzles[3]);
2020
2021    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2022       return;
2023    }
2024
2025    if (ir->condition) {
2026       emit_bool_to_cond_code(ir->condition, &predicate);
2027    }
2028
2029    for (i = 0; i < type_size(ir->lhs->type); i++) {
2030       vec4_instruction *inst = emit(MOV(dst, src));
2031       inst->predicate = predicate;
2032
2033       dst.reg_offset++;
2034       src.reg_offset++;
2035    }
2036 }
2037
2038 void
2039 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2040 {
2041    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2042       foreach_list(node, &ir->components) {
2043          ir_constant *field_value = (ir_constant *)node;
2044
2045          emit_constant_values(dst, field_value);
2046       }
2047       return;
2048    }
2049
2050    if (ir->type->is_array()) {
2051       for (unsigned int i = 0; i < ir->type->length; i++) {
2052          emit_constant_values(dst, ir->array_elements[i]);
2053       }
2054       return;
2055    }
2056
2057    if (ir->type->is_matrix()) {
2058       for (int i = 0; i < ir->type->matrix_columns; i++) {
2059          float *vec = &ir->value.f[i * ir->type->vector_elements];
2060
2061          for (int j = 0; j < ir->type->vector_elements; j++) {
2062             dst->writemask = 1 << j;
2063             dst->type = BRW_REGISTER_TYPE_F;
2064
2065             emit(MOV(*dst, src_reg(vec[j])));
2066          }
2067          dst->reg_offset++;
2068       }
2069       return;
2070    }
2071
2072    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2073
2074    for (int i = 0; i < ir->type->vector_elements; i++) {
2075       if (!(remaining_writemask & (1 << i)))
2076          continue;
2077
2078       dst->writemask = 1 << i;
2079       dst->type = brw_type_for_base_type(ir->type);
2080
2081       /* Find other components that match the one we're about to
2082        * write.  Emits fewer instructions for things like vec4(0.5,
2083        * 1.5, 1.5, 1.5).
2084        */
2085       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2086          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2087             if (ir->value.b[i] == ir->value.b[j])
2088                dst->writemask |= (1 << j);
2089          } else {
2090             /* u, i, and f storage all line up, so no need for a
2091              * switch case for comparing each type.
2092              */
2093             if (ir->value.u[i] == ir->value.u[j])
2094                dst->writemask |= (1 << j);
2095          }
2096       }
2097
2098       switch (ir->type->base_type) {
2099       case GLSL_TYPE_FLOAT:
2100          emit(MOV(*dst, src_reg(ir->value.f[i])));
2101          break;
2102       case GLSL_TYPE_INT:
2103          emit(MOV(*dst, src_reg(ir->value.i[i])));
2104          break;
2105       case GLSL_TYPE_UINT:
2106          emit(MOV(*dst, src_reg(ir->value.u[i])));
2107          break;
2108       case GLSL_TYPE_BOOL:
2109          emit(MOV(*dst, src_reg(ir->value.b[i])));
2110          break;
2111       default:
2112          assert(!"Non-float/uint/int/bool constant");
2113          break;
2114       }
2115
2116       remaining_writemask &= ~dst->writemask;
2117    }
2118    dst->reg_offset++;
2119 }
2120
2121 void
2122 vec4_visitor::visit(ir_constant *ir)
2123 {
2124    dst_reg dst = dst_reg(this, ir->type);
2125    this->result = src_reg(dst);
2126
2127    emit_constant_values(&dst, ir);
2128 }
2129
2130 void
2131 vec4_visitor::visit(ir_call *ir)
2132 {
2133    assert(!"not reached");
2134 }
2135
2136 void
2137 vec4_visitor::visit(ir_texture *ir)
2138 {
2139    int sampler =
2140       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2141
2142    /* Should be lowered by do_lower_texture_projection */
2143    assert(!ir->projector);
2144
2145    /* Generate code to compute all the subexpression trees.  This has to be
2146     * done before loading any values into MRFs for the sampler message since
2147     * generating these values may involve SEND messages that need the MRFs.
2148     */
2149    src_reg coordinate;
2150    if (ir->coordinate) {
2151       ir->coordinate->accept(this);
2152       coordinate = this->result;
2153    }
2154
2155    src_reg shadow_comparitor;
2156    if (ir->shadow_comparitor) {
2157       ir->shadow_comparitor->accept(this);
2158       shadow_comparitor = this->result;
2159    }
2160
2161    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2162    src_reg lod, dPdx, dPdy, sample_index;
2163    switch (ir->op) {
2164    case ir_tex:
2165       lod = src_reg(0.0f);
2166       lod_type = glsl_type::float_type;
2167       break;
2168    case ir_txf:
2169    case ir_txl:
2170    case ir_txs:
2171       ir->lod_info.lod->accept(this);
2172       lod = this->result;
2173       lod_type = ir->lod_info.lod->type;
2174       break;
2175    case ir_txf_ms:
2176       ir->lod_info.sample_index->accept(this);
2177       sample_index = this->result;
2178       sample_index_type = ir->lod_info.sample_index->type;
2179       break;
2180    case ir_txd:
2181       ir->lod_info.grad.dPdx->accept(this);
2182       dPdx = this->result;
2183
2184       ir->lod_info.grad.dPdy->accept(this);
2185       dPdy = this->result;
2186
2187       lod_type = ir->lod_info.grad.dPdx->type;
2188       break;
2189    case ir_txb:
2190    case ir_lod:
2191       break;
2192    }
2193
2194    vec4_instruction *inst = NULL;
2195    switch (ir->op) {
2196    case ir_tex:
2197    case ir_txl:
2198       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2199       break;
2200    case ir_txd:
2201       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2202       break;
2203    case ir_txf:
2204       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2205       break;
2206    case ir_txf_ms:
2207       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2208       break;
2209    case ir_txs:
2210       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2211       break;
2212    case ir_txb:
2213       assert(!"TXB is not valid for vertex shaders.");
2214       break;
2215    case ir_lod:
2216       assert(!"LOD is not valid for vertex shaders.");
2217       break;
2218    }
2219
2220    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2221
2222    /* Texel offsets go in the message header; Gen4 also requires headers. */
2223    inst->header_present = use_texture_offset || intel->gen < 5;
2224    inst->base_mrf = 2;
2225    inst->mlen = inst->header_present + 1; /* always at least one */
2226    inst->sampler = sampler;
2227    inst->dst = dst_reg(this, ir->type);
2228    inst->dst.writemask = WRITEMASK_XYZW;
2229    inst->shadow_compare = ir->shadow_comparitor != NULL;
2230
2231    if (use_texture_offset)
2232       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2233
2234    /* MRF for the first parameter */
2235    int param_base = inst->base_mrf + inst->header_present;
2236
2237    if (ir->op == ir_txs) {
2238       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2239       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2240    } else {
2241       int i, coord_mask = 0, zero_mask = 0;
2242       /* Load the coordinate */
2243       /* FINISHME: gl_clamp_mask and saturate */
2244       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2245          coord_mask |= (1 << i);
2246       for (; i < 4; i++)
2247          zero_mask |= (1 << i);
2248
2249       if (ir->offset && ir->op == ir_txf) {
2250          /* It appears that the ld instruction used for txf does its
2251           * address bounds check before adding in the offset.  To work
2252           * around this, just add the integer offset to the integer
2253           * texel coordinate, and don't put the offset in the header.
2254           */
2255          ir_constant *offset = ir->offset->as_constant();
2256          assert(offset);
2257
2258          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2259             src_reg src = coordinate;
2260             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2261                                        BRW_GET_SWZ(src.swizzle, j),
2262                                        BRW_GET_SWZ(src.swizzle, j),
2263                                        BRW_GET_SWZ(src.swizzle, j));
2264             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2265                      src, offset->value.i[j]));
2266          }
2267       } else {
2268          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2269                   coordinate));
2270       }
2271       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2272                src_reg(0)));
2273       /* Load the shadow comparitor */
2274       if (ir->shadow_comparitor && ir->op != ir_txd) {
2275          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2276                           WRITEMASK_X),
2277                   shadow_comparitor));
2278          inst->mlen++;
2279       }
2280
2281       /* Load the LOD info */
2282       if (ir->op == ir_tex || ir->op == ir_txl) {
2283          int mrf, writemask;
2284          if (intel->gen >= 5) {
2285             mrf = param_base + 1;
2286             if (ir->shadow_comparitor) {
2287                writemask = WRITEMASK_Y;
2288                /* mlen already incremented */
2289             } else {
2290                writemask = WRITEMASK_X;
2291                inst->mlen++;
2292             }
2293          } else /* intel->gen == 4 */ {
2294             mrf = param_base;
2295             writemask = WRITEMASK_Z;
2296          }
2297          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2298       } else if (ir->op == ir_txf) {
2299          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2300       } else if (ir->op == ir_txf_ms) {
2301          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2302                   sample_index));
2303          inst->mlen++;
2304
2305          /* on Gen7, there is an additional MCS parameter here after SI,
2306           * but we don't bother to emit it since it's always zero. If
2307           * we start supporting texturing from CMS surfaces, this will have
2308           * to change
2309           */
2310       } else if (ir->op == ir_txd) {
2311          const glsl_type *type = lod_type;
2312
2313          if (intel->gen >= 5) {
2314             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2315             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2316             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2317             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2318             inst->mlen++;
2319
2320             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2321                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2322                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2323                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2324                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2325                inst->mlen++;
2326
2327                if (ir->shadow_comparitor) {
2328                   emit(MOV(dst_reg(MRF, param_base + 2,
2329                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2330                            shadow_comparitor));
2331                }
2332             }
2333          } else /* intel->gen == 4 */ {
2334             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2335             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2336             inst->mlen += 2;
2337          }
2338       }
2339    }
2340
2341    emit(inst);
2342
2343    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2344     * spec requires layers.
2345     */
2346    if (ir->op == ir_txs) {
2347       glsl_type const *type = ir->sampler->type;
2348       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2349           type->sampler_array) {
2350          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2351                    with_writemask(inst->dst, WRITEMASK_Z),
2352                    src_reg(inst->dst), src_reg(6));
2353       }
2354    }
2355
2356    swizzle_result(ir, src_reg(inst->dst), sampler);
2357 }
2358
2359 void
2360 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2361 {
2362    int s = key->tex.swizzles[sampler];
2363
2364    this->result = src_reg(this, ir->type);
2365    dst_reg swizzled_result(this->result);
2366
2367    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2368                         || s == SWIZZLE_NOOP) {
2369       emit(MOV(swizzled_result, orig_val));
2370       return;
2371    }
2372
2373    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2374    int swizzle[4];
2375
2376    for (int i = 0; i < 4; i++) {
2377       switch (GET_SWZ(s, i)) {
2378       case SWIZZLE_ZERO:
2379          zero_mask |= (1 << i);
2380          break;
2381       case SWIZZLE_ONE:
2382          one_mask |= (1 << i);
2383          break;
2384       default:
2385          copy_mask |= (1 << i);
2386          swizzle[i] = GET_SWZ(s, i);
2387          break;
2388       }
2389    }
2390
2391    if (copy_mask) {
2392       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2393       swizzled_result.writemask = copy_mask;
2394       emit(MOV(swizzled_result, orig_val));
2395    }
2396
2397    if (zero_mask) {
2398       swizzled_result.writemask = zero_mask;
2399       emit(MOV(swizzled_result, src_reg(0.0f)));
2400    }
2401
2402    if (one_mask) {
2403       swizzled_result.writemask = one_mask;
2404       emit(MOV(swizzled_result, src_reg(1.0f)));
2405    }
2406 }
2407
2408 void
2409 vec4_visitor::visit(ir_return *ir)
2410 {
2411    assert(!"not reached");
2412 }
2413
2414 void
2415 vec4_visitor::visit(ir_discard *ir)
2416 {
2417    assert(!"not reached");
2418 }
2419
2420 void
2421 vec4_visitor::visit(ir_if *ir)
2422 {
2423    /* Don't point the annotation at the if statement, because then it plus
2424     * the then and else blocks get printed.
2425     */
2426    this->base_ir = ir->condition;
2427
2428    if (intel->gen == 6) {
2429       emit_if_gen6(ir);
2430    } else {
2431       uint32_t predicate;
2432       emit_bool_to_cond_code(ir->condition, &predicate);
2433       emit(IF(predicate));
2434    }
2435
2436    visit_instructions(&ir->then_instructions);
2437
2438    if (!ir->else_instructions.is_empty()) {
2439       this->base_ir = ir->condition;
2440       emit(BRW_OPCODE_ELSE);
2441
2442       visit_instructions(&ir->else_instructions);
2443    }
2444
2445    this->base_ir = ir->condition;
2446    emit(BRW_OPCODE_ENDIF);
2447 }
2448
2449 void
2450 vec4_visitor::emit_ndc_computation()
2451 {
2452    /* Get the position */
2453    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2454
2455    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2456    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2457    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2458
2459    current_annotation = "NDC";
2460    dst_reg ndc_w = ndc;
2461    ndc_w.writemask = WRITEMASK_W;
2462    src_reg pos_w = pos;
2463    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2464    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2465
2466    dst_reg ndc_xyz = ndc;
2467    ndc_xyz.writemask = WRITEMASK_XYZ;
2468
2469    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2470 }
2471
2472 void
2473 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2474 {
2475    if (intel->gen < 6 &&
2476        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2477         key->userclip_active || brw->has_negative_rhw_bug)) {
2478       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2479       dst_reg header1_w = header1;
2480       header1_w.writemask = WRITEMASK_W;
2481       GLuint i;
2482
2483       emit(MOV(header1, 0u));
2484
2485       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2486          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2487
2488          current_annotation = "Point size";
2489          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2490          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2491       }
2492
2493       current_annotation = "Clipping flags";
2494       for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2495          vec4_instruction *inst;
2496
2497          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2498                          src_reg(this->userplane[i])));
2499          inst->conditional_mod = BRW_CONDITIONAL_L;
2500
2501          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2502          inst->predicate = BRW_PREDICATE_NORMAL;
2503       }
2504
2505       /* i965 clipping workaround:
2506        * 1) Test for -ve rhw
2507        * 2) If set,
2508        *      set ndc = (0,0,0,0)
2509        *      set ucp[6] = 1
2510        *
2511        * Later, clipping will detect ucp[6] and ensure the primitive is
2512        * clipped against all fixed planes.
2513        */
2514       if (brw->has_negative_rhw_bug) {
2515          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2516          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2517          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2518          vec4_instruction *inst;
2519          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2520          inst->predicate = BRW_PREDICATE_NORMAL;
2521          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2522          inst->predicate = BRW_PREDICATE_NORMAL;
2523       }
2524
2525       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2526    } else if (intel->gen < 6) {
2527       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2528    } else {
2529       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2530       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2531          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2532                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2533       }
2534    }
2535 }
2536
2537 void
2538 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2539 {
2540    if (intel->gen < 6) {
2541       /* Clip distance slots are set aside in gen5, but they are not used.  It
2542        * is not clear whether we actually need to set aside space for them,
2543        * but the performance cost is negligible.
2544        */
2545       return;
2546    }
2547
2548    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2549     *
2550     *     "If a linked set of shaders forming the vertex stage contains no
2551     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2552     *     application has requested clipping against user clip planes through
2553     *     the API, then the coordinate written to gl_Position is used for
2554     *     comparison against the user clip planes."
2555     *
2556     * This function is only called if the shader didn't write to
2557     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2558     * if the user wrote to it; otherwise we use gl_Position.
2559     */
2560    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2561    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2562       clip_vertex = VARYING_SLOT_POS;
2563    }
2564
2565    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2566         ++i) {
2567       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2568                src_reg(output_reg[clip_vertex]),
2569                src_reg(this->userplane[i + offset])));
2570    }
2571 }
2572
2573 void
2574 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2575 {
2576    assert (varying < VARYING_SLOT_MAX);
2577    reg.type = output_reg[varying].type;
2578    current_annotation = output_reg_annotation[varying];
2579    /* Copy the register, saturating if necessary */
2580    vec4_instruction *inst = emit(MOV(reg,
2581                                      src_reg(output_reg[varying])));
2582    if ((varying == VARYING_SLOT_COL0 ||
2583         varying == VARYING_SLOT_COL1 ||
2584         varying == VARYING_SLOT_BFC0 ||
2585         varying == VARYING_SLOT_BFC1) &&
2586        key->clamp_vertex_color) {
2587       inst->saturate = true;
2588    }
2589 }
2590
2591 void
2592 vec4_visitor::emit_urb_slot(int mrf, int varying)
2593 {
2594    struct brw_reg hw_reg = brw_message_reg(mrf);
2595    dst_reg reg = dst_reg(MRF, mrf);
2596    reg.type = BRW_REGISTER_TYPE_F;
2597
2598    switch (varying) {
2599    case VARYING_SLOT_PSIZ:
2600       /* PSIZ is always in slot 0, and is coupled with other flags. */
2601       current_annotation = "indices, point width, clip flags";
2602       emit_psiz_and_flags(hw_reg);
2603       break;
2604    case BRW_VARYING_SLOT_NDC:
2605       current_annotation = "NDC";
2606       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2607       break;
2608    case BRW_VARYING_SLOT_POS_DUPLICATE:
2609    case VARYING_SLOT_POS:
2610       current_annotation = "gl_Position";
2611       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2612       break;
2613    case VARYING_SLOT_CLIP_DIST0:
2614    case VARYING_SLOT_CLIP_DIST1:
2615       if (this->key->uses_clip_distance) {
2616          emit_generic_urb_slot(reg, varying);
2617       } else {
2618          current_annotation = "user clip distances";
2619          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2620       }
2621       break;
2622    case VARYING_SLOT_EDGE:
2623       /* This is present when doing unfilled polygons.  We're supposed to copy
2624        * the edge flag from the user-provided vertex array
2625        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2626        * of that attribute (starts as 1.0f).  This is then used in clipping to
2627        * determine which edges should be drawn as wireframe.
2628        */
2629       current_annotation = "edge flag";
2630       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2631                                     glsl_type::float_type, WRITEMASK_XYZW))));
2632       break;
2633    case BRW_VARYING_SLOT_PAD:
2634       /* No need to write to this slot */
2635       break;
2636    default:
2637       emit_generic_urb_slot(reg, varying);
2638       break;
2639    }
2640 }
2641
2642 static int
2643 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2644 {
2645    struct intel_context *intel = &brw->intel;
2646
2647    if (intel->gen >= 6) {
2648       /* URB data written (does not include the message header reg) must
2649        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2650        * section 5.4.3.2.2: URB_INTERLEAVED.
2651        *
2652        * URB entries are allocated on a multiple of 1024 bits, so an
2653        * extra 128 bits written here to make the end align to 256 is
2654        * no problem.
2655        */
2656       if ((mlen % 2) != 1)
2657          mlen++;
2658    }
2659
2660    return mlen;
2661 }
2662
2663 void
2664 vec4_vs_visitor::emit_urb_write_header(int mrf)
2665 {
2666    /* No need to do anything for VS; an implied write to this MRF will be
2667     * performed by VS_OPCODE_URB_WRITE.
2668     */
2669    (void) mrf;
2670 }
2671
2672 vec4_instruction *
2673 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2674 {
2675    /* For VS, the URB writes end the thread. */
2676    if (complete) {
2677       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2678          emit_shader_time_end();
2679    }
2680
2681    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2682    inst->eot = complete;
2683
2684    return inst;
2685 }
2686
2687 /**
2688  * Generates the VUE payload plus the necessary URB write instructions to
2689  * output it.
2690  *
2691  * The VUE layout is documented in Volume 2a.
2692  */
2693 void
2694 vec4_visitor::emit_vertex()
2695 {
2696    /* MRF 0 is reserved for the debugger, so start with message header
2697     * in MRF 1.
2698     */
2699    int base_mrf = 1;
2700    int mrf = base_mrf;
2701    /* In the process of generating our URB write message contents, we
2702     * may need to unspill a register or load from an array.  Those
2703     * reads would use MRFs 14-15.
2704     */
2705    int max_usable_mrf = 13;
2706
2707    /* The following assertion verifies that max_usable_mrf causes an
2708     * even-numbered amount of URB write data, which will meet gen6's
2709     * requirements for length alignment.
2710     */
2711    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2712
2713    /* First mrf is the g0-based message header containing URB handles and
2714     * such.
2715     */
2716    emit_urb_write_header(mrf++);
2717
2718    if (intel->gen < 6) {
2719       emit_ndc_computation();
2720    }
2721
2722    /* Set up the VUE data for the first URB write */
2723    int slot;
2724    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2725       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2726
2727       /* If this was max_usable_mrf, we can't fit anything more into this URB
2728        * WRITE.
2729        */
2730       if (mrf > max_usable_mrf) {
2731          slot++;
2732          break;
2733       }
2734    }
2735
2736    bool complete = slot >= prog_data->vue_map.num_slots;
2737    current_annotation = "URB write";
2738    vec4_instruction *inst = emit_urb_write_opcode(complete);
2739    inst->base_mrf = base_mrf;
2740    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2741
2742    /* Optional second URB write */
2743    if (!complete) {
2744       mrf = base_mrf + 1;
2745
2746       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2747          assert(mrf < max_usable_mrf);
2748
2749          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2750       }
2751
2752       current_annotation = "URB write";
2753       inst = emit_urb_write_opcode(true /* complete */);
2754       inst->base_mrf = base_mrf;
2755       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2756       /* URB destination offset.  In the previous write, we got MRFs
2757        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2758        * URB row increments, and each of our MRFs is half of one of
2759        * those, since we're doing interleaved writes.
2760        */
2761       inst->offset = (max_usable_mrf - base_mrf) / 2;
2762    }
2763 }
2764
2765 void
2766 vec4_vs_visitor::emit_thread_end()
2767 {
2768    /* For VS, we always end the thread by emitting a single vertex.
2769     * emit_urb_write_opcode() will take care of setting the eot flag on the
2770     * SEND instruction.
2771     */
2772    emit_vertex();
2773 }
2774
2775 src_reg
2776 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2777                                  src_reg *reladdr, int reg_offset)
2778 {
2779    /* Because we store the values to scratch interleaved like our
2780     * vertex data, we need to scale the vec4 index by 2.
2781     */
2782    int message_header_scale = 2;
2783
2784    /* Pre-gen6, the message header uses byte offsets instead of vec4
2785     * (16-byte) offset units.
2786     */
2787    if (intel->gen < 6)
2788       message_header_scale *= 16;
2789
2790    if (reladdr) {
2791       src_reg index = src_reg(this, glsl_type::int_type);
2792
2793       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2794       emit_before(inst, MUL(dst_reg(index),
2795                             index, src_reg(message_header_scale)));
2796
2797       return index;
2798    } else {
2799       return src_reg(reg_offset * message_header_scale);
2800    }
2801 }
2802
2803 src_reg
2804 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2805                                        src_reg *reladdr, int reg_offset)
2806 {
2807    if (reladdr) {
2808       src_reg index = src_reg(this, glsl_type::int_type);
2809
2810       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2811
2812       /* Pre-gen6, the message header uses byte offsets instead of vec4
2813        * (16-byte) offset units.
2814        */
2815       if (intel->gen < 6) {
2816          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2817       }
2818
2819       return index;
2820    } else {
2821       int message_header_scale = intel->gen < 6 ? 16 : 1;
2822       return src_reg(reg_offset * message_header_scale);
2823    }
2824 }
2825
2826 /**
2827  * Emits an instruction before @inst to load the value named by @orig_src
2828  * from scratch space at @base_offset to @temp.
2829  *
2830  * @base_offset is measured in 32-byte units (the size of a register).
2831  */
2832 void
2833 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2834                                 dst_reg temp, src_reg orig_src,
2835                                 int base_offset)
2836 {
2837    int reg_offset = base_offset + orig_src.reg_offset;
2838    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2839
2840    emit_before(inst, SCRATCH_READ(temp, index));
2841 }
2842
2843 /**
2844  * Emits an instruction after @inst to store the value to be written
2845  * to @orig_dst to scratch space at @base_offset, from @temp.
2846  *
2847  * @base_offset is measured in 32-byte units (the size of a register).
2848  */
2849 void
2850 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2851 {
2852    int reg_offset = base_offset + inst->dst.reg_offset;
2853    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2854
2855    /* Create a temporary register to store *inst's result in.
2856     *
2857     * We have to be careful in MOVing from our temporary result register in
2858     * the scratch write.  If we swizzle from channels of the temporary that
2859     * weren't initialized, it will confuse live interval analysis, which will
2860     * make spilling fail to make progress.
2861     */
2862    src_reg temp = src_reg(this, glsl_type::vec4_type);
2863    temp.type = inst->dst.type;
2864    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2865    int swizzles[4];
2866    for (int i = 0; i < 4; i++)
2867       if (inst->dst.writemask & (1 << i))
2868          swizzles[i] = i;
2869       else
2870          swizzles[i] = first_writemask_chan;
2871    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2872                                swizzles[2], swizzles[3]);
2873
2874    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2875                                        inst->dst.writemask));
2876    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2877    write->predicate = inst->predicate;
2878    write->ir = inst->ir;
2879    write->annotation = inst->annotation;
2880    inst->insert_after(write);
2881
2882    inst->dst.file = temp.file;
2883    inst->dst.reg = temp.reg;
2884    inst->dst.reg_offset = temp.reg_offset;
2885    inst->dst.reladdr = NULL;
2886 }
2887
2888 /**
2889  * We can't generally support array access in GRF space, because a
2890  * single instruction's destination can only span 2 contiguous
2891  * registers.  So, we send all GRF arrays that get variable index
2892  * access to scratch space.
2893  */
2894 void
2895 vec4_visitor::move_grf_array_access_to_scratch()
2896 {
2897    int scratch_loc[this->virtual_grf_count];
2898
2899    for (int i = 0; i < this->virtual_grf_count; i++) {
2900       scratch_loc[i] = -1;
2901    }
2902
2903    /* First, calculate the set of virtual GRFs that need to be punted
2904     * to scratch due to having any array access on them, and where in
2905     * scratch.
2906     */
2907    foreach_list(node, &this->instructions) {
2908       vec4_instruction *inst = (vec4_instruction *)node;
2909
2910       if (inst->dst.file == GRF && inst->dst.reladdr &&
2911           scratch_loc[inst->dst.reg] == -1) {
2912          scratch_loc[inst->dst.reg] = c->last_scratch;
2913          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2914       }
2915
2916       for (int i = 0 ; i < 3; i++) {
2917          src_reg *src = &inst->src[i];
2918
2919          if (src->file == GRF && src->reladdr &&
2920              scratch_loc[src->reg] == -1) {
2921             scratch_loc[src->reg] = c->last_scratch;
2922             c->last_scratch += this->virtual_grf_sizes[src->reg];
2923          }
2924       }
2925    }
2926
2927    /* Now, for anything that will be accessed through scratch, rewrite
2928     * it to load/store.  Note that this is a _safe list walk, because
2929     * we may generate a new scratch_write instruction after the one
2930     * we're processing.
2931     */
2932    foreach_list_safe(node, &this->instructions) {
2933       vec4_instruction *inst = (vec4_instruction *)node;
2934
2935       /* Set up the annotation tracking for new generated instructions. */
2936       base_ir = inst->ir;
2937       current_annotation = inst->annotation;
2938
2939       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2940          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2941       }
2942
2943       for (int i = 0 ; i < 3; i++) {
2944          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2945             continue;
2946
2947          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2948
2949          emit_scratch_read(inst, temp, inst->src[i],
2950                            scratch_loc[inst->src[i].reg]);
2951
2952          inst->src[i].file = temp.file;
2953          inst->src[i].reg = temp.reg;
2954          inst->src[i].reg_offset = temp.reg_offset;
2955          inst->src[i].reladdr = NULL;
2956       }
2957    }
2958 }
2959
2960 /**
2961  * Emits an instruction before @inst to load the value named by @orig_src
2962  * from the pull constant buffer (surface) at @base_offset to @temp.
2963  */
2964 void
2965 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2966                                       dst_reg temp, src_reg orig_src,
2967                                       int base_offset)
2968 {
2969    int reg_offset = base_offset + orig_src.reg_offset;
2970    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2971    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2972    vec4_instruction *load;
2973
2974    if (intel->gen >= 7) {
2975       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2976       grf_offset.type = offset.type;
2977       emit_before(inst, MOV(grf_offset, offset));
2978
2979       load = new(mem_ctx) vec4_instruction(this,
2980                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2981                                            temp, index, src_reg(grf_offset));
2982    } else {
2983       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2984                                            temp, index, offset);
2985       load->base_mrf = 14;
2986       load->mlen = 1;
2987    }
2988    emit_before(inst, load);
2989 }
2990
2991 /**
2992  * Implements array access of uniforms by inserting a
2993  * PULL_CONSTANT_LOAD instruction.
2994  *
2995  * Unlike temporary GRF array access (where we don't support it due to
2996  * the difficulty of doing relative addressing on instruction
2997  * destinations), we could potentially do array access of uniforms
2998  * that were loaded in GRF space as push constants.  In real-world
2999  * usage we've seen, though, the arrays being used are always larger
3000  * than we could load as push constants, so just always move all
3001  * uniform array access out to a pull constant buffer.
3002  */
3003 void
3004 vec4_visitor::move_uniform_array_access_to_pull_constants()
3005 {
3006    int pull_constant_loc[this->uniforms];
3007
3008    for (int i = 0; i < this->uniforms; i++) {
3009       pull_constant_loc[i] = -1;
3010    }
3011
3012    /* Walk through and find array access of uniforms.  Put a copy of that
3013     * uniform in the pull constant buffer.
3014     *
3015     * Note that we don't move constant-indexed accesses to arrays.  No
3016     * testing has been done of the performance impact of this choice.
3017     */
3018    foreach_list_safe(node, &this->instructions) {
3019       vec4_instruction *inst = (vec4_instruction *)node;
3020
3021       for (int i = 0 ; i < 3; i++) {
3022          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3023             continue;
3024
3025          int uniform = inst->src[i].reg;
3026
3027          /* If this array isn't already present in the pull constant buffer,
3028           * add it.
3029           */
3030          if (pull_constant_loc[uniform] == -1) {
3031             const float **values = &prog_data->param[uniform * 4];
3032
3033             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3034
3035             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3036                prog_data->pull_param[prog_data->nr_pull_params++]
3037                   = values[j];
3038             }
3039          }
3040
3041          /* Set up the annotation tracking for new generated instructions. */
3042          base_ir = inst->ir;
3043          current_annotation = inst->annotation;
3044
3045          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3046
3047          emit_pull_constant_load(inst, temp, inst->src[i],
3048                                  pull_constant_loc[uniform]);
3049
3050          inst->src[i].file = temp.file;
3051          inst->src[i].reg = temp.reg;
3052          inst->src[i].reg_offset = temp.reg_offset;
3053          inst->src[i].reladdr = NULL;
3054       }
3055    }
3056
3057    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3058     * no need to track them as larger-than-vec4 objects.  This will be
3059     * relied on in cutting out unused uniform vectors from push
3060     * constants.
3061     */
3062    split_uniform_registers();
3063 }
3064
3065 void
3066 vec4_visitor::resolve_ud_negate(src_reg *reg)
3067 {
3068    if (reg->type != BRW_REGISTER_TYPE_UD ||
3069        !reg->negate)
3070       return;
3071
3072    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3073    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3074    *reg = temp;
3075 }
3076
3077 vec4_visitor::vec4_visitor(struct brw_context *brw,
3078                            struct brw_vec4_compile *c,
3079                            struct gl_program *prog,
3080                            const struct brw_vec4_prog_key *key,
3081                            struct brw_vec4_prog_data *prog_data,
3082                            struct gl_shader_program *shader_prog,
3083                            struct brw_shader *shader,
3084                            void *mem_ctx,
3085                            bool debug_flag)
3086    : debug_flag(debug_flag)
3087 {
3088    this->brw = brw;
3089    this->intel = &brw->intel;
3090    this->ctx = &intel->ctx;
3091    this->shader_prog = shader_prog;
3092    this->shader = shader;
3093
3094    this->mem_ctx = mem_ctx;
3095    this->failed = false;
3096
3097    this->base_ir = NULL;
3098    this->current_annotation = NULL;
3099    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3100
3101    this->c = c;
3102    this->prog = prog;
3103    this->key = key;
3104    this->prog_data = prog_data;
3105
3106    this->variable_ht = hash_table_ctor(0,
3107                                        hash_table_pointer_hash,
3108                                        hash_table_pointer_compare);
3109
3110    this->virtual_grf_def = NULL;
3111    this->virtual_grf_use = NULL;
3112    this->virtual_grf_sizes = NULL;
3113    this->virtual_grf_count = 0;
3114    this->virtual_grf_reg_map = NULL;
3115    this->virtual_grf_reg_count = 0;
3116    this->virtual_grf_array_size = 0;
3117    this->live_intervals_valid = false;
3118
3119    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3120
3121    this->uniforms = 0;
3122 }
3123
3124 vec4_visitor::~vec4_visitor()
3125 {
3126    hash_table_dtor(this->variable_ht);
3127 }
3128
3129
3130 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3131                                  struct brw_vs_compile *vs_compile,
3132                                  struct brw_vs_prog_data *vs_prog_data,
3133                                  struct gl_shader_program *prog,
3134                                  struct brw_shader *shader,
3135                                  void *mem_ctx)
3136    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3137                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3138                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3139      vs_compile(vs_compile),
3140      vs_prog_data(vs_prog_data)
3141 {
3142 }
3143
3144
3145 void
3146 vec4_visitor::fail(const char *format, ...)
3147 {
3148    va_list va;
3149    char *msg;
3150
3151    if (failed)
3152       return;
3153
3154    failed = true;
3155
3156    va_start(va, format);
3157    msg = ralloc_vasprintf(mem_ctx, format, va);
3158    va_end(va);
3159    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3160
3161    this->fail_msg = msg;
3162
3163    if (debug_flag) {
3164       fprintf(stderr, "%s",  msg);
3165    }
3166 }
3167
3168 } /* namespace brw */