src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU1(F32TO16)
 117 ALU1(F16TO32)
 118 ALU2(ADD)
 119 ALU2(MUL)
 120 ALU2(MACH)
 121 ALU2(AND)
 122 ALU2(OR)
 123 ALU2(XOR)
 124 ALU2(DP3)
 125 ALU2(DP4)
 126 ALU2(DPH)
 127 ALU2(SHL)
 128 ALU2(SHR)
 129 ALU2(ASR)
 130
 131 /** Gen4 predicated IF. */
 132 vec4_instruction *
 133 vec4_visitor::IF(uint32_t predicate)
 134 {
 135    vec4_instruction *inst;
 136
 137    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 138    inst->predicate = predicate;
 139
 140    return inst;
 141 }
 142
 143 /** Gen6+ IF with embedded comparison. */
 144 vec4_instruction *
 145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 146 {
 147    assert(intel->gen >= 6);
 148
 149    vec4_instruction *inst;
 150
 151    resolve_ud_negate(&src0);
 152    resolve_ud_negate(&src1);
 153
 154    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 155                                         src0, src1);
 156    inst->conditional_mod = condition;
 157
 158    return inst;
 159 }
 160
 161 /**
 162  * CMP: Sets the low bit of the destination channels with the result
 163  * of the comparison, while the upper bits are undefined, and updates
 164  * the flag register with the packed 16 bits of the result.
 165  */
 166 vec4_instruction *
 167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 168 {
 169    vec4_instruction *inst;
 170
 171    /* original gen4 does type conversion to the destination type
 172     * before before comparison, producing garbage results for floating
 173     * point comparisons.
 174     */
 175    if (intel->gen == 4) {
 176       dst.type = src0.type;
 177       if (dst.file == HW_REG)
 178          dst.fixed_hw_reg.type = dst.type;
 179    }
 180
 181    resolve_ud_negate(&src0);
 182    resolve_ud_negate(&src1);
 183
 184    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 185    inst->conditional_mod = condition;
 186
 187    return inst;
 188 }
 189
 190 vec4_instruction *
 191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 196                                         dst, index);
 197    inst->base_mrf = 14;
 198    inst->mlen = 2;
 199
 200    return inst;
 201 }
 202
 203 vec4_instruction *
 204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 205 {
 206    vec4_instruction *inst;
 207
 208    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 209                                         dst, src, index);
 210    inst->base_mrf = 13;
 211    inst->mlen = 3;
 212
 213    return inst;
 214 }
 215
 216 void
 217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 218 {
 219    static enum opcode dot_opcodes[] = {
 220       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 221    };
 222
 223    emit(dot_opcodes[elements - 2], dst, src0, src1);
 224 }
 225
 226 src_reg
 227 vec4_visitor::fix_3src_operand(src_reg src)
 228 {
 229    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 230     * able to use vertical stride of zero to replicate the vec4 uniform, like
 231     *
 232     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 233     *
 234     * But you can't, since vertical stride is always four in three-source
 235     * instructions. Instead, insert a MOV instruction to do the replication so
 236     * that the three-source instruction can consume it.
 237     */
 238
 239    /* The MOV is only needed if the source is a uniform or immediate. */
 240    if (src.file != UNIFORM && src.file != IMM)
 241       return src;
 242
 243    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 244    expanded.type = src.type;
 245    emit(MOV(expanded, src));
 246    return src_reg(expanded);
 247 }
 248
 249 src_reg
 250 vec4_visitor::fix_math_operand(src_reg src)
 251 {
 252    /* The gen6 math instruction ignores the source modifiers --
 253     * swizzle, abs, negate, and at least some parts of the register
 254     * region description.
 255     *
 256     * Rather than trying to enumerate all these cases, *always* expand the
 257     * operand to a temp GRF for gen6.
 258     *
 259     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 260     * can't use.
 261     */
 262
 263    if (intel->gen == 7 && src.file != IMM)
 264       return src;
 265
 266    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 267    expanded.type = src.type;
 268    emit(MOV(expanded, src));
 269    return src_reg(expanded);
 270 }
 271
 272 void
 273 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 274 {
 275    src = fix_math_operand(src);
 276
 277    if (dst.writemask != WRITEMASK_XYZW) {
 278       /* The gen6 math instruction must be align1, so we can't do
 279        * writemasks.
 280        */
 281       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 282
 283       emit(opcode, temp_dst, src);
 284
 285       emit(MOV(dst, src_reg(temp_dst)));
 286    } else {
 287       emit(opcode, dst, src);
 288    }
 289 }
 290
 291 void
 292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 293 {
 294    vec4_instruction *inst = emit(opcode, dst, src);
 295    inst->base_mrf = 1;
 296    inst->mlen = 1;
 297 }
 298
 299 void
 300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 301 {
 302    switch (opcode) {
 303    case SHADER_OPCODE_RCP:
 304    case SHADER_OPCODE_RSQ:
 305    case SHADER_OPCODE_SQRT:
 306    case SHADER_OPCODE_EXP2:
 307    case SHADER_OPCODE_LOG2:
 308    case SHADER_OPCODE_SIN:
 309    case SHADER_OPCODE_COS:
 310       break;
 311    default:
 312       assert(!"not reached: bad math opcode");
 313       return;
 314    }
 315
 316    if (intel->gen >= 6) {
 317       return emit_math1_gen6(opcode, dst, src);
 318    } else {
 319       return emit_math1_gen4(opcode, dst, src);
 320    }
 321 }
 322
 323 void
 324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 325                               dst_reg dst, src_reg src0, src_reg src1)
 326 {
 327    src0 = fix_math_operand(src0);
 328    src1 = fix_math_operand(src1);
 329
 330    if (dst.writemask != WRITEMASK_XYZW) {
 331       /* The gen6 math instruction must be align1, so we can't do
 332        * writemasks.
 333        */
 334       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 335       temp_dst.type = dst.type;
 336
 337       emit(opcode, temp_dst, src0, src1);
 338
 339       emit(MOV(dst, src_reg(temp_dst)));
 340    } else {
 341       emit(opcode, dst, src0, src1);
 342    }
 343 }
 344
 345 void
 346 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 347                               dst_reg dst, src_reg src0, src_reg src1)
 348 {
 349    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 350    inst->base_mrf = 1;
 351    inst->mlen = 2;
 352 }
 353
 354 void
 355 vec4_visitor::emit_math(enum opcode opcode,
 356                         dst_reg dst, src_reg src0, src_reg src1)
 357 {
 358    switch (opcode) {
 359    case SHADER_OPCODE_POW:
 360    case SHADER_OPCODE_INT_QUOTIENT:
 361    case SHADER_OPCODE_INT_REMAINDER:
 362       break;
 363    default:
 364       assert(!"not reached: unsupported binary math opcode");
 365       return;
 366    }
 367
 368    if (intel->gen >= 6) {
 369       return emit_math2_gen6(opcode, dst, src0, src1);
 370    } else {
 371       return emit_math2_gen4(opcode, dst, src0, src1);
 372    }
 373 }
 374
 375 void
 376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 377 {
 378    if (intel->gen < 7)
 379       assert(!"ir_unop_pack_half_2x16 should be lowered");
 380
 381    assert(dst.type == BRW_REGISTER_TYPE_UD);
 382    assert(src0.type == BRW_REGISTER_TYPE_F);
 383
 384    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 385     *
 386     *   Because this instruction does not have a 16-bit floating-point type,
 387     *   the destination data type must be Word (W).
 388     *
 389     *   The destination must be DWord-aligned and specify a horizontal stride
 390     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 391     *   each destination channel and the upper word is not modified.
 392     *
 393     * The above restriction implies that the f32to16 instruction must use
 394     * align1 mode, because only in align1 mode is it possible to specify
 395     * horizontal stride.  We choose here to defy the hardware docs and emit
 396     * align16 instructions.
 397     *
 398     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 399     * instructions. I was partially successful in that the code passed all
 400     * tests.  However, the code was dubiously correct and fragile, and the
 401     * tests were not harsh enough to probe that frailty. Not trusting the
 402     * code, I chose instead to remain in align16 mode in defiance of the hw
 403     * docs).
 404     *
 405     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 406     * simulator, emitting a f32to16 in align16 mode with UD as destination
 407     * data type is safe. The behavior differs from that specified in the PRM
 408     * in that the upper word of each destination channel is cleared to 0.
 409     */
 410
 411    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 412    src_reg tmp_src(tmp_dst);
 413
 414 #if 0
 415    /* Verify the undocumented behavior on which the following instructions
 416     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 417     * then the result of the bit-or instruction below will be incorrect.
 418     *
 419     * You should inspect the disasm output in order to verify that the MOV is
 420     * not optimized away.
 421     */
 422    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 423 #endif
 424
 425    /* Give tmp the form below, where "." means untouched.
 426     *
 427     *     w z          y          x w z          y          x
 428     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 429     *
 430     * That the upper word of each write-channel be 0 is required for the
 431     * following bit-shift and bit-or instructions to work. Note that this
 432     * relies on the undocumented hardware behavior mentioned above.
 433     */
 434    tmp_dst.writemask = WRITEMASK_XY;
 435    emit(F32TO16(tmp_dst, src0));
 436
 437    /* Give the write-channels of dst the form:
 438     *   0xhhhh0000
 439     */
 440    tmp_src.swizzle = SWIZZLE_Y;
 441    emit(SHL(dst, tmp_src, src_reg(16u)));
 442
 443    /* Finally, give the write-channels of dst the form of packHalf2x16's
 444     * output:
 445     *   0xhhhhllll
 446     */
 447    tmp_src.swizzle = SWIZZLE_X;
 448    emit(OR(dst, src_reg(dst), tmp_src));
 449 }
 450
 451 void
 452 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 453 {
 454    if (intel->gen < 7)
 455       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 456
 457    assert(dst.type == BRW_REGISTER_TYPE_F);
 458    assert(src0.type == BRW_REGISTER_TYPE_UD);
 459
 460    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 461     *
 462     *   Because this instruction does not have a 16-bit floating-point type,
 463     *   the source data type must be Word (W). The destination type must be
 464     *   F (Float).
 465     *
 466     * To use W as the source data type, we must adjust horizontal strides,
 467     * which is only possible in align1 mode. All my [chadv] attempts at
 468     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 469     * Piglit tests, so I gave up.
 470     *
 471     * I've verified that, on gen7 hardware and the simulator, it is safe to
 472     * emit f16to32 in align16 mode with UD as source data type.
 473     */
 474
 475    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 476    src_reg tmp_src(tmp_dst);
 477
 478    tmp_dst.writemask = WRITEMASK_X;
 479    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 480
 481    tmp_dst.writemask = WRITEMASK_Y;
 482    emit(SHR(tmp_dst, src0, src_reg(16u)));
 483
 484    dst.writemask = WRITEMASK_XY;
 485    emit(F16TO32(dst, tmp_src));
 486 }
 487
 488 void
 489 vec4_visitor::visit_instructions(const exec_list *list)
 490 {
 491    foreach_list(node, list) {
 492       ir_instruction *ir = (ir_instruction *)node;
 493
 494       base_ir = ir;
 495       ir->accept(this);
 496    }
 497 }
 498
 499
 500 static int
 501 type_size(const struct glsl_type *type)
 502 {
 503    unsigned int i;
 504    int size;
 505
 506    switch (type->base_type) {
 507    case GLSL_TYPE_UINT:
 508    case GLSL_TYPE_INT:
 509    case GLSL_TYPE_FLOAT:
 510    case GLSL_TYPE_BOOL:
 511       if (type->is_matrix()) {
 512          return type->matrix_columns;
 513       } else {
 514          /* Regardless of size of vector, it gets a vec4. This is bad
 515           * packing for things like floats, but otherwise arrays become a
 516           * mess.  Hopefully a later pass over the code can pack scalars
 517           * down if appropriate.
 518           */
 519          return 1;
 520       }
 521    case GLSL_TYPE_ARRAY:
 522       assert(type->length > 0);
 523       return type_size(type->fields.array) * type->length;
 524    case GLSL_TYPE_STRUCT:
 525       size = 0;
 526       for (i = 0; i < type->length; i++) {
 527          size += type_size(type->fields.structure[i].type);
 528       }
 529       return size;
 530    case GLSL_TYPE_SAMPLER:
 531       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 532        * at link time.
 533        */
 534       return 1;
 535    case GLSL_TYPE_VOID:
 536    case GLSL_TYPE_ERROR:
 537    case GLSL_TYPE_INTERFACE:
 538       assert(0);
 539       break;
 540    }
 541
 542    return 0;
 543 }
 544
 545 int
 546 vec4_visitor::virtual_grf_alloc(int size)
 547 {
 548    if (virtual_grf_array_size <= virtual_grf_count) {
 549       if (virtual_grf_array_size == 0)
 550          virtual_grf_array_size = 16;
 551       else
 552          virtual_grf_array_size *= 2;
 553       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 554                                    virtual_grf_array_size);
 555       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 556                                      virtual_grf_array_size);
 557    }
 558    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 559    virtual_grf_reg_count += size;
 560    virtual_grf_sizes[virtual_grf_count] = size;
 561    return virtual_grf_count++;
 562 }
 563
 564 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 565 {
 566    init();
 567
 568    this->file = GRF;
 569    this->reg = v->virtual_grf_alloc(type_size(type));
 570
 571    if (type->is_array() || type->is_record()) {
 572       this->swizzle = BRW_SWIZZLE_NOOP;
 573    } else {
 574       this->swizzle = swizzle_for_size(type->vector_elements);
 575    }
 576
 577    this->type = brw_type_for_base_type(type);
 578 }
 579
 580 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 581 {
 582    init();
 583
 584    this->file = GRF;
 585    this->reg = v->virtual_grf_alloc(type_size(type));
 586
 587    if (type->is_array() || type->is_record()) {
 588       this->writemask = WRITEMASK_XYZW;
 589    } else {
 590       this->writemask = (1 << type->vector_elements) - 1;
 591    }
 592
 593    this->type = brw_type_for_base_type(type);
 594 }
 595
 596 /* Our support for uniforms is piggy-backed on the struct
 597  * gl_fragment_program, because that's where the values actually
 598  * get stored, rather than in some global gl_shader_program uniform
 599  * store.
 600  */
 601 void
 602 vec4_visitor::setup_uniform_values(ir_variable *ir)
 603 {
 604    int namelen = strlen(ir->name);
 605
 606    /* The data for our (non-builtin) uniforms is stored in a series of
 607     * gl_uniform_driver_storage structs for each subcomponent that
 608     * glGetUniformLocation() could name.  We know it's been set up in the same
 609     * order we'd walk the type, so walk the list of storage and find anything
 610     * with our name, or the prefix of a component that starts with our name.
 611     */
 612    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 613       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 614
 615       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 616           (storage->name[namelen] != 0 &&
 617            storage->name[namelen] != '.' &&
 618            storage->name[namelen] != '[')) {
 619          continue;
 620       }
 621
 622       gl_constant_value *components = storage->storage;
 623       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 624                                storage->type->matrix_columns);
 625
 626       for (unsigned s = 0; s < vector_count; s++) {
 627          uniform_vector_size[uniforms] = storage->type->vector_elements;
 628
 629          int i;
 630          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 631             prog_data->param[uniforms * 4 + i] = &components->f;
 632             components++;
 633          }
 634          for (; i < 4; i++) {
 635             static float zero = 0;
 636             prog_data->param[uniforms * 4 + i] = &zero;
 637          }
 638
 639          uniforms++;
 640       }
 641    }
 642 }
 643
 644 void
 645 vec4_visitor::setup_uniform_clipplane_values()
 646 {
 647    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 648
 649    if (intel->gen < 6) {
 650       /* Pre-Gen6, we compact clip planes.  For example, if the user
 651        * enables just clip planes 0, 1, and 3, we will enable clip planes
 652        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 653        * plane 2.  This simplifies the implementation of the Gen6 clip
 654        * thread.
 655        */
 656       int compacted_clipplane_index = 0;
 657       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 658          if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
 659             continue;
 660
 661          this->uniform_vector_size[this->uniforms] = 4;
 662          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 663          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 664          for (int j = 0; j < 4; ++j) {
 665             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 666          }
 667          ++compacted_clipplane_index;
 668          ++this->uniforms;
 669       }
 670    } else {
 671       /* In Gen6 and later, we don't compact clip planes, because this
 672        * simplifies the implementation of gl_ClipDistance.
 673        */
 674       for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 675          this->uniform_vector_size[this->uniforms] = 4;
 676          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 677          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 678          for (int j = 0; j < 4; ++j) {
 679             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 680          }
 681          ++this->uniforms;
 682       }
 683    }
 684 }
 685
 686 /* Our support for builtin uniforms is even scarier than non-builtin.
 687  * It sits on top of the PROG_STATE_VAR parameters that are
 688  * automatically updated from GL context state.
 689  */
 690 void
 691 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 692 {
 693    const ir_state_slot *const slots = ir->state_slots;
 694    assert(ir->state_slots != NULL);
 695
 696    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 697       /* This state reference has already been setup by ir_to_mesa,
 698        * but we'll get the same index back here.  We can reference
 699        * ParameterValues directly, since unlike brw_fs.cpp, we never
 700        * add new state references during compile.
 701        */
 702       int index = _mesa_add_state_reference(this->prog->Parameters,
 703                                             (gl_state_index *)slots[i].tokens);
 704       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 705
 706       this->uniform_vector_size[this->uniforms] = 0;
 707       /* Add each of the unique swizzled channels of the element.
 708        * This will end up matching the size of the glsl_type of this field.
 709        */
 710       int last_swiz = -1;
 711       for (unsigned int j = 0; j < 4; j++) {
 712          int swiz = GET_SWZ(slots[i].swizzle, j);
 713          last_swiz = swiz;
 714
 715          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 716          if (swiz <= last_swiz)
 717             this->uniform_vector_size[this->uniforms]++;
 718       }
 719       this->uniforms++;
 720    }
 721 }
 722
 723 dst_reg *
 724 vec4_visitor::variable_storage(ir_variable *var)
 725 {
 726    return (dst_reg *)hash_table_find(this->variable_ht, var);
 727 }
 728
 729 void
 730 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 731 {
 732    ir_expression *expr = ir->as_expression();
 733
 734    *predicate = BRW_PREDICATE_NORMAL;
 735
 736    if (expr) {
 737       src_reg op[2];
 738       vec4_instruction *inst;
 739
 740       assert(expr->get_num_operands() <= 2);
 741       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 742          expr->operands[i]->accept(this);
 743          op[i] = this->result;
 744
 745          resolve_ud_negate(&op[i]);
 746       }
 747
 748       switch (expr->operation) {
 749       case ir_unop_logic_not:
 750          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 751          inst->conditional_mod = BRW_CONDITIONAL_Z;
 752          break;
 753
 754       case ir_binop_logic_xor:
 755          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 756          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 757          break;
 758
 759       case ir_binop_logic_or:
 760          inst = emit(OR(dst_null_d(), op[0], op[1]));
 761          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 762          break;
 763
 764       case ir_binop_logic_and:
 765          inst = emit(AND(dst_null_d(), op[0], op[1]));
 766          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 767          break;
 768
 769       case ir_unop_f2b:
 770          if (intel->gen >= 6) {
 771             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 772          } else {
 773             inst = emit(MOV(dst_null_f(), op[0]));
 774             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 775          }
 776          break;
 777
 778       case ir_unop_i2b:
 779          if (intel->gen >= 6) {
 780             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 781          } else {
 782             inst = emit(MOV(dst_null_d(), op[0]));
 783             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 784          }
 785          break;
 786
 787       case ir_binop_all_equal:
 788          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 789          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 790          break;
 791
 792       case ir_binop_any_nequal:
 793          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 794          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 795          break;
 796
 797       case ir_unop_any:
 798          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 799          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 800          break;
 801
 802       case ir_binop_greater:
 803       case ir_binop_gequal:
 804       case ir_binop_less:
 805       case ir_binop_lequal:
 806       case ir_binop_equal:
 807       case ir_binop_nequal:
 808          emit(CMP(dst_null_d(), op[0], op[1],
 809                   brw_conditional_for_comparison(expr->operation)));
 810          break;
 811
 812       default:
 813          assert(!"not reached");
 814          break;
 815       }
 816       return;
 817    }
 818
 819    ir->accept(this);
 820
 821    resolve_ud_negate(&this->result);
 822
 823    if (intel->gen >= 6) {
 824       vec4_instruction *inst = emit(AND(dst_null_d(),
 825                                         this->result, src_reg(1)));
 826       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 827    } else {
 828       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 829       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 830    }
 831 }
 832
 833 /**
 834  * Emit a gen6 IF statement with the comparison folded into the IF
 835  * instruction.
 836  */
 837 void
 838 vec4_visitor::emit_if_gen6(ir_if *ir)
 839 {
 840    ir_expression *expr = ir->condition->as_expression();
 841
 842    if (expr) {
 843       src_reg op[2];
 844       dst_reg temp;
 845
 846       assert(expr->get_num_operands() <= 2);
 847       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 848          expr->operands[i]->accept(this);
 849          op[i] = this->result;
 850       }
 851
 852       switch (expr->operation) {
 853       case ir_unop_logic_not:
 854          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 855          return;
 856
 857       case ir_binop_logic_xor:
 858          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 859          return;
 860
 861       case ir_binop_logic_or:
 862          temp = dst_reg(this, glsl_type::bool_type);
 863          emit(OR(temp, op[0], op[1]));
 864          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 865          return;
 866
 867       case ir_binop_logic_and:
 868          temp = dst_reg(this, glsl_type::bool_type);
 869          emit(AND(temp, op[0], op[1]));
 870          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 871          return;
 872
 873       case ir_unop_f2b:
 874          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 875          return;
 876
 877       case ir_unop_i2b:
 878          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 879          return;
 880
 881       case ir_binop_greater:
 882       case ir_binop_gequal:
 883       case ir_binop_less:
 884       case ir_binop_lequal:
 885       case ir_binop_equal:
 886       case ir_binop_nequal:
 887          emit(IF(op[0], op[1],
 888                  brw_conditional_for_comparison(expr->operation)));
 889          return;
 890
 891       case ir_binop_all_equal:
 892          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 893          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 894          return;
 895
 896       case ir_binop_any_nequal:
 897          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 898          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 899          return;
 900
 901       case ir_unop_any:
 902          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 903          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 904          return;
 905
 906       default:
 907          assert(!"not reached");
 908          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 909          return;
 910       }
 911       return;
 912    }
 913
 914    ir->condition->accept(this);
 915
 916    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 917 }
 918
 919 static dst_reg
 920 with_writemask(dst_reg const & r, int mask)
 921 {
 922    dst_reg result = r;
 923    result.writemask = mask;
 924    return result;
 925 }
 926
 927 void
 928 vec4_vs_visitor::emit_prolog()
 929 {
 930    dst_reg sign_recovery_shift;
 931    dst_reg normalize_factor;
 932    dst_reg es3_normalize_factor;
 933
 934    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 935       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 936          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 937          dst_reg reg(ATTR, i);
 938          dst_reg reg_d = reg;
 939          reg_d.type = BRW_REGISTER_TYPE_D;
 940          dst_reg reg_ud = reg;
 941          reg_ud.type = BRW_REGISTER_TYPE_UD;
 942
 943          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 944           * come in as floating point conversions of the integer values.
 945           */
 946          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 947             dst_reg dst = reg;
 948             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 949             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 950             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 951          }
 952
 953          /* Do sign recovery for 2101010 formats if required. */
 954          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 955             if (sign_recovery_shift.file == BAD_FILE) {
 956                /* shift constant: <22,22,22,30> */
 957                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 958                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 959                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 960             }
 961
 962             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 963             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 964          }
 965
 966          /* Apply BGRA swizzle if required. */
 967          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 968             src_reg temp = src_reg(reg);
 969             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 970             emit(MOV(reg, temp));
 971          }
 972
 973          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 974             /* ES 3.0 has different rules for converting signed normalized
 975              * fixed-point numbers than desktop GL.
 976              */
 977             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 978                /* According to equation 2.2 of the ES 3.0 specification,
 979                 * signed normalization conversion is done by:
 980                 *
 981                 * f = c / (2^(b-1)-1)
 982                 */
 983                if (es3_normalize_factor.file == BAD_FILE) {
 984                   /* mul constant: 1 / (2^(b-1) - 1) */
 985                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 986                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 987                            src_reg(1.0f / ((1<<9) - 1))));
 988                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 989                            src_reg(1.0f / ((1<<1) - 1))));
 990                }
 991
 992                dst_reg dst = reg;
 993                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 994                emit(MOV(dst, src_reg(reg_d)));
 995                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 996                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 997             } else {
 998                /* The following equations are from the OpenGL 3.2 specification:
 999                 *
1000                 * 2.1 unsigned normalization
1001                 * f = c/(2^n-1)
1002                 *
1003                 * 2.2 signed normalization
1004                 * f = (2c+1)/(2^n-1)
1005                 *
1006                 * Both of these share a common divisor, which is represented by
1007                 * "normalize_factor" in the code below.
1008                 */
1009                if (normalize_factor.file == BAD_FILE) {
1010                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1011                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1012                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1013                            src_reg(1.0f / ((1<<10) - 1))));
1014                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1015                            src_reg(1.0f / ((1<<2) - 1))));
1016                }
1017
1018                dst_reg dst = reg;
1019                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1020                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1021
1022                /* For signed normalization, we want the numerator to be 2c+1. */
1023                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1024                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1025                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1026                }
1027
1028                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1029             }
1030          }
1031
1032          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1033             dst_reg dst = reg;
1034             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1035             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1036          }
1037       }
1038    }
1039 }
1040
1041
1042 dst_reg *
1043 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1044 {
1045    /* VertexID is stored by the VF as the last vertex element, but
1046     * we don't represent it with a flag in inputs_read, so we call
1047     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1048     */
1049    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1050    vs_prog_data->uses_vertexid = true;
1051
1052    switch (ir->location) {
1053    case SYSTEM_VALUE_VERTEX_ID:
1054       reg->writemask = WRITEMASK_X;
1055       break;
1056    case SYSTEM_VALUE_INSTANCE_ID:
1057       reg->writemask = WRITEMASK_Y;
1058       break;
1059    default:
1060       assert(!"not reached");
1061       break;
1062    }
1063
1064    return reg;
1065 }
1066
1067
1068 void
1069 vec4_visitor::visit(ir_variable *ir)
1070 {
1071    dst_reg *reg = NULL;
1072
1073    if (variable_storage(ir))
1074       return;
1075
1076    switch (ir->mode) {
1077    case ir_var_shader_in:
1078       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1079       break;
1080
1081    case ir_var_shader_out:
1082       reg = new(mem_ctx) dst_reg(this, ir->type);
1083
1084       for (int i = 0; i < type_size(ir->type); i++) {
1085          output_reg[ir->location + i] = *reg;
1086          output_reg[ir->location + i].reg_offset = i;
1087          output_reg[ir->location + i].type =
1088             brw_type_for_base_type(ir->type->get_scalar_type());
1089          output_reg_annotation[ir->location + i] = ir->name;
1090       }
1091       break;
1092
1093    case ir_var_auto:
1094    case ir_var_temporary:
1095       reg = new(mem_ctx) dst_reg(this, ir->type);
1096       break;
1097
1098    case ir_var_uniform:
1099       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1100
1101       /* Thanks to the lower_ubo_reference pass, we will see only
1102        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1103        * variables, so no need for them to be in variable_ht.
1104        */
1105       if (ir->is_in_uniform_block())
1106          return;
1107
1108       /* Track how big the whole uniform variable is, in case we need to put a
1109        * copy of its data into pull constants for array access.
1110        */
1111       this->uniform_size[this->uniforms] = type_size(ir->type);
1112
1113       if (!strncmp(ir->name, "gl_", 3)) {
1114          setup_builtin_uniform_values(ir);
1115       } else {
1116          setup_uniform_values(ir);
1117       }
1118       break;
1119
1120    case ir_var_system_value:
1121       reg = make_reg_for_system_value(ir);
1122       break;
1123
1124    default:
1125       assert(!"not reached");
1126    }
1127
1128    reg->type = brw_type_for_base_type(ir->type);
1129    hash_table_insert(this->variable_ht, reg, ir);
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_loop *ir)
1134 {
1135    dst_reg counter;
1136
1137    /* We don't want debugging output to print the whole body of the
1138     * loop as the annotation.
1139     */
1140    this->base_ir = NULL;
1141
1142    if (ir->counter != NULL) {
1143       this->base_ir = ir->counter;
1144       ir->counter->accept(this);
1145       counter = *(variable_storage(ir->counter));
1146
1147       if (ir->from != NULL) {
1148          this->base_ir = ir->from;
1149          ir->from->accept(this);
1150
1151          emit(MOV(counter, this->result));
1152       }
1153    }
1154
1155    emit(BRW_OPCODE_DO);
1156
1157    if (ir->to) {
1158       this->base_ir = ir->to;
1159       ir->to->accept(this);
1160
1161       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1162                brw_conditional_for_comparison(ir->cmp)));
1163
1164       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1165       inst->predicate = BRW_PREDICATE_NORMAL;
1166    }
1167
1168    visit_instructions(&ir->body_instructions);
1169
1170
1171    if (ir->increment) {
1172       this->base_ir = ir->increment;
1173       ir->increment->accept(this);
1174       emit(ADD(counter, src_reg(counter), this->result));
1175    }
1176
1177    emit(BRW_OPCODE_WHILE);
1178 }
1179
1180 void
1181 vec4_visitor::visit(ir_loop_jump *ir)
1182 {
1183    switch (ir->mode) {
1184    case ir_loop_jump::jump_break:
1185       emit(BRW_OPCODE_BREAK);
1186       break;
1187    case ir_loop_jump::jump_continue:
1188       emit(BRW_OPCODE_CONTINUE);
1189       break;
1190    }
1191 }
1192
1193
1194 void
1195 vec4_visitor::visit(ir_function_signature *ir)
1196 {
1197    assert(0);
1198    (void)ir;
1199 }
1200
1201 void
1202 vec4_visitor::visit(ir_function *ir)
1203 {
1204    /* Ignore function bodies other than main() -- we shouldn't see calls to
1205     * them since they should all be inlined.
1206     */
1207    if (strcmp(ir->name, "main") == 0) {
1208       const ir_function_signature *sig;
1209       exec_list empty;
1210
1211       sig = ir->matching_signature(&empty);
1212
1213       assert(sig);
1214
1215       visit_instructions(&sig->body);
1216    }
1217 }
1218
1219 bool
1220 vec4_visitor::try_emit_sat(ir_expression *ir)
1221 {
1222    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1223    if (!sat_src)
1224       return false;
1225
1226    sat_src->accept(this);
1227    src_reg src = this->result;
1228
1229    this->result = src_reg(this, ir->type);
1230    vec4_instruction *inst;
1231    inst = emit(MOV(dst_reg(this->result), src));
1232    inst->saturate = true;
1233
1234    return true;
1235 }
1236
1237 void
1238 vec4_visitor::emit_bool_comparison(unsigned int op,
1239                                  dst_reg dst, src_reg src0, src_reg src1)
1240 {
1241    /* original gen4 does destination conversion before comparison. */
1242    if (intel->gen < 5)
1243       dst.type = src0.type;
1244
1245    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1246
1247    dst.type = BRW_REGISTER_TYPE_D;
1248    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1249 }
1250
1251 void
1252 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1253                           src_reg src0, src_reg src1)
1254 {
1255    vec4_instruction *inst;
1256
1257    if (intel->gen >= 6) {
1258       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1259       inst->conditional_mod = conditionalmod;
1260    } else {
1261       emit(CMP(dst, src0, src1, conditionalmod));
1262
1263       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1264       inst->predicate = BRW_PREDICATE_NORMAL;
1265    }
1266 }
1267
1268 void
1269 vec4_visitor::visit(ir_expression *ir)
1270 {
1271    unsigned int operand;
1272    src_reg op[Elements(ir->operands)];
1273    src_reg result_src;
1274    dst_reg result_dst;
1275    vec4_instruction *inst;
1276
1277    if (try_emit_sat(ir))
1278       return;
1279
1280    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1281       this->result.file = BAD_FILE;
1282       ir->operands[operand]->accept(this);
1283       if (this->result.file == BAD_FILE) {
1284          printf("Failed to get tree for expression operand:\n");
1285          ir->operands[operand]->print();
1286          exit(1);
1287       }
1288       op[operand] = this->result;
1289
1290       /* Matrix expression operands should have been broken down to vector
1291        * operations already.
1292        */
1293       assert(!ir->operands[operand]->type->is_matrix());
1294    }
1295
1296    int vector_elements = ir->operands[0]->type->vector_elements;
1297    if (ir->operands[1]) {
1298       vector_elements = MAX2(vector_elements,
1299                              ir->operands[1]->type->vector_elements);
1300    }
1301
1302    this->result.file = BAD_FILE;
1303
1304    /* Storage for our result.  Ideally for an assignment we'd be using
1305     * the actual storage for the result here, instead.
1306     */
1307    result_src = src_reg(this, ir->type);
1308    /* convenience for the emit functions below. */
1309    result_dst = dst_reg(result_src);
1310    /* If nothing special happens, this is the result. */
1311    this->result = result_src;
1312    /* Limit writes to the channels that will be used by result_src later.
1313     * This does limit this temp's use as a temporary for multi-instruction
1314     * sequences.
1315     */
1316    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1317
1318    switch (ir->operation) {
1319    case ir_unop_logic_not:
1320       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1321        * ones complement of the whole register, not just bit 0.
1322        */
1323       emit(XOR(result_dst, op[0], src_reg(1)));
1324       break;
1325    case ir_unop_neg:
1326       op[0].negate = !op[0].negate;
1327       this->result = op[0];
1328       break;
1329    case ir_unop_abs:
1330       op[0].abs = true;
1331       op[0].negate = false;
1332       this->result = op[0];
1333       break;
1334
1335    case ir_unop_sign:
1336       emit(MOV(result_dst, src_reg(0.0f)));
1337
1338       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1339       inst = emit(MOV(result_dst, src_reg(1.0f)));
1340       inst->predicate = BRW_PREDICATE_NORMAL;
1341
1342       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1343       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1344       inst->predicate = BRW_PREDICATE_NORMAL;
1345
1346       break;
1347
1348    case ir_unop_rcp:
1349       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1350       break;
1351
1352    case ir_unop_exp2:
1353       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1354       break;
1355    case ir_unop_log2:
1356       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1357       break;
1358    case ir_unop_exp:
1359    case ir_unop_log:
1360       assert(!"not reached: should be handled by ir_explog_to_explog2");
1361       break;
1362    case ir_unop_sin:
1363    case ir_unop_sin_reduced:
1364       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1365       break;
1366    case ir_unop_cos:
1367    case ir_unop_cos_reduced:
1368       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1369       break;
1370
1371    case ir_unop_dFdx:
1372    case ir_unop_dFdy:
1373       assert(!"derivatives not valid in vertex shader");
1374       break;
1375
1376    case ir_unop_noise:
1377       assert(!"not reached: should be handled by lower_noise");
1378       break;
1379
1380    case ir_binop_add:
1381       emit(ADD(result_dst, op[0], op[1]));
1382       break;
1383    case ir_binop_sub:
1384       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1385       break;
1386
1387    case ir_binop_mul:
1388       if (ir->type->is_integer()) {
1389          /* For integer multiplication, the MUL uses the low 16 bits
1390           * of one of the operands (src0 on gen6, src1 on gen7).  The
1391           * MACH accumulates in the contribution of the upper 16 bits
1392           * of that operand.
1393           *
1394           * FINISHME: Emit just the MUL if we know an operand is small
1395           * enough.
1396           */
1397          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1398
1399          emit(MUL(acc, op[0], op[1]));
1400          emit(MACH(dst_null_d(), op[0], op[1]));
1401          emit(MOV(result_dst, src_reg(acc)));
1402       } else {
1403          emit(MUL(result_dst, op[0], op[1]));
1404       }
1405       break;
1406    case ir_binop_div:
1407       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1408       assert(ir->type->is_integer());
1409       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1410       break;
1411    case ir_binop_mod:
1412       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1413       assert(ir->type->is_integer());
1414       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1415       break;
1416
1417    case ir_binop_less:
1418    case ir_binop_greater:
1419    case ir_binop_lequal:
1420    case ir_binop_gequal:
1421    case ir_binop_equal:
1422    case ir_binop_nequal: {
1423       emit(CMP(result_dst, op[0], op[1],
1424                brw_conditional_for_comparison(ir->operation)));
1425       emit(AND(result_dst, result_src, src_reg(0x1)));
1426       break;
1427    }
1428
1429    case ir_binop_all_equal:
1430       /* "==" operator producing a scalar boolean. */
1431       if (ir->operands[0]->type->is_vector() ||
1432           ir->operands[1]->type->is_vector()) {
1433          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1434          emit(MOV(result_dst, src_reg(0)));
1435          inst = emit(MOV(result_dst, src_reg(1)));
1436          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1437       } else {
1438          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1439          emit(AND(result_dst, result_src, src_reg(0x1)));
1440       }
1441       break;
1442    case ir_binop_any_nequal:
1443       /* "!=" operator producing a scalar boolean. */
1444       if (ir->operands[0]->type->is_vector() ||
1445           ir->operands[1]->type->is_vector()) {
1446          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1447
1448          emit(MOV(result_dst, src_reg(0)));
1449          inst = emit(MOV(result_dst, src_reg(1)));
1450          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1451       } else {
1452          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1453          emit(AND(result_dst, result_src, src_reg(0x1)));
1454       }
1455       break;
1456
1457    case ir_unop_any:
1458       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1459       emit(MOV(result_dst, src_reg(0)));
1460
1461       inst = emit(MOV(result_dst, src_reg(1)));
1462       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1463       break;
1464
1465    case ir_binop_logic_xor:
1466       emit(XOR(result_dst, op[0], op[1]));
1467       break;
1468
1469    case ir_binop_logic_or:
1470       emit(OR(result_dst, op[0], op[1]));
1471       break;
1472
1473    case ir_binop_logic_and:
1474       emit(AND(result_dst, op[0], op[1]));
1475       break;
1476
1477    case ir_binop_dot:
1478       assert(ir->operands[0]->type->is_vector());
1479       assert(ir->operands[0]->type == ir->operands[1]->type);
1480       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1481       break;
1482
1483    case ir_unop_sqrt:
1484       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1485       break;
1486    case ir_unop_rsq:
1487       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1488       break;
1489
1490    case ir_unop_bitcast_i2f:
1491    case ir_unop_bitcast_u2f:
1492       this->result = op[0];
1493       this->result.type = BRW_REGISTER_TYPE_F;
1494       break;
1495
1496    case ir_unop_bitcast_f2i:
1497       this->result = op[0];
1498       this->result.type = BRW_REGISTER_TYPE_D;
1499       break;
1500
1501    case ir_unop_bitcast_f2u:
1502       this->result = op[0];
1503       this->result.type = BRW_REGISTER_TYPE_UD;
1504       break;
1505
1506    case ir_unop_i2f:
1507    case ir_unop_i2u:
1508    case ir_unop_u2i:
1509    case ir_unop_u2f:
1510    case ir_unop_b2f:
1511    case ir_unop_b2i:
1512    case ir_unop_f2i:
1513    case ir_unop_f2u:
1514       emit(MOV(result_dst, op[0]));
1515       break;
1516    case ir_unop_f2b:
1517    case ir_unop_i2b: {
1518       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1519       emit(AND(result_dst, result_src, src_reg(1)));
1520       break;
1521    }
1522
1523    case ir_unop_trunc:
1524       emit(RNDZ(result_dst, op[0]));
1525       break;
1526    case ir_unop_ceil:
1527       op[0].negate = !op[0].negate;
1528       inst = emit(RNDD(result_dst, op[0]));
1529       this->result.negate = true;
1530       break;
1531    case ir_unop_floor:
1532       inst = emit(RNDD(result_dst, op[0]));
1533       break;
1534    case ir_unop_fract:
1535       inst = emit(FRC(result_dst, op[0]));
1536       break;
1537    case ir_unop_round_even:
1538       emit(RNDE(result_dst, op[0]));
1539       break;
1540
1541    case ir_binop_min:
1542       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1543       break;
1544    case ir_binop_max:
1545       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1546       break;
1547
1548    case ir_binop_pow:
1549       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1550       break;
1551
1552    case ir_unop_bit_not:
1553       inst = emit(NOT(result_dst, op[0]));
1554       break;
1555    case ir_binop_bit_and:
1556       inst = emit(AND(result_dst, op[0], op[1]));
1557       break;
1558    case ir_binop_bit_xor:
1559       inst = emit(XOR(result_dst, op[0], op[1]));
1560       break;
1561    case ir_binop_bit_or:
1562       inst = emit(OR(result_dst, op[0], op[1]));
1563       break;
1564
1565    case ir_binop_lshift:
1566       inst = emit(SHL(result_dst, op[0], op[1]));
1567       break;
1568
1569    case ir_binop_rshift:
1570       if (ir->type->base_type == GLSL_TYPE_INT)
1571          inst = emit(ASR(result_dst, op[0], op[1]));
1572       else
1573          inst = emit(SHR(result_dst, op[0], op[1]));
1574       break;
1575
1576    case ir_binop_ubo_load: {
1577       ir_constant *uniform_block = ir->operands[0]->as_constant();
1578       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1579       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1580       src_reg offset = op[1];
1581
1582       /* Now, load the vector from that offset. */
1583       assert(ir->type->is_vector() || ir->type->is_scalar());
1584
1585       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1586       packed_consts.type = result.type;
1587       src_reg surf_index =
1588          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1589       if (const_offset_ir) {
1590          offset = src_reg(const_offset / 16);
1591       } else {
1592          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1593       }
1594
1595       vec4_instruction *pull =
1596          emit(new(mem_ctx) vec4_instruction(this,
1597                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1598                                             dst_reg(packed_consts),
1599                                             surf_index,
1600                                             offset));
1601       pull->base_mrf = 14;
1602       pull->mlen = 1;
1603
1604       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1605       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1606                                             const_offset % 16 / 4,
1607                                             const_offset % 16 / 4,
1608                                             const_offset % 16 / 4);
1609
1610       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1611       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1612          emit(CMP(result_dst, packed_consts, src_reg(0u),
1613                   BRW_CONDITIONAL_NZ));
1614          emit(AND(result_dst, result, src_reg(0x1)));
1615       } else {
1616          emit(MOV(result_dst, packed_consts));
1617       }
1618       break;
1619    }
1620
1621    case ir_triop_lrp:
1622       assert(!"not reached: should be handled by lrp_to_arith");
1623       break;
1624
1625    case ir_quadop_vector:
1626       assert(!"not reached: should be handled by lower_quadop_vector");
1627       break;
1628
1629    case ir_unop_pack_half_2x16:
1630       emit_pack_half_2x16(result_dst, op[0]);
1631       break;
1632    case ir_unop_unpack_half_2x16:
1633       emit_unpack_half_2x16(result_dst, op[0]);
1634       break;
1635    case ir_unop_pack_snorm_2x16:
1636    case ir_unop_pack_snorm_4x8:
1637    case ir_unop_pack_unorm_2x16:
1638    case ir_unop_pack_unorm_4x8:
1639    case ir_unop_unpack_snorm_2x16:
1640    case ir_unop_unpack_snorm_4x8:
1641    case ir_unop_unpack_unorm_2x16:
1642    case ir_unop_unpack_unorm_4x8:
1643       assert(!"not reached: should be handled by lower_packing_builtins");
1644       break;
1645    case ir_unop_unpack_half_2x16_split_x:
1646    case ir_unop_unpack_half_2x16_split_y:
1647    case ir_binop_pack_half_2x16_split:
1648       assert(!"not reached: should not occur in vertex shader");
1649       break;
1650    }
1651 }
1652
1653
1654 void
1655 vec4_visitor::visit(ir_swizzle *ir)
1656 {
1657    src_reg src;
1658    int i = 0;
1659    int swizzle[4];
1660
1661    /* Note that this is only swizzles in expressions, not those on the left
1662     * hand side of an assignment, which do write masking.  See ir_assignment
1663     * for that.
1664     */
1665
1666    ir->val->accept(this);
1667    src = this->result;
1668    assert(src.file != BAD_FILE);
1669
1670    for (i = 0; i < ir->type->vector_elements; i++) {
1671       switch (i) {
1672       case 0:
1673          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1674          break;
1675       case 1:
1676          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1677          break;
1678       case 2:
1679          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1680          break;
1681       case 3:
1682          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1683             break;
1684       }
1685    }
1686    for (; i < 4; i++) {
1687       /* Replicate the last channel out. */
1688       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1689    }
1690
1691    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1692
1693    this->result = src;
1694 }
1695
1696 void
1697 vec4_visitor::visit(ir_dereference_variable *ir)
1698 {
1699    const struct glsl_type *type = ir->type;
1700    dst_reg *reg = variable_storage(ir->var);
1701
1702    if (!reg) {
1703       fail("Failed to find variable storage for %s\n", ir->var->name);
1704       this->result = src_reg(brw_null_reg());
1705       return;
1706    }
1707
1708    this->result = src_reg(*reg);
1709
1710    /* System values get their swizzle from the dst_reg writemask */
1711    if (ir->var->mode == ir_var_system_value)
1712       return;
1713
1714    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1715       this->result.swizzle = swizzle_for_size(type->vector_elements);
1716 }
1717
1718
1719 int
1720 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1721 {
1722    /* Under normal circumstances array elements are stored consecutively, so
1723     * the stride is equal to the size of the array element.
1724     */
1725    return type_size(ir->type);
1726 }
1727
1728
1729 void
1730 vec4_visitor::visit(ir_dereference_array *ir)
1731 {
1732    ir_constant *constant_index;
1733    src_reg src;
1734    int array_stride = compute_array_stride(ir);
1735
1736    constant_index = ir->array_index->constant_expression_value();
1737
1738    ir->array->accept(this);
1739    src = this->result;
1740
1741    if (constant_index) {
1742       src.reg_offset += constant_index->value.i[0] * array_stride;
1743    } else {
1744       /* Variable index array dereference.  It eats the "vec4" of the
1745        * base of the array and an index that offsets the Mesa register
1746        * index.
1747        */
1748       ir->array_index->accept(this);
1749
1750       src_reg index_reg;
1751
1752       if (array_stride == 1) {
1753          index_reg = this->result;
1754       } else {
1755          index_reg = src_reg(this, glsl_type::int_type);
1756
1757          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1758       }
1759
1760       if (src.reladdr) {
1761          src_reg temp = src_reg(this, glsl_type::int_type);
1762
1763          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1764
1765          index_reg = temp;
1766       }
1767
1768       src.reladdr = ralloc(mem_ctx, src_reg);
1769       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1770    }
1771
1772    /* If the type is smaller than a vec4, replicate the last channel out. */
1773    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1774       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1775    else
1776       src.swizzle = BRW_SWIZZLE_NOOP;
1777    src.type = brw_type_for_base_type(ir->type);
1778
1779    this->result = src;
1780 }
1781
1782 void
1783 vec4_visitor::visit(ir_dereference_record *ir)
1784 {
1785    unsigned int i;
1786    const glsl_type *struct_type = ir->record->type;
1787    int offset = 0;
1788
1789    ir->record->accept(this);
1790
1791    for (i = 0; i < struct_type->length; i++) {
1792       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1793          break;
1794       offset += type_size(struct_type->fields.structure[i].type);
1795    }
1796
1797    /* If the type is smaller than a vec4, replicate the last channel out. */
1798    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1799       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1800    else
1801       this->result.swizzle = BRW_SWIZZLE_NOOP;
1802    this->result.type = brw_type_for_base_type(ir->type);
1803
1804    this->result.reg_offset += offset;
1805 }
1806
1807 /**
1808  * We want to be careful in assignment setup to hit the actual storage
1809  * instead of potentially using a temporary like we might with the
1810  * ir_dereference handler.
1811  */
1812 static dst_reg
1813 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1814 {
1815    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1816     * access of a vector, it must be separated into a series conditional moves
1817     * before reaching this point (see ir_vec_index_to_cond_assign).
1818     */
1819    assert(ir->as_dereference());
1820    ir_dereference_array *deref_array = ir->as_dereference_array();
1821    if (deref_array) {
1822       assert(!deref_array->array->type->is_vector());
1823    }
1824
1825    /* Use the rvalue deref handler for the most part.  We'll ignore
1826     * swizzles in it and write swizzles using writemask, though.
1827     */
1828    ir->accept(v);
1829    return dst_reg(v->result);
1830 }
1831
1832 void
1833 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1834                               const struct glsl_type *type, uint32_t predicate)
1835 {
1836    if (type->base_type == GLSL_TYPE_STRUCT) {
1837       for (unsigned int i = 0; i < type->length; i++) {
1838          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1839       }
1840       return;
1841    }
1842
1843    if (type->is_array()) {
1844       for (unsigned int i = 0; i < type->length; i++) {
1845          emit_block_move(dst, src, type->fields.array, predicate);
1846       }
1847       return;
1848    }
1849
1850    if (type->is_matrix()) {
1851       const struct glsl_type *vec_type;
1852
1853       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1854                                          type->vector_elements, 1);
1855
1856       for (int i = 0; i < type->matrix_columns; i++) {
1857          emit_block_move(dst, src, vec_type, predicate);
1858       }
1859       return;
1860    }
1861
1862    assert(type->is_scalar() || type->is_vector());
1863
1864    dst->type = brw_type_for_base_type(type);
1865    src->type = dst->type;
1866
1867    dst->writemask = (1 << type->vector_elements) - 1;
1868
1869    src->swizzle = swizzle_for_size(type->vector_elements);
1870
1871    vec4_instruction *inst = emit(MOV(*dst, *src));
1872    inst->predicate = predicate;
1873
1874    dst->reg_offset++;
1875    src->reg_offset++;
1876 }
1877
1878
1879 /* If the RHS processing resulted in an instruction generating a
1880  * temporary value, and it would be easy to rewrite the instruction to
1881  * generate its result right into the LHS instead, do so.  This ends
1882  * up reliably removing instructions where it can be tricky to do so
1883  * later without real UD chain information.
1884  */
1885 bool
1886 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1887                                      dst_reg dst,
1888                                      src_reg src,
1889                                      vec4_instruction *pre_rhs_inst,
1890                                      vec4_instruction *last_rhs_inst)
1891 {
1892    /* This could be supported, but it would take more smarts. */
1893    if (ir->condition)
1894       return false;
1895
1896    if (pre_rhs_inst == last_rhs_inst)
1897       return false; /* No instructions generated to work with. */
1898
1899    /* Make sure the last instruction generated our source reg. */
1900    if (src.file != GRF ||
1901        src.file != last_rhs_inst->dst.file ||
1902        src.reg != last_rhs_inst->dst.reg ||
1903        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1904        src.reladdr ||
1905        src.abs ||
1906        src.negate ||
1907        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1908       return false;
1909
1910    /* Check that that last instruction fully initialized the channels
1911     * we want to use, in the order we want to use them.  We could
1912     * potentially reswizzle the operands of many instructions so that
1913     * we could handle out of order channels, but don't yet.
1914     */
1915
1916    for (unsigned i = 0; i < 4; i++) {
1917       if (dst.writemask & (1 << i)) {
1918          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1919             return false;
1920
1921          if (BRW_GET_SWZ(src.swizzle, i) != i)
1922             return false;
1923       }
1924    }
1925
1926    /* Success!  Rewrite the instruction. */
1927    last_rhs_inst->dst.file = dst.file;
1928    last_rhs_inst->dst.reg = dst.reg;
1929    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1930    last_rhs_inst->dst.reladdr = dst.reladdr;
1931    last_rhs_inst->dst.writemask &= dst.writemask;
1932
1933    return true;
1934 }
1935
1936 void
1937 vec4_visitor::visit(ir_assignment *ir)
1938 {
1939    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1940    uint32_t predicate = BRW_PREDICATE_NONE;
1941
1942    if (!ir->lhs->type->is_scalar() &&
1943        !ir->lhs->type->is_vector()) {
1944       ir->rhs->accept(this);
1945       src_reg src = this->result;
1946
1947       if (ir->condition) {
1948          emit_bool_to_cond_code(ir->condition, &predicate);
1949       }
1950
1951       /* emit_block_move doesn't account for swizzles in the source register.
1952        * This should be ok, since the source register is a structure or an
1953        * array, and those can't be swizzled.  But double-check to be sure.
1954        */
1955       assert(src.swizzle ==
1956              (ir->rhs->type->is_matrix()
1957               ? swizzle_for_size(ir->rhs->type->vector_elements)
1958               : BRW_SWIZZLE_NOOP));
1959
1960       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1961       return;
1962    }
1963
1964    /* Now we're down to just a scalar/vector with writemasks. */
1965    int i;
1966
1967    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1968    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1969
1970    ir->rhs->accept(this);
1971
1972    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1973
1974    src_reg src = this->result;
1975
1976    int swizzles[4];
1977    int first_enabled_chan = 0;
1978    int src_chan = 0;
1979
1980    assert(ir->lhs->type->is_vector() ||
1981           ir->lhs->type->is_scalar());
1982    dst.writemask = ir->write_mask;
1983
1984    for (int i = 0; i < 4; i++) {
1985       if (dst.writemask & (1 << i)) {
1986          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1987          break;
1988       }
1989    }
1990
1991    /* Swizzle a small RHS vector into the channels being written.
1992     *
1993     * glsl ir treats write_mask as dictating how many channels are
1994     * present on the RHS while in our instructions we need to make
1995     * those channels appear in the slots of the vec4 they're written to.
1996     */
1997    for (int i = 0; i < 4; i++) {
1998       if (dst.writemask & (1 << i))
1999          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2000       else
2001          swizzles[i] = first_enabled_chan;
2002    }
2003    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2004                               swizzles[2], swizzles[3]);
2005
2006    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2007       return;
2008    }
2009
2010    if (ir->condition) {
2011       emit_bool_to_cond_code(ir->condition, &predicate);
2012    }
2013
2014    for (i = 0; i < type_size(ir->lhs->type); i++) {
2015       vec4_instruction *inst = emit(MOV(dst, src));
2016       inst->predicate = predicate;
2017
2018       dst.reg_offset++;
2019       src.reg_offset++;
2020    }
2021 }
2022
2023 void
2024 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2025 {
2026    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2027       foreach_list(node, &ir->components) {
2028          ir_constant *field_value = (ir_constant *)node;
2029
2030          emit_constant_values(dst, field_value);
2031       }
2032       return;
2033    }
2034
2035    if (ir->type->is_array()) {
2036       for (unsigned int i = 0; i < ir->type->length; i++) {
2037          emit_constant_values(dst, ir->array_elements[i]);
2038       }
2039       return;
2040    }
2041
2042    if (ir->type->is_matrix()) {
2043       for (int i = 0; i < ir->type->matrix_columns; i++) {
2044          float *vec = &ir->value.f[i * ir->type->vector_elements];
2045
2046          for (int j = 0; j < ir->type->vector_elements; j++) {
2047             dst->writemask = 1 << j;
2048             dst->type = BRW_REGISTER_TYPE_F;
2049
2050             emit(MOV(*dst, src_reg(vec[j])));
2051          }
2052          dst->reg_offset++;
2053       }
2054       return;
2055    }
2056
2057    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2058
2059    for (int i = 0; i < ir->type->vector_elements; i++) {
2060       if (!(remaining_writemask & (1 << i)))
2061          continue;
2062
2063       dst->writemask = 1 << i;
2064       dst->type = brw_type_for_base_type(ir->type);
2065
2066       /* Find other components that match the one we're about to
2067        * write.  Emits fewer instructions for things like vec4(0.5,
2068        * 1.5, 1.5, 1.5).
2069        */
2070       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2071          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2072             if (ir->value.b[i] == ir->value.b[j])
2073                dst->writemask |= (1 << j);
2074          } else {
2075             /* u, i, and f storage all line up, so no need for a
2076              * switch case for comparing each type.
2077              */
2078             if (ir->value.u[i] == ir->value.u[j])
2079                dst->writemask |= (1 << j);
2080          }
2081       }
2082
2083       switch (ir->type->base_type) {
2084       case GLSL_TYPE_FLOAT:
2085          emit(MOV(*dst, src_reg(ir->value.f[i])));
2086          break;
2087       case GLSL_TYPE_INT:
2088          emit(MOV(*dst, src_reg(ir->value.i[i])));
2089          break;
2090       case GLSL_TYPE_UINT:
2091          emit(MOV(*dst, src_reg(ir->value.u[i])));
2092          break;
2093       case GLSL_TYPE_BOOL:
2094          emit(MOV(*dst, src_reg(ir->value.b[i])));
2095          break;
2096       default:
2097          assert(!"Non-float/uint/int/bool constant");
2098          break;
2099       }
2100
2101       remaining_writemask &= ~dst->writemask;
2102    }
2103    dst->reg_offset++;
2104 }
2105
2106 void
2107 vec4_visitor::visit(ir_constant *ir)
2108 {
2109    dst_reg dst = dst_reg(this, ir->type);
2110    this->result = src_reg(dst);
2111
2112    emit_constant_values(&dst, ir);
2113 }
2114
2115 void
2116 vec4_visitor::visit(ir_call *ir)
2117 {
2118    assert(!"not reached");
2119 }
2120
2121 void
2122 vec4_visitor::visit(ir_texture *ir)
2123 {
2124    int sampler =
2125       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2126
2127    /* Should be lowered by do_lower_texture_projection */
2128    assert(!ir->projector);
2129
2130    /* Generate code to compute all the subexpression trees.  This has to be
2131     * done before loading any values into MRFs for the sampler message since
2132     * generating these values may involve SEND messages that need the MRFs.
2133     */
2134    src_reg coordinate;
2135    if (ir->coordinate) {
2136       ir->coordinate->accept(this);
2137       coordinate = this->result;
2138    }
2139
2140    src_reg shadow_comparitor;
2141    if (ir->shadow_comparitor) {
2142       ir->shadow_comparitor->accept(this);
2143       shadow_comparitor = this->result;
2144    }
2145
2146    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2147    src_reg lod, dPdx, dPdy, sample_index;
2148    switch (ir->op) {
2149    case ir_tex:
2150       lod = src_reg(0.0f);
2151       lod_type = glsl_type::float_type;
2152       break;
2153    case ir_txf:
2154    case ir_txl:
2155    case ir_txs:
2156       ir->lod_info.lod->accept(this);
2157       lod = this->result;
2158       lod_type = ir->lod_info.lod->type;
2159       break;
2160    case ir_txf_ms:
2161       ir->lod_info.sample_index->accept(this);
2162       sample_index = this->result;
2163       sample_index_type = ir->lod_info.sample_index->type;
2164       break;
2165    case ir_txd:
2166       ir->lod_info.grad.dPdx->accept(this);
2167       dPdx = this->result;
2168
2169       ir->lod_info.grad.dPdy->accept(this);
2170       dPdy = this->result;
2171
2172       lod_type = ir->lod_info.grad.dPdx->type;
2173       break;
2174    case ir_txb:
2175    case ir_lod:
2176       break;
2177    }
2178
2179    vec4_instruction *inst = NULL;
2180    switch (ir->op) {
2181    case ir_tex:
2182    case ir_txl:
2183       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2184       break;
2185    case ir_txd:
2186       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2187       break;
2188    case ir_txf:
2189       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2190       break;
2191    case ir_txf_ms:
2192       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2193       break;
2194    case ir_txs:
2195       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2196       break;
2197    case ir_txb:
2198       assert(!"TXB is not valid for vertex shaders.");
2199       break;
2200    case ir_lod:
2201       assert(!"LOD is not valid for vertex shaders.");
2202       break;
2203    }
2204
2205    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2206
2207    /* Texel offsets go in the message header; Gen4 also requires headers. */
2208    inst->header_present = use_texture_offset || intel->gen < 5;
2209    inst->base_mrf = 2;
2210    inst->mlen = inst->header_present + 1; /* always at least one */
2211    inst->sampler = sampler;
2212    inst->dst = dst_reg(this, ir->type);
2213    inst->dst.writemask = WRITEMASK_XYZW;
2214    inst->shadow_compare = ir->shadow_comparitor != NULL;
2215
2216    if (use_texture_offset)
2217       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2218
2219    /* MRF for the first parameter */
2220    int param_base = inst->base_mrf + inst->header_present;
2221
2222    if (ir->op == ir_txs) {
2223       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2224       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2225    } else {
2226       int i, coord_mask = 0, zero_mask = 0;
2227       /* Load the coordinate */
2228       /* FINISHME: gl_clamp_mask and saturate */
2229       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2230          coord_mask |= (1 << i);
2231       for (; i < 4; i++)
2232          zero_mask |= (1 << i);
2233
2234       if (ir->offset && ir->op == ir_txf) {
2235          /* It appears that the ld instruction used for txf does its
2236           * address bounds check before adding in the offset.  To work
2237           * around this, just add the integer offset to the integer
2238           * texel coordinate, and don't put the offset in the header.
2239           */
2240          ir_constant *offset = ir->offset->as_constant();
2241          assert(offset);
2242
2243          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2244             src_reg src = coordinate;
2245             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2246                                        BRW_GET_SWZ(src.swizzle, j),
2247                                        BRW_GET_SWZ(src.swizzle, j),
2248                                        BRW_GET_SWZ(src.swizzle, j));
2249             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2250                      src, offset->value.i[j]));
2251          }
2252       } else {
2253          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2254                   coordinate));
2255       }
2256       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2257                src_reg(0)));
2258       /* Load the shadow comparitor */
2259       if (ir->shadow_comparitor) {
2260          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2261                           WRITEMASK_X),
2262                   shadow_comparitor));
2263          inst->mlen++;
2264       }
2265
2266       /* Load the LOD info */
2267       if (ir->op == ir_tex || ir->op == ir_txl) {
2268          int mrf, writemask;
2269          if (intel->gen >= 5) {
2270             mrf = param_base + 1;
2271             if (ir->shadow_comparitor) {
2272                writemask = WRITEMASK_Y;
2273                /* mlen already incremented */
2274             } else {
2275                writemask = WRITEMASK_X;
2276                inst->mlen++;
2277             }
2278          } else /* intel->gen == 4 */ {
2279             mrf = param_base;
2280             writemask = WRITEMASK_Z;
2281          }
2282          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2283       } else if (ir->op == ir_txf) {
2284          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2285       } else if (ir->op == ir_txf_ms) {
2286          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2287                   sample_index));
2288          inst->mlen++;
2289
2290          /* on Gen7, there is an additional MCS parameter here after SI,
2291           * but we don't bother to emit it since it's always zero. If
2292           * we start supporting texturing from CMS surfaces, this will have
2293           * to change
2294           */
2295       } else if (ir->op == ir_txd) {
2296          const glsl_type *type = lod_type;
2297
2298          if (intel->gen >= 5) {
2299             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2300             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2301             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2302             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2303             inst->mlen++;
2304
2305             if (ir->type->vector_elements == 3) {
2306                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2307                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2308                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2309                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2310                inst->mlen++;
2311             }
2312          } else /* intel->gen == 4 */ {
2313             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2314             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2315             inst->mlen += 2;
2316          }
2317       }
2318    }
2319
2320    emit(inst);
2321
2322    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2323     * spec requires layers.
2324     */
2325    if (ir->op == ir_txs) {
2326       glsl_type const *type = ir->sampler->type;
2327       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2328           type->sampler_array) {
2329          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2330                    with_writemask(inst->dst, WRITEMASK_Z),
2331                    src_reg(inst->dst), src_reg(6));
2332       }
2333    }
2334
2335    swizzle_result(ir, src_reg(inst->dst), sampler);
2336 }
2337
2338 void
2339 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2340 {
2341    int s = key->tex.swizzles[sampler];
2342
2343    this->result = src_reg(this, ir->type);
2344    dst_reg swizzled_result(this->result);
2345
2346    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2347                         || s == SWIZZLE_NOOP) {
2348       emit(MOV(swizzled_result, orig_val));
2349       return;
2350    }
2351
2352    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2353    int swizzle[4];
2354
2355    for (int i = 0; i < 4; i++) {
2356       switch (GET_SWZ(s, i)) {
2357       case SWIZZLE_ZERO:
2358          zero_mask |= (1 << i);
2359          break;
2360       case SWIZZLE_ONE:
2361          one_mask |= (1 << i);
2362          break;
2363       default:
2364          copy_mask |= (1 << i);
2365          swizzle[i] = GET_SWZ(s, i);
2366          break;
2367       }
2368    }
2369
2370    if (copy_mask) {
2371       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2372       swizzled_result.writemask = copy_mask;
2373       emit(MOV(swizzled_result, orig_val));
2374    }
2375
2376    if (zero_mask) {
2377       swizzled_result.writemask = zero_mask;
2378       emit(MOV(swizzled_result, src_reg(0.0f)));
2379    }
2380
2381    if (one_mask) {
2382       swizzled_result.writemask = one_mask;
2383       emit(MOV(swizzled_result, src_reg(1.0f)));
2384    }
2385 }
2386
2387 void
2388 vec4_visitor::visit(ir_return *ir)
2389 {
2390    assert(!"not reached");
2391 }
2392
2393 void
2394 vec4_visitor::visit(ir_discard *ir)
2395 {
2396    assert(!"not reached");
2397 }
2398
2399 void
2400 vec4_visitor::visit(ir_if *ir)
2401 {
2402    /* Don't point the annotation at the if statement, because then it plus
2403     * the then and else blocks get printed.
2404     */
2405    this->base_ir = ir->condition;
2406
2407    if (intel->gen == 6) {
2408       emit_if_gen6(ir);
2409    } else {
2410       uint32_t predicate;
2411       emit_bool_to_cond_code(ir->condition, &predicate);
2412       emit(IF(predicate));
2413    }
2414
2415    visit_instructions(&ir->then_instructions);
2416
2417    if (!ir->else_instructions.is_empty()) {
2418       this->base_ir = ir->condition;
2419       emit(BRW_OPCODE_ELSE);
2420
2421       visit_instructions(&ir->else_instructions);
2422    }
2423
2424    this->base_ir = ir->condition;
2425    emit(BRW_OPCODE_ENDIF);
2426 }
2427
2428 void
2429 vec4_visitor::emit_ndc_computation()
2430 {
2431    /* Get the position */
2432    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2433
2434    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2435    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2436    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2437
2438    current_annotation = "NDC";
2439    dst_reg ndc_w = ndc;
2440    ndc_w.writemask = WRITEMASK_W;
2441    src_reg pos_w = pos;
2442    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2443    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2444
2445    dst_reg ndc_xyz = ndc;
2446    ndc_xyz.writemask = WRITEMASK_XYZ;
2447
2448    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2449 }
2450
2451 void
2452 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2453 {
2454    if (intel->gen < 6 &&
2455        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2456         key->userclip_active || brw->has_negative_rhw_bug)) {
2457       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2458       dst_reg header1_w = header1;
2459       header1_w.writemask = WRITEMASK_W;
2460       GLuint i;
2461
2462       emit(MOV(header1, 0u));
2463
2464       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2465          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2466
2467          current_annotation = "Point size";
2468          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2469          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2470       }
2471
2472       current_annotation = "Clipping flags";
2473       for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2474          vec4_instruction *inst;
2475
2476          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2477                          src_reg(this->userplane[i])));
2478          inst->conditional_mod = BRW_CONDITIONAL_L;
2479
2480          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2481          inst->predicate = BRW_PREDICATE_NORMAL;
2482       }
2483
2484       /* i965 clipping workaround:
2485        * 1) Test for -ve rhw
2486        * 2) If set,
2487        *      set ndc = (0,0,0,0)
2488        *      set ucp[6] = 1
2489        *
2490        * Later, clipping will detect ucp[6] and ensure the primitive is
2491        * clipped against all fixed planes.
2492        */
2493       if (brw->has_negative_rhw_bug) {
2494          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2495          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2496          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2497          vec4_instruction *inst;
2498          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2499          inst->predicate = BRW_PREDICATE_NORMAL;
2500          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2501          inst->predicate = BRW_PREDICATE_NORMAL;
2502       }
2503
2504       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2505    } else if (intel->gen < 6) {
2506       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2507    } else {
2508       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2509       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2510          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2511                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2512       }
2513    }
2514 }
2515
2516 void
2517 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2518 {
2519    if (intel->gen < 6) {
2520       /* Clip distance slots are set aside in gen5, but they are not used.  It
2521        * is not clear whether we actually need to set aside space for them,
2522        * but the performance cost is negligible.
2523        */
2524       return;
2525    }
2526
2527    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2528     *
2529     *     "If a linked set of shaders forming the vertex stage contains no
2530     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2531     *     application has requested clipping against user clip planes through
2532     *     the API, then the coordinate written to gl_Position is used for
2533     *     comparison against the user clip planes."
2534     *
2535     * This function is only called if the shader didn't write to
2536     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2537     * if the user wrote to it; otherwise we use gl_Position.
2538     */
2539    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2540    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2541       clip_vertex = VARYING_SLOT_POS;
2542    }
2543
2544    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2545         ++i) {
2546       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2547                src_reg(output_reg[clip_vertex]),
2548                src_reg(this->userplane[i + offset])));
2549    }
2550 }
2551
2552 void
2553 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2554 {
2555    assert (varying < VARYING_SLOT_MAX);
2556    reg.type = output_reg[varying].type;
2557    current_annotation = output_reg_annotation[varying];
2558    /* Copy the register, saturating if necessary */
2559    vec4_instruction *inst = emit(MOV(reg,
2560                                      src_reg(output_reg[varying])));
2561    if ((varying == VARYING_SLOT_COL0 ||
2562         varying == VARYING_SLOT_COL1 ||
2563         varying == VARYING_SLOT_BFC0 ||
2564         varying == VARYING_SLOT_BFC1) &&
2565        key->clamp_vertex_color) {
2566       inst->saturate = true;
2567    }
2568 }
2569
2570 void
2571 vec4_visitor::emit_urb_slot(int mrf, int varying)
2572 {
2573    struct brw_reg hw_reg = brw_message_reg(mrf);
2574    dst_reg reg = dst_reg(MRF, mrf);
2575    reg.type = BRW_REGISTER_TYPE_F;
2576
2577    switch (varying) {
2578    case VARYING_SLOT_PSIZ:
2579       /* PSIZ is always in slot 0, and is coupled with other flags. */
2580       current_annotation = "indices, point width, clip flags";
2581       emit_psiz_and_flags(hw_reg);
2582       break;
2583    case BRW_VARYING_SLOT_NDC:
2584       current_annotation = "NDC";
2585       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2586       break;
2587    case BRW_VARYING_SLOT_POS_DUPLICATE:
2588    case VARYING_SLOT_POS:
2589       current_annotation = "gl_Position";
2590       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2591       break;
2592    case VARYING_SLOT_CLIP_DIST0:
2593    case VARYING_SLOT_CLIP_DIST1:
2594       if (this->key->uses_clip_distance) {
2595          emit_generic_urb_slot(reg, varying);
2596       } else {
2597          current_annotation = "user clip distances";
2598          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2599       }
2600       break;
2601    case VARYING_SLOT_EDGE:
2602       /* This is present when doing unfilled polygons.  We're supposed to copy
2603        * the edge flag from the user-provided vertex array
2604        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2605        * of that attribute (starts as 1.0f).  This is then used in clipping to
2606        * determine which edges should be drawn as wireframe.
2607        */
2608       current_annotation = "edge flag";
2609       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2610                                     glsl_type::float_type, WRITEMASK_XYZW))));
2611       break;
2612    case BRW_VARYING_SLOT_PAD:
2613       /* No need to write to this slot */
2614       break;
2615    default:
2616       emit_generic_urb_slot(reg, varying);
2617       break;
2618    }
2619 }
2620
2621 static int
2622 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2623 {
2624    struct intel_context *intel = &brw->intel;
2625
2626    if (intel->gen >= 6) {
2627       /* URB data written (does not include the message header reg) must
2628        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2629        * section 5.4.3.2.2: URB_INTERLEAVED.
2630        *
2631        * URB entries are allocated on a multiple of 1024 bits, so an
2632        * extra 128 bits written here to make the end align to 256 is
2633        * no problem.
2634        */
2635       if ((mlen % 2) != 1)
2636          mlen++;
2637    }
2638
2639    return mlen;
2640 }
2641
2642 void
2643 vec4_vs_visitor::emit_urb_write_header(int mrf)
2644 {
2645    /* No need to do anything for VS; an implied write to this MRF will be
2646     * performed by VS_OPCODE_URB_WRITE.
2647     */
2648    (void) mrf;
2649 }
2650
2651 vec4_instruction *
2652 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2653 {
2654    /* For VS, the URB writes end the thread. */
2655    if (complete) {
2656       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2657          emit_shader_time_end();
2658    }
2659
2660    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2661    inst->eot = complete;
2662
2663    return inst;
2664 }
2665
2666 /**
2667  * Generates the VUE payload plus the necessary URB write instructions to
2668  * output it.
2669  *
2670  * The VUE layout is documented in Volume 2a.
2671  */
2672 void
2673 vec4_visitor::emit_vertex()
2674 {
2675    /* MRF 0 is reserved for the debugger, so start with message header
2676     * in MRF 1.
2677     */
2678    int base_mrf = 1;
2679    int mrf = base_mrf;
2680    /* In the process of generating our URB write message contents, we
2681     * may need to unspill a register or load from an array.  Those
2682     * reads would use MRFs 14-15.
2683     */
2684    int max_usable_mrf = 13;
2685
2686    /* The following assertion verifies that max_usable_mrf causes an
2687     * even-numbered amount of URB write data, which will meet gen6's
2688     * requirements for length alignment.
2689     */
2690    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2691
2692    /* First mrf is the g0-based message header containing URB handles and
2693     * such.
2694     */
2695    emit_urb_write_header(mrf++);
2696
2697    if (intel->gen < 6) {
2698       emit_ndc_computation();
2699    }
2700
2701    /* Set up the VUE data for the first URB write */
2702    int slot;
2703    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2704       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2705
2706       /* If this was max_usable_mrf, we can't fit anything more into this URB
2707        * WRITE.
2708        */
2709       if (mrf > max_usable_mrf) {
2710          slot++;
2711          break;
2712       }
2713    }
2714
2715    bool complete = slot >= prog_data->vue_map.num_slots;
2716    current_annotation = "URB write";
2717    vec4_instruction *inst = emit_urb_write_opcode(complete);
2718    inst->base_mrf = base_mrf;
2719    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2720
2721    /* Optional second URB write */
2722    if (!complete) {
2723       mrf = base_mrf + 1;
2724
2725       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2726          assert(mrf < max_usable_mrf);
2727
2728          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2729       }
2730
2731       current_annotation = "URB write";
2732       inst = emit_urb_write_opcode(true /* complete */);
2733       inst->base_mrf = base_mrf;
2734       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2735       /* URB destination offset.  In the previous write, we got MRFs
2736        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2737        * URB row increments, and each of our MRFs is half of one of
2738        * those, since we're doing interleaved writes.
2739        */
2740       inst->offset = (max_usable_mrf - base_mrf) / 2;
2741    }
2742 }
2743
2744 void
2745 vec4_vs_visitor::emit_thread_end()
2746 {
2747    /* For VS, we always end the thread by emitting a single vertex.
2748     * emit_urb_write_opcode() will take care of setting the eot flag on the
2749     * SEND instruction.
2750     */
2751    emit_vertex();
2752 }
2753
2754 src_reg
2755 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2756                                  src_reg *reladdr, int reg_offset)
2757 {
2758    /* Because we store the values to scratch interleaved like our
2759     * vertex data, we need to scale the vec4 index by 2.
2760     */
2761    int message_header_scale = 2;
2762
2763    /* Pre-gen6, the message header uses byte offsets instead of vec4
2764     * (16-byte) offset units.
2765     */
2766    if (intel->gen < 6)
2767       message_header_scale *= 16;
2768
2769    if (reladdr) {
2770       src_reg index = src_reg(this, glsl_type::int_type);
2771
2772       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2773       emit_before(inst, MUL(dst_reg(index),
2774                             index, src_reg(message_header_scale)));
2775
2776       return index;
2777    } else {
2778       return src_reg(reg_offset * message_header_scale);
2779    }
2780 }
2781
2782 src_reg
2783 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2784                                        src_reg *reladdr, int reg_offset)
2785 {
2786    if (reladdr) {
2787       src_reg index = src_reg(this, glsl_type::int_type);
2788
2789       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2790
2791       /* Pre-gen6, the message header uses byte offsets instead of vec4
2792        * (16-byte) offset units.
2793        */
2794       if (intel->gen < 6) {
2795          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2796       }
2797
2798       return index;
2799    } else {
2800       int message_header_scale = intel->gen < 6 ? 16 : 1;
2801       return src_reg(reg_offset * message_header_scale);
2802    }
2803 }
2804
2805 /**
2806  * Emits an instruction before @inst to load the value named by @orig_src
2807  * from scratch space at @base_offset to @temp.
2808  *
2809  * @base_offset is measured in 32-byte units (the size of a register).
2810  */
2811 void
2812 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2813                                 dst_reg temp, src_reg orig_src,
2814                                 int base_offset)
2815 {
2816    int reg_offset = base_offset + orig_src.reg_offset;
2817    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2818
2819    emit_before(inst, SCRATCH_READ(temp, index));
2820 }
2821
2822 /**
2823  * Emits an instruction after @inst to store the value to be written
2824  * to @orig_dst to scratch space at @base_offset, from @temp.
2825  *
2826  * @base_offset is measured in 32-byte units (the size of a register).
2827  */
2828 void
2829 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2830 {
2831    int reg_offset = base_offset + inst->dst.reg_offset;
2832    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2833
2834    /* Create a temporary register to store *inst's result in.
2835     *
2836     * We have to be careful in MOVing from our temporary result register in
2837     * the scratch write.  If we swizzle from channels of the temporary that
2838     * weren't initialized, it will confuse live interval analysis, which will
2839     * make spilling fail to make progress.
2840     */
2841    src_reg temp = src_reg(this, glsl_type::vec4_type);
2842    temp.type = inst->dst.type;
2843    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2844    int swizzles[4];
2845    for (int i = 0; i < 4; i++)
2846       if (inst->dst.writemask & (1 << i))
2847          swizzles[i] = i;
2848       else
2849          swizzles[i] = first_writemask_chan;
2850    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2851                                swizzles[2], swizzles[3]);
2852
2853    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2854                                        inst->dst.writemask));
2855    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2856    write->predicate = inst->predicate;
2857    write->ir = inst->ir;
2858    write->annotation = inst->annotation;
2859    inst->insert_after(write);
2860
2861    inst->dst.file = temp.file;
2862    inst->dst.reg = temp.reg;
2863    inst->dst.reg_offset = temp.reg_offset;
2864    inst->dst.reladdr = NULL;
2865 }
2866
2867 /**
2868  * We can't generally support array access in GRF space, because a
2869  * single instruction's destination can only span 2 contiguous
2870  * registers.  So, we send all GRF arrays that get variable index
2871  * access to scratch space.
2872  */
2873 void
2874 vec4_visitor::move_grf_array_access_to_scratch()
2875 {
2876    int scratch_loc[this->virtual_grf_count];
2877
2878    for (int i = 0; i < this->virtual_grf_count; i++) {
2879       scratch_loc[i] = -1;
2880    }
2881
2882    /* First, calculate the set of virtual GRFs that need to be punted
2883     * to scratch due to having any array access on them, and where in
2884     * scratch.
2885     */
2886    foreach_list(node, &this->instructions) {
2887       vec4_instruction *inst = (vec4_instruction *)node;
2888
2889       if (inst->dst.file == GRF && inst->dst.reladdr &&
2890           scratch_loc[inst->dst.reg] == -1) {
2891          scratch_loc[inst->dst.reg] = c->last_scratch;
2892          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2893       }
2894
2895       for (int i = 0 ; i < 3; i++) {
2896          src_reg *src = &inst->src[i];
2897
2898          if (src->file == GRF && src->reladdr &&
2899              scratch_loc[src->reg] == -1) {
2900             scratch_loc[src->reg] = c->last_scratch;
2901             c->last_scratch += this->virtual_grf_sizes[src->reg];
2902          }
2903       }
2904    }
2905
2906    /* Now, for anything that will be accessed through scratch, rewrite
2907     * it to load/store.  Note that this is a _safe list walk, because
2908     * we may generate a new scratch_write instruction after the one
2909     * we're processing.
2910     */
2911    foreach_list_safe(node, &this->instructions) {
2912       vec4_instruction *inst = (vec4_instruction *)node;
2913
2914       /* Set up the annotation tracking for new generated instructions. */
2915       base_ir = inst->ir;
2916       current_annotation = inst->annotation;
2917
2918       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2919          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2920       }
2921
2922       for (int i = 0 ; i < 3; i++) {
2923          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2924             continue;
2925
2926          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2927
2928          emit_scratch_read(inst, temp, inst->src[i],
2929                            scratch_loc[inst->src[i].reg]);
2930
2931          inst->src[i].file = temp.file;
2932          inst->src[i].reg = temp.reg;
2933          inst->src[i].reg_offset = temp.reg_offset;
2934          inst->src[i].reladdr = NULL;
2935       }
2936    }
2937 }
2938
2939 /**
2940  * Emits an instruction before @inst to load the value named by @orig_src
2941  * from the pull constant buffer (surface) at @base_offset to @temp.
2942  */
2943 void
2944 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2945                                       dst_reg temp, src_reg orig_src,
2946                                       int base_offset)
2947 {
2948    int reg_offset = base_offset + orig_src.reg_offset;
2949    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2950    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2951    vec4_instruction *load;
2952
2953    if (intel->gen >= 7) {
2954       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2955       grf_offset.type = offset.type;
2956       emit_before(inst, MOV(grf_offset, offset));
2957
2958       load = new(mem_ctx) vec4_instruction(this,
2959                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2960                                            temp, index, src_reg(grf_offset));
2961    } else {
2962       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2963                                            temp, index, offset);
2964       load->base_mrf = 14;
2965       load->mlen = 1;
2966    }
2967    emit_before(inst, load);
2968 }
2969
2970 /**
2971  * Implements array access of uniforms by inserting a
2972  * PULL_CONSTANT_LOAD instruction.
2973  *
2974  * Unlike temporary GRF array access (where we don't support it due to
2975  * the difficulty of doing relative addressing on instruction
2976  * destinations), we could potentially do array access of uniforms
2977  * that were loaded in GRF space as push constants.  In real-world
2978  * usage we've seen, though, the arrays being used are always larger
2979  * than we could load as push constants, so just always move all
2980  * uniform array access out to a pull constant buffer.
2981  */
2982 void
2983 vec4_visitor::move_uniform_array_access_to_pull_constants()
2984 {
2985    int pull_constant_loc[this->uniforms];
2986
2987    for (int i = 0; i < this->uniforms; i++) {
2988       pull_constant_loc[i] = -1;
2989    }
2990
2991    /* Walk through and find array access of uniforms.  Put a copy of that
2992     * uniform in the pull constant buffer.
2993     *
2994     * Note that we don't move constant-indexed accesses to arrays.  No
2995     * testing has been done of the performance impact of this choice.
2996     */
2997    foreach_list_safe(node, &this->instructions) {
2998       vec4_instruction *inst = (vec4_instruction *)node;
2999
3000       for (int i = 0 ; i < 3; i++) {
3001          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3002             continue;
3003
3004          int uniform = inst->src[i].reg;
3005
3006          /* If this array isn't already present in the pull constant buffer,
3007           * add it.
3008           */
3009          if (pull_constant_loc[uniform] == -1) {
3010             const float **values = &prog_data->param[uniform * 4];
3011
3012             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3013
3014             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3015                prog_data->pull_param[prog_data->nr_pull_params++]
3016                   = values[j];
3017             }
3018          }
3019
3020          /* Set up the annotation tracking for new generated instructions. */
3021          base_ir = inst->ir;
3022          current_annotation = inst->annotation;
3023
3024          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3025
3026          emit_pull_constant_load(inst, temp, inst->src[i],
3027                                  pull_constant_loc[uniform]);
3028
3029          inst->src[i].file = temp.file;
3030          inst->src[i].reg = temp.reg;
3031          inst->src[i].reg_offset = temp.reg_offset;
3032          inst->src[i].reladdr = NULL;
3033       }
3034    }
3035
3036    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3037     * no need to track them as larger-than-vec4 objects.  This will be
3038     * relied on in cutting out unused uniform vectors from push
3039     * constants.
3040     */
3041    split_uniform_registers();
3042 }
3043
3044 void
3045 vec4_visitor::resolve_ud_negate(src_reg *reg)
3046 {
3047    if (reg->type != BRW_REGISTER_TYPE_UD ||
3048        !reg->negate)
3049       return;
3050
3051    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3052    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3053    *reg = temp;
3054 }
3055
3056 vec4_visitor::vec4_visitor(struct brw_context *brw,
3057                            struct brw_vec4_compile *c,
3058                            struct gl_program *prog,
3059                            const struct brw_vec4_prog_key *key,
3060                            struct brw_vec4_prog_data *prog_data,
3061                            struct gl_shader_program *shader_prog,
3062                            struct brw_shader *shader,
3063                            void *mem_ctx,
3064                            bool debug_flag)
3065    : debug_flag(debug_flag)
3066 {
3067    this->brw = brw;
3068    this->intel = &brw->intel;
3069    this->ctx = &intel->ctx;
3070    this->shader_prog = shader_prog;
3071    this->shader = shader;
3072
3073    this->mem_ctx = mem_ctx;
3074    this->failed = false;
3075
3076    this->base_ir = NULL;
3077    this->current_annotation = NULL;
3078    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3079
3080    this->c = c;
3081    this->prog = prog;
3082    this->key = key;
3083    this->prog_data = prog_data;
3084
3085    this->variable_ht = hash_table_ctor(0,
3086                                        hash_table_pointer_hash,
3087                                        hash_table_pointer_compare);
3088
3089    this->virtual_grf_def = NULL;
3090    this->virtual_grf_use = NULL;
3091    this->virtual_grf_sizes = NULL;
3092    this->virtual_grf_count = 0;
3093    this->virtual_grf_reg_map = NULL;
3094    this->virtual_grf_reg_count = 0;
3095    this->virtual_grf_array_size = 0;
3096    this->live_intervals_valid = false;
3097
3098    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3099
3100    this->uniforms = 0;
3101 }
3102
3103 vec4_visitor::~vec4_visitor()
3104 {
3105    hash_table_dtor(this->variable_ht);
3106 }
3107
3108
3109 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3110                                  struct brw_vs_compile *vs_compile,
3111                                  struct brw_vs_prog_data *vs_prog_data,
3112                                  struct gl_shader_program *prog,
3113                                  struct brw_shader *shader,
3114                                  void *mem_ctx)
3115    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3116                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3117                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3118      vs_compile(vs_compile),
3119      vs_prog_data(vs_prog_data)
3120 {
3121 }
3122
3123
3124 void
3125 vec4_visitor::fail(const char *format, ...)
3126 {
3127    va_list va;
3128    char *msg;
3129
3130    if (failed)
3131       return;
3132
3133    failed = true;
3134
3135    va_start(va, format);
3136    msg = ralloc_vasprintf(mem_ctx, format, va);
3137    va_end(va);
3138    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3139
3140    this->fail_msg = msg;
3141
3142    if (debug_flag) {
3143       fprintf(stderr, "%s",  msg);
3144    }
3145 }
3146
3147 } /* namespace brw */