src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU1(F32TO16)
 117 ALU1(F16TO32)
 118 ALU2(ADD)
 119 ALU2(MUL)
 120 ALU2(MACH)
 121 ALU2(AND)
 122 ALU2(OR)
 123 ALU2(XOR)
 124 ALU2(DP3)
 125 ALU2(DP4)
 126 ALU2(DPH)
 127 ALU2(SHL)
 128 ALU2(SHR)
 129 ALU2(ASR)
 130
 131 /** Gen4 predicated IF. */
 132 vec4_instruction *
 133 vec4_visitor::IF(uint32_t predicate)
 134 {
 135    vec4_instruction *inst;
 136
 137    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 138    inst->predicate = predicate;
 139
 140    return inst;
 141 }
 142
 143 /** Gen6+ IF with embedded comparison. */
 144 vec4_instruction *
 145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 146 {
 147    assert(intel->gen >= 6);
 148
 149    vec4_instruction *inst;
 150
 151    resolve_ud_negate(&src0);
 152    resolve_ud_negate(&src1);
 153
 154    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 155                                         src0, src1);
 156    inst->conditional_mod = condition;
 157
 158    return inst;
 159 }
 160
 161 /**
 162  * CMP: Sets the low bit of the destination channels with the result
 163  * of the comparison, while the upper bits are undefined, and updates
 164  * the flag register with the packed 16 bits of the result.
 165  */
 166 vec4_instruction *
 167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 168 {
 169    vec4_instruction *inst;
 170
 171    /* original gen4 does type conversion to the destination type
 172     * before before comparison, producing garbage results for floating
 173     * point comparisons.
 174     */
 175    if (intel->gen == 4) {
 176       dst.type = src0.type;
 177       if (dst.file == HW_REG)
 178          dst.fixed_hw_reg.type = dst.type;
 179    }
 180
 181    resolve_ud_negate(&src0);
 182    resolve_ud_negate(&src1);
 183
 184    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 185    inst->conditional_mod = condition;
 186
 187    return inst;
 188 }
 189
 190 vec4_instruction *
 191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 196                                         dst, index);
 197    inst->base_mrf = 14;
 198    inst->mlen = 2;
 199
 200    return inst;
 201 }
 202
 203 vec4_instruction *
 204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 205 {
 206    vec4_instruction *inst;
 207
 208    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 209                                         dst, src, index);
 210    inst->base_mrf = 13;
 211    inst->mlen = 3;
 212
 213    return inst;
 214 }
 215
 216 void
 217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 218 {
 219    static enum opcode dot_opcodes[] = {
 220       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 221    };
 222
 223    emit(dot_opcodes[elements - 2], dst, src0, src1);
 224 }
 225
 226 src_reg
 227 vec4_visitor::fix_math_operand(src_reg src)
 228 {
 229    /* The gen6 math instruction ignores the source modifiers --
 230     * swizzle, abs, negate, and at least some parts of the register
 231     * region description.
 232     *
 233     * Rather than trying to enumerate all these cases, *always* expand the
 234     * operand to a temp GRF for gen6.
 235     *
 236     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 237     * can't use.
 238     */
 239
 240    if (intel->gen == 7 && src.file != IMM)
 241       return src;
 242
 243    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 244    expanded.type = src.type;
 245    emit(MOV(expanded, src));
 246    return src_reg(expanded);
 247 }
 248
 249 void
 250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 251 {
 252    src = fix_math_operand(src);
 253
 254    if (dst.writemask != WRITEMASK_XYZW) {
 255       /* The gen6 math instruction must be align1, so we can't do
 256        * writemasks.
 257        */
 258       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 259
 260       emit(opcode, temp_dst, src);
 261
 262       emit(MOV(dst, src_reg(temp_dst)));
 263    } else {
 264       emit(opcode, dst, src);
 265    }
 266 }
 267
 268 void
 269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 270 {
 271    vec4_instruction *inst = emit(opcode, dst, src);
 272    inst->base_mrf = 1;
 273    inst->mlen = 1;
 274 }
 275
 276 void
 277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 278 {
 279    switch (opcode) {
 280    case SHADER_OPCODE_RCP:
 281    case SHADER_OPCODE_RSQ:
 282    case SHADER_OPCODE_SQRT:
 283    case SHADER_OPCODE_EXP2:
 284    case SHADER_OPCODE_LOG2:
 285    case SHADER_OPCODE_SIN:
 286    case SHADER_OPCODE_COS:
 287       break;
 288    default:
 289       assert(!"not reached: bad math opcode");
 290       return;
 291    }
 292
 293    if (intel->gen >= 6) {
 294       return emit_math1_gen6(opcode, dst, src);
 295    } else {
 296       return emit_math1_gen4(opcode, dst, src);
 297    }
 298 }
 299
 300 void
 301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 302                               dst_reg dst, src_reg src0, src_reg src1)
 303 {
 304    src0 = fix_math_operand(src0);
 305    src1 = fix_math_operand(src1);
 306
 307    if (dst.writemask != WRITEMASK_XYZW) {
 308       /* The gen6 math instruction must be align1, so we can't do
 309        * writemasks.
 310        */
 311       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 312       temp_dst.type = dst.type;
 313
 314       emit(opcode, temp_dst, src0, src1);
 315
 316       emit(MOV(dst, src_reg(temp_dst)));
 317    } else {
 318       emit(opcode, dst, src0, src1);
 319    }
 320 }
 321
 322 void
 323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 324                               dst_reg dst, src_reg src0, src_reg src1)
 325 {
 326    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 327    inst->base_mrf = 1;
 328    inst->mlen = 2;
 329 }
 330
 331 void
 332 vec4_visitor::emit_math(enum opcode opcode,
 333                         dst_reg dst, src_reg src0, src_reg src1)
 334 {
 335    switch (opcode) {
 336    case SHADER_OPCODE_POW:
 337    case SHADER_OPCODE_INT_QUOTIENT:
 338    case SHADER_OPCODE_INT_REMAINDER:
 339       break;
 340    default:
 341       assert(!"not reached: unsupported binary math opcode");
 342       return;
 343    }
 344
 345    if (intel->gen >= 6) {
 346       return emit_math2_gen6(opcode, dst, src0, src1);
 347    } else {
 348       return emit_math2_gen4(opcode, dst, src0, src1);
 349    }
 350 }
 351
 352 void
 353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 354 {
 355    if (intel->gen < 7)
 356       assert(!"ir_unop_pack_half_2x16 should be lowered");
 357
 358    assert(dst.type == BRW_REGISTER_TYPE_UD);
 359    assert(src0.type == BRW_REGISTER_TYPE_F);
 360
 361    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 362     *
 363     *   Because this instruction does not have a 16-bit floating-point type,
 364     *   the destination data type must be Word (W).
 365     *
 366     *   The destination must be DWord-aligned and specify a horizontal stride
 367     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 368     *   each destination channel and the upper word is not modified.
 369     *
 370     * The above restriction implies that the f32to16 instruction must use
 371     * align1 mode, because only in align1 mode is it possible to specify
 372     * horizontal stride.  We choose here to defy the hardware docs and emit
 373     * align16 instructions.
 374     *
 375     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 376     * instructions. I was partially successful in that the code passed all
 377     * tests.  However, the code was dubiously correct and fragile, and the
 378     * tests were not harsh enough to probe that frailty. Not trusting the
 379     * code, I chose instead to remain in align16 mode in defiance of the hw
 380     * docs).
 381     *
 382     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 383     * simulator, emitting a f32to16 in align16 mode with UD as destination
 384     * data type is safe. The behavior differs from that specified in the PRM
 385     * in that the upper word of each destination channel is cleared to 0.
 386     */
 387
 388    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 389    src_reg tmp_src(tmp_dst);
 390
 391 #if 0
 392    /* Verify the undocumented behavior on which the following instructions
 393     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 394     * then the result of the bit-or instruction below will be incorrect.
 395     *
 396     * You should inspect the disasm output in order to verify that the MOV is
 397     * not optimized away.
 398     */
 399    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 400 #endif
 401
 402    /* Give tmp the form below, where "." means untouched.
 403     *
 404     *     w z          y          x w z          y          x
 405     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 406     *
 407     * That the upper word of each write-channel be 0 is required for the
 408     * following bit-shift and bit-or instructions to work. Note that this
 409     * relies on the undocumented hardware behavior mentioned above.
 410     */
 411    tmp_dst.writemask = WRITEMASK_XY;
 412    emit(F32TO16(tmp_dst, src0));
 413
 414    /* Give the write-channels of dst the form:
 415     *   0xhhhh0000
 416     */
 417    tmp_src.swizzle = SWIZZLE_Y;
 418    emit(SHL(dst, tmp_src, src_reg(16u)));
 419
 420    /* Finally, give the write-channels of dst the form of packHalf2x16's
 421     * output:
 422     *   0xhhhhllll
 423     */
 424    tmp_src.swizzle = SWIZZLE_X;
 425    emit(OR(dst, src_reg(dst), tmp_src));
 426 }
 427
 428 void
 429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 430 {
 431    if (intel->gen < 7)
 432       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 433
 434    assert(dst.type == BRW_REGISTER_TYPE_F);
 435    assert(src0.type == BRW_REGISTER_TYPE_UD);
 436
 437    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 438     *
 439     *   Because this instruction does not have a 16-bit floating-point type,
 440     *   the source data type must be Word (W). The destination type must be
 441     *   F (Float).
 442     *
 443     * To use W as the source data type, we must adjust horizontal strides,
 444     * which is only possible in align1 mode. All my [chadv] attempts at
 445     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 446     * Piglit tests, so I gave up.
 447     *
 448     * I've verified that, on gen7 hardware and the simulator, it is safe to
 449     * emit f16to32 in align16 mode with UD as source data type.
 450     */
 451
 452    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 453    src_reg tmp_src(tmp_dst);
 454
 455    tmp_dst.writemask = WRITEMASK_X;
 456    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 457
 458    tmp_dst.writemask = WRITEMASK_Y;
 459    emit(SHR(tmp_dst, src0, src_reg(16u)));
 460
 461    dst.writemask = WRITEMASK_XY;
 462    emit(F16TO32(dst, tmp_src));
 463 }
 464
 465 void
 466 vec4_visitor::visit_instructions(const exec_list *list)
 467 {
 468    foreach_list(node, list) {
 469       ir_instruction *ir = (ir_instruction *)node;
 470
 471       base_ir = ir;
 472       ir->accept(this);
 473    }
 474 }
 475
 476
 477 static int
 478 type_size(const struct glsl_type *type)
 479 {
 480    unsigned int i;
 481    int size;
 482
 483    switch (type->base_type) {
 484    case GLSL_TYPE_UINT:
 485    case GLSL_TYPE_INT:
 486    case GLSL_TYPE_FLOAT:
 487    case GLSL_TYPE_BOOL:
 488       if (type->is_matrix()) {
 489          return type->matrix_columns;
 490       } else {
 491          /* Regardless of size of vector, it gets a vec4. This is bad
 492           * packing for things like floats, but otherwise arrays become a
 493           * mess.  Hopefully a later pass over the code can pack scalars
 494           * down if appropriate.
 495           */
 496          return 1;
 497       }
 498    case GLSL_TYPE_ARRAY:
 499       assert(type->length > 0);
 500       return type_size(type->fields.array) * type->length;
 501    case GLSL_TYPE_STRUCT:
 502       size = 0;
 503       for (i = 0; i < type->length; i++) {
 504          size += type_size(type->fields.structure[i].type);
 505       }
 506       return size;
 507    case GLSL_TYPE_SAMPLER:
 508       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 509        * at link time.
 510        */
 511       return 1;
 512    case GLSL_TYPE_VOID:
 513    case GLSL_TYPE_ERROR:
 514    case GLSL_TYPE_INTERFACE:
 515       assert(0);
 516       break;
 517    }
 518
 519    return 0;
 520 }
 521
 522 int
 523 vec4_visitor::virtual_grf_alloc(int size)
 524 {
 525    if (virtual_grf_array_size <= virtual_grf_count) {
 526       if (virtual_grf_array_size == 0)
 527          virtual_grf_array_size = 16;
 528       else
 529          virtual_grf_array_size *= 2;
 530       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 531                                    virtual_grf_array_size);
 532       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 533                                      virtual_grf_array_size);
 534    }
 535    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 536    virtual_grf_reg_count += size;
 537    virtual_grf_sizes[virtual_grf_count] = size;
 538    return virtual_grf_count++;
 539 }
 540
 541 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 542 {
 543    init();
 544
 545    this->file = GRF;
 546    this->reg = v->virtual_grf_alloc(type_size(type));
 547
 548    if (type->is_array() || type->is_record()) {
 549       this->swizzle = BRW_SWIZZLE_NOOP;
 550    } else {
 551       this->swizzle = swizzle_for_size(type->vector_elements);
 552    }
 553
 554    this->type = brw_type_for_base_type(type);
 555 }
 556
 557 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 558 {
 559    init();
 560
 561    this->file = GRF;
 562    this->reg = v->virtual_grf_alloc(type_size(type));
 563
 564    if (type->is_array() || type->is_record()) {
 565       this->writemask = WRITEMASK_XYZW;
 566    } else {
 567       this->writemask = (1 << type->vector_elements) - 1;
 568    }
 569
 570    this->type = brw_type_for_base_type(type);
 571 }
 572
 573 /* Our support for uniforms is piggy-backed on the struct
 574  * gl_fragment_program, because that's where the values actually
 575  * get stored, rather than in some global gl_shader_program uniform
 576  * store.
 577  */
 578 void
 579 vec4_visitor::setup_uniform_values(ir_variable *ir)
 580 {
 581    int namelen = strlen(ir->name);
 582
 583    /* The data for our (non-builtin) uniforms is stored in a series of
 584     * gl_uniform_driver_storage structs for each subcomponent that
 585     * glGetUniformLocation() could name.  We know it's been set up in the same
 586     * order we'd walk the type, so walk the list of storage and find anything
 587     * with our name, or the prefix of a component that starts with our name.
 588     */
 589    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 590       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 591
 592       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 593           (storage->name[namelen] != 0 &&
 594            storage->name[namelen] != '.' &&
 595            storage->name[namelen] != '[')) {
 596          continue;
 597       }
 598
 599       gl_constant_value *components = storage->storage;
 600       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 601                                storage->type->matrix_columns);
 602
 603       for (unsigned s = 0; s < vector_count; s++) {
 604          uniform_vector_size[uniforms] = storage->type->vector_elements;
 605
 606          int i;
 607          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 608             prog_data->base.param[uniforms * 4 + i] = &components->f;
 609             components++;
 610          }
 611          for (; i < 4; i++) {
 612             static float zero = 0;
 613             prog_data->base.param[uniforms * 4 + i] = &zero;
 614          }
 615
 616          uniforms++;
 617       }
 618    }
 619 }
 620
 621 void
 622 vec4_visitor::setup_uniform_clipplane_values()
 623 {
 624    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 625
 626    if (intel->gen < 6) {
 627       /* Pre-Gen6, we compact clip planes.  For example, if the user
 628        * enables just clip planes 0, 1, and 3, we will enable clip planes
 629        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 630        * plane 2.  This simplifies the implementation of the Gen6 clip
 631        * thread.
 632        */
 633       int compacted_clipplane_index = 0;
 634       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 635          if (!(c->key.base.userclip_planes_enabled_gen_4_5 & (1 << i)))
 636             continue;
 637
 638          this->uniform_vector_size[this->uniforms] = 4;
 639          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 640          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 641          for (int j = 0; j < 4; ++j) {
 642             prog_data->base.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 643          }
 644          ++compacted_clipplane_index;
 645          ++this->uniforms;
 646       }
 647    } else {
 648       /* In Gen6 and later, we don't compact clip planes, because this
 649        * simplifies the implementation of gl_ClipDistance.
 650        */
 651       for (int i = 0; i < c->key.base.nr_userclip_plane_consts; ++i) {
 652          this->uniform_vector_size[this->uniforms] = 4;
 653          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 654          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 655          for (int j = 0; j < 4; ++j) {
 656             prog_data->base.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 657          }
 658          ++this->uniforms;
 659       }
 660    }
 661 }
 662
 663 /* Our support for builtin uniforms is even scarier than non-builtin.
 664  * It sits on top of the PROG_STATE_VAR parameters that are
 665  * automatically updated from GL context state.
 666  */
 667 void
 668 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 669 {
 670    const ir_state_slot *const slots = ir->state_slots;
 671    assert(ir->state_slots != NULL);
 672
 673    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 674       /* This state reference has already been setup by ir_to_mesa,
 675        * but we'll get the same index back here.  We can reference
 676        * ParameterValues directly, since unlike brw_fs.cpp, we never
 677        * add new state references during compile.
 678        */
 679       int index = _mesa_add_state_reference(this->prog->Parameters,
 680                                             (gl_state_index *)slots[i].tokens);
 681       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 682
 683       this->uniform_vector_size[this->uniforms] = 0;
 684       /* Add each of the unique swizzled channels of the element.
 685        * This will end up matching the size of the glsl_type of this field.
 686        */
 687       int last_swiz = -1;
 688       for (unsigned int j = 0; j < 4; j++) {
 689          int swiz = GET_SWZ(slots[i].swizzle, j);
 690          last_swiz = swiz;
 691
 692          prog_data->base.param[this->uniforms * 4 + j] = &values[swiz];
 693          if (swiz <= last_swiz)
 694             this->uniform_vector_size[this->uniforms]++;
 695       }
 696       this->uniforms++;
 697    }
 698 }
 699
 700 dst_reg *
 701 vec4_visitor::variable_storage(ir_variable *var)
 702 {
 703    return (dst_reg *)hash_table_find(this->variable_ht, var);
 704 }
 705
 706 void
 707 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 708 {
 709    ir_expression *expr = ir->as_expression();
 710
 711    *predicate = BRW_PREDICATE_NORMAL;
 712
 713    if (expr) {
 714       src_reg op[2];
 715       vec4_instruction *inst;
 716
 717       assert(expr->get_num_operands() <= 2);
 718       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 719          expr->operands[i]->accept(this);
 720          op[i] = this->result;
 721
 722          resolve_ud_negate(&op[i]);
 723       }
 724
 725       switch (expr->operation) {
 726       case ir_unop_logic_not:
 727          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 728          inst->conditional_mod = BRW_CONDITIONAL_Z;
 729          break;
 730
 731       case ir_binop_logic_xor:
 732          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 733          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 734          break;
 735
 736       case ir_binop_logic_or:
 737          inst = emit(OR(dst_null_d(), op[0], op[1]));
 738          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 739          break;
 740
 741       case ir_binop_logic_and:
 742          inst = emit(AND(dst_null_d(), op[0], op[1]));
 743          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 744          break;
 745
 746       case ir_unop_f2b:
 747          if (intel->gen >= 6) {
 748             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 749          } else {
 750             inst = emit(MOV(dst_null_f(), op[0]));
 751             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 752          }
 753          break;
 754
 755       case ir_unop_i2b:
 756          if (intel->gen >= 6) {
 757             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 758          } else {
 759             inst = emit(MOV(dst_null_d(), op[0]));
 760             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 761          }
 762          break;
 763
 764       case ir_binop_all_equal:
 765          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 766          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 767          break;
 768
 769       case ir_binop_any_nequal:
 770          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 771          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 772          break;
 773
 774       case ir_unop_any:
 775          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 776          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 777          break;
 778
 779       case ir_binop_greater:
 780       case ir_binop_gequal:
 781       case ir_binop_less:
 782       case ir_binop_lequal:
 783       case ir_binop_equal:
 784       case ir_binop_nequal:
 785          emit(CMP(dst_null_d(), op[0], op[1],
 786                   brw_conditional_for_comparison(expr->operation)));
 787          break;
 788
 789       default:
 790          assert(!"not reached");
 791          break;
 792       }
 793       return;
 794    }
 795
 796    ir->accept(this);
 797
 798    resolve_ud_negate(&this->result);
 799
 800    if (intel->gen >= 6) {
 801       vec4_instruction *inst = emit(AND(dst_null_d(),
 802                                         this->result, src_reg(1)));
 803       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 804    } else {
 805       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 806       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 807    }
 808 }
 809
 810 /**
 811  * Emit a gen6 IF statement with the comparison folded into the IF
 812  * instruction.
 813  */
 814 void
 815 vec4_visitor::emit_if_gen6(ir_if *ir)
 816 {
 817    ir_expression *expr = ir->condition->as_expression();
 818
 819    if (expr) {
 820       src_reg op[2];
 821       dst_reg temp;
 822
 823       assert(expr->get_num_operands() <= 2);
 824       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 825          expr->operands[i]->accept(this);
 826          op[i] = this->result;
 827       }
 828
 829       switch (expr->operation) {
 830       case ir_unop_logic_not:
 831          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 832          return;
 833
 834       case ir_binop_logic_xor:
 835          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 836          return;
 837
 838       case ir_binop_logic_or:
 839          temp = dst_reg(this, glsl_type::bool_type);
 840          emit(OR(temp, op[0], op[1]));
 841          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 842          return;
 843
 844       case ir_binop_logic_and:
 845          temp = dst_reg(this, glsl_type::bool_type);
 846          emit(AND(temp, op[0], op[1]));
 847          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 848          return;
 849
 850       case ir_unop_f2b:
 851          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 852          return;
 853
 854       case ir_unop_i2b:
 855          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 856          return;
 857
 858       case ir_binop_greater:
 859       case ir_binop_gequal:
 860       case ir_binop_less:
 861       case ir_binop_lequal:
 862       case ir_binop_equal:
 863       case ir_binop_nequal:
 864          emit(IF(op[0], op[1],
 865                  brw_conditional_for_comparison(expr->operation)));
 866          return;
 867
 868       case ir_binop_all_equal:
 869          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 870          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 871          return;
 872
 873       case ir_binop_any_nequal:
 874          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 875          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 876          return;
 877
 878       case ir_unop_any:
 879          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 880          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 881          return;
 882
 883       default:
 884          assert(!"not reached");
 885          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 886          return;
 887       }
 888       return;
 889    }
 890
 891    ir->condition->accept(this);
 892
 893    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 894 }
 895
 896 static dst_reg
 897 with_writemask(dst_reg const & r, int mask)
 898 {
 899    dst_reg result = r;
 900    result.writemask = mask;
 901    return result;
 902 }
 903
 904 void
 905 vec4_vs_visitor::emit_prolog()
 906 {
 907    dst_reg sign_recovery_shift;
 908    dst_reg normalize_factor;
 909    dst_reg es3_normalize_factor;
 910
 911    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 912       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 913          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 914          dst_reg reg(ATTR, i);
 915          dst_reg reg_d = reg;
 916          reg_d.type = BRW_REGISTER_TYPE_D;
 917          dst_reg reg_ud = reg;
 918          reg_ud.type = BRW_REGISTER_TYPE_UD;
 919
 920          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 921           * come in as floating point conversions of the integer values.
 922           */
 923          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 924             dst_reg dst = reg;
 925             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 926             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 927             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 928          }
 929
 930          /* Do sign recovery for 2101010 formats if required. */
 931          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 932             if (sign_recovery_shift.file == BAD_FILE) {
 933                /* shift constant: <22,22,22,30> */
 934                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 935                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 936                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 937             }
 938
 939             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 940             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 941          }
 942
 943          /* Apply BGRA swizzle if required. */
 944          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 945             src_reg temp = src_reg(reg);
 946             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 947             emit(MOV(reg, temp));
 948          }
 949
 950          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 951             /* ES 3.0 has different rules for converting signed normalized
 952              * fixed-point numbers than desktop GL.
 953              */
 954             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 955                /* According to equation 2.2 of the ES 3.0 specification,
 956                 * signed normalization conversion is done by:
 957                 *
 958                 * f = c / (2^(b-1)-1)
 959                 */
 960                if (es3_normalize_factor.file == BAD_FILE) {
 961                   /* mul constant: 1 / (2^(b-1) - 1) */
 962                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 963                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 964                            src_reg(1.0f / ((1<<9) - 1))));
 965                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 966                            src_reg(1.0f / ((1<<1) - 1))));
 967                }
 968
 969                dst_reg dst = reg;
 970                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 971                emit(MOV(dst, src_reg(reg_d)));
 972                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 973                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 974             } else {
 975                /* The following equations are from the OpenGL 3.2 specification:
 976                 *
 977                 * 2.1 unsigned normalization
 978                 * f = c/(2^n-1)
 979                 *
 980                 * 2.2 signed normalization
 981                 * f = (2c+1)/(2^n-1)
 982                 *
 983                 * Both of these share a common divisor, which is represented by
 984                 * "normalize_factor" in the code below.
 985                 */
 986                if (normalize_factor.file == BAD_FILE) {
 987                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 988                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 989                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 990                            src_reg(1.0f / ((1<<10) - 1))));
 991                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 992                            src_reg(1.0f / ((1<<2) - 1))));
 993                }
 994
 995                dst_reg dst = reg;
 996                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 997                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 998
 999                /* For signed normalization, we want the numerator to be 2c+1. */
1000                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1001                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1002                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1003                }
1004
1005                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1006             }
1007          }
1008
1009          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1010             dst_reg dst = reg;
1011             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1012             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1013          }
1014       }
1015    }
1016 }
1017
1018
1019 dst_reg *
1020 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1021 {
1022    /* VertexID is stored by the VF as the last vertex element, but
1023     * we don't represent it with a flag in inputs_read, so we call
1024     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1025     */
1026    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1027    prog_data->uses_vertexid = true;
1028
1029    switch (ir->location) {
1030    case SYSTEM_VALUE_VERTEX_ID:
1031       reg->writemask = WRITEMASK_X;
1032       break;
1033    case SYSTEM_VALUE_INSTANCE_ID:
1034       reg->writemask = WRITEMASK_Y;
1035       break;
1036    default:
1037       assert(!"not reached");
1038       break;
1039    }
1040
1041    return reg;
1042 }
1043
1044
1045 void
1046 vec4_visitor::visit(ir_variable *ir)
1047 {
1048    dst_reg *reg = NULL;
1049
1050    if (variable_storage(ir))
1051       return;
1052
1053    switch (ir->mode) {
1054    case ir_var_shader_in:
1055       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1056       break;
1057
1058    case ir_var_shader_out:
1059       reg = new(mem_ctx) dst_reg(this, ir->type);
1060
1061       for (int i = 0; i < type_size(ir->type); i++) {
1062          output_reg[ir->location + i] = *reg;
1063          output_reg[ir->location + i].reg_offset = i;
1064          output_reg[ir->location + i].type =
1065             brw_type_for_base_type(ir->type->get_scalar_type());
1066          output_reg_annotation[ir->location + i] = ir->name;
1067       }
1068       break;
1069
1070    case ir_var_auto:
1071    case ir_var_temporary:
1072       reg = new(mem_ctx) dst_reg(this, ir->type);
1073       break;
1074
1075    case ir_var_uniform:
1076       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1077
1078       /* Thanks to the lower_ubo_reference pass, we will see only
1079        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1080        * variables, so no need for them to be in variable_ht.
1081        */
1082       if (ir->is_in_uniform_block())
1083          return;
1084
1085       /* Track how big the whole uniform variable is, in case we need to put a
1086        * copy of its data into pull constants for array access.
1087        */
1088       this->uniform_size[this->uniforms] = type_size(ir->type);
1089
1090       if (!strncmp(ir->name, "gl_", 3)) {
1091          setup_builtin_uniform_values(ir);
1092       } else {
1093          setup_uniform_values(ir);
1094       }
1095       break;
1096
1097    case ir_var_system_value:
1098       reg = make_reg_for_system_value(ir);
1099       break;
1100
1101    default:
1102       assert(!"not reached");
1103    }
1104
1105    reg->type = brw_type_for_base_type(ir->type);
1106    hash_table_insert(this->variable_ht, reg, ir);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop *ir)
1111 {
1112    dst_reg counter;
1113
1114    /* We don't want debugging output to print the whole body of the
1115     * loop as the annotation.
1116     */
1117    this->base_ir = NULL;
1118
1119    if (ir->counter != NULL) {
1120       this->base_ir = ir->counter;
1121       ir->counter->accept(this);
1122       counter = *(variable_storage(ir->counter));
1123
1124       if (ir->from != NULL) {
1125          this->base_ir = ir->from;
1126          ir->from->accept(this);
1127
1128          emit(MOV(counter, this->result));
1129       }
1130    }
1131
1132    emit(BRW_OPCODE_DO);
1133
1134    if (ir->to) {
1135       this->base_ir = ir->to;
1136       ir->to->accept(this);
1137
1138       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1139                brw_conditional_for_comparison(ir->cmp)));
1140
1141       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1142       inst->predicate = BRW_PREDICATE_NORMAL;
1143    }
1144
1145    visit_instructions(&ir->body_instructions);
1146
1147
1148    if (ir->increment) {
1149       this->base_ir = ir->increment;
1150       ir->increment->accept(this);
1151       emit(ADD(counter, src_reg(counter), this->result));
1152    }
1153
1154    emit(BRW_OPCODE_WHILE);
1155 }
1156
1157 void
1158 vec4_visitor::visit(ir_loop_jump *ir)
1159 {
1160    switch (ir->mode) {
1161    case ir_loop_jump::jump_break:
1162       emit(BRW_OPCODE_BREAK);
1163       break;
1164    case ir_loop_jump::jump_continue:
1165       emit(BRW_OPCODE_CONTINUE);
1166       break;
1167    }
1168 }
1169
1170
1171 void
1172 vec4_visitor::visit(ir_function_signature *ir)
1173 {
1174    assert(0);
1175    (void)ir;
1176 }
1177
1178 void
1179 vec4_visitor::visit(ir_function *ir)
1180 {
1181    /* Ignore function bodies other than main() -- we shouldn't see calls to
1182     * them since they should all be inlined.
1183     */
1184    if (strcmp(ir->name, "main") == 0) {
1185       const ir_function_signature *sig;
1186       exec_list empty;
1187
1188       sig = ir->matching_signature(&empty);
1189
1190       assert(sig);
1191
1192       visit_instructions(&sig->body);
1193    }
1194 }
1195
1196 bool
1197 vec4_visitor::try_emit_sat(ir_expression *ir)
1198 {
1199    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1200    if (!sat_src)
1201       return false;
1202
1203    sat_src->accept(this);
1204    src_reg src = this->result;
1205
1206    this->result = src_reg(this, ir->type);
1207    vec4_instruction *inst;
1208    inst = emit(MOV(dst_reg(this->result), src));
1209    inst->saturate = true;
1210
1211    return true;
1212 }
1213
1214 void
1215 vec4_visitor::emit_bool_comparison(unsigned int op,
1216                                  dst_reg dst, src_reg src0, src_reg src1)
1217 {
1218    /* original gen4 does destination conversion before comparison. */
1219    if (intel->gen < 5)
1220       dst.type = src0.type;
1221
1222    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1223
1224    dst.type = BRW_REGISTER_TYPE_D;
1225    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1226 }
1227
1228 void
1229 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1230                           src_reg src0, src_reg src1)
1231 {
1232    vec4_instruction *inst;
1233
1234    if (intel->gen >= 6) {
1235       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1236       inst->conditional_mod = conditionalmod;
1237    } else {
1238       emit(CMP(dst, src0, src1, conditionalmod));
1239
1240       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1241       inst->predicate = BRW_PREDICATE_NORMAL;
1242    }
1243 }
1244
1245 void
1246 vec4_visitor::visit(ir_expression *ir)
1247 {
1248    unsigned int operand;
1249    src_reg op[Elements(ir->operands)];
1250    src_reg result_src;
1251    dst_reg result_dst;
1252    vec4_instruction *inst;
1253
1254    if (try_emit_sat(ir))
1255       return;
1256
1257    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1258       this->result.file = BAD_FILE;
1259       ir->operands[operand]->accept(this);
1260       if (this->result.file == BAD_FILE) {
1261          printf("Failed to get tree for expression operand:\n");
1262          ir->operands[operand]->print();
1263          exit(1);
1264       }
1265       op[operand] = this->result;
1266
1267       /* Matrix expression operands should have been broken down to vector
1268        * operations already.
1269        */
1270       assert(!ir->operands[operand]->type->is_matrix());
1271    }
1272
1273    int vector_elements = ir->operands[0]->type->vector_elements;
1274    if (ir->operands[1]) {
1275       vector_elements = MAX2(vector_elements,
1276                              ir->operands[1]->type->vector_elements);
1277    }
1278
1279    this->result.file = BAD_FILE;
1280
1281    /* Storage for our result.  Ideally for an assignment we'd be using
1282     * the actual storage for the result here, instead.
1283     */
1284    result_src = src_reg(this, ir->type);
1285    /* convenience for the emit functions below. */
1286    result_dst = dst_reg(result_src);
1287    /* If nothing special happens, this is the result. */
1288    this->result = result_src;
1289    /* Limit writes to the channels that will be used by result_src later.
1290     * This does limit this temp's use as a temporary for multi-instruction
1291     * sequences.
1292     */
1293    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1294
1295    switch (ir->operation) {
1296    case ir_unop_logic_not:
1297       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1298        * ones complement of the whole register, not just bit 0.
1299        */
1300       emit(XOR(result_dst, op[0], src_reg(1)));
1301       break;
1302    case ir_unop_neg:
1303       op[0].negate = !op[0].negate;
1304       this->result = op[0];
1305       break;
1306    case ir_unop_abs:
1307       op[0].abs = true;
1308       op[0].negate = false;
1309       this->result = op[0];
1310       break;
1311
1312    case ir_unop_sign:
1313       emit(MOV(result_dst, src_reg(0.0f)));
1314
1315       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1316       inst = emit(MOV(result_dst, src_reg(1.0f)));
1317       inst->predicate = BRW_PREDICATE_NORMAL;
1318
1319       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1320       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1321       inst->predicate = BRW_PREDICATE_NORMAL;
1322
1323       break;
1324
1325    case ir_unop_rcp:
1326       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1327       break;
1328
1329    case ir_unop_exp2:
1330       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1331       break;
1332    case ir_unop_log2:
1333       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1334       break;
1335    case ir_unop_exp:
1336    case ir_unop_log:
1337       assert(!"not reached: should be handled by ir_explog_to_explog2");
1338       break;
1339    case ir_unop_sin:
1340    case ir_unop_sin_reduced:
1341       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1342       break;
1343    case ir_unop_cos:
1344    case ir_unop_cos_reduced:
1345       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1346       break;
1347
1348    case ir_unop_dFdx:
1349    case ir_unop_dFdy:
1350       assert(!"derivatives not valid in vertex shader");
1351       break;
1352
1353    case ir_unop_noise:
1354       assert(!"not reached: should be handled by lower_noise");
1355       break;
1356
1357    case ir_binop_add:
1358       emit(ADD(result_dst, op[0], op[1]));
1359       break;
1360    case ir_binop_sub:
1361       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1362       break;
1363
1364    case ir_binop_mul:
1365       if (ir->type->is_integer()) {
1366          /* For integer multiplication, the MUL uses the low 16 bits
1367           * of one of the operands (src0 on gen6, src1 on gen7).  The
1368           * MACH accumulates in the contribution of the upper 16 bits
1369           * of that operand.
1370           *
1371           * FINISHME: Emit just the MUL if we know an operand is small
1372           * enough.
1373           */
1374          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1375
1376          emit(MUL(acc, op[0], op[1]));
1377          emit(MACH(dst_null_d(), op[0], op[1]));
1378          emit(MOV(result_dst, src_reg(acc)));
1379       } else {
1380          emit(MUL(result_dst, op[0], op[1]));
1381       }
1382       break;
1383    case ir_binop_div:
1384       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1385       assert(ir->type->is_integer());
1386       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1387       break;
1388    case ir_binop_mod:
1389       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1390       assert(ir->type->is_integer());
1391       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1392       break;
1393
1394    case ir_binop_less:
1395    case ir_binop_greater:
1396    case ir_binop_lequal:
1397    case ir_binop_gequal:
1398    case ir_binop_equal:
1399    case ir_binop_nequal: {
1400       emit(CMP(result_dst, op[0], op[1],
1401                brw_conditional_for_comparison(ir->operation)));
1402       emit(AND(result_dst, result_src, src_reg(0x1)));
1403       break;
1404    }
1405
1406    case ir_binop_all_equal:
1407       /* "==" operator producing a scalar boolean. */
1408       if (ir->operands[0]->type->is_vector() ||
1409           ir->operands[1]->type->is_vector()) {
1410          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1411          emit(MOV(result_dst, src_reg(0)));
1412          inst = emit(MOV(result_dst, src_reg(1)));
1413          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1414       } else {
1415          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1416          emit(AND(result_dst, result_src, src_reg(0x1)));
1417       }
1418       break;
1419    case ir_binop_any_nequal:
1420       /* "!=" operator producing a scalar boolean. */
1421       if (ir->operands[0]->type->is_vector() ||
1422           ir->operands[1]->type->is_vector()) {
1423          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1424
1425          emit(MOV(result_dst, src_reg(0)));
1426          inst = emit(MOV(result_dst, src_reg(1)));
1427          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1428       } else {
1429          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1430          emit(AND(result_dst, result_src, src_reg(0x1)));
1431       }
1432       break;
1433
1434    case ir_unop_any:
1435       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1436       emit(MOV(result_dst, src_reg(0)));
1437
1438       inst = emit(MOV(result_dst, src_reg(1)));
1439       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1440       break;
1441
1442    case ir_binop_logic_xor:
1443       emit(XOR(result_dst, op[0], op[1]));
1444       break;
1445
1446    case ir_binop_logic_or:
1447       emit(OR(result_dst, op[0], op[1]));
1448       break;
1449
1450    case ir_binop_logic_and:
1451       emit(AND(result_dst, op[0], op[1]));
1452       break;
1453
1454    case ir_binop_dot:
1455       assert(ir->operands[0]->type->is_vector());
1456       assert(ir->operands[0]->type == ir->operands[1]->type);
1457       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1458       break;
1459
1460    case ir_unop_sqrt:
1461       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1462       break;
1463    case ir_unop_rsq:
1464       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1465       break;
1466
1467    case ir_unop_bitcast_i2f:
1468    case ir_unop_bitcast_u2f:
1469       this->result = op[0];
1470       this->result.type = BRW_REGISTER_TYPE_F;
1471       break;
1472
1473    case ir_unop_bitcast_f2i:
1474       this->result = op[0];
1475       this->result.type = BRW_REGISTER_TYPE_D;
1476       break;
1477
1478    case ir_unop_bitcast_f2u:
1479       this->result = op[0];
1480       this->result.type = BRW_REGISTER_TYPE_UD;
1481       break;
1482
1483    case ir_unop_i2f:
1484    case ir_unop_i2u:
1485    case ir_unop_u2i:
1486    case ir_unop_u2f:
1487    case ir_unop_b2f:
1488    case ir_unop_b2i:
1489    case ir_unop_f2i:
1490    case ir_unop_f2u:
1491       emit(MOV(result_dst, op[0]));
1492       break;
1493    case ir_unop_f2b:
1494    case ir_unop_i2b: {
1495       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1496       emit(AND(result_dst, result_src, src_reg(1)));
1497       break;
1498    }
1499
1500    case ir_unop_trunc:
1501       emit(RNDZ(result_dst, op[0]));
1502       break;
1503    case ir_unop_ceil:
1504       op[0].negate = !op[0].negate;
1505       inst = emit(RNDD(result_dst, op[0]));
1506       this->result.negate = true;
1507       break;
1508    case ir_unop_floor:
1509       inst = emit(RNDD(result_dst, op[0]));
1510       break;
1511    case ir_unop_fract:
1512       inst = emit(FRC(result_dst, op[0]));
1513       break;
1514    case ir_unop_round_even:
1515       emit(RNDE(result_dst, op[0]));
1516       break;
1517
1518    case ir_binop_min:
1519       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1520       break;
1521    case ir_binop_max:
1522       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1523       break;
1524
1525    case ir_binop_pow:
1526       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1527       break;
1528
1529    case ir_unop_bit_not:
1530       inst = emit(NOT(result_dst, op[0]));
1531       break;
1532    case ir_binop_bit_and:
1533       inst = emit(AND(result_dst, op[0], op[1]));
1534       break;
1535    case ir_binop_bit_xor:
1536       inst = emit(XOR(result_dst, op[0], op[1]));
1537       break;
1538    case ir_binop_bit_or:
1539       inst = emit(OR(result_dst, op[0], op[1]));
1540       break;
1541
1542    case ir_binop_lshift:
1543       inst = emit(SHL(result_dst, op[0], op[1]));
1544       break;
1545
1546    case ir_binop_rshift:
1547       if (ir->type->base_type == GLSL_TYPE_INT)
1548          inst = emit(ASR(result_dst, op[0], op[1]));
1549       else
1550          inst = emit(SHR(result_dst, op[0], op[1]));
1551       break;
1552
1553    case ir_binop_ubo_load: {
1554       ir_constant *uniform_block = ir->operands[0]->as_constant();
1555       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1556       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1557       src_reg offset = op[1];
1558
1559       /* Now, load the vector from that offset. */
1560       assert(ir->type->is_vector() || ir->type->is_scalar());
1561
1562       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1563       packed_consts.type = result.type;
1564       src_reg surf_index =
1565          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1566       if (const_offset_ir) {
1567          offset = src_reg(const_offset / 16);
1568       } else {
1569          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1570       }
1571
1572       vec4_instruction *pull =
1573          emit(new(mem_ctx) vec4_instruction(this,
1574                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1575                                             dst_reg(packed_consts),
1576                                             surf_index,
1577                                             offset));
1578       pull->base_mrf = 14;
1579       pull->mlen = 1;
1580
1581       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1582       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1583                                             const_offset % 16 / 4,
1584                                             const_offset % 16 / 4,
1585                                             const_offset % 16 / 4);
1586
1587       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1588       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1589          emit(CMP(result_dst, packed_consts, src_reg(0u),
1590                   BRW_CONDITIONAL_NZ));
1591          emit(AND(result_dst, result, src_reg(0x1)));
1592       } else {
1593          emit(MOV(result_dst, packed_consts));
1594       }
1595       break;
1596    }
1597
1598    case ir_triop_lrp:
1599       assert(!"not reached: should be handled by lrp_to_arith");
1600       break;
1601
1602    case ir_quadop_vector:
1603       assert(!"not reached: should be handled by lower_quadop_vector");
1604       break;
1605
1606    case ir_unop_pack_half_2x16:
1607       emit_pack_half_2x16(result_dst, op[0]);
1608       break;
1609    case ir_unop_unpack_half_2x16:
1610       emit_unpack_half_2x16(result_dst, op[0]);
1611       break;
1612    case ir_unop_pack_snorm_2x16:
1613    case ir_unop_pack_snorm_4x8:
1614    case ir_unop_pack_unorm_2x16:
1615    case ir_unop_pack_unorm_4x8:
1616    case ir_unop_unpack_snorm_2x16:
1617    case ir_unop_unpack_snorm_4x8:
1618    case ir_unop_unpack_unorm_2x16:
1619    case ir_unop_unpack_unorm_4x8:
1620       assert(!"not reached: should be handled by lower_packing_builtins");
1621       break;
1622    case ir_unop_unpack_half_2x16_split_x:
1623    case ir_unop_unpack_half_2x16_split_y:
1624    case ir_binop_pack_half_2x16_split:
1625       assert(!"not reached: should not occur in vertex shader");
1626       break;
1627    }
1628 }
1629
1630
1631 void
1632 vec4_visitor::visit(ir_swizzle *ir)
1633 {
1634    src_reg src;
1635    int i = 0;
1636    int swizzle[4];
1637
1638    /* Note that this is only swizzles in expressions, not those on the left
1639     * hand side of an assignment, which do write masking.  See ir_assignment
1640     * for that.
1641     */
1642
1643    ir->val->accept(this);
1644    src = this->result;
1645    assert(src.file != BAD_FILE);
1646
1647    for (i = 0; i < ir->type->vector_elements; i++) {
1648       switch (i) {
1649       case 0:
1650          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1651          break;
1652       case 1:
1653          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1654          break;
1655       case 2:
1656          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1657          break;
1658       case 3:
1659          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1660             break;
1661       }
1662    }
1663    for (; i < 4; i++) {
1664       /* Replicate the last channel out. */
1665       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1666    }
1667
1668    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1669
1670    this->result = src;
1671 }
1672
1673 void
1674 vec4_visitor::visit(ir_dereference_variable *ir)
1675 {
1676    const struct glsl_type *type = ir->type;
1677    dst_reg *reg = variable_storage(ir->var);
1678
1679    if (!reg) {
1680       fail("Failed to find variable storage for %s\n", ir->var->name);
1681       this->result = src_reg(brw_null_reg());
1682       return;
1683    }
1684
1685    this->result = src_reg(*reg);
1686
1687    /* System values get their swizzle from the dst_reg writemask */
1688    if (ir->var->mode == ir_var_system_value)
1689       return;
1690
1691    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1692       this->result.swizzle = swizzle_for_size(type->vector_elements);
1693 }
1694
1695 void
1696 vec4_visitor::visit(ir_dereference_array *ir)
1697 {
1698    ir_constant *constant_index;
1699    src_reg src;
1700    int element_size = type_size(ir->type);
1701
1702    constant_index = ir->array_index->constant_expression_value();
1703
1704    ir->array->accept(this);
1705    src = this->result;
1706
1707    if (constant_index) {
1708       src.reg_offset += constant_index->value.i[0] * element_size;
1709    } else {
1710       /* Variable index array dereference.  It eats the "vec4" of the
1711        * base of the array and an index that offsets the Mesa register
1712        * index.
1713        */
1714       ir->array_index->accept(this);
1715
1716       src_reg index_reg;
1717
1718       if (element_size == 1) {
1719          index_reg = this->result;
1720       } else {
1721          index_reg = src_reg(this, glsl_type::int_type);
1722
1723          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1724       }
1725
1726       if (src.reladdr) {
1727          src_reg temp = src_reg(this, glsl_type::int_type);
1728
1729          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1730
1731          index_reg = temp;
1732       }
1733
1734       src.reladdr = ralloc(mem_ctx, src_reg);
1735       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1736    }
1737
1738    /* If the type is smaller than a vec4, replicate the last channel out. */
1739    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1740       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1741    else
1742       src.swizzle = BRW_SWIZZLE_NOOP;
1743    src.type = brw_type_for_base_type(ir->type);
1744
1745    this->result = src;
1746 }
1747
1748 void
1749 vec4_visitor::visit(ir_dereference_record *ir)
1750 {
1751    unsigned int i;
1752    const glsl_type *struct_type = ir->record->type;
1753    int offset = 0;
1754
1755    ir->record->accept(this);
1756
1757    for (i = 0; i < struct_type->length; i++) {
1758       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1759          break;
1760       offset += type_size(struct_type->fields.structure[i].type);
1761    }
1762
1763    /* If the type is smaller than a vec4, replicate the last channel out. */
1764    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1765       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1766    else
1767       this->result.swizzle = BRW_SWIZZLE_NOOP;
1768    this->result.type = brw_type_for_base_type(ir->type);
1769
1770    this->result.reg_offset += offset;
1771 }
1772
1773 /**
1774  * We want to be careful in assignment setup to hit the actual storage
1775  * instead of potentially using a temporary like we might with the
1776  * ir_dereference handler.
1777  */
1778 static dst_reg
1779 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1780 {
1781    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1782     * access of a vector, it must be separated into a series conditional moves
1783     * before reaching this point (see ir_vec_index_to_cond_assign).
1784     */
1785    assert(ir->as_dereference());
1786    ir_dereference_array *deref_array = ir->as_dereference_array();
1787    if (deref_array) {
1788       assert(!deref_array->array->type->is_vector());
1789    }
1790
1791    /* Use the rvalue deref handler for the most part.  We'll ignore
1792     * swizzles in it and write swizzles using writemask, though.
1793     */
1794    ir->accept(v);
1795    return dst_reg(v->result);
1796 }
1797
1798 void
1799 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1800                               const struct glsl_type *type, uint32_t predicate)
1801 {
1802    if (type->base_type == GLSL_TYPE_STRUCT) {
1803       for (unsigned int i = 0; i < type->length; i++) {
1804          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1805       }
1806       return;
1807    }
1808
1809    if (type->is_array()) {
1810       for (unsigned int i = 0; i < type->length; i++) {
1811          emit_block_move(dst, src, type->fields.array, predicate);
1812       }
1813       return;
1814    }
1815
1816    if (type->is_matrix()) {
1817       const struct glsl_type *vec_type;
1818
1819       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1820                                          type->vector_elements, 1);
1821
1822       for (int i = 0; i < type->matrix_columns; i++) {
1823          emit_block_move(dst, src, vec_type, predicate);
1824       }
1825       return;
1826    }
1827
1828    assert(type->is_scalar() || type->is_vector());
1829
1830    dst->type = brw_type_for_base_type(type);
1831    src->type = dst->type;
1832
1833    dst->writemask = (1 << type->vector_elements) - 1;
1834
1835    src->swizzle = swizzle_for_size(type->vector_elements);
1836
1837    vec4_instruction *inst = emit(MOV(*dst, *src));
1838    inst->predicate = predicate;
1839
1840    dst->reg_offset++;
1841    src->reg_offset++;
1842 }
1843
1844
1845 /* If the RHS processing resulted in an instruction generating a
1846  * temporary value, and it would be easy to rewrite the instruction to
1847  * generate its result right into the LHS instead, do so.  This ends
1848  * up reliably removing instructions where it can be tricky to do so
1849  * later without real UD chain information.
1850  */
1851 bool
1852 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1853                                      dst_reg dst,
1854                                      src_reg src,
1855                                      vec4_instruction *pre_rhs_inst,
1856                                      vec4_instruction *last_rhs_inst)
1857 {
1858    /* This could be supported, but it would take more smarts. */
1859    if (ir->condition)
1860       return false;
1861
1862    if (pre_rhs_inst == last_rhs_inst)
1863       return false; /* No instructions generated to work with. */
1864
1865    /* Make sure the last instruction generated our source reg. */
1866    if (src.file != GRF ||
1867        src.file != last_rhs_inst->dst.file ||
1868        src.reg != last_rhs_inst->dst.reg ||
1869        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1870        src.reladdr ||
1871        src.abs ||
1872        src.negate ||
1873        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1874       return false;
1875
1876    /* Check that that last instruction fully initialized the channels
1877     * we want to use, in the order we want to use them.  We could
1878     * potentially reswizzle the operands of many instructions so that
1879     * we could handle out of order channels, but don't yet.
1880     */
1881
1882    for (unsigned i = 0; i < 4; i++) {
1883       if (dst.writemask & (1 << i)) {
1884          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1885             return false;
1886
1887          if (BRW_GET_SWZ(src.swizzle, i) != i)
1888             return false;
1889       }
1890    }
1891
1892    /* Success!  Rewrite the instruction. */
1893    last_rhs_inst->dst.file = dst.file;
1894    last_rhs_inst->dst.reg = dst.reg;
1895    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1896    last_rhs_inst->dst.reladdr = dst.reladdr;
1897    last_rhs_inst->dst.writemask &= dst.writemask;
1898
1899    return true;
1900 }
1901
1902 void
1903 vec4_visitor::visit(ir_assignment *ir)
1904 {
1905    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1906    uint32_t predicate = BRW_PREDICATE_NONE;
1907
1908    if (!ir->lhs->type->is_scalar() &&
1909        !ir->lhs->type->is_vector()) {
1910       ir->rhs->accept(this);
1911       src_reg src = this->result;
1912
1913       if (ir->condition) {
1914          emit_bool_to_cond_code(ir->condition, &predicate);
1915       }
1916
1917       /* emit_block_move doesn't account for swizzles in the source register.
1918        * This should be ok, since the source register is a structure or an
1919        * array, and those can't be swizzled.  But double-check to be sure.
1920        */
1921       assert(src.swizzle ==
1922              (ir->rhs->type->is_matrix()
1923               ? swizzle_for_size(ir->rhs->type->vector_elements)
1924               : BRW_SWIZZLE_NOOP));
1925
1926       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1927       return;
1928    }
1929
1930    /* Now we're down to just a scalar/vector with writemasks. */
1931    int i;
1932
1933    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1934    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1935
1936    ir->rhs->accept(this);
1937
1938    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1939
1940    src_reg src = this->result;
1941
1942    int swizzles[4];
1943    int first_enabled_chan = 0;
1944    int src_chan = 0;
1945
1946    assert(ir->lhs->type->is_vector() ||
1947           ir->lhs->type->is_scalar());
1948    dst.writemask = ir->write_mask;
1949
1950    for (int i = 0; i < 4; i++) {
1951       if (dst.writemask & (1 << i)) {
1952          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1953          break;
1954       }
1955    }
1956
1957    /* Swizzle a small RHS vector into the channels being written.
1958     *
1959     * glsl ir treats write_mask as dictating how many channels are
1960     * present on the RHS while in our instructions we need to make
1961     * those channels appear in the slots of the vec4 they're written to.
1962     */
1963    for (int i = 0; i < 4; i++) {
1964       if (dst.writemask & (1 << i))
1965          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1966       else
1967          swizzles[i] = first_enabled_chan;
1968    }
1969    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1970                               swizzles[2], swizzles[3]);
1971
1972    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1973       return;
1974    }
1975
1976    if (ir->condition) {
1977       emit_bool_to_cond_code(ir->condition, &predicate);
1978    }
1979
1980    for (i = 0; i < type_size(ir->lhs->type); i++) {
1981       vec4_instruction *inst = emit(MOV(dst, src));
1982       inst->predicate = predicate;
1983
1984       dst.reg_offset++;
1985       src.reg_offset++;
1986    }
1987 }
1988
1989 void
1990 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1991 {
1992    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1993       foreach_list(node, &ir->components) {
1994          ir_constant *field_value = (ir_constant *)node;
1995
1996          emit_constant_values(dst, field_value);
1997       }
1998       return;
1999    }
2000
2001    if (ir->type->is_array()) {
2002       for (unsigned int i = 0; i < ir->type->length; i++) {
2003          emit_constant_values(dst, ir->array_elements[i]);
2004       }
2005       return;
2006    }
2007
2008    if (ir->type->is_matrix()) {
2009       for (int i = 0; i < ir->type->matrix_columns; i++) {
2010          float *vec = &ir->value.f[i * ir->type->vector_elements];
2011
2012          for (int j = 0; j < ir->type->vector_elements; j++) {
2013             dst->writemask = 1 << j;
2014             dst->type = BRW_REGISTER_TYPE_F;
2015
2016             emit(MOV(*dst, src_reg(vec[j])));
2017          }
2018          dst->reg_offset++;
2019       }
2020       return;
2021    }
2022
2023    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2024
2025    for (int i = 0; i < ir->type->vector_elements; i++) {
2026       if (!(remaining_writemask & (1 << i)))
2027          continue;
2028
2029       dst->writemask = 1 << i;
2030       dst->type = brw_type_for_base_type(ir->type);
2031
2032       /* Find other components that match the one we're about to
2033        * write.  Emits fewer instructions for things like vec4(0.5,
2034        * 1.5, 1.5, 1.5).
2035        */
2036       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2037          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2038             if (ir->value.b[i] == ir->value.b[j])
2039                dst->writemask |= (1 << j);
2040          } else {
2041             /* u, i, and f storage all line up, so no need for a
2042              * switch case for comparing each type.
2043              */
2044             if (ir->value.u[i] == ir->value.u[j])
2045                dst->writemask |= (1 << j);
2046          }
2047       }
2048
2049       switch (ir->type->base_type) {
2050       case GLSL_TYPE_FLOAT:
2051          emit(MOV(*dst, src_reg(ir->value.f[i])));
2052          break;
2053       case GLSL_TYPE_INT:
2054          emit(MOV(*dst, src_reg(ir->value.i[i])));
2055          break;
2056       case GLSL_TYPE_UINT:
2057          emit(MOV(*dst, src_reg(ir->value.u[i])));
2058          break;
2059       case GLSL_TYPE_BOOL:
2060          emit(MOV(*dst, src_reg(ir->value.b[i])));
2061          break;
2062       default:
2063          assert(!"Non-float/uint/int/bool constant");
2064          break;
2065       }
2066
2067       remaining_writemask &= ~dst->writemask;
2068    }
2069    dst->reg_offset++;
2070 }
2071
2072 void
2073 vec4_visitor::visit(ir_constant *ir)
2074 {
2075    dst_reg dst = dst_reg(this, ir->type);
2076    this->result = src_reg(dst);
2077
2078    emit_constant_values(&dst, ir);
2079 }
2080
2081 void
2082 vec4_visitor::visit(ir_call *ir)
2083 {
2084    assert(!"not reached");
2085 }
2086
2087 void
2088 vec4_visitor::visit(ir_texture *ir)
2089 {
2090    int sampler =
2091       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2092
2093    /* Should be lowered by do_lower_texture_projection */
2094    assert(!ir->projector);
2095
2096    /* Generate code to compute all the subexpression trees.  This has to be
2097     * done before loading any values into MRFs for the sampler message since
2098     * generating these values may involve SEND messages that need the MRFs.
2099     */
2100    src_reg coordinate;
2101    if (ir->coordinate) {
2102       ir->coordinate->accept(this);
2103       coordinate = this->result;
2104    }
2105
2106    src_reg shadow_comparitor;
2107    if (ir->shadow_comparitor) {
2108       ir->shadow_comparitor->accept(this);
2109       shadow_comparitor = this->result;
2110    }
2111
2112    const glsl_type *lod_type, *sample_index_type;
2113    src_reg lod, dPdx, dPdy, sample_index;
2114    switch (ir->op) {
2115    case ir_tex:
2116       lod = src_reg(0.0f);
2117       lod_type = glsl_type::float_type;
2118       break;
2119    case ir_txf:
2120    case ir_txl:
2121    case ir_txs:
2122       ir->lod_info.lod->accept(this);
2123       lod = this->result;
2124       lod_type = ir->lod_info.lod->type;
2125       break;
2126    case ir_txf_ms:
2127       ir->lod_info.sample_index->accept(this);
2128       sample_index = this->result;
2129       sample_index_type = ir->lod_info.sample_index->type;
2130       break;
2131    case ir_txd:
2132       ir->lod_info.grad.dPdx->accept(this);
2133       dPdx = this->result;
2134
2135       ir->lod_info.grad.dPdy->accept(this);
2136       dPdy = this->result;
2137
2138       lod_type = ir->lod_info.grad.dPdx->type;
2139       break;
2140    case ir_txb:
2141    case ir_lod:
2142       break;
2143    }
2144
2145    vec4_instruction *inst = NULL;
2146    switch (ir->op) {
2147    case ir_tex:
2148    case ir_txl:
2149       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2150       break;
2151    case ir_txd:
2152       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2153       break;
2154    case ir_txf:
2155       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2156       break;
2157    case ir_txf_ms:
2158       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2159       break;
2160    case ir_txs:
2161       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2162       break;
2163    case ir_txb:
2164       assert(!"TXB is not valid for vertex shaders.");
2165       break;
2166    case ir_lod:
2167       assert(!"LOD is not valid for vertex shaders.");
2168       break;
2169    }
2170
2171    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2172
2173    /* Texel offsets go in the message header; Gen4 also requires headers. */
2174    inst->header_present = use_texture_offset || intel->gen < 5;
2175    inst->base_mrf = 2;
2176    inst->mlen = inst->header_present + 1; /* always at least one */
2177    inst->sampler = sampler;
2178    inst->dst = dst_reg(this, ir->type);
2179    inst->dst.writemask = WRITEMASK_XYZW;
2180    inst->shadow_compare = ir->shadow_comparitor != NULL;
2181
2182    if (use_texture_offset)
2183       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2184
2185    /* MRF for the first parameter */
2186    int param_base = inst->base_mrf + inst->header_present;
2187
2188    if (ir->op == ir_txs) {
2189       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2190       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2191    } else {
2192       int i, coord_mask = 0, zero_mask = 0;
2193       /* Load the coordinate */
2194       /* FINISHME: gl_clamp_mask and saturate */
2195       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2196          coord_mask |= (1 << i);
2197       for (; i < 4; i++)
2198          zero_mask |= (1 << i);
2199
2200       if (ir->offset && ir->op == ir_txf) {
2201          /* It appears that the ld instruction used for txf does its
2202           * address bounds check before adding in the offset.  To work
2203           * around this, just add the integer offset to the integer
2204           * texel coordinate, and don't put the offset in the header.
2205           */
2206          ir_constant *offset = ir->offset->as_constant();
2207          assert(offset);
2208
2209          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2210             src_reg src = coordinate;
2211             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2212                                        BRW_GET_SWZ(src.swizzle, j),
2213                                        BRW_GET_SWZ(src.swizzle, j),
2214                                        BRW_GET_SWZ(src.swizzle, j));
2215             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2216                      src, offset->value.i[j]));
2217          }
2218       } else {
2219          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2220                   coordinate));
2221       }
2222       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2223                src_reg(0)));
2224       /* Load the shadow comparitor */
2225       if (ir->shadow_comparitor) {
2226          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2227                           WRITEMASK_X),
2228                   shadow_comparitor));
2229          inst->mlen++;
2230       }
2231
2232       /* Load the LOD info */
2233       if (ir->op == ir_tex || ir->op == ir_txl) {
2234          int mrf, writemask;
2235          if (intel->gen >= 5) {
2236             mrf = param_base + 1;
2237             if (ir->shadow_comparitor) {
2238                writemask = WRITEMASK_Y;
2239                /* mlen already incremented */
2240             } else {
2241                writemask = WRITEMASK_X;
2242                inst->mlen++;
2243             }
2244          } else /* intel->gen == 4 */ {
2245             mrf = param_base;
2246             writemask = WRITEMASK_Z;
2247          }
2248          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2249       } else if (ir->op == ir_txf) {
2250          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2251       } else if (ir->op == ir_txf_ms) {
2252          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2253                   sample_index));
2254          inst->mlen++;
2255
2256          /* on Gen7, there is an additional MCS parameter here after SI,
2257           * but we don't bother to emit it since it's always zero. If
2258           * we start supporting texturing from CMS surfaces, this will have
2259           * to change
2260           */
2261       } else if (ir->op == ir_txd) {
2262          const glsl_type *type = lod_type;
2263
2264          if (intel->gen >= 5) {
2265             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2266             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2267             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2268             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2269             inst->mlen++;
2270
2271             if (ir->type->vector_elements == 3) {
2272                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2273                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2274                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2275                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2276                inst->mlen++;
2277             }
2278          } else /* intel->gen == 4 */ {
2279             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2280             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2281             inst->mlen += 2;
2282          }
2283       }
2284    }
2285
2286    emit(inst);
2287
2288    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2289     * spec requires layers.
2290     */
2291    if (ir->op == ir_txs) {
2292       glsl_type const *type = ir->sampler->type;
2293       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2294           type->sampler_array) {
2295          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2296                    with_writemask(inst->dst, WRITEMASK_Z),
2297                    src_reg(inst->dst), src_reg(6));
2298       }
2299    }
2300
2301    swizzle_result(ir, src_reg(inst->dst), sampler);
2302 }
2303
2304 void
2305 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2306 {
2307    int s = c->key.base.tex.swizzles[sampler];
2308
2309    this->result = src_reg(this, ir->type);
2310    dst_reg swizzled_result(this->result);
2311
2312    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2313                         || s == SWIZZLE_NOOP) {
2314       emit(MOV(swizzled_result, orig_val));
2315       return;
2316    }
2317
2318    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2319    int swizzle[4];
2320
2321    for (int i = 0; i < 4; i++) {
2322       switch (GET_SWZ(s, i)) {
2323       case SWIZZLE_ZERO:
2324          zero_mask |= (1 << i);
2325          break;
2326       case SWIZZLE_ONE:
2327          one_mask |= (1 << i);
2328          break;
2329       default:
2330          copy_mask |= (1 << i);
2331          swizzle[i] = GET_SWZ(s, i);
2332          break;
2333       }
2334    }
2335
2336    if (copy_mask) {
2337       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2338       swizzled_result.writemask = copy_mask;
2339       emit(MOV(swizzled_result, orig_val));
2340    }
2341
2342    if (zero_mask) {
2343       swizzled_result.writemask = zero_mask;
2344       emit(MOV(swizzled_result, src_reg(0.0f)));
2345    }
2346
2347    if (one_mask) {
2348       swizzled_result.writemask = one_mask;
2349       emit(MOV(swizzled_result, src_reg(1.0f)));
2350    }
2351 }
2352
2353 void
2354 vec4_visitor::visit(ir_return *ir)
2355 {
2356    assert(!"not reached");
2357 }
2358
2359 void
2360 vec4_visitor::visit(ir_discard *ir)
2361 {
2362    assert(!"not reached");
2363 }
2364
2365 void
2366 vec4_visitor::visit(ir_if *ir)
2367 {
2368    /* Don't point the annotation at the if statement, because then it plus
2369     * the then and else blocks get printed.
2370     */
2371    this->base_ir = ir->condition;
2372
2373    if (intel->gen == 6) {
2374       emit_if_gen6(ir);
2375    } else {
2376       uint32_t predicate;
2377       emit_bool_to_cond_code(ir->condition, &predicate);
2378       emit(IF(predicate));
2379    }
2380
2381    visit_instructions(&ir->then_instructions);
2382
2383    if (!ir->else_instructions.is_empty()) {
2384       this->base_ir = ir->condition;
2385       emit(BRW_OPCODE_ELSE);
2386
2387       visit_instructions(&ir->else_instructions);
2388    }
2389
2390    this->base_ir = ir->condition;
2391    emit(BRW_OPCODE_ENDIF);
2392 }
2393
2394 void
2395 vec4_visitor::emit_ndc_computation()
2396 {
2397    /* Get the position */
2398    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2399
2400    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2401    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2402    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2403
2404    current_annotation = "NDC";
2405    dst_reg ndc_w = ndc;
2406    ndc_w.writemask = WRITEMASK_W;
2407    src_reg pos_w = pos;
2408    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2409    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2410
2411    dst_reg ndc_xyz = ndc;
2412    ndc_xyz.writemask = WRITEMASK_XYZ;
2413
2414    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2415 }
2416
2417 void
2418 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2419 {
2420    if (intel->gen < 6 &&
2421        ((prog_data->base.vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2422         c->key.base.userclip_active || brw->has_negative_rhw_bug)) {
2423       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2424       dst_reg header1_w = header1;
2425       header1_w.writemask = WRITEMASK_W;
2426       GLuint i;
2427
2428       emit(MOV(header1, 0u));
2429
2430       if (prog_data->base.vue_map.slots_valid & VARYING_BIT_PSIZ) {
2431          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2432
2433          current_annotation = "Point size";
2434          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2435          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2436       }
2437
2438       current_annotation = "Clipping flags";
2439       for (i = 0; i < c->key.base.nr_userclip_plane_consts; i++) {
2440          vec4_instruction *inst;
2441
2442          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2443                          src_reg(this->userplane[i])));
2444          inst->conditional_mod = BRW_CONDITIONAL_L;
2445
2446          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2447          inst->predicate = BRW_PREDICATE_NORMAL;
2448       }
2449
2450       /* i965 clipping workaround:
2451        * 1) Test for -ve rhw
2452        * 2) If set,
2453        *      set ndc = (0,0,0,0)
2454        *      set ucp[6] = 1
2455        *
2456        * Later, clipping will detect ucp[6] and ensure the primitive is
2457        * clipped against all fixed planes.
2458        */
2459       if (brw->has_negative_rhw_bug) {
2460          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2461          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2462          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2463          vec4_instruction *inst;
2464          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2465          inst->predicate = BRW_PREDICATE_NORMAL;
2466          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2467          inst->predicate = BRW_PREDICATE_NORMAL;
2468       }
2469
2470       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2471    } else if (intel->gen < 6) {
2472       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2473    } else {
2474       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2475       if (prog_data->base.vue_map.slots_valid & VARYING_BIT_PSIZ) {
2476          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2477                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2478       }
2479    }
2480 }
2481
2482 void
2483 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2484 {
2485    if (intel->gen < 6) {
2486       /* Clip distance slots are set aside in gen5, but they are not used.  It
2487        * is not clear whether we actually need to set aside space for them,
2488        * but the performance cost is negligible.
2489        */
2490       return;
2491    }
2492
2493    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2494     *
2495     *     "If a linked set of shaders forming the vertex stage contains no
2496     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2497     *     application has requested clipping against user clip planes through
2498     *     the API, then the coordinate written to gl_Position is used for
2499     *     comparison against the user clip planes."
2500     *
2501     * This function is only called if the shader didn't write to
2502     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2503     * if the user wrote to it; otherwise we use gl_Position.
2504     */
2505    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2506    if (!(prog_data->base.vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2507       clip_vertex = VARYING_SLOT_POS;
2508    }
2509
2510    for (int i = 0; i + offset < c->key.base.nr_userclip_plane_consts && i < 4;
2511         ++i) {
2512       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2513                src_reg(output_reg[clip_vertex]),
2514                src_reg(this->userplane[i + offset])));
2515    }
2516 }
2517
2518 void
2519 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2520 {
2521    assert (varying < VARYING_SLOT_MAX);
2522    reg.type = output_reg[varying].type;
2523    current_annotation = output_reg_annotation[varying];
2524    /* Copy the register, saturating if necessary */
2525    vec4_instruction *inst = emit(MOV(reg,
2526                                      src_reg(output_reg[varying])));
2527    if ((varying == VARYING_SLOT_COL0 ||
2528         varying == VARYING_SLOT_COL1 ||
2529         varying == VARYING_SLOT_BFC0 ||
2530         varying == VARYING_SLOT_BFC1) &&
2531        c->key.base.clamp_vertex_color) {
2532       inst->saturate = true;
2533    }
2534 }
2535
2536 void
2537 vec4_visitor::emit_urb_slot(int mrf, int varying)
2538 {
2539    struct brw_reg hw_reg = brw_message_reg(mrf);
2540    dst_reg reg = dst_reg(MRF, mrf);
2541    reg.type = BRW_REGISTER_TYPE_F;
2542
2543    switch (varying) {
2544    case VARYING_SLOT_PSIZ:
2545       /* PSIZ is always in slot 0, and is coupled with other flags. */
2546       current_annotation = "indices, point width, clip flags";
2547       emit_psiz_and_flags(hw_reg);
2548       break;
2549    case BRW_VARYING_SLOT_NDC:
2550       current_annotation = "NDC";
2551       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2552       break;
2553    case BRW_VARYING_SLOT_POS_DUPLICATE:
2554    case VARYING_SLOT_POS:
2555       current_annotation = "gl_Position";
2556       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2557       break;
2558    case VARYING_SLOT_CLIP_DIST0:
2559    case VARYING_SLOT_CLIP_DIST1:
2560       if (this->c->key.base.uses_clip_distance) {
2561          emit_generic_urb_slot(reg, varying);
2562       } else {
2563          current_annotation = "user clip distances";
2564          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2565       }
2566       break;
2567    case VARYING_SLOT_EDGE:
2568       /* This is present when doing unfilled polygons.  We're supposed to copy
2569        * the edge flag from the user-provided vertex array
2570        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2571        * of that attribute (starts as 1.0f).  This is then used in clipping to
2572        * determine which edges should be drawn as wireframe.
2573        */
2574       current_annotation = "edge flag";
2575       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2576                                     glsl_type::float_type, WRITEMASK_XYZW))));
2577       break;
2578    case BRW_VARYING_SLOT_PAD:
2579       /* No need to write to this slot */
2580       break;
2581    default:
2582       emit_generic_urb_slot(reg, varying);
2583       break;
2584    }
2585 }
2586
2587 static int
2588 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2589 {
2590    struct intel_context *intel = &brw->intel;
2591
2592    if (intel->gen >= 6) {
2593       /* URB data written (does not include the message header reg) must
2594        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2595        * section 5.4.3.2.2: URB_INTERLEAVED.
2596        *
2597        * URB entries are allocated on a multiple of 1024 bits, so an
2598        * extra 128 bits written here to make the end align to 256 is
2599        * no problem.
2600        */
2601       if ((mlen % 2) != 1)
2602          mlen++;
2603    }
2604
2605    return mlen;
2606 }
2607
2608 /**
2609  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2610  * complete the VS thread.
2611  *
2612  * The VUE layout is documented in Volume 2a.
2613  */
2614 void
2615 vec4_vs_visitor::emit_thread_end()
2616 {
2617    /* MRF 0 is reserved for the debugger, so start with message header
2618     * in MRF 1.
2619     */
2620    int base_mrf = 1;
2621    int mrf = base_mrf;
2622    /* In the process of generating our URB write message contents, we
2623     * may need to unspill a register or load from an array.  Those
2624     * reads would use MRFs 14-15.
2625     */
2626    int max_usable_mrf = 13;
2627
2628    /* The following assertion verifies that max_usable_mrf causes an
2629     * even-numbered amount of URB write data, which will meet gen6's
2630     * requirements for length alignment.
2631     */
2632    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2633
2634    /* First mrf is the g0-based message header containing URB handles and such,
2635     * which is implied in VS_OPCODE_URB_WRITE.
2636     */
2637    mrf++;
2638
2639    if (intel->gen < 6) {
2640       emit_ndc_computation();
2641    }
2642
2643    /* Set up the VUE data for the first URB write */
2644    int slot;
2645    for (slot = 0; slot < prog_data->base.vue_map.num_slots; ++slot) {
2646       emit_urb_slot(mrf++, prog_data->base.vue_map.slot_to_varying[slot]);
2647
2648       /* If this was max_usable_mrf, we can't fit anything more into this URB
2649        * WRITE.
2650        */
2651       if (mrf > max_usable_mrf) {
2652          slot++;
2653          break;
2654       }
2655    }
2656
2657    bool eot = slot >= prog_data->base.vue_map.num_slots;
2658    if (eot) {
2659       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2660          emit_shader_time_end();
2661    }
2662    current_annotation = "URB write";
2663    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2664    inst->base_mrf = base_mrf;
2665    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2666    inst->eot = eot;
2667
2668    /* Optional second URB write */
2669    if (!inst->eot) {
2670       mrf = base_mrf + 1;
2671
2672       for (; slot < prog_data->base.vue_map.num_slots; ++slot) {
2673          assert(mrf < max_usable_mrf);
2674
2675          emit_urb_slot(mrf++, prog_data->base.vue_map.slot_to_varying[slot]);
2676       }
2677
2678       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2679          emit_shader_time_end();
2680
2681       current_annotation = "URB write";
2682       inst = emit(VS_OPCODE_URB_WRITE);
2683       inst->base_mrf = base_mrf;
2684       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2685       inst->eot = true;
2686       /* URB destination offset.  In the previous write, we got MRFs
2687        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2688        * URB row increments, and each of our MRFs is half of one of
2689        * those, since we're doing interleaved writes.
2690        */
2691       inst->offset = (max_usable_mrf - base_mrf) / 2;
2692    }
2693 }
2694
2695 src_reg
2696 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2697                                  src_reg *reladdr, int reg_offset)
2698 {
2699    /* Because we store the values to scratch interleaved like our
2700     * vertex data, we need to scale the vec4 index by 2.
2701     */
2702    int message_header_scale = 2;
2703
2704    /* Pre-gen6, the message header uses byte offsets instead of vec4
2705     * (16-byte) offset units.
2706     */
2707    if (intel->gen < 6)
2708       message_header_scale *= 16;
2709
2710    if (reladdr) {
2711       src_reg index = src_reg(this, glsl_type::int_type);
2712
2713       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2714       emit_before(inst, MUL(dst_reg(index),
2715                             index, src_reg(message_header_scale)));
2716
2717       return index;
2718    } else {
2719       return src_reg(reg_offset * message_header_scale);
2720    }
2721 }
2722
2723 src_reg
2724 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2725                                        src_reg *reladdr, int reg_offset)
2726 {
2727    if (reladdr) {
2728       src_reg index = src_reg(this, glsl_type::int_type);
2729
2730       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2731
2732       /* Pre-gen6, the message header uses byte offsets instead of vec4
2733        * (16-byte) offset units.
2734        */
2735       if (intel->gen < 6) {
2736          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2737       }
2738
2739       return index;
2740    } else {
2741       int message_header_scale = intel->gen < 6 ? 16 : 1;
2742       return src_reg(reg_offset * message_header_scale);
2743    }
2744 }
2745
2746 /**
2747  * Emits an instruction before @inst to load the value named by @orig_src
2748  * from scratch space at @base_offset to @temp.
2749  *
2750  * @base_offset is measured in 32-byte units (the size of a register).
2751  */
2752 void
2753 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2754                                 dst_reg temp, src_reg orig_src,
2755                                 int base_offset)
2756 {
2757    int reg_offset = base_offset + orig_src.reg_offset;
2758    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2759
2760    emit_before(inst, SCRATCH_READ(temp, index));
2761 }
2762
2763 /**
2764  * Emits an instruction after @inst to store the value to be written
2765  * to @orig_dst to scratch space at @base_offset, from @temp.
2766  *
2767  * @base_offset is measured in 32-byte units (the size of a register).
2768  */
2769 void
2770 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2771 {
2772    int reg_offset = base_offset + inst->dst.reg_offset;
2773    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2774
2775    /* Create a temporary register to store *inst's result in.
2776     *
2777     * We have to be careful in MOVing from our temporary result register in
2778     * the scratch write.  If we swizzle from channels of the temporary that
2779     * weren't initialized, it will confuse live interval analysis, which will
2780     * make spilling fail to make progress.
2781     */
2782    src_reg temp = src_reg(this, glsl_type::vec4_type);
2783    temp.type = inst->dst.type;
2784    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2785    int swizzles[4];
2786    for (int i = 0; i < 4; i++)
2787       if (inst->dst.writemask & (1 << i))
2788          swizzles[i] = i;
2789       else
2790          swizzles[i] = first_writemask_chan;
2791    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2792                                swizzles[2], swizzles[3]);
2793
2794    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2795                                        inst->dst.writemask));
2796    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2797    write->predicate = inst->predicate;
2798    write->ir = inst->ir;
2799    write->annotation = inst->annotation;
2800    inst->insert_after(write);
2801
2802    inst->dst.file = temp.file;
2803    inst->dst.reg = temp.reg;
2804    inst->dst.reg_offset = temp.reg_offset;
2805    inst->dst.reladdr = NULL;
2806 }
2807
2808 /**
2809  * We can't generally support array access in GRF space, because a
2810  * single instruction's destination can only span 2 contiguous
2811  * registers.  So, we send all GRF arrays that get variable index
2812  * access to scratch space.
2813  */
2814 void
2815 vec4_visitor::move_grf_array_access_to_scratch()
2816 {
2817    int scratch_loc[this->virtual_grf_count];
2818
2819    for (int i = 0; i < this->virtual_grf_count; i++) {
2820       scratch_loc[i] = -1;
2821    }
2822
2823    /* First, calculate the set of virtual GRFs that need to be punted
2824     * to scratch due to having any array access on them, and where in
2825     * scratch.
2826     */
2827    foreach_list(node, &this->instructions) {
2828       vec4_instruction *inst = (vec4_instruction *)node;
2829
2830       if (inst->dst.file == GRF && inst->dst.reladdr &&
2831           scratch_loc[inst->dst.reg] == -1) {
2832          scratch_loc[inst->dst.reg] = c->base.last_scratch;
2833          c->base.last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2834       }
2835
2836       for (int i = 0 ; i < 3; i++) {
2837          src_reg *src = &inst->src[i];
2838
2839          if (src->file == GRF && src->reladdr &&
2840              scratch_loc[src->reg] == -1) {
2841             scratch_loc[src->reg] = c->base.last_scratch;
2842             c->base.last_scratch += this->virtual_grf_sizes[src->reg];
2843          }
2844       }
2845    }
2846
2847    /* Now, for anything that will be accessed through scratch, rewrite
2848     * it to load/store.  Note that this is a _safe list walk, because
2849     * we may generate a new scratch_write instruction after the one
2850     * we're processing.
2851     */
2852    foreach_list_safe(node, &this->instructions) {
2853       vec4_instruction *inst = (vec4_instruction *)node;
2854
2855       /* Set up the annotation tracking for new generated instructions. */
2856       base_ir = inst->ir;
2857       current_annotation = inst->annotation;
2858
2859       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2860          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2861       }
2862
2863       for (int i = 0 ; i < 3; i++) {
2864          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2865             continue;
2866
2867          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2868
2869          emit_scratch_read(inst, temp, inst->src[i],
2870                            scratch_loc[inst->src[i].reg]);
2871
2872          inst->src[i].file = temp.file;
2873          inst->src[i].reg = temp.reg;
2874          inst->src[i].reg_offset = temp.reg_offset;
2875          inst->src[i].reladdr = NULL;
2876       }
2877    }
2878 }
2879
2880 /**
2881  * Emits an instruction before @inst to load the value named by @orig_src
2882  * from the pull constant buffer (surface) at @base_offset to @temp.
2883  */
2884 void
2885 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2886                                       dst_reg temp, src_reg orig_src,
2887                                       int base_offset)
2888 {
2889    int reg_offset = base_offset + orig_src.reg_offset;
2890    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2891    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2892    vec4_instruction *load;
2893
2894    if (intel->gen >= 7) {
2895       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2896       grf_offset.type = offset.type;
2897       emit_before(inst, MOV(grf_offset, offset));
2898
2899       load = new(mem_ctx) vec4_instruction(this,
2900                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2901                                            temp, index, src_reg(grf_offset));
2902    } else {
2903       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2904                                            temp, index, offset);
2905       load->base_mrf = 14;
2906       load->mlen = 1;
2907    }
2908    emit_before(inst, load);
2909 }
2910
2911 /**
2912  * Implements array access of uniforms by inserting a
2913  * PULL_CONSTANT_LOAD instruction.
2914  *
2915  * Unlike temporary GRF array access (where we don't support it due to
2916  * the difficulty of doing relative addressing on instruction
2917  * destinations), we could potentially do array access of uniforms
2918  * that were loaded in GRF space as push constants.  In real-world
2919  * usage we've seen, though, the arrays being used are always larger
2920  * than we could load as push constants, so just always move all
2921  * uniform array access out to a pull constant buffer.
2922  */
2923 void
2924 vec4_visitor::move_uniform_array_access_to_pull_constants()
2925 {
2926    int pull_constant_loc[this->uniforms];
2927
2928    for (int i = 0; i < this->uniforms; i++) {
2929       pull_constant_loc[i] = -1;
2930    }
2931
2932    /* Walk through and find array access of uniforms.  Put a copy of that
2933     * uniform in the pull constant buffer.
2934     *
2935     * Note that we don't move constant-indexed accesses to arrays.  No
2936     * testing has been done of the performance impact of this choice.
2937     */
2938    foreach_list_safe(node, &this->instructions) {
2939       vec4_instruction *inst = (vec4_instruction *)node;
2940
2941       for (int i = 0 ; i < 3; i++) {
2942          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2943             continue;
2944
2945          int uniform = inst->src[i].reg;
2946
2947          /* If this array isn't already present in the pull constant buffer,
2948           * add it.
2949           */
2950          if (pull_constant_loc[uniform] == -1) {
2951             const float **values = &prog_data->base.param[uniform * 4];
2952
2953             pull_constant_loc[uniform] = prog_data->base.nr_pull_params / 4;
2954
2955             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2956                prog_data->base.pull_param[prog_data->base.nr_pull_params++]
2957                   = values[j];
2958             }
2959          }
2960
2961          /* Set up the annotation tracking for new generated instructions. */
2962          base_ir = inst->ir;
2963          current_annotation = inst->annotation;
2964
2965          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2966
2967          emit_pull_constant_load(inst, temp, inst->src[i],
2968                                  pull_constant_loc[uniform]);
2969
2970          inst->src[i].file = temp.file;
2971          inst->src[i].reg = temp.reg;
2972          inst->src[i].reg_offset = temp.reg_offset;
2973          inst->src[i].reladdr = NULL;
2974       }
2975    }
2976
2977    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2978     * no need to track them as larger-than-vec4 objects.  This will be
2979     * relied on in cutting out unused uniform vectors from push
2980     * constants.
2981     */
2982    split_uniform_registers();
2983 }
2984
2985 void
2986 vec4_visitor::resolve_ud_negate(src_reg *reg)
2987 {
2988    if (reg->type != BRW_REGISTER_TYPE_UD ||
2989        !reg->negate)
2990       return;
2991
2992    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2993    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2994    *reg = temp;
2995 }
2996
2997 vec4_visitor::vec4_visitor(struct brw_context *brw,
2998                            struct brw_vs_compile *c,
2999                            struct brw_vs_prog_data *prog_data,
3000                            struct gl_shader_program *shader_prog,
3001                            struct brw_shader *shader,
3002                            void *mem_ctx)
3003 {
3004    this->c = c;
3005    this->brw = brw;
3006    this->intel = &brw->intel;
3007    this->ctx = &intel->ctx;
3008    this->shader_prog = shader_prog;
3009    this->shader = shader;
3010
3011    this->mem_ctx = mem_ctx;
3012    this->failed = false;
3013
3014    this->base_ir = NULL;
3015    this->current_annotation = NULL;
3016    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3017
3018    this->c = c;
3019    this->prog = &c->vp->program.Base;
3020    this->prog_data = prog_data;
3021
3022    this->variable_ht = hash_table_ctor(0,
3023                                        hash_table_pointer_hash,
3024                                        hash_table_pointer_compare);
3025
3026    this->virtual_grf_def = NULL;
3027    this->virtual_grf_use = NULL;
3028    this->virtual_grf_sizes = NULL;
3029    this->virtual_grf_count = 0;
3030    this->virtual_grf_reg_map = NULL;
3031    this->virtual_grf_reg_count = 0;
3032    this->virtual_grf_array_size = 0;
3033    this->live_intervals_valid = false;
3034
3035    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3036
3037    this->uniforms = 0;
3038 }
3039
3040 vec4_visitor::~vec4_visitor()
3041 {
3042    hash_table_dtor(this->variable_ht);
3043 }
3044
3045
3046 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3047                                  struct brw_vs_compile *c,
3048                                  struct brw_vs_prog_data *prog_data,
3049                                  struct gl_shader_program *prog,
3050                                  struct brw_shader *shader,
3051                                  void *mem_ctx)
3052    : vec4_visitor(brw, c, prog_data, prog, shader, mem_ctx)
3053 {
3054 }
3055
3056
3057 void
3058 vec4_visitor::fail(const char *format, ...)
3059 {
3060    va_list va;
3061    char *msg;
3062
3063    if (failed)
3064       return;
3065
3066    failed = true;
3067
3068    va_start(va, format);
3069    msg = ralloc_vasprintf(mem_ctx, format, va);
3070    va_end(va);
3071    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3072
3073    this->fail_msg = msg;
3074
3075    if (INTEL_DEBUG & DEBUG_VS) {
3076       fprintf(stderr, "%s",  msg);
3077    }
3078 }
3079
3080 } /* namespace brw */