src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU1(F32TO16)
 117 ALU1(F16TO32)
 118 ALU2(ADD)
 119 ALU2(MUL)
 120 ALU2(MACH)
 121 ALU2(AND)
 122 ALU2(OR)
 123 ALU2(XOR)
 124 ALU2(DP3)
 125 ALU2(DP4)
 126 ALU2(DPH)
 127 ALU2(SHL)
 128 ALU2(SHR)
 129 ALU2(ASR)
 130
 131 /** Gen4 predicated IF. */
 132 vec4_instruction *
 133 vec4_visitor::IF(uint32_t predicate)
 134 {
 135    vec4_instruction *inst;
 136
 137    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 138    inst->predicate = predicate;
 139
 140    return inst;
 141 }
 142
 143 /** Gen6+ IF with embedded comparison. */
 144 vec4_instruction *
 145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 146 {
 147    assert(intel->gen >= 6);
 148
 149    vec4_instruction *inst;
 150
 151    resolve_ud_negate(&src0);
 152    resolve_ud_negate(&src1);
 153
 154    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 155                                         src0, src1);
 156    inst->conditional_mod = condition;
 157
 158    return inst;
 159 }
 160
 161 /**
 162  * CMP: Sets the low bit of the destination channels with the result
 163  * of the comparison, while the upper bits are undefined, and updates
 164  * the flag register with the packed 16 bits of the result.
 165  */
 166 vec4_instruction *
 167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 168 {
 169    vec4_instruction *inst;
 170
 171    /* original gen4 does type conversion to the destination type
 172     * before before comparison, producing garbage results for floating
 173     * point comparisons.
 174     */
 175    if (intel->gen == 4) {
 176       dst.type = src0.type;
 177       if (dst.file == HW_REG)
 178          dst.fixed_hw_reg.type = dst.type;
 179    }
 180
 181    resolve_ud_negate(&src0);
 182    resolve_ud_negate(&src1);
 183
 184    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 185    inst->conditional_mod = condition;
 186
 187    return inst;
 188 }
 189
 190 vec4_instruction *
 191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 192 {
 193    vec4_instruction *inst;
 194
 195    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 196                                         dst, index);
 197    inst->base_mrf = 14;
 198    inst->mlen = 2;
 199
 200    return inst;
 201 }
 202
 203 vec4_instruction *
 204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 205 {
 206    vec4_instruction *inst;
 207
 208    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 209                                         dst, src, index);
 210    inst->base_mrf = 13;
 211    inst->mlen = 3;
 212
 213    return inst;
 214 }
 215
 216 void
 217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 218 {
 219    static enum opcode dot_opcodes[] = {
 220       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 221    };
 222
 223    emit(dot_opcodes[elements - 2], dst, src0, src1);
 224 }
 225
 226 src_reg
 227 vec4_visitor::fix_math_operand(src_reg src)
 228 {
 229    /* The gen6 math instruction ignores the source modifiers --
 230     * swizzle, abs, negate, and at least some parts of the register
 231     * region description.
 232     *
 233     * Rather than trying to enumerate all these cases, *always* expand the
 234     * operand to a temp GRF for gen6.
 235     *
 236     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 237     * can't use.
 238     */
 239
 240    if (intel->gen == 7 && src.file != IMM)
 241       return src;
 242
 243    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 244    expanded.type = src.type;
 245    emit(MOV(expanded, src));
 246    return src_reg(expanded);
 247 }
 248
 249 void
 250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 251 {
 252    src = fix_math_operand(src);
 253
 254    if (dst.writemask != WRITEMASK_XYZW) {
 255       /* The gen6 math instruction must be align1, so we can't do
 256        * writemasks.
 257        */
 258       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 259
 260       emit(opcode, temp_dst, src);
 261
 262       emit(MOV(dst, src_reg(temp_dst)));
 263    } else {
 264       emit(opcode, dst, src);
 265    }
 266 }
 267
 268 void
 269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 270 {
 271    vec4_instruction *inst = emit(opcode, dst, src);
 272    inst->base_mrf = 1;
 273    inst->mlen = 1;
 274 }
 275
 276 void
 277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 278 {
 279    switch (opcode) {
 280    case SHADER_OPCODE_RCP:
 281    case SHADER_OPCODE_RSQ:
 282    case SHADER_OPCODE_SQRT:
 283    case SHADER_OPCODE_EXP2:
 284    case SHADER_OPCODE_LOG2:
 285    case SHADER_OPCODE_SIN:
 286    case SHADER_OPCODE_COS:
 287       break;
 288    default:
 289       assert(!"not reached: bad math opcode");
 290       return;
 291    }
 292
 293    if (intel->gen >= 6) {
 294       return emit_math1_gen6(opcode, dst, src);
 295    } else {
 296       return emit_math1_gen4(opcode, dst, src);
 297    }
 298 }
 299
 300 void
 301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 302                               dst_reg dst, src_reg src0, src_reg src1)
 303 {
 304    src0 = fix_math_operand(src0);
 305    src1 = fix_math_operand(src1);
 306
 307    if (dst.writemask != WRITEMASK_XYZW) {
 308       /* The gen6 math instruction must be align1, so we can't do
 309        * writemasks.
 310        */
 311       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 312       temp_dst.type = dst.type;
 313
 314       emit(opcode, temp_dst, src0, src1);
 315
 316       emit(MOV(dst, src_reg(temp_dst)));
 317    } else {
 318       emit(opcode, dst, src0, src1);
 319    }
 320 }
 321
 322 void
 323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 324                               dst_reg dst, src_reg src0, src_reg src1)
 325 {
 326    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 327    inst->base_mrf = 1;
 328    inst->mlen = 2;
 329 }
 330
 331 void
 332 vec4_visitor::emit_math(enum opcode opcode,
 333                         dst_reg dst, src_reg src0, src_reg src1)
 334 {
 335    switch (opcode) {
 336    case SHADER_OPCODE_POW:
 337    case SHADER_OPCODE_INT_QUOTIENT:
 338    case SHADER_OPCODE_INT_REMAINDER:
 339       break;
 340    default:
 341       assert(!"not reached: unsupported binary math opcode");
 342       return;
 343    }
 344
 345    if (intel->gen >= 6) {
 346       return emit_math2_gen6(opcode, dst, src0, src1);
 347    } else {
 348       return emit_math2_gen4(opcode, dst, src0, src1);
 349    }
 350 }
 351
 352 void
 353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 354 {
 355    if (intel->gen < 7)
 356       assert(!"ir_unop_pack_half_2x16 should be lowered");
 357
 358    assert(dst.type == BRW_REGISTER_TYPE_UD);
 359    assert(src0.type == BRW_REGISTER_TYPE_F);
 360
 361    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 362     *
 363     *   Because this instruction does not have a 16-bit floating-point type,
 364     *   the destination data type must be Word (W).
 365     *
 366     *   The destination must be DWord-aligned and specify a horizontal stride
 367     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 368     *   each destination channel and the upper word is not modified.
 369     *
 370     * The above restriction implies that the f32to16 instruction must use
 371     * align1 mode, because only in align1 mode is it possible to specify
 372     * horizontal stride.  We choose here to defy the hardware docs and emit
 373     * align16 instructions.
 374     *
 375     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 376     * instructions. I was partially successful in that the code passed all
 377     * tests.  However, the code was dubiously correct and fragile, and the
 378     * tests were not harsh enough to probe that frailty. Not trusting the
 379     * code, I chose instead to remain in align16 mode in defiance of the hw
 380     * docs).
 381     *
 382     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 383     * simulator, emitting a f32to16 in align16 mode with UD as destination
 384     * data type is safe. The behavior differs from that specified in the PRM
 385     * in that the upper word of each destination channel is cleared to 0.
 386     */
 387
 388    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 389    src_reg tmp_src(tmp_dst);
 390
 391 #if 0
 392    /* Verify the undocumented behavior on which the following instructions
 393     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 394     * then the result of the bit-or instruction below will be incorrect.
 395     *
 396     * You should inspect the disasm output in order to verify that the MOV is
 397     * not optimized away.
 398     */
 399    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 400 #endif
 401
 402    /* Give tmp the form below, where "." means untouched.
 403     *
 404     *     w z          y          x w z          y          x
 405     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 406     *
 407     * That the upper word of each write-channel be 0 is required for the
 408     * following bit-shift and bit-or instructions to work. Note that this
 409     * relies on the undocumented hardware behavior mentioned above.
 410     */
 411    tmp_dst.writemask = WRITEMASK_XY;
 412    emit(F32TO16(tmp_dst, src0));
 413
 414    /* Give the write-channels of dst the form:
 415     *   0xhhhh0000
 416     */
 417    tmp_src.swizzle = SWIZZLE_Y;
 418    emit(SHL(dst, tmp_src, src_reg(16u)));
 419
 420    /* Finally, give the write-channels of dst the form of packHalf2x16's
 421     * output:
 422     *   0xhhhhllll
 423     */
 424    tmp_src.swizzle = SWIZZLE_X;
 425    emit(OR(dst, src_reg(dst), tmp_src));
 426 }
 427
 428 void
 429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 430 {
 431    if (intel->gen < 7)
 432       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 433
 434    assert(dst.type == BRW_REGISTER_TYPE_F);
 435    assert(src0.type == BRW_REGISTER_TYPE_UD);
 436
 437    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 438     *
 439     *   Because this instruction does not have a 16-bit floating-point type,
 440     *   the source data type must be Word (W). The destination type must be
 441     *   F (Float).
 442     *
 443     * To use W as the source data type, we must adjust horizontal strides,
 444     * which is only possible in align1 mode. All my [chadv] attempts at
 445     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 446     * Piglit tests, so I gave up.
 447     *
 448     * I've verified that, on gen7 hardware and the simulator, it is safe to
 449     * emit f16to32 in align16 mode with UD as source data type.
 450     */
 451
 452    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 453    src_reg tmp_src(tmp_dst);
 454
 455    tmp_dst.writemask = WRITEMASK_X;
 456    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 457
 458    tmp_dst.writemask = WRITEMASK_Y;
 459    emit(SHR(tmp_dst, src0, src_reg(16u)));
 460
 461    dst.writemask = WRITEMASK_XY;
 462    emit(F16TO32(dst, tmp_src));
 463 }
 464
 465 void
 466 vec4_visitor::visit_instructions(const exec_list *list)
 467 {
 468    foreach_list(node, list) {
 469       ir_instruction *ir = (ir_instruction *)node;
 470
 471       base_ir = ir;
 472       ir->accept(this);
 473    }
 474 }
 475
 476
 477 static int
 478 type_size(const struct glsl_type *type)
 479 {
 480    unsigned int i;
 481    int size;
 482
 483    switch (type->base_type) {
 484    case GLSL_TYPE_UINT:
 485    case GLSL_TYPE_INT:
 486    case GLSL_TYPE_FLOAT:
 487    case GLSL_TYPE_BOOL:
 488       if (type->is_matrix()) {
 489          return type->matrix_columns;
 490       } else {
 491          /* Regardless of size of vector, it gets a vec4. This is bad
 492           * packing for things like floats, but otherwise arrays become a
 493           * mess.  Hopefully a later pass over the code can pack scalars
 494           * down if appropriate.
 495           */
 496          return 1;
 497       }
 498    case GLSL_TYPE_ARRAY:
 499       assert(type->length > 0);
 500       return type_size(type->fields.array) * type->length;
 501    case GLSL_TYPE_STRUCT:
 502       size = 0;
 503       for (i = 0; i < type->length; i++) {
 504          size += type_size(type->fields.structure[i].type);
 505       }
 506       return size;
 507    case GLSL_TYPE_SAMPLER:
 508       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 509        * at link time.
 510        */
 511       return 1;
 512    case GLSL_TYPE_VOID:
 513    case GLSL_TYPE_ERROR:
 514    case GLSL_TYPE_INTERFACE:
 515       assert(0);
 516       break;
 517    }
 518
 519    return 0;
 520 }
 521
 522 int
 523 vec4_visitor::virtual_grf_alloc(int size)
 524 {
 525    if (virtual_grf_array_size <= virtual_grf_count) {
 526       if (virtual_grf_array_size == 0)
 527          virtual_grf_array_size = 16;
 528       else
 529          virtual_grf_array_size *= 2;
 530       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 531                                    virtual_grf_array_size);
 532       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 533                                      virtual_grf_array_size);
 534    }
 535    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 536    virtual_grf_reg_count += size;
 537    virtual_grf_sizes[virtual_grf_count] = size;
 538    return virtual_grf_count++;
 539 }
 540
 541 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 542 {
 543    init();
 544
 545    this->file = GRF;
 546    this->reg = v->virtual_grf_alloc(type_size(type));
 547
 548    if (type->is_array() || type->is_record()) {
 549       this->swizzle = BRW_SWIZZLE_NOOP;
 550    } else {
 551       this->swizzle = swizzle_for_size(type->vector_elements);
 552    }
 553
 554    this->type = brw_type_for_base_type(type);
 555 }
 556
 557 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 558 {
 559    init();
 560
 561    this->file = GRF;
 562    this->reg = v->virtual_grf_alloc(type_size(type));
 563
 564    if (type->is_array() || type->is_record()) {
 565       this->writemask = WRITEMASK_XYZW;
 566    } else {
 567       this->writemask = (1 << type->vector_elements) - 1;
 568    }
 569
 570    this->type = brw_type_for_base_type(type);
 571 }
 572
 573 /* Our support for uniforms is piggy-backed on the struct
 574  * gl_fragment_program, because that's where the values actually
 575  * get stored, rather than in some global gl_shader_program uniform
 576  * store.
 577  */
 578 void
 579 vec4_visitor::setup_uniform_values(ir_variable *ir)
 580 {
 581    int namelen = strlen(ir->name);
 582
 583    /* The data for our (non-builtin) uniforms is stored in a series of
 584     * gl_uniform_driver_storage structs for each subcomponent that
 585     * glGetUniformLocation() could name.  We know it's been set up in the same
 586     * order we'd walk the type, so walk the list of storage and find anything
 587     * with our name, or the prefix of a component that starts with our name.
 588     */
 589    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 590       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 591
 592       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 593           (storage->name[namelen] != 0 &&
 594            storage->name[namelen] != '.' &&
 595            storage->name[namelen] != '[')) {
 596          continue;
 597       }
 598
 599       gl_constant_value *components = storage->storage;
 600       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 601                                storage->type->matrix_columns);
 602
 603       for (unsigned s = 0; s < vector_count; s++) {
 604          uniform_vector_size[uniforms] = storage->type->vector_elements;
 605
 606          int i;
 607          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 608             prog_data->param[uniforms * 4 + i] = &components->f;
 609             components++;
 610          }
 611          for (; i < 4; i++) {
 612             static float zero = 0;
 613             prog_data->param[uniforms * 4 + i] = &zero;
 614          }
 615
 616          uniforms++;
 617       }
 618    }
 619 }
 620
 621 void
 622 vec4_visitor::setup_uniform_clipplane_values()
 623 {
 624    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 625
 626    if (intel->gen < 6) {
 627       /* Pre-Gen6, we compact clip planes.  For example, if the user
 628        * enables just clip planes 0, 1, and 3, we will enable clip planes
 629        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 630        * plane 2.  This simplifies the implementation of the Gen6 clip
 631        * thread.
 632        */
 633       int compacted_clipplane_index = 0;
 634       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 635          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 636             continue;
 637
 638          this->uniform_vector_size[this->uniforms] = 4;
 639          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 640          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 641          for (int j = 0; j < 4; ++j) {
 642             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 643          }
 644          ++compacted_clipplane_index;
 645          ++this->uniforms;
 646       }
 647    } else {
 648       /* In Gen6 and later, we don't compact clip planes, because this
 649        * simplifies the implementation of gl_ClipDistance.
 650        */
 651       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 652          this->uniform_vector_size[this->uniforms] = 4;
 653          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 654          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 655          for (int j = 0; j < 4; ++j) {
 656             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 657          }
 658          ++this->uniforms;
 659       }
 660    }
 661 }
 662
 663 /* Our support for builtin uniforms is even scarier than non-builtin.
 664  * It sits on top of the PROG_STATE_VAR parameters that are
 665  * automatically updated from GL context state.
 666  */
 667 void
 668 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 669 {
 670    const ir_state_slot *const slots = ir->state_slots;
 671    assert(ir->state_slots != NULL);
 672
 673    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 674       /* This state reference has already been setup by ir_to_mesa,
 675        * but we'll get the same index back here.  We can reference
 676        * ParameterValues directly, since unlike brw_fs.cpp, we never
 677        * add new state references during compile.
 678        */
 679       int index = _mesa_add_state_reference(this->prog->Parameters,
 680                                             (gl_state_index *)slots[i].tokens);
 681       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 682
 683       this->uniform_vector_size[this->uniforms] = 0;
 684       /* Add each of the unique swizzled channels of the element.
 685        * This will end up matching the size of the glsl_type of this field.
 686        */
 687       int last_swiz = -1;
 688       for (unsigned int j = 0; j < 4; j++) {
 689          int swiz = GET_SWZ(slots[i].swizzle, j);
 690          last_swiz = swiz;
 691
 692          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 693          if (swiz <= last_swiz)
 694             this->uniform_vector_size[this->uniforms]++;
 695       }
 696       this->uniforms++;
 697    }
 698 }
 699
 700 dst_reg *
 701 vec4_visitor::variable_storage(ir_variable *var)
 702 {
 703    return (dst_reg *)hash_table_find(this->variable_ht, var);
 704 }
 705
 706 void
 707 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 708 {
 709    ir_expression *expr = ir->as_expression();
 710
 711    *predicate = BRW_PREDICATE_NORMAL;
 712
 713    if (expr) {
 714       src_reg op[2];
 715       vec4_instruction *inst;
 716
 717       assert(expr->get_num_operands() <= 2);
 718       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 719          expr->operands[i]->accept(this);
 720          op[i] = this->result;
 721
 722          resolve_ud_negate(&op[i]);
 723       }
 724
 725       switch (expr->operation) {
 726       case ir_unop_logic_not:
 727          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 728          inst->conditional_mod = BRW_CONDITIONAL_Z;
 729          break;
 730
 731       case ir_binop_logic_xor:
 732          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 733          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 734          break;
 735
 736       case ir_binop_logic_or:
 737          inst = emit(OR(dst_null_d(), op[0], op[1]));
 738          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 739          break;
 740
 741       case ir_binop_logic_and:
 742          inst = emit(AND(dst_null_d(), op[0], op[1]));
 743          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 744          break;
 745
 746       case ir_unop_f2b:
 747          if (intel->gen >= 6) {
 748             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 749          } else {
 750             inst = emit(MOV(dst_null_f(), op[0]));
 751             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 752          }
 753          break;
 754
 755       case ir_unop_i2b:
 756          if (intel->gen >= 6) {
 757             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 758          } else {
 759             inst = emit(MOV(dst_null_d(), op[0]));
 760             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 761          }
 762          break;
 763
 764       case ir_binop_all_equal:
 765          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 766          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 767          break;
 768
 769       case ir_binop_any_nequal:
 770          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 771          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 772          break;
 773
 774       case ir_unop_any:
 775          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 776          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 777          break;
 778
 779       case ir_binop_greater:
 780       case ir_binop_gequal:
 781       case ir_binop_less:
 782       case ir_binop_lequal:
 783       case ir_binop_equal:
 784       case ir_binop_nequal:
 785          emit(CMP(dst_null_d(), op[0], op[1],
 786                   brw_conditional_for_comparison(expr->operation)));
 787          break;
 788
 789       default:
 790          assert(!"not reached");
 791          break;
 792       }
 793       return;
 794    }
 795
 796    ir->accept(this);
 797
 798    resolve_ud_negate(&this->result);
 799
 800    if (intel->gen >= 6) {
 801       vec4_instruction *inst = emit(AND(dst_null_d(),
 802                                         this->result, src_reg(1)));
 803       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 804    } else {
 805       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 806       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 807    }
 808 }
 809
 810 /**
 811  * Emit a gen6 IF statement with the comparison folded into the IF
 812  * instruction.
 813  */
 814 void
 815 vec4_visitor::emit_if_gen6(ir_if *ir)
 816 {
 817    ir_expression *expr = ir->condition->as_expression();
 818
 819    if (expr) {
 820       src_reg op[2];
 821       dst_reg temp;
 822
 823       assert(expr->get_num_operands() <= 2);
 824       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 825          expr->operands[i]->accept(this);
 826          op[i] = this->result;
 827       }
 828
 829       switch (expr->operation) {
 830       case ir_unop_logic_not:
 831          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 832          return;
 833
 834       case ir_binop_logic_xor:
 835          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 836          return;
 837
 838       case ir_binop_logic_or:
 839          temp = dst_reg(this, glsl_type::bool_type);
 840          emit(OR(temp, op[0], op[1]));
 841          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 842          return;
 843
 844       case ir_binop_logic_and:
 845          temp = dst_reg(this, glsl_type::bool_type);
 846          emit(AND(temp, op[0], op[1]));
 847          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 848          return;
 849
 850       case ir_unop_f2b:
 851          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 852          return;
 853
 854       case ir_unop_i2b:
 855          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 856          return;
 857
 858       case ir_binop_greater:
 859       case ir_binop_gequal:
 860       case ir_binop_less:
 861       case ir_binop_lequal:
 862       case ir_binop_equal:
 863       case ir_binop_nequal:
 864          emit(IF(op[0], op[1],
 865                  brw_conditional_for_comparison(expr->operation)));
 866          return;
 867
 868       case ir_binop_all_equal:
 869          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 870          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 871          return;
 872
 873       case ir_binop_any_nequal:
 874          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 875          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 876          return;
 877
 878       case ir_unop_any:
 879          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 880          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 881          return;
 882
 883       default:
 884          assert(!"not reached");
 885          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 886          return;
 887       }
 888       return;
 889    }
 890
 891    ir->condition->accept(this);
 892
 893    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 894 }
 895
 896 static dst_reg
 897 with_writemask(dst_reg const & r, int mask)
 898 {
 899    dst_reg result = r;
 900    result.writemask = mask;
 901    return result;
 902 }
 903
 904 void
 905 vec4_visitor::emit_attribute_fixups()
 906 {
 907    dst_reg sign_recovery_shift;
 908    dst_reg normalize_factor;
 909    dst_reg es3_normalize_factor;
 910
 911    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 912       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 913          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 914          dst_reg reg(ATTR, i);
 915          dst_reg reg_d = reg;
 916          reg_d.type = BRW_REGISTER_TYPE_D;
 917          dst_reg reg_ud = reg;
 918          reg_ud.type = BRW_REGISTER_TYPE_UD;
 919
 920          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 921           * come in as floating point conversions of the integer values.
 922           */
 923          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 924             dst_reg dst = reg;
 925             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 926             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 927             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 928          }
 929
 930          /* Do sign recovery for 2101010 formats if required. */
 931          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 932             if (sign_recovery_shift.file == BAD_FILE) {
 933                /* shift constant: <22,22,22,30> */
 934                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 935                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 936                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 937             }
 938
 939             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 940             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 941          }
 942
 943          /* Apply BGRA swizzle if required. */
 944          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 945             src_reg temp = src_reg(reg);
 946             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 947             emit(MOV(reg, temp));
 948          }
 949
 950          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 951             /* ES 3.0 has different rules for converting signed normalized
 952              * fixed-point numbers than desktop GL.
 953              */
 954             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 955                /* According to equation 2.2 of the ES 3.0 specification,
 956                 * signed normalization conversion is done by:
 957                 *
 958                 * f = c / (2^(b-1)-1)
 959                 */
 960                if (es3_normalize_factor.file == BAD_FILE) {
 961                   /* mul constant: 1 / (2^(b-1) - 1) */
 962                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 963                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 964                            src_reg(1.0f / ((1<<9) - 1))));
 965                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 966                            src_reg(1.0f / ((1<<1) - 1))));
 967                }
 968
 969                dst_reg dst = reg;
 970                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 971                emit(MOV(dst, src_reg(reg_d)));
 972                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 973                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 974             } else {
 975                /* The following equations are from the OpenGL 3.2 specification:
 976                 *
 977                 * 2.1 unsigned normalization
 978                 * f = c/(2^n-1)
 979                 *
 980                 * 2.2 signed normalization
 981                 * f = (2c+1)/(2^n-1)
 982                 *
 983                 * Both of these share a common divisor, which is represented by
 984                 * "normalize_factor" in the code below.
 985                 */
 986                if (normalize_factor.file == BAD_FILE) {
 987                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 988                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 989                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 990                            src_reg(1.0f / ((1<<10) - 1))));
 991                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 992                            src_reg(1.0f / ((1<<2) - 1))));
 993                }
 994
 995                dst_reg dst = reg;
 996                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 997                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 998
 999                /* For signed normalization, we want the numerator to be 2c+1. */
1000                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1001                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1002                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1003                }
1004
1005                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1006             }
1007          }
1008
1009          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1010             dst_reg dst = reg;
1011             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1012             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1013          }
1014       }
1015    }
1016 }
1017
1018 void
1019 vec4_visitor::visit(ir_variable *ir)
1020 {
1021    dst_reg *reg = NULL;
1022
1023    if (variable_storage(ir))
1024       return;
1025
1026    switch (ir->mode) {
1027    case ir_var_shader_in:
1028       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1029       break;
1030
1031    case ir_var_shader_out:
1032       reg = new(mem_ctx) dst_reg(this, ir->type);
1033
1034       for (int i = 0; i < type_size(ir->type); i++) {
1035          output_reg[ir->location + i] = *reg;
1036          output_reg[ir->location + i].reg_offset = i;
1037          output_reg[ir->location + i].type =
1038             brw_type_for_base_type(ir->type->get_scalar_type());
1039          output_reg_annotation[ir->location + i] = ir->name;
1040       }
1041       break;
1042
1043    case ir_var_auto:
1044    case ir_var_temporary:
1045       reg = new(mem_ctx) dst_reg(this, ir->type);
1046       break;
1047
1048    case ir_var_uniform:
1049       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1050
1051       /* Thanks to the lower_ubo_reference pass, we will see only
1052        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1053        * variables, so no need for them to be in variable_ht.
1054        */
1055       if (ir->is_in_uniform_block())
1056          return;
1057
1058       /* Track how big the whole uniform variable is, in case we need to put a
1059        * copy of its data into pull constants for array access.
1060        */
1061       this->uniform_size[this->uniforms] = type_size(ir->type);
1062
1063       if (!strncmp(ir->name, "gl_", 3)) {
1064          setup_builtin_uniform_values(ir);
1065       } else {
1066          setup_uniform_values(ir);
1067       }
1068       break;
1069
1070    case ir_var_system_value:
1071       /* VertexID is stored by the VF as the last vertex element, but
1072        * we don't represent it with a flag in inputs_read, so we call
1073        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1074        */
1075       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1076       prog_data->uses_vertexid = true;
1077
1078       switch (ir->location) {
1079       case SYSTEM_VALUE_VERTEX_ID:
1080          reg->writemask = WRITEMASK_X;
1081          break;
1082       case SYSTEM_VALUE_INSTANCE_ID:
1083          reg->writemask = WRITEMASK_Y;
1084          break;
1085       default:
1086          assert(!"not reached");
1087          break;
1088       }
1089       break;
1090
1091    default:
1092       assert(!"not reached");
1093    }
1094
1095    reg->type = brw_type_for_base_type(ir->type);
1096    hash_table_insert(this->variable_ht, reg, ir);
1097 }
1098
1099 void
1100 vec4_visitor::visit(ir_loop *ir)
1101 {
1102    dst_reg counter;
1103
1104    /* We don't want debugging output to print the whole body of the
1105     * loop as the annotation.
1106     */
1107    this->base_ir = NULL;
1108
1109    if (ir->counter != NULL) {
1110       this->base_ir = ir->counter;
1111       ir->counter->accept(this);
1112       counter = *(variable_storage(ir->counter));
1113
1114       if (ir->from != NULL) {
1115          this->base_ir = ir->from;
1116          ir->from->accept(this);
1117
1118          emit(MOV(counter, this->result));
1119       }
1120    }
1121
1122    emit(BRW_OPCODE_DO);
1123
1124    if (ir->to) {
1125       this->base_ir = ir->to;
1126       ir->to->accept(this);
1127
1128       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1129                brw_conditional_for_comparison(ir->cmp)));
1130
1131       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1132       inst->predicate = BRW_PREDICATE_NORMAL;
1133    }
1134
1135    visit_instructions(&ir->body_instructions);
1136
1137
1138    if (ir->increment) {
1139       this->base_ir = ir->increment;
1140       ir->increment->accept(this);
1141       emit(ADD(counter, src_reg(counter), this->result));
1142    }
1143
1144    emit(BRW_OPCODE_WHILE);
1145 }
1146
1147 void
1148 vec4_visitor::visit(ir_loop_jump *ir)
1149 {
1150    switch (ir->mode) {
1151    case ir_loop_jump::jump_break:
1152       emit(BRW_OPCODE_BREAK);
1153       break;
1154    case ir_loop_jump::jump_continue:
1155       emit(BRW_OPCODE_CONTINUE);
1156       break;
1157    }
1158 }
1159
1160
1161 void
1162 vec4_visitor::visit(ir_function_signature *ir)
1163 {
1164    assert(0);
1165    (void)ir;
1166 }
1167
1168 void
1169 vec4_visitor::visit(ir_function *ir)
1170 {
1171    /* Ignore function bodies other than main() -- we shouldn't see calls to
1172     * them since they should all be inlined.
1173     */
1174    if (strcmp(ir->name, "main") == 0) {
1175       const ir_function_signature *sig;
1176       exec_list empty;
1177
1178       sig = ir->matching_signature(&empty);
1179
1180       assert(sig);
1181
1182       visit_instructions(&sig->body);
1183    }
1184 }
1185
1186 bool
1187 vec4_visitor::try_emit_sat(ir_expression *ir)
1188 {
1189    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1190    if (!sat_src)
1191       return false;
1192
1193    sat_src->accept(this);
1194    src_reg src = this->result;
1195
1196    this->result = src_reg(this, ir->type);
1197    vec4_instruction *inst;
1198    inst = emit(MOV(dst_reg(this->result), src));
1199    inst->saturate = true;
1200
1201    return true;
1202 }
1203
1204 void
1205 vec4_visitor::emit_bool_comparison(unsigned int op,
1206                                  dst_reg dst, src_reg src0, src_reg src1)
1207 {
1208    /* original gen4 does destination conversion before comparison. */
1209    if (intel->gen < 5)
1210       dst.type = src0.type;
1211
1212    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1213
1214    dst.type = BRW_REGISTER_TYPE_D;
1215    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1216 }
1217
1218 void
1219 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1220                           src_reg src0, src_reg src1)
1221 {
1222    vec4_instruction *inst;
1223
1224    if (intel->gen >= 6) {
1225       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1226       inst->conditional_mod = conditionalmod;
1227    } else {
1228       emit(CMP(dst, src0, src1, conditionalmod));
1229
1230       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1231       inst->predicate = BRW_PREDICATE_NORMAL;
1232    }
1233 }
1234
1235 void
1236 vec4_visitor::visit(ir_expression *ir)
1237 {
1238    unsigned int operand;
1239    src_reg op[Elements(ir->operands)];
1240    src_reg result_src;
1241    dst_reg result_dst;
1242    vec4_instruction *inst;
1243
1244    if (try_emit_sat(ir))
1245       return;
1246
1247    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1248       this->result.file = BAD_FILE;
1249       ir->operands[operand]->accept(this);
1250       if (this->result.file == BAD_FILE) {
1251          printf("Failed to get tree for expression operand:\n");
1252          ir->operands[operand]->print();
1253          exit(1);
1254       }
1255       op[operand] = this->result;
1256
1257       /* Matrix expression operands should have been broken down to vector
1258        * operations already.
1259        */
1260       assert(!ir->operands[operand]->type->is_matrix());
1261    }
1262
1263    int vector_elements = ir->operands[0]->type->vector_elements;
1264    if (ir->operands[1]) {
1265       vector_elements = MAX2(vector_elements,
1266                              ir->operands[1]->type->vector_elements);
1267    }
1268
1269    this->result.file = BAD_FILE;
1270
1271    /* Storage for our result.  Ideally for an assignment we'd be using
1272     * the actual storage for the result here, instead.
1273     */
1274    result_src = src_reg(this, ir->type);
1275    /* convenience for the emit functions below. */
1276    result_dst = dst_reg(result_src);
1277    /* If nothing special happens, this is the result. */
1278    this->result = result_src;
1279    /* Limit writes to the channels that will be used by result_src later.
1280     * This does limit this temp's use as a temporary for multi-instruction
1281     * sequences.
1282     */
1283    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1284
1285    switch (ir->operation) {
1286    case ir_unop_logic_not:
1287       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1288        * ones complement of the whole register, not just bit 0.
1289        */
1290       emit(XOR(result_dst, op[0], src_reg(1)));
1291       break;
1292    case ir_unop_neg:
1293       op[0].negate = !op[0].negate;
1294       this->result = op[0];
1295       break;
1296    case ir_unop_abs:
1297       op[0].abs = true;
1298       op[0].negate = false;
1299       this->result = op[0];
1300       break;
1301
1302    case ir_unop_sign:
1303       emit(MOV(result_dst, src_reg(0.0f)));
1304
1305       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1306       inst = emit(MOV(result_dst, src_reg(1.0f)));
1307       inst->predicate = BRW_PREDICATE_NORMAL;
1308
1309       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1310       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1311       inst->predicate = BRW_PREDICATE_NORMAL;
1312
1313       break;
1314
1315    case ir_unop_rcp:
1316       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1317       break;
1318
1319    case ir_unop_exp2:
1320       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1321       break;
1322    case ir_unop_log2:
1323       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1324       break;
1325    case ir_unop_exp:
1326    case ir_unop_log:
1327       assert(!"not reached: should be handled by ir_explog_to_explog2");
1328       break;
1329    case ir_unop_sin:
1330    case ir_unop_sin_reduced:
1331       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1332       break;
1333    case ir_unop_cos:
1334    case ir_unop_cos_reduced:
1335       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1336       break;
1337
1338    case ir_unop_dFdx:
1339    case ir_unop_dFdy:
1340       assert(!"derivatives not valid in vertex shader");
1341       break;
1342
1343    case ir_unop_noise:
1344       assert(!"not reached: should be handled by lower_noise");
1345       break;
1346
1347    case ir_binop_add:
1348       emit(ADD(result_dst, op[0], op[1]));
1349       break;
1350    case ir_binop_sub:
1351       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1352       break;
1353
1354    case ir_binop_mul:
1355       if (ir->type->is_integer()) {
1356          /* For integer multiplication, the MUL uses the low 16 bits
1357           * of one of the operands (src0 on gen6, src1 on gen7).  The
1358           * MACH accumulates in the contribution of the upper 16 bits
1359           * of that operand.
1360           *
1361           * FINISHME: Emit just the MUL if we know an operand is small
1362           * enough.
1363           */
1364          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1365
1366          emit(MUL(acc, op[0], op[1]));
1367          emit(MACH(dst_null_d(), op[0], op[1]));
1368          emit(MOV(result_dst, src_reg(acc)));
1369       } else {
1370          emit(MUL(result_dst, op[0], op[1]));
1371       }
1372       break;
1373    case ir_binop_div:
1374       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1375       assert(ir->type->is_integer());
1376       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1377       break;
1378    case ir_binop_mod:
1379       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1380       assert(ir->type->is_integer());
1381       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1382       break;
1383
1384    case ir_binop_less:
1385    case ir_binop_greater:
1386    case ir_binop_lequal:
1387    case ir_binop_gequal:
1388    case ir_binop_equal:
1389    case ir_binop_nequal: {
1390       emit(CMP(result_dst, op[0], op[1],
1391                brw_conditional_for_comparison(ir->operation)));
1392       emit(AND(result_dst, result_src, src_reg(0x1)));
1393       break;
1394    }
1395
1396    case ir_binop_all_equal:
1397       /* "==" operator producing a scalar boolean. */
1398       if (ir->operands[0]->type->is_vector() ||
1399           ir->operands[1]->type->is_vector()) {
1400          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1401          emit(MOV(result_dst, src_reg(0)));
1402          inst = emit(MOV(result_dst, src_reg(1)));
1403          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1404       } else {
1405          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1406          emit(AND(result_dst, result_src, src_reg(0x1)));
1407       }
1408       break;
1409    case ir_binop_any_nequal:
1410       /* "!=" operator producing a scalar boolean. */
1411       if (ir->operands[0]->type->is_vector() ||
1412           ir->operands[1]->type->is_vector()) {
1413          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1414
1415          emit(MOV(result_dst, src_reg(0)));
1416          inst = emit(MOV(result_dst, src_reg(1)));
1417          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1418       } else {
1419          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1420          emit(AND(result_dst, result_src, src_reg(0x1)));
1421       }
1422       break;
1423
1424    case ir_unop_any:
1425       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1426       emit(MOV(result_dst, src_reg(0)));
1427
1428       inst = emit(MOV(result_dst, src_reg(1)));
1429       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1430       break;
1431
1432    case ir_binop_logic_xor:
1433       emit(XOR(result_dst, op[0], op[1]));
1434       break;
1435
1436    case ir_binop_logic_or:
1437       emit(OR(result_dst, op[0], op[1]));
1438       break;
1439
1440    case ir_binop_logic_and:
1441       emit(AND(result_dst, op[0], op[1]));
1442       break;
1443
1444    case ir_binop_dot:
1445       assert(ir->operands[0]->type->is_vector());
1446       assert(ir->operands[0]->type == ir->operands[1]->type);
1447       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1448       break;
1449
1450    case ir_unop_sqrt:
1451       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1452       break;
1453    case ir_unop_rsq:
1454       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1455       break;
1456
1457    case ir_unop_bitcast_i2f:
1458    case ir_unop_bitcast_u2f:
1459       this->result = op[0];
1460       this->result.type = BRW_REGISTER_TYPE_F;
1461       break;
1462
1463    case ir_unop_bitcast_f2i:
1464       this->result = op[0];
1465       this->result.type = BRW_REGISTER_TYPE_D;
1466       break;
1467
1468    case ir_unop_bitcast_f2u:
1469       this->result = op[0];
1470       this->result.type = BRW_REGISTER_TYPE_UD;
1471       break;
1472
1473    case ir_unop_i2f:
1474    case ir_unop_i2u:
1475    case ir_unop_u2i:
1476    case ir_unop_u2f:
1477    case ir_unop_b2f:
1478    case ir_unop_b2i:
1479    case ir_unop_f2i:
1480    case ir_unop_f2u:
1481       emit(MOV(result_dst, op[0]));
1482       break;
1483    case ir_unop_f2b:
1484    case ir_unop_i2b: {
1485       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486       emit(AND(result_dst, result_src, src_reg(1)));
1487       break;
1488    }
1489
1490    case ir_unop_trunc:
1491       emit(RNDZ(result_dst, op[0]));
1492       break;
1493    case ir_unop_ceil:
1494       op[0].negate = !op[0].negate;
1495       inst = emit(RNDD(result_dst, op[0]));
1496       this->result.negate = true;
1497       break;
1498    case ir_unop_floor:
1499       inst = emit(RNDD(result_dst, op[0]));
1500       break;
1501    case ir_unop_fract:
1502       inst = emit(FRC(result_dst, op[0]));
1503       break;
1504    case ir_unop_round_even:
1505       emit(RNDE(result_dst, op[0]));
1506       break;
1507
1508    case ir_binop_min:
1509       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1510       break;
1511    case ir_binop_max:
1512       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1513       break;
1514
1515    case ir_binop_pow:
1516       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1517       break;
1518
1519    case ir_unop_bit_not:
1520       inst = emit(NOT(result_dst, op[0]));
1521       break;
1522    case ir_binop_bit_and:
1523       inst = emit(AND(result_dst, op[0], op[1]));
1524       break;
1525    case ir_binop_bit_xor:
1526       inst = emit(XOR(result_dst, op[0], op[1]));
1527       break;
1528    case ir_binop_bit_or:
1529       inst = emit(OR(result_dst, op[0], op[1]));
1530       break;
1531
1532    case ir_binop_lshift:
1533       inst = emit(SHL(result_dst, op[0], op[1]));
1534       break;
1535
1536    case ir_binop_rshift:
1537       if (ir->type->base_type == GLSL_TYPE_INT)
1538          inst = emit(ASR(result_dst, op[0], op[1]));
1539       else
1540          inst = emit(SHR(result_dst, op[0], op[1]));
1541       break;
1542
1543    case ir_binop_ubo_load: {
1544       ir_constant *uniform_block = ir->operands[0]->as_constant();
1545       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1546       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1547       src_reg offset = op[1];
1548
1549       /* Now, load the vector from that offset. */
1550       assert(ir->type->is_vector() || ir->type->is_scalar());
1551
1552       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1553       packed_consts.type = result.type;
1554       src_reg surf_index =
1555          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1556       if (const_offset_ir) {
1557          offset = src_reg(const_offset / 16);
1558       } else {
1559          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1560       }
1561
1562       vec4_instruction *pull =
1563          emit(new(mem_ctx) vec4_instruction(this,
1564                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1565                                             dst_reg(packed_consts),
1566                                             surf_index,
1567                                             offset));
1568       pull->base_mrf = 14;
1569       pull->mlen = 1;
1570
1571       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1572       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1573                                             const_offset % 16 / 4,
1574                                             const_offset % 16 / 4,
1575                                             const_offset % 16 / 4);
1576
1577       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1578       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1579          emit(CMP(result_dst, packed_consts, src_reg(0u),
1580                   BRW_CONDITIONAL_NZ));
1581          emit(AND(result_dst, result, src_reg(0x1)));
1582       } else {
1583          emit(MOV(result_dst, packed_consts));
1584       }
1585       break;
1586    }
1587
1588    case ir_triop_lrp:
1589       assert(!"not reached: should be handled by lrp_to_arith");
1590       break;
1591
1592    case ir_quadop_vector:
1593       assert(!"not reached: should be handled by lower_quadop_vector");
1594       break;
1595
1596    case ir_unop_pack_half_2x16:
1597       emit_pack_half_2x16(result_dst, op[0]);
1598       break;
1599    case ir_unop_unpack_half_2x16:
1600       emit_unpack_half_2x16(result_dst, op[0]);
1601       break;
1602    case ir_unop_pack_snorm_2x16:
1603    case ir_unop_pack_snorm_4x8:
1604    case ir_unop_pack_unorm_2x16:
1605    case ir_unop_pack_unorm_4x8:
1606    case ir_unop_unpack_snorm_2x16:
1607    case ir_unop_unpack_snorm_4x8:
1608    case ir_unop_unpack_unorm_2x16:
1609    case ir_unop_unpack_unorm_4x8:
1610       assert(!"not reached: should be handled by lower_packing_builtins");
1611       break;
1612    case ir_unop_unpack_half_2x16_split_x:
1613    case ir_unop_unpack_half_2x16_split_y:
1614    case ir_binop_pack_half_2x16_split:
1615       assert(!"not reached: should not occur in vertex shader");
1616       break;
1617    }
1618 }
1619
1620
1621 void
1622 vec4_visitor::visit(ir_swizzle *ir)
1623 {
1624    src_reg src;
1625    int i = 0;
1626    int swizzle[4];
1627
1628    /* Note that this is only swizzles in expressions, not those on the left
1629     * hand side of an assignment, which do write masking.  See ir_assignment
1630     * for that.
1631     */
1632
1633    ir->val->accept(this);
1634    src = this->result;
1635    assert(src.file != BAD_FILE);
1636
1637    for (i = 0; i < ir->type->vector_elements; i++) {
1638       switch (i) {
1639       case 0:
1640          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1641          break;
1642       case 1:
1643          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1644          break;
1645       case 2:
1646          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1647          break;
1648       case 3:
1649          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1650             break;
1651       }
1652    }
1653    for (; i < 4; i++) {
1654       /* Replicate the last channel out. */
1655       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1656    }
1657
1658    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1659
1660    this->result = src;
1661 }
1662
1663 void
1664 vec4_visitor::visit(ir_dereference_variable *ir)
1665 {
1666    const struct glsl_type *type = ir->type;
1667    dst_reg *reg = variable_storage(ir->var);
1668
1669    if (!reg) {
1670       fail("Failed to find variable storage for %s\n", ir->var->name);
1671       this->result = src_reg(brw_null_reg());
1672       return;
1673    }
1674
1675    this->result = src_reg(*reg);
1676
1677    /* System values get their swizzle from the dst_reg writemask */
1678    if (ir->var->mode == ir_var_system_value)
1679       return;
1680
1681    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1682       this->result.swizzle = swizzle_for_size(type->vector_elements);
1683 }
1684
1685 void
1686 vec4_visitor::visit(ir_dereference_array *ir)
1687 {
1688    ir_constant *constant_index;
1689    src_reg src;
1690    int element_size = type_size(ir->type);
1691
1692    constant_index = ir->array_index->constant_expression_value();
1693
1694    ir->array->accept(this);
1695    src = this->result;
1696
1697    if (constant_index) {
1698       src.reg_offset += constant_index->value.i[0] * element_size;
1699    } else {
1700       /* Variable index array dereference.  It eats the "vec4" of the
1701        * base of the array and an index that offsets the Mesa register
1702        * index.
1703        */
1704       ir->array_index->accept(this);
1705
1706       src_reg index_reg;
1707
1708       if (element_size == 1) {
1709          index_reg = this->result;
1710       } else {
1711          index_reg = src_reg(this, glsl_type::int_type);
1712
1713          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1714       }
1715
1716       if (src.reladdr) {
1717          src_reg temp = src_reg(this, glsl_type::int_type);
1718
1719          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1720
1721          index_reg = temp;
1722       }
1723
1724       src.reladdr = ralloc(mem_ctx, src_reg);
1725       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1726    }
1727
1728    /* If the type is smaller than a vec4, replicate the last channel out. */
1729    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1730       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1731    else
1732       src.swizzle = BRW_SWIZZLE_NOOP;
1733    src.type = brw_type_for_base_type(ir->type);
1734
1735    this->result = src;
1736 }
1737
1738 void
1739 vec4_visitor::visit(ir_dereference_record *ir)
1740 {
1741    unsigned int i;
1742    const glsl_type *struct_type = ir->record->type;
1743    int offset = 0;
1744
1745    ir->record->accept(this);
1746
1747    for (i = 0; i < struct_type->length; i++) {
1748       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1749          break;
1750       offset += type_size(struct_type->fields.structure[i].type);
1751    }
1752
1753    /* If the type is smaller than a vec4, replicate the last channel out. */
1754    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1755       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1756    else
1757       this->result.swizzle = BRW_SWIZZLE_NOOP;
1758    this->result.type = brw_type_for_base_type(ir->type);
1759
1760    this->result.reg_offset += offset;
1761 }
1762
1763 /**
1764  * We want to be careful in assignment setup to hit the actual storage
1765  * instead of potentially using a temporary like we might with the
1766  * ir_dereference handler.
1767  */
1768 static dst_reg
1769 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1770 {
1771    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1772     * access of a vector, it must be separated into a series conditional moves
1773     * before reaching this point (see ir_vec_index_to_cond_assign).
1774     */
1775    assert(ir->as_dereference());
1776    ir_dereference_array *deref_array = ir->as_dereference_array();
1777    if (deref_array) {
1778       assert(!deref_array->array->type->is_vector());
1779    }
1780
1781    /* Use the rvalue deref handler for the most part.  We'll ignore
1782     * swizzles in it and write swizzles using writemask, though.
1783     */
1784    ir->accept(v);
1785    return dst_reg(v->result);
1786 }
1787
1788 void
1789 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1790                               const struct glsl_type *type, uint32_t predicate)
1791 {
1792    if (type->base_type == GLSL_TYPE_STRUCT) {
1793       for (unsigned int i = 0; i < type->length; i++) {
1794          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1795       }
1796       return;
1797    }
1798
1799    if (type->is_array()) {
1800       for (unsigned int i = 0; i < type->length; i++) {
1801          emit_block_move(dst, src, type->fields.array, predicate);
1802       }
1803       return;
1804    }
1805
1806    if (type->is_matrix()) {
1807       const struct glsl_type *vec_type;
1808
1809       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1810                                          type->vector_elements, 1);
1811
1812       for (int i = 0; i < type->matrix_columns; i++) {
1813          emit_block_move(dst, src, vec_type, predicate);
1814       }
1815       return;
1816    }
1817
1818    assert(type->is_scalar() || type->is_vector());
1819
1820    dst->type = brw_type_for_base_type(type);
1821    src->type = dst->type;
1822
1823    dst->writemask = (1 << type->vector_elements) - 1;
1824
1825    src->swizzle = swizzle_for_size(type->vector_elements);
1826
1827    vec4_instruction *inst = emit(MOV(*dst, *src));
1828    inst->predicate = predicate;
1829
1830    dst->reg_offset++;
1831    src->reg_offset++;
1832 }
1833
1834
1835 /* If the RHS processing resulted in an instruction generating a
1836  * temporary value, and it would be easy to rewrite the instruction to
1837  * generate its result right into the LHS instead, do so.  This ends
1838  * up reliably removing instructions where it can be tricky to do so
1839  * later without real UD chain information.
1840  */
1841 bool
1842 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1843                                      dst_reg dst,
1844                                      src_reg src,
1845                                      vec4_instruction *pre_rhs_inst,
1846                                      vec4_instruction *last_rhs_inst)
1847 {
1848    /* This could be supported, but it would take more smarts. */
1849    if (ir->condition)
1850       return false;
1851
1852    if (pre_rhs_inst == last_rhs_inst)
1853       return false; /* No instructions generated to work with. */
1854
1855    /* Make sure the last instruction generated our source reg. */
1856    if (src.file != GRF ||
1857        src.file != last_rhs_inst->dst.file ||
1858        src.reg != last_rhs_inst->dst.reg ||
1859        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1860        src.reladdr ||
1861        src.abs ||
1862        src.negate ||
1863        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1864       return false;
1865
1866    /* Check that that last instruction fully initialized the channels
1867     * we want to use, in the order we want to use them.  We could
1868     * potentially reswizzle the operands of many instructions so that
1869     * we could handle out of order channels, but don't yet.
1870     */
1871
1872    for (unsigned i = 0; i < 4; i++) {
1873       if (dst.writemask & (1 << i)) {
1874          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1875             return false;
1876
1877          if (BRW_GET_SWZ(src.swizzle, i) != i)
1878             return false;
1879       }
1880    }
1881
1882    /* Success!  Rewrite the instruction. */
1883    last_rhs_inst->dst.file = dst.file;
1884    last_rhs_inst->dst.reg = dst.reg;
1885    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1886    last_rhs_inst->dst.reladdr = dst.reladdr;
1887    last_rhs_inst->dst.writemask &= dst.writemask;
1888
1889    return true;
1890 }
1891
1892 void
1893 vec4_visitor::visit(ir_assignment *ir)
1894 {
1895    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1896    uint32_t predicate = BRW_PREDICATE_NONE;
1897
1898    if (!ir->lhs->type->is_scalar() &&
1899        !ir->lhs->type->is_vector()) {
1900       ir->rhs->accept(this);
1901       src_reg src = this->result;
1902
1903       if (ir->condition) {
1904          emit_bool_to_cond_code(ir->condition, &predicate);
1905       }
1906
1907       /* emit_block_move doesn't account for swizzles in the source register.
1908        * This should be ok, since the source register is a structure or an
1909        * array, and those can't be swizzled.  But double-check to be sure.
1910        */
1911       assert(src.swizzle ==
1912              (ir->rhs->type->is_matrix()
1913               ? swizzle_for_size(ir->rhs->type->vector_elements)
1914               : BRW_SWIZZLE_NOOP));
1915
1916       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1917       return;
1918    }
1919
1920    /* Now we're down to just a scalar/vector with writemasks. */
1921    int i;
1922
1923    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1924    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1925
1926    ir->rhs->accept(this);
1927
1928    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1929
1930    src_reg src = this->result;
1931
1932    int swizzles[4];
1933    int first_enabled_chan = 0;
1934    int src_chan = 0;
1935
1936    assert(ir->lhs->type->is_vector() ||
1937           ir->lhs->type->is_scalar());
1938    dst.writemask = ir->write_mask;
1939
1940    for (int i = 0; i < 4; i++) {
1941       if (dst.writemask & (1 << i)) {
1942          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1943          break;
1944       }
1945    }
1946
1947    /* Swizzle a small RHS vector into the channels being written.
1948     *
1949     * glsl ir treats write_mask as dictating how many channels are
1950     * present on the RHS while in our instructions we need to make
1951     * those channels appear in the slots of the vec4 they're written to.
1952     */
1953    for (int i = 0; i < 4; i++) {
1954       if (dst.writemask & (1 << i))
1955          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1956       else
1957          swizzles[i] = first_enabled_chan;
1958    }
1959    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1960                               swizzles[2], swizzles[3]);
1961
1962    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1963       return;
1964    }
1965
1966    if (ir->condition) {
1967       emit_bool_to_cond_code(ir->condition, &predicate);
1968    }
1969
1970    for (i = 0; i < type_size(ir->lhs->type); i++) {
1971       vec4_instruction *inst = emit(MOV(dst, src));
1972       inst->predicate = predicate;
1973
1974       dst.reg_offset++;
1975       src.reg_offset++;
1976    }
1977 }
1978
1979 void
1980 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1981 {
1982    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1983       foreach_list(node, &ir->components) {
1984          ir_constant *field_value = (ir_constant *)node;
1985
1986          emit_constant_values(dst, field_value);
1987       }
1988       return;
1989    }
1990
1991    if (ir->type->is_array()) {
1992       for (unsigned int i = 0; i < ir->type->length; i++) {
1993          emit_constant_values(dst, ir->array_elements[i]);
1994       }
1995       return;
1996    }
1997
1998    if (ir->type->is_matrix()) {
1999       for (int i = 0; i < ir->type->matrix_columns; i++) {
2000          float *vec = &ir->value.f[i * ir->type->vector_elements];
2001
2002          for (int j = 0; j < ir->type->vector_elements; j++) {
2003             dst->writemask = 1 << j;
2004             dst->type = BRW_REGISTER_TYPE_F;
2005
2006             emit(MOV(*dst, src_reg(vec[j])));
2007          }
2008          dst->reg_offset++;
2009       }
2010       return;
2011    }
2012
2013    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2014
2015    for (int i = 0; i < ir->type->vector_elements; i++) {
2016       if (!(remaining_writemask & (1 << i)))
2017          continue;
2018
2019       dst->writemask = 1 << i;
2020       dst->type = brw_type_for_base_type(ir->type);
2021
2022       /* Find other components that match the one we're about to
2023        * write.  Emits fewer instructions for things like vec4(0.5,
2024        * 1.5, 1.5, 1.5).
2025        */
2026       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2027          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2028             if (ir->value.b[i] == ir->value.b[j])
2029                dst->writemask |= (1 << j);
2030          } else {
2031             /* u, i, and f storage all line up, so no need for a
2032              * switch case for comparing each type.
2033              */
2034             if (ir->value.u[i] == ir->value.u[j])
2035                dst->writemask |= (1 << j);
2036          }
2037       }
2038
2039       switch (ir->type->base_type) {
2040       case GLSL_TYPE_FLOAT:
2041          emit(MOV(*dst, src_reg(ir->value.f[i])));
2042          break;
2043       case GLSL_TYPE_INT:
2044          emit(MOV(*dst, src_reg(ir->value.i[i])));
2045          break;
2046       case GLSL_TYPE_UINT:
2047          emit(MOV(*dst, src_reg(ir->value.u[i])));
2048          break;
2049       case GLSL_TYPE_BOOL:
2050          emit(MOV(*dst, src_reg(ir->value.b[i])));
2051          break;
2052       default:
2053          assert(!"Non-float/uint/int/bool constant");
2054          break;
2055       }
2056
2057       remaining_writemask &= ~dst->writemask;
2058    }
2059    dst->reg_offset++;
2060 }
2061
2062 void
2063 vec4_visitor::visit(ir_constant *ir)
2064 {
2065    dst_reg dst = dst_reg(this, ir->type);
2066    this->result = src_reg(dst);
2067
2068    emit_constant_values(&dst, ir);
2069 }
2070
2071 void
2072 vec4_visitor::visit(ir_call *ir)
2073 {
2074    assert(!"not reached");
2075 }
2076
2077 void
2078 vec4_visitor::visit(ir_texture *ir)
2079 {
2080    int sampler =
2081       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2082
2083    /* Should be lowered by do_lower_texture_projection */
2084    assert(!ir->projector);
2085
2086    /* Generate code to compute all the subexpression trees.  This has to be
2087     * done before loading any values into MRFs for the sampler message since
2088     * generating these values may involve SEND messages that need the MRFs.
2089     */
2090    src_reg coordinate;
2091    if (ir->coordinate) {
2092       ir->coordinate->accept(this);
2093       coordinate = this->result;
2094    }
2095
2096    src_reg shadow_comparitor;
2097    if (ir->shadow_comparitor) {
2098       ir->shadow_comparitor->accept(this);
2099       shadow_comparitor = this->result;
2100    }
2101
2102    const glsl_type *lod_type, *sample_index_type;
2103    src_reg lod, dPdx, dPdy, sample_index;
2104    switch (ir->op) {
2105    case ir_tex:
2106       lod = src_reg(0.0f);
2107       lod_type = glsl_type::float_type;
2108       break;
2109    case ir_txf:
2110    case ir_txl:
2111    case ir_txs:
2112       ir->lod_info.lod->accept(this);
2113       lod = this->result;
2114       lod_type = ir->lod_info.lod->type;
2115       break;
2116    case ir_txf_ms:
2117       ir->lod_info.sample_index->accept(this);
2118       sample_index = this->result;
2119       sample_index_type = ir->lod_info.sample_index->type;
2120       break;
2121    case ir_txd:
2122       ir->lod_info.grad.dPdx->accept(this);
2123       dPdx = this->result;
2124
2125       ir->lod_info.grad.dPdy->accept(this);
2126       dPdy = this->result;
2127
2128       lod_type = ir->lod_info.grad.dPdx->type;
2129       break;
2130    case ir_txb:
2131    case ir_lod:
2132       break;
2133    }
2134
2135    vec4_instruction *inst = NULL;
2136    switch (ir->op) {
2137    case ir_tex:
2138    case ir_txl:
2139       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2140       break;
2141    case ir_txd:
2142       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2143       break;
2144    case ir_txf:
2145       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2146       break;
2147    case ir_txf_ms:
2148       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2149       break;
2150    case ir_txs:
2151       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2152       break;
2153    case ir_txb:
2154       assert(!"TXB is not valid for vertex shaders.");
2155       break;
2156    case ir_lod:
2157       assert(!"LOD is not valid for vertex shaders.");
2158       break;
2159    }
2160
2161    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2162
2163    /* Texel offsets go in the message header; Gen4 also requires headers. */
2164    inst->header_present = use_texture_offset || intel->gen < 5;
2165    inst->base_mrf = 2;
2166    inst->mlen = inst->header_present + 1; /* always at least one */
2167    inst->sampler = sampler;
2168    inst->dst = dst_reg(this, ir->type);
2169    inst->dst.writemask = WRITEMASK_XYZW;
2170    inst->shadow_compare = ir->shadow_comparitor != NULL;
2171
2172    if (use_texture_offset)
2173       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2174
2175    /* MRF for the first parameter */
2176    int param_base = inst->base_mrf + inst->header_present;
2177
2178    if (ir->op == ir_txs) {
2179       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2180       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2181    } else {
2182       int i, coord_mask = 0, zero_mask = 0;
2183       /* Load the coordinate */
2184       /* FINISHME: gl_clamp_mask and saturate */
2185       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2186          coord_mask |= (1 << i);
2187       for (; i < 4; i++)
2188          zero_mask |= (1 << i);
2189
2190       if (ir->offset && ir->op == ir_txf) {
2191          /* It appears that the ld instruction used for txf does its
2192           * address bounds check before adding in the offset.  To work
2193           * around this, just add the integer offset to the integer
2194           * texel coordinate, and don't put the offset in the header.
2195           */
2196          ir_constant *offset = ir->offset->as_constant();
2197          assert(offset);
2198
2199          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2200             src_reg src = coordinate;
2201             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2202                                        BRW_GET_SWZ(src.swizzle, j),
2203                                        BRW_GET_SWZ(src.swizzle, j),
2204                                        BRW_GET_SWZ(src.swizzle, j));
2205             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2206                      src, offset->value.i[j]));
2207          }
2208       } else {
2209          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2210                   coordinate));
2211       }
2212       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2213                src_reg(0)));
2214       /* Load the shadow comparitor */
2215       if (ir->shadow_comparitor) {
2216          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2217                           WRITEMASK_X),
2218                   shadow_comparitor));
2219          inst->mlen++;
2220       }
2221
2222       /* Load the LOD info */
2223       if (ir->op == ir_tex || ir->op == ir_txl) {
2224          int mrf, writemask;
2225          if (intel->gen >= 5) {
2226             mrf = param_base + 1;
2227             if (ir->shadow_comparitor) {
2228                writemask = WRITEMASK_Y;
2229                /* mlen already incremented */
2230             } else {
2231                writemask = WRITEMASK_X;
2232                inst->mlen++;
2233             }
2234          } else /* intel->gen == 4 */ {
2235             mrf = param_base;
2236             writemask = WRITEMASK_Z;
2237          }
2238          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2239       } else if (ir->op == ir_txf) {
2240          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2241       } else if (ir->op == ir_txf_ms) {
2242          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2243                   sample_index));
2244          inst->mlen++;
2245
2246          /* on Gen7, there is an additional MCS parameter here after SI,
2247           * but we don't bother to emit it since it's always zero. If
2248           * we start supporting texturing from CMS surfaces, this will have
2249           * to change
2250           */
2251       } else if (ir->op == ir_txd) {
2252          const glsl_type *type = lod_type;
2253
2254          if (intel->gen >= 5) {
2255             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2256             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2257             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2258             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2259             inst->mlen++;
2260
2261             if (ir->type->vector_elements == 3) {
2262                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2263                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2264                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2265                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2266                inst->mlen++;
2267             }
2268          } else /* intel->gen == 4 */ {
2269             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2270             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2271             inst->mlen += 2;
2272          }
2273       }
2274    }
2275
2276    emit(inst);
2277
2278    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2279     * spec requires layers.
2280     */
2281    if (ir->op == ir_txs) {
2282       glsl_type const *type = ir->sampler->type;
2283       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2284           type->sampler_array) {
2285          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2286                    with_writemask(inst->dst, WRITEMASK_Z),
2287                    src_reg(inst->dst), src_reg(6));
2288       }
2289    }
2290
2291    swizzle_result(ir, src_reg(inst->dst), sampler);
2292 }
2293
2294 void
2295 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2296 {
2297    int s = c->key.tex.swizzles[sampler];
2298
2299    this->result = src_reg(this, ir->type);
2300    dst_reg swizzled_result(this->result);
2301
2302    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2303                         || s == SWIZZLE_NOOP) {
2304       emit(MOV(swizzled_result, orig_val));
2305       return;
2306    }
2307
2308    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2309    int swizzle[4];
2310
2311    for (int i = 0; i < 4; i++) {
2312       switch (GET_SWZ(s, i)) {
2313       case SWIZZLE_ZERO:
2314          zero_mask |= (1 << i);
2315          break;
2316       case SWIZZLE_ONE:
2317          one_mask |= (1 << i);
2318          break;
2319       default:
2320          copy_mask |= (1 << i);
2321          swizzle[i] = GET_SWZ(s, i);
2322          break;
2323       }
2324    }
2325
2326    if (copy_mask) {
2327       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2328       swizzled_result.writemask = copy_mask;
2329       emit(MOV(swizzled_result, orig_val));
2330    }
2331
2332    if (zero_mask) {
2333       swizzled_result.writemask = zero_mask;
2334       emit(MOV(swizzled_result, src_reg(0.0f)));
2335    }
2336
2337    if (one_mask) {
2338       swizzled_result.writemask = one_mask;
2339       emit(MOV(swizzled_result, src_reg(1.0f)));
2340    }
2341 }
2342
2343 void
2344 vec4_visitor::visit(ir_return *ir)
2345 {
2346    assert(!"not reached");
2347 }
2348
2349 void
2350 vec4_visitor::visit(ir_discard *ir)
2351 {
2352    assert(!"not reached");
2353 }
2354
2355 void
2356 vec4_visitor::visit(ir_if *ir)
2357 {
2358    /* Don't point the annotation at the if statement, because then it plus
2359     * the then and else blocks get printed.
2360     */
2361    this->base_ir = ir->condition;
2362
2363    if (intel->gen == 6) {
2364       emit_if_gen6(ir);
2365    } else {
2366       uint32_t predicate;
2367       emit_bool_to_cond_code(ir->condition, &predicate);
2368       emit(IF(predicate));
2369    }
2370
2371    visit_instructions(&ir->then_instructions);
2372
2373    if (!ir->else_instructions.is_empty()) {
2374       this->base_ir = ir->condition;
2375       emit(BRW_OPCODE_ELSE);
2376
2377       visit_instructions(&ir->else_instructions);
2378    }
2379
2380    this->base_ir = ir->condition;
2381    emit(BRW_OPCODE_ENDIF);
2382 }
2383
2384 void
2385 vec4_visitor::emit_ndc_computation()
2386 {
2387    /* Get the position */
2388    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2389
2390    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2391    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2392    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2393
2394    current_annotation = "NDC";
2395    dst_reg ndc_w = ndc;
2396    ndc_w.writemask = WRITEMASK_W;
2397    src_reg pos_w = pos;
2398    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2399    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2400
2401    dst_reg ndc_xyz = ndc;
2402    ndc_xyz.writemask = WRITEMASK_XYZ;
2403
2404    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2405 }
2406
2407 void
2408 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2409 {
2410    if (intel->gen < 6 &&
2411        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2412         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2413       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2414       dst_reg header1_w = header1;
2415       header1_w.writemask = WRITEMASK_W;
2416       GLuint i;
2417
2418       emit(MOV(header1, 0u));
2419
2420       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2421          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2422
2423          current_annotation = "Point size";
2424          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2425          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2426       }
2427
2428       current_annotation = "Clipping flags";
2429       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2430          vec4_instruction *inst;
2431
2432          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2433                          src_reg(this->userplane[i])));
2434          inst->conditional_mod = BRW_CONDITIONAL_L;
2435
2436          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2437          inst->predicate = BRW_PREDICATE_NORMAL;
2438       }
2439
2440       /* i965 clipping workaround:
2441        * 1) Test for -ve rhw
2442        * 2) If set,
2443        *      set ndc = (0,0,0,0)
2444        *      set ucp[6] = 1
2445        *
2446        * Later, clipping will detect ucp[6] and ensure the primitive is
2447        * clipped against all fixed planes.
2448        */
2449       if (brw->has_negative_rhw_bug) {
2450          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2451          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2452          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2453          vec4_instruction *inst;
2454          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2455          inst->predicate = BRW_PREDICATE_NORMAL;
2456          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2457          inst->predicate = BRW_PREDICATE_NORMAL;
2458       }
2459
2460       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2461    } else if (intel->gen < 6) {
2462       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2463    } else {
2464       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2465       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2466          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2467                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2468       }
2469    }
2470 }
2471
2472 void
2473 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2474 {
2475    if (intel->gen < 6) {
2476       /* Clip distance slots are set aside in gen5, but they are not used.  It
2477        * is not clear whether we actually need to set aside space for them,
2478        * but the performance cost is negligible.
2479        */
2480       return;
2481    }
2482
2483    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2484     *
2485     *     "If a linked set of shaders forming the vertex stage contains no
2486     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2487     *     application has requested clipping against user clip planes through
2488     *     the API, then the coordinate written to gl_Position is used for
2489     *     comparison against the user clip planes."
2490     *
2491     * This function is only called if the shader didn't write to
2492     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2493     * if the user wrote to it; otherwise we use gl_Position.
2494     */
2495    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2496    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2497       clip_vertex = VARYING_SLOT_POS;
2498    }
2499
2500    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2501         ++i) {
2502       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2503                src_reg(output_reg[clip_vertex]),
2504                src_reg(this->userplane[i + offset])));
2505    }
2506 }
2507
2508 void
2509 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2510 {
2511    assert (varying < VARYING_SLOT_MAX);
2512    reg.type = output_reg[varying].type;
2513    current_annotation = output_reg_annotation[varying];
2514    /* Copy the register, saturating if necessary */
2515    vec4_instruction *inst = emit(MOV(reg,
2516                                      src_reg(output_reg[varying])));
2517    if ((varying == VARYING_SLOT_COL0 ||
2518         varying == VARYING_SLOT_COL1 ||
2519         varying == VARYING_SLOT_BFC0 ||
2520         varying == VARYING_SLOT_BFC1) &&
2521        c->key.clamp_vertex_color) {
2522       inst->saturate = true;
2523    }
2524 }
2525
2526 void
2527 vec4_visitor::emit_urb_slot(int mrf, int varying)
2528 {
2529    struct brw_reg hw_reg = brw_message_reg(mrf);
2530    dst_reg reg = dst_reg(MRF, mrf);
2531    reg.type = BRW_REGISTER_TYPE_F;
2532
2533    switch (varying) {
2534    case VARYING_SLOT_PSIZ:
2535       /* PSIZ is always in slot 0, and is coupled with other flags. */
2536       current_annotation = "indices, point width, clip flags";
2537       emit_psiz_and_flags(hw_reg);
2538       break;
2539    case BRW_VARYING_SLOT_NDC:
2540       current_annotation = "NDC";
2541       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2542       break;
2543    case BRW_VARYING_SLOT_POS_DUPLICATE:
2544    case VARYING_SLOT_POS:
2545       current_annotation = "gl_Position";
2546       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2547       break;
2548    case VARYING_SLOT_CLIP_DIST0:
2549    case VARYING_SLOT_CLIP_DIST1:
2550       if (this->c->key.uses_clip_distance) {
2551          emit_generic_urb_slot(reg, varying);
2552       } else {
2553          current_annotation = "user clip distances";
2554          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2555       }
2556       break;
2557    case VARYING_SLOT_EDGE:
2558       /* This is present when doing unfilled polygons.  We're supposed to copy
2559        * the edge flag from the user-provided vertex array
2560        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2561        * of that attribute (starts as 1.0f).  This is then used in clipping to
2562        * determine which edges should be drawn as wireframe.
2563        */
2564       current_annotation = "edge flag";
2565       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2566                                     glsl_type::float_type, WRITEMASK_XYZW))));
2567       break;
2568    case BRW_VARYING_SLOT_PAD:
2569       /* No need to write to this slot */
2570       break;
2571    default:
2572       emit_generic_urb_slot(reg, varying);
2573       break;
2574    }
2575 }
2576
2577 static int
2578 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2579 {
2580    struct intel_context *intel = &brw->intel;
2581
2582    if (intel->gen >= 6) {
2583       /* URB data written (does not include the message header reg) must
2584        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2585        * section 5.4.3.2.2: URB_INTERLEAVED.
2586        *
2587        * URB entries are allocated on a multiple of 1024 bits, so an
2588        * extra 128 bits written here to make the end align to 256 is
2589        * no problem.
2590        */
2591       if ((mlen % 2) != 1)
2592          mlen++;
2593    }
2594
2595    return mlen;
2596 }
2597
2598 /**
2599  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2600  * complete the VS thread.
2601  *
2602  * The VUE layout is documented in Volume 2a.
2603  */
2604 void
2605 vec4_visitor::emit_urb_writes()
2606 {
2607    /* MRF 0 is reserved for the debugger, so start with message header
2608     * in MRF 1.
2609     */
2610    int base_mrf = 1;
2611    int mrf = base_mrf;
2612    /* In the process of generating our URB write message contents, we
2613     * may need to unspill a register or load from an array.  Those
2614     * reads would use MRFs 14-15.
2615     */
2616    int max_usable_mrf = 13;
2617
2618    /* The following assertion verifies that max_usable_mrf causes an
2619     * even-numbered amount of URB write data, which will meet gen6's
2620     * requirements for length alignment.
2621     */
2622    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2623
2624    /* First mrf is the g0-based message header containing URB handles and such,
2625     * which is implied in VS_OPCODE_URB_WRITE.
2626     */
2627    mrf++;
2628
2629    if (intel->gen < 6) {
2630       emit_ndc_computation();
2631    }
2632
2633    /* Set up the VUE data for the first URB write */
2634    int slot;
2635    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2636       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2637
2638       /* If this was max_usable_mrf, we can't fit anything more into this URB
2639        * WRITE.
2640        */
2641       if (mrf > max_usable_mrf) {
2642          slot++;
2643          break;
2644       }
2645    }
2646
2647    bool eot = slot >= prog_data->vue_map.num_slots;
2648    if (eot) {
2649       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2650          emit_shader_time_end();
2651    }
2652    current_annotation = "URB write";
2653    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2654    inst->base_mrf = base_mrf;
2655    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2656    inst->eot = eot;
2657
2658    /* Optional second URB write */
2659    if (!inst->eot) {
2660       mrf = base_mrf + 1;
2661
2662       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2663          assert(mrf < max_usable_mrf);
2664
2665          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2666       }
2667
2668       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2669          emit_shader_time_end();
2670
2671       current_annotation = "URB write";
2672       inst = emit(VS_OPCODE_URB_WRITE);
2673       inst->base_mrf = base_mrf;
2674       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2675       inst->eot = true;
2676       /* URB destination offset.  In the previous write, we got MRFs
2677        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2678        * URB row increments, and each of our MRFs is half of one of
2679        * those, since we're doing interleaved writes.
2680        */
2681       inst->offset = (max_usable_mrf - base_mrf) / 2;
2682    }
2683 }
2684
2685 src_reg
2686 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2687                                  src_reg *reladdr, int reg_offset)
2688 {
2689    /* Because we store the values to scratch interleaved like our
2690     * vertex data, we need to scale the vec4 index by 2.
2691     */
2692    int message_header_scale = 2;
2693
2694    /* Pre-gen6, the message header uses byte offsets instead of vec4
2695     * (16-byte) offset units.
2696     */
2697    if (intel->gen < 6)
2698       message_header_scale *= 16;
2699
2700    if (reladdr) {
2701       src_reg index = src_reg(this, glsl_type::int_type);
2702
2703       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2704       emit_before(inst, MUL(dst_reg(index),
2705                             index, src_reg(message_header_scale)));
2706
2707       return index;
2708    } else {
2709       return src_reg(reg_offset * message_header_scale);
2710    }
2711 }
2712
2713 src_reg
2714 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2715                                        src_reg *reladdr, int reg_offset)
2716 {
2717    if (reladdr) {
2718       src_reg index = src_reg(this, glsl_type::int_type);
2719
2720       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2721
2722       /* Pre-gen6, the message header uses byte offsets instead of vec4
2723        * (16-byte) offset units.
2724        */
2725       if (intel->gen < 6) {
2726          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2727       }
2728
2729       return index;
2730    } else {
2731       int message_header_scale = intel->gen < 6 ? 16 : 1;
2732       return src_reg(reg_offset * message_header_scale);
2733    }
2734 }
2735
2736 /**
2737  * Emits an instruction before @inst to load the value named by @orig_src
2738  * from scratch space at @base_offset to @temp.
2739  *
2740  * @base_offset is measured in 32-byte units (the size of a register).
2741  */
2742 void
2743 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2744                                 dst_reg temp, src_reg orig_src,
2745                                 int base_offset)
2746 {
2747    int reg_offset = base_offset + orig_src.reg_offset;
2748    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2749
2750    emit_before(inst, SCRATCH_READ(temp, index));
2751 }
2752
2753 /**
2754  * Emits an instruction after @inst to store the value to be written
2755  * to @orig_dst to scratch space at @base_offset, from @temp.
2756  *
2757  * @base_offset is measured in 32-byte units (the size of a register).
2758  */
2759 void
2760 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2761 {
2762    int reg_offset = base_offset + inst->dst.reg_offset;
2763    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2764
2765    /* Create a temporary register to store *inst's result in.
2766     *
2767     * We have to be careful in MOVing from our temporary result register in
2768     * the scratch write.  If we swizzle from channels of the temporary that
2769     * weren't initialized, it will confuse live interval analysis, which will
2770     * make spilling fail to make progress.
2771     */
2772    src_reg temp = src_reg(this, glsl_type::vec4_type);
2773    temp.type = inst->dst.type;
2774    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2775    int swizzles[4];
2776    for (int i = 0; i < 4; i++)
2777       if (inst->dst.writemask & (1 << i))
2778          swizzles[i] = i;
2779       else
2780          swizzles[i] = first_writemask_chan;
2781    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2782                                swizzles[2], swizzles[3]);
2783
2784    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2785                                        inst->dst.writemask));
2786    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2787    write->predicate = inst->predicate;
2788    write->ir = inst->ir;
2789    write->annotation = inst->annotation;
2790    inst->insert_after(write);
2791
2792    inst->dst.file = temp.file;
2793    inst->dst.reg = temp.reg;
2794    inst->dst.reg_offset = temp.reg_offset;
2795    inst->dst.reladdr = NULL;
2796 }
2797
2798 /**
2799  * We can't generally support array access in GRF space, because a
2800  * single instruction's destination can only span 2 contiguous
2801  * registers.  So, we send all GRF arrays that get variable index
2802  * access to scratch space.
2803  */
2804 void
2805 vec4_visitor::move_grf_array_access_to_scratch()
2806 {
2807    int scratch_loc[this->virtual_grf_count];
2808
2809    for (int i = 0; i < this->virtual_grf_count; i++) {
2810       scratch_loc[i] = -1;
2811    }
2812
2813    /* First, calculate the set of virtual GRFs that need to be punted
2814     * to scratch due to having any array access on them, and where in
2815     * scratch.
2816     */
2817    foreach_list(node, &this->instructions) {
2818       vec4_instruction *inst = (vec4_instruction *)node;
2819
2820       if (inst->dst.file == GRF && inst->dst.reladdr &&
2821           scratch_loc[inst->dst.reg] == -1) {
2822          scratch_loc[inst->dst.reg] = c->base.last_scratch;
2823          c->base.last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2824       }
2825
2826       for (int i = 0 ; i < 3; i++) {
2827          src_reg *src = &inst->src[i];
2828
2829          if (src->file == GRF && src->reladdr &&
2830              scratch_loc[src->reg] == -1) {
2831             scratch_loc[src->reg] = c->base.last_scratch;
2832             c->base.last_scratch += this->virtual_grf_sizes[src->reg];
2833          }
2834       }
2835    }
2836
2837    /* Now, for anything that will be accessed through scratch, rewrite
2838     * it to load/store.  Note that this is a _safe list walk, because
2839     * we may generate a new scratch_write instruction after the one
2840     * we're processing.
2841     */
2842    foreach_list_safe(node, &this->instructions) {
2843       vec4_instruction *inst = (vec4_instruction *)node;
2844
2845       /* Set up the annotation tracking for new generated instructions. */
2846       base_ir = inst->ir;
2847       current_annotation = inst->annotation;
2848
2849       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2850          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2851       }
2852
2853       for (int i = 0 ; i < 3; i++) {
2854          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2855             continue;
2856
2857          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2858
2859          emit_scratch_read(inst, temp, inst->src[i],
2860                            scratch_loc[inst->src[i].reg]);
2861
2862          inst->src[i].file = temp.file;
2863          inst->src[i].reg = temp.reg;
2864          inst->src[i].reg_offset = temp.reg_offset;
2865          inst->src[i].reladdr = NULL;
2866       }
2867    }
2868 }
2869
2870 /**
2871  * Emits an instruction before @inst to load the value named by @orig_src
2872  * from the pull constant buffer (surface) at @base_offset to @temp.
2873  */
2874 void
2875 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2876                                       dst_reg temp, src_reg orig_src,
2877                                       int base_offset)
2878 {
2879    int reg_offset = base_offset + orig_src.reg_offset;
2880    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2881    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2882    vec4_instruction *load;
2883
2884    if (intel->gen >= 7) {
2885       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2886       grf_offset.type = offset.type;
2887       emit_before(inst, MOV(grf_offset, offset));
2888
2889       load = new(mem_ctx) vec4_instruction(this,
2890                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2891                                            temp, index, src_reg(grf_offset));
2892    } else {
2893       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2894                                            temp, index, offset);
2895       load->base_mrf = 14;
2896       load->mlen = 1;
2897    }
2898    emit_before(inst, load);
2899 }
2900
2901 /**
2902  * Implements array access of uniforms by inserting a
2903  * PULL_CONSTANT_LOAD instruction.
2904  *
2905  * Unlike temporary GRF array access (where we don't support it due to
2906  * the difficulty of doing relative addressing on instruction
2907  * destinations), we could potentially do array access of uniforms
2908  * that were loaded in GRF space as push constants.  In real-world
2909  * usage we've seen, though, the arrays being used are always larger
2910  * than we could load as push constants, so just always move all
2911  * uniform array access out to a pull constant buffer.
2912  */
2913 void
2914 vec4_visitor::move_uniform_array_access_to_pull_constants()
2915 {
2916    int pull_constant_loc[this->uniforms];
2917
2918    for (int i = 0; i < this->uniforms; i++) {
2919       pull_constant_loc[i] = -1;
2920    }
2921
2922    /* Walk through and find array access of uniforms.  Put a copy of that
2923     * uniform in the pull constant buffer.
2924     *
2925     * Note that we don't move constant-indexed accesses to arrays.  No
2926     * testing has been done of the performance impact of this choice.
2927     */
2928    foreach_list_safe(node, &this->instructions) {
2929       vec4_instruction *inst = (vec4_instruction *)node;
2930
2931       for (int i = 0 ; i < 3; i++) {
2932          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2933             continue;
2934
2935          int uniform = inst->src[i].reg;
2936
2937          /* If this array isn't already present in the pull constant buffer,
2938           * add it.
2939           */
2940          if (pull_constant_loc[uniform] == -1) {
2941             const float **values = &prog_data->param[uniform * 4];
2942
2943             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2944
2945             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2946                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2947             }
2948          }
2949
2950          /* Set up the annotation tracking for new generated instructions. */
2951          base_ir = inst->ir;
2952          current_annotation = inst->annotation;
2953
2954          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2955
2956          emit_pull_constant_load(inst, temp, inst->src[i],
2957                                  pull_constant_loc[uniform]);
2958
2959          inst->src[i].file = temp.file;
2960          inst->src[i].reg = temp.reg;
2961          inst->src[i].reg_offset = temp.reg_offset;
2962          inst->src[i].reladdr = NULL;
2963       }
2964    }
2965
2966    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2967     * no need to track them as larger-than-vec4 objects.  This will be
2968     * relied on in cutting out unused uniform vectors from push
2969     * constants.
2970     */
2971    split_uniform_registers();
2972 }
2973
2974 void
2975 vec4_visitor::resolve_ud_negate(src_reg *reg)
2976 {
2977    if (reg->type != BRW_REGISTER_TYPE_UD ||
2978        !reg->negate)
2979       return;
2980
2981    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2982    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2983    *reg = temp;
2984 }
2985
2986 vec4_visitor::vec4_visitor(struct brw_context *brw,
2987                            struct brw_vs_compile *c,
2988                            struct brw_vs_prog_data *prog_data,
2989                            struct gl_shader_program *shader_prog,
2990                            struct brw_shader *shader,
2991                            void *mem_ctx)
2992 {
2993    this->c = c;
2994    this->brw = brw;
2995    this->intel = &brw->intel;
2996    this->ctx = &intel->ctx;
2997    this->shader_prog = shader_prog;
2998    this->shader = shader;
2999
3000    this->mem_ctx = mem_ctx;
3001    this->failed = false;
3002
3003    this->base_ir = NULL;
3004    this->current_annotation = NULL;
3005    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3006
3007    this->c = c;
3008    this->prog = &c->vp->program.Base;
3009    this->prog_data = prog_data;
3010
3011    this->variable_ht = hash_table_ctor(0,
3012                                        hash_table_pointer_hash,
3013                                        hash_table_pointer_compare);
3014
3015    this->virtual_grf_def = NULL;
3016    this->virtual_grf_use = NULL;
3017    this->virtual_grf_sizes = NULL;
3018    this->virtual_grf_count = 0;
3019    this->virtual_grf_reg_map = NULL;
3020    this->virtual_grf_reg_count = 0;
3021    this->virtual_grf_array_size = 0;
3022    this->live_intervals_valid = false;
3023
3024    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3025
3026    this->uniforms = 0;
3027 }
3028
3029 vec4_visitor::~vec4_visitor()
3030 {
3031    hash_table_dtor(this->variable_ht);
3032 }
3033
3034
3035 void
3036 vec4_visitor::fail(const char *format, ...)
3037 {
3038    va_list va;
3039    char *msg;
3040
3041    if (failed)
3042       return;
3043
3044    failed = true;
3045
3046    va_start(va, format);
3047    msg = ralloc_vasprintf(mem_ctx, format, va);
3048    va_end(va);
3049    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3050
3051    this->fail_msg = msg;
3052
3053    if (INTEL_DEBUG & DEBUG_VS) {
3054       fprintf(stderr, "%s",  msg);
3055    }
3056 }
3057
3058 } /* namespace brw */