src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_vs.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "main/context.h"
  29 #include "main/macros.h"
  30 #include "program/prog_parameter.h"
  31 #include "program/sampler.h"
  32 }
  33
  34 namespace brw {
  35
  36 vec4_instruction::vec4_instruction(vec4_visitor *v,
  37                                    enum opcode opcode, dst_reg dst,
  38                                    src_reg src0, src_reg src1, src_reg src2)
  39 {
  40    this->opcode = opcode;
  41    this->dst = dst;
  42    this->src[0] = src0;
  43    this->src[1] = src1;
  44    this->src[2] = src2;
  45    this->ir = v->base_ir;
  46    this->annotation = v->current_annotation;
  47 }
  48
  49 vec4_instruction *
  50 vec4_visitor::emit(vec4_instruction *inst)
  51 {
  52    this->instructions.push_tail(inst);
  53
  54    return inst;
  55 }
  56
  57 vec4_instruction *
  58 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  59 {
  60    new_inst->ir = inst->ir;
  61    new_inst->annotation = inst->annotation;
  62
  63    inst->insert_before(new_inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  70                    src_reg src0, src_reg src1, src_reg src2)
  71 {
  72    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  73                                              src0, src1, src2));
  74 }
  75
  76
  77 vec4_instruction *
  78 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  79 {
  80    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  81 }
  82
  83 vec4_instruction *
  84 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  85 {
  86    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  87 }
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  93 }
  94
  95 #define ALU1(op)                                                        \
  96    vec4_instruction *                                                   \
  97    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  98    {                                                                    \
  99       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 100                                            src0);                       \
 101    }
 102
 103 #define ALU2(op)                                                        \
 104    vec4_instruction *                                                   \
 105    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 106    {                                                                    \
 107       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 108                                            src0, src1);                 \
 109    }
 110
 111 #define ALU3(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0, src1, src2);           \
 117    }
 118
 119 ALU1(NOT)
 120 ALU1(MOV)
 121 ALU1(FRC)
 122 ALU1(RNDD)
 123 ALU1(RNDE)
 124 ALU1(RNDZ)
 125 ALU1(F32TO16)
 126 ALU1(F16TO32)
 127 ALU2(ADD)
 128 ALU2(MUL)
 129 ALU2(MACH)
 130 ALU2(AND)
 131 ALU2(OR)
 132 ALU2(XOR)
 133 ALU2(DP3)
 134 ALU2(DP4)
 135 ALU2(DPH)
 136 ALU2(SHL)
 137 ALU2(SHR)
 138 ALU2(ASR)
 139 ALU3(LRP)
 140 ALU1(BFREV)
 141 ALU3(BFE)
 142 ALU2(BFI1)
 143 ALU3(BFI2)
 144 ALU1(FBH)
 145 ALU1(FBL)
 146 ALU1(CBIT)
 147 ALU3(MAD)
 148
 149 /** Gen4 predicated IF. */
 150 vec4_instruction *
 151 vec4_visitor::IF(uint32_t predicate)
 152 {
 153    vec4_instruction *inst;
 154
 155    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 156    inst->predicate = predicate;
 157
 158    return inst;
 159 }
 160
 161 /** Gen6+ IF with embedded comparison. */
 162 vec4_instruction *
 163 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 164 {
 165    assert(brw->gen >= 6);
 166
 167    vec4_instruction *inst;
 168
 169    resolve_ud_negate(&src0);
 170    resolve_ud_negate(&src1);
 171
 172    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 173                                         src0, src1);
 174    inst->conditional_mod = condition;
 175
 176    return inst;
 177 }
 178
 179 /**
 180  * CMP: Sets the low bit of the destination channels with the result
 181  * of the comparison, while the upper bits are undefined, and updates
 182  * the flag register with the packed 16 bits of the result.
 183  */
 184 vec4_instruction *
 185 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 186 {
 187    vec4_instruction *inst;
 188
 189    /* original gen4 does type conversion to the destination type
 190     * before before comparison, producing garbage results for floating
 191     * point comparisons.
 192     */
 193    if (brw->gen == 4) {
 194       dst.type = src0.type;
 195       if (dst.file == HW_REG)
 196          dst.fixed_hw_reg.type = dst.type;
 197    }
 198
 199    resolve_ud_negate(&src0);
 200    resolve_ud_negate(&src1);
 201
 202    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 203    inst->conditional_mod = condition;
 204
 205    return inst;
 206 }
 207
 208 vec4_instruction *
 209 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 210 {
 211    vec4_instruction *inst;
 212
 213    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 214                                         dst, index);
 215    inst->base_mrf = 14;
 216    inst->mlen = 2;
 217
 218    return inst;
 219 }
 220
 221 vec4_instruction *
 222 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 223 {
 224    vec4_instruction *inst;
 225
 226    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 227                                         dst, src, index);
 228    inst->base_mrf = 13;
 229    inst->mlen = 3;
 230
 231    return inst;
 232 }
 233
 234 void
 235 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 236 {
 237    static enum opcode dot_opcodes[] = {
 238       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 239    };
 240
 241    emit(dot_opcodes[elements - 2], dst, src0, src1);
 242 }
 243
 244 src_reg
 245 vec4_visitor::fix_3src_operand(src_reg src)
 246 {
 247    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 248     * able to use vertical stride of zero to replicate the vec4 uniform, like
 249     *
 250     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 251     *
 252     * But you can't, since vertical stride is always four in three-source
 253     * instructions. Instead, insert a MOV instruction to do the replication so
 254     * that the three-source instruction can consume it.
 255     */
 256
 257    /* The MOV is only needed if the source is a uniform or immediate. */
 258    if (src.file != UNIFORM && src.file != IMM)
 259       return src;
 260
 261    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 262    expanded.type = src.type;
 263    emit(MOV(expanded, src));
 264    return src_reg(expanded);
 265 }
 266
 267 src_reg
 268 vec4_visitor::fix_math_operand(src_reg src)
 269 {
 270    /* The gen6 math instruction ignores the source modifiers --
 271     * swizzle, abs, negate, and at least some parts of the register
 272     * region description.
 273     *
 274     * Rather than trying to enumerate all these cases, *always* expand the
 275     * operand to a temp GRF for gen6.
 276     *
 277     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 278     * can't use.
 279     */
 280
 281    if (brw->gen == 7 && src.file != IMM)
 282       return src;
 283
 284    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 285    expanded.type = src.type;
 286    emit(MOV(expanded, src));
 287    return src_reg(expanded);
 288 }
 289
 290 void
 291 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 292 {
 293    src = fix_math_operand(src);
 294
 295    if (dst.writemask != WRITEMASK_XYZW) {
 296       /* The gen6 math instruction must be align1, so we can't do
 297        * writemasks.
 298        */
 299       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 300
 301       emit(opcode, temp_dst, src);
 302
 303       emit(MOV(dst, src_reg(temp_dst)));
 304    } else {
 305       emit(opcode, dst, src);
 306    }
 307 }
 308
 309 void
 310 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 311 {
 312    vec4_instruction *inst = emit(opcode, dst, src);
 313    inst->base_mrf = 1;
 314    inst->mlen = 1;
 315 }
 316
 317 void
 318 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 319 {
 320    switch (opcode) {
 321    case SHADER_OPCODE_RCP:
 322    case SHADER_OPCODE_RSQ:
 323    case SHADER_OPCODE_SQRT:
 324    case SHADER_OPCODE_EXP2:
 325    case SHADER_OPCODE_LOG2:
 326    case SHADER_OPCODE_SIN:
 327    case SHADER_OPCODE_COS:
 328       break;
 329    default:
 330       assert(!"not reached: bad math opcode");
 331       return;
 332    }
 333
 334    if (brw->gen >= 6) {
 335       return emit_math1_gen6(opcode, dst, src);
 336    } else {
 337       return emit_math1_gen4(opcode, dst, src);
 338    }
 339 }
 340
 341 void
 342 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 343                               dst_reg dst, src_reg src0, src_reg src1)
 344 {
 345    src0 = fix_math_operand(src0);
 346    src1 = fix_math_operand(src1);
 347
 348    if (dst.writemask != WRITEMASK_XYZW) {
 349       /* The gen6 math instruction must be align1, so we can't do
 350        * writemasks.
 351        */
 352       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 353       temp_dst.type = dst.type;
 354
 355       emit(opcode, temp_dst, src0, src1);
 356
 357       emit(MOV(dst, src_reg(temp_dst)));
 358    } else {
 359       emit(opcode, dst, src0, src1);
 360    }
 361 }
 362
 363 void
 364 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 365                               dst_reg dst, src_reg src0, src_reg src1)
 366 {
 367    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 368    inst->base_mrf = 1;
 369    inst->mlen = 2;
 370 }
 371
 372 void
 373 vec4_visitor::emit_math(enum opcode opcode,
 374                         dst_reg dst, src_reg src0, src_reg src1)
 375 {
 376    switch (opcode) {
 377    case SHADER_OPCODE_POW:
 378    case SHADER_OPCODE_INT_QUOTIENT:
 379    case SHADER_OPCODE_INT_REMAINDER:
 380       break;
 381    default:
 382       assert(!"not reached: unsupported binary math opcode");
 383       return;
 384    }
 385
 386    if (brw->gen >= 6) {
 387       return emit_math2_gen6(opcode, dst, src0, src1);
 388    } else {
 389       return emit_math2_gen4(opcode, dst, src0, src1);
 390    }
 391 }
 392
 393 void
 394 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 395 {
 396    if (brw->gen < 7)
 397       assert(!"ir_unop_pack_half_2x16 should be lowered");
 398
 399    assert(dst.type == BRW_REGISTER_TYPE_UD);
 400    assert(src0.type == BRW_REGISTER_TYPE_F);
 401
 402    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 403     *
 404     *   Because this instruction does not have a 16-bit floating-point type,
 405     *   the destination data type must be Word (W).
 406     *
 407     *   The destination must be DWord-aligned and specify a horizontal stride
 408     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 409     *   each destination channel and the upper word is not modified.
 410     *
 411     * The above restriction implies that the f32to16 instruction must use
 412     * align1 mode, because only in align1 mode is it possible to specify
 413     * horizontal stride.  We choose here to defy the hardware docs and emit
 414     * align16 instructions.
 415     *
 416     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 417     * instructions. I was partially successful in that the code passed all
 418     * tests.  However, the code was dubiously correct and fragile, and the
 419     * tests were not harsh enough to probe that frailty. Not trusting the
 420     * code, I chose instead to remain in align16 mode in defiance of the hw
 421     * docs).
 422     *
 423     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 424     * simulator, emitting a f32to16 in align16 mode with UD as destination
 425     * data type is safe. The behavior differs from that specified in the PRM
 426     * in that the upper word of each destination channel is cleared to 0.
 427     */
 428
 429    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 430    src_reg tmp_src(tmp_dst);
 431
 432 #if 0
 433    /* Verify the undocumented behavior on which the following instructions
 434     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 435     * then the result of the bit-or instruction below will be incorrect.
 436     *
 437     * You should inspect the disasm output in order to verify that the MOV is
 438     * not optimized away.
 439     */
 440    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 441 #endif
 442
 443    /* Give tmp the form below, where "." means untouched.
 444     *
 445     *     w z          y          x w z          y          x
 446     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 447     *
 448     * That the upper word of each write-channel be 0 is required for the
 449     * following bit-shift and bit-or instructions to work. Note that this
 450     * relies on the undocumented hardware behavior mentioned above.
 451     */
 452    tmp_dst.writemask = WRITEMASK_XY;
 453    emit(F32TO16(tmp_dst, src0));
 454
 455    /* Give the write-channels of dst the form:
 456     *   0xhhhh0000
 457     */
 458    tmp_src.swizzle = SWIZZLE_Y;
 459    emit(SHL(dst, tmp_src, src_reg(16u)));
 460
 461    /* Finally, give the write-channels of dst the form of packHalf2x16's
 462     * output:
 463     *   0xhhhhllll
 464     */
 465    tmp_src.swizzle = SWIZZLE_X;
 466    emit(OR(dst, src_reg(dst), tmp_src));
 467 }
 468
 469 void
 470 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 471 {
 472    if (brw->gen < 7)
 473       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 474
 475    assert(dst.type == BRW_REGISTER_TYPE_F);
 476    assert(src0.type == BRW_REGISTER_TYPE_UD);
 477
 478    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 479     *
 480     *   Because this instruction does not have a 16-bit floating-point type,
 481     *   the source data type must be Word (W). The destination type must be
 482     *   F (Float).
 483     *
 484     * To use W as the source data type, we must adjust horizontal strides,
 485     * which is only possible in align1 mode. All my [chadv] attempts at
 486     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 487     * Piglit tests, so I gave up.
 488     *
 489     * I've verified that, on gen7 hardware and the simulator, it is safe to
 490     * emit f16to32 in align16 mode with UD as source data type.
 491     */
 492
 493    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 494    src_reg tmp_src(tmp_dst);
 495
 496    tmp_dst.writemask = WRITEMASK_X;
 497    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 498
 499    tmp_dst.writemask = WRITEMASK_Y;
 500    emit(SHR(tmp_dst, src0, src_reg(16u)));
 501
 502    dst.writemask = WRITEMASK_XY;
 503    emit(F16TO32(dst, tmp_src));
 504 }
 505
 506 void
 507 vec4_visitor::visit_instructions(const exec_list *list)
 508 {
 509    foreach_list(node, list) {
 510       ir_instruction *ir = (ir_instruction *)node;
 511
 512       base_ir = ir;
 513       ir->accept(this);
 514    }
 515 }
 516
 517
 518 static int
 519 type_size(const struct glsl_type *type)
 520 {
 521    unsigned int i;
 522    int size;
 523
 524    switch (type->base_type) {
 525    case GLSL_TYPE_UINT:
 526    case GLSL_TYPE_INT:
 527    case GLSL_TYPE_FLOAT:
 528    case GLSL_TYPE_BOOL:
 529       if (type->is_matrix()) {
 530          return type->matrix_columns;
 531       } else {
 532          /* Regardless of size of vector, it gets a vec4. This is bad
 533           * packing for things like floats, but otherwise arrays become a
 534           * mess.  Hopefully a later pass over the code can pack scalars
 535           * down if appropriate.
 536           */
 537          return 1;
 538       }
 539    case GLSL_TYPE_ARRAY:
 540       assert(type->length > 0);
 541       return type_size(type->fields.array) * type->length;
 542    case GLSL_TYPE_STRUCT:
 543       size = 0;
 544       for (i = 0; i < type->length; i++) {
 545          size += type_size(type->fields.structure[i].type);
 546       }
 547       return size;
 548    case GLSL_TYPE_SAMPLER:
 549       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 550        * at link time.
 551        */
 552       return 1;
 553    case GLSL_TYPE_VOID:
 554    case GLSL_TYPE_ERROR:
 555    case GLSL_TYPE_INTERFACE:
 556       assert(0);
 557       break;
 558    }
 559
 560    return 0;
 561 }
 562
 563 int
 564 vec4_visitor::virtual_grf_alloc(int size)
 565 {
 566    if (virtual_grf_array_size <= virtual_grf_count) {
 567       if (virtual_grf_array_size == 0)
 568          virtual_grf_array_size = 16;
 569       else
 570          virtual_grf_array_size *= 2;
 571       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 572                                    virtual_grf_array_size);
 573       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 574                                      virtual_grf_array_size);
 575    }
 576    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 577    virtual_grf_reg_count += size;
 578    virtual_grf_sizes[virtual_grf_count] = size;
 579    return virtual_grf_count++;
 580 }
 581
 582 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 583 {
 584    init();
 585
 586    this->file = GRF;
 587    this->reg = v->virtual_grf_alloc(type_size(type));
 588
 589    if (type->is_array() || type->is_record()) {
 590       this->swizzle = BRW_SWIZZLE_NOOP;
 591    } else {
 592       this->swizzle = swizzle_for_size(type->vector_elements);
 593    }
 594
 595    this->type = brw_type_for_base_type(type);
 596 }
 597
 598 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 599 {
 600    init();
 601
 602    this->file = GRF;
 603    this->reg = v->virtual_grf_alloc(type_size(type));
 604
 605    if (type->is_array() || type->is_record()) {
 606       this->writemask = WRITEMASK_XYZW;
 607    } else {
 608       this->writemask = (1 << type->vector_elements) - 1;
 609    }
 610
 611    this->type = brw_type_for_base_type(type);
 612 }
 613
 614 /* Our support for uniforms is piggy-backed on the struct
 615  * gl_fragment_program, because that's where the values actually
 616  * get stored, rather than in some global gl_shader_program uniform
 617  * store.
 618  */
 619 void
 620 vec4_visitor::setup_uniform_values(ir_variable *ir)
 621 {
 622    int namelen = strlen(ir->name);
 623
 624    /* The data for our (non-builtin) uniforms is stored in a series of
 625     * gl_uniform_driver_storage structs for each subcomponent that
 626     * glGetUniformLocation() could name.  We know it's been set up in the same
 627     * order we'd walk the type, so walk the list of storage and find anything
 628     * with our name, or the prefix of a component that starts with our name.
 629     */
 630    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 631       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 632
 633       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 634           (storage->name[namelen] != 0 &&
 635            storage->name[namelen] != '.' &&
 636            storage->name[namelen] != '[')) {
 637          continue;
 638       }
 639
 640       gl_constant_value *components = storage->storage;
 641       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 642                                storage->type->matrix_columns);
 643
 644       for (unsigned s = 0; s < vector_count; s++) {
 645          uniform_vector_size[uniforms] = storage->type->vector_elements;
 646
 647          int i;
 648          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 649             prog_data->param[uniforms * 4 + i] = &components->f;
 650             components++;
 651          }
 652          for (; i < 4; i++) {
 653             static float zero = 0;
 654             prog_data->param[uniforms * 4 + i] = &zero;
 655          }
 656
 657          uniforms++;
 658       }
 659    }
 660 }
 661
 662 void
 663 vec4_visitor::setup_uniform_clipplane_values()
 664 {
 665    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 666
 667    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 668       this->uniform_vector_size[this->uniforms] = 4;
 669       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 670       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 671       for (int j = 0; j < 4; ++j) {
 672          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 673       }
 674       ++this->uniforms;
 675    }
 676 }
 677
 678 /* Our support for builtin uniforms is even scarier than non-builtin.
 679  * It sits on top of the PROG_STATE_VAR parameters that are
 680  * automatically updated from GL context state.
 681  */
 682 void
 683 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 684 {
 685    const ir_state_slot *const slots = ir->state_slots;
 686    assert(ir->state_slots != NULL);
 687
 688    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 689       /* This state reference has already been setup by ir_to_mesa,
 690        * but we'll get the same index back here.  We can reference
 691        * ParameterValues directly, since unlike brw_fs.cpp, we never
 692        * add new state references during compile.
 693        */
 694       int index = _mesa_add_state_reference(this->prog->Parameters,
 695                                             (gl_state_index *)slots[i].tokens);
 696       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 697
 698       this->uniform_vector_size[this->uniforms] = 0;
 699       /* Add each of the unique swizzled channels of the element.
 700        * This will end up matching the size of the glsl_type of this field.
 701        */
 702       int last_swiz = -1;
 703       for (unsigned int j = 0; j < 4; j++) {
 704          int swiz = GET_SWZ(slots[i].swizzle, j);
 705          last_swiz = swiz;
 706
 707          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 708          if (swiz <= last_swiz)
 709             this->uniform_vector_size[this->uniforms]++;
 710       }
 711       this->uniforms++;
 712    }
 713 }
 714
 715 dst_reg *
 716 vec4_visitor::variable_storage(ir_variable *var)
 717 {
 718    return (dst_reg *)hash_table_find(this->variable_ht, var);
 719 }
 720
 721 void
 722 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 723 {
 724    ir_expression *expr = ir->as_expression();
 725
 726    *predicate = BRW_PREDICATE_NORMAL;
 727
 728    if (expr) {
 729       src_reg op[2];
 730       vec4_instruction *inst;
 731
 732       assert(expr->get_num_operands() <= 2);
 733       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 734          expr->operands[i]->accept(this);
 735          op[i] = this->result;
 736
 737          resolve_ud_negate(&op[i]);
 738       }
 739
 740       switch (expr->operation) {
 741       case ir_unop_logic_not:
 742          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 743          inst->conditional_mod = BRW_CONDITIONAL_Z;
 744          break;
 745
 746       case ir_binop_logic_xor:
 747          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 748          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 749          break;
 750
 751       case ir_binop_logic_or:
 752          inst = emit(OR(dst_null_d(), op[0], op[1]));
 753          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 754          break;
 755
 756       case ir_binop_logic_and:
 757          inst = emit(AND(dst_null_d(), op[0], op[1]));
 758          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 759          break;
 760
 761       case ir_unop_f2b:
 762          if (brw->gen >= 6) {
 763             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 764          } else {
 765             inst = emit(MOV(dst_null_f(), op[0]));
 766             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 767          }
 768          break;
 769
 770       case ir_unop_i2b:
 771          if (brw->gen >= 6) {
 772             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 773          } else {
 774             inst = emit(MOV(dst_null_d(), op[0]));
 775             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 776          }
 777          break;
 778
 779       case ir_binop_all_equal:
 780          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 781          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 782          break;
 783
 784       case ir_binop_any_nequal:
 785          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 786          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 787          break;
 788
 789       case ir_unop_any:
 790          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 791          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 792          break;
 793
 794       case ir_binop_greater:
 795       case ir_binop_gequal:
 796       case ir_binop_less:
 797       case ir_binop_lequal:
 798       case ir_binop_equal:
 799       case ir_binop_nequal:
 800          emit(CMP(dst_null_d(), op[0], op[1],
 801                   brw_conditional_for_comparison(expr->operation)));
 802          break;
 803
 804       default:
 805          assert(!"not reached");
 806          break;
 807       }
 808       return;
 809    }
 810
 811    ir->accept(this);
 812
 813    resolve_ud_negate(&this->result);
 814
 815    if (brw->gen >= 6) {
 816       vec4_instruction *inst = emit(AND(dst_null_d(),
 817                                         this->result, src_reg(1)));
 818       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 819    } else {
 820       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 821       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 822    }
 823 }
 824
 825 /**
 826  * Emit a gen6 IF statement with the comparison folded into the IF
 827  * instruction.
 828  */
 829 void
 830 vec4_visitor::emit_if_gen6(ir_if *ir)
 831 {
 832    ir_expression *expr = ir->condition->as_expression();
 833
 834    if (expr) {
 835       src_reg op[2];
 836       dst_reg temp;
 837
 838       assert(expr->get_num_operands() <= 2);
 839       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 840          expr->operands[i]->accept(this);
 841          op[i] = this->result;
 842       }
 843
 844       switch (expr->operation) {
 845       case ir_unop_logic_not:
 846          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 847          return;
 848
 849       case ir_binop_logic_xor:
 850          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 851          return;
 852
 853       case ir_binop_logic_or:
 854          temp = dst_reg(this, glsl_type::bool_type);
 855          emit(OR(temp, op[0], op[1]));
 856          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 857          return;
 858
 859       case ir_binop_logic_and:
 860          temp = dst_reg(this, glsl_type::bool_type);
 861          emit(AND(temp, op[0], op[1]));
 862          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 863          return;
 864
 865       case ir_unop_f2b:
 866          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 867          return;
 868
 869       case ir_unop_i2b:
 870          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 871          return;
 872
 873       case ir_binop_greater:
 874       case ir_binop_gequal:
 875       case ir_binop_less:
 876       case ir_binop_lequal:
 877       case ir_binop_equal:
 878       case ir_binop_nequal:
 879          emit(IF(op[0], op[1],
 880                  brw_conditional_for_comparison(expr->operation)));
 881          return;
 882
 883       case ir_binop_all_equal:
 884          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 885          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 886          return;
 887
 888       case ir_binop_any_nequal:
 889          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 890          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 891          return;
 892
 893       case ir_unop_any:
 894          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 895          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 896          return;
 897
 898       default:
 899          assert(!"not reached");
 900          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 901          return;
 902       }
 903       return;
 904    }
 905
 906    ir->condition->accept(this);
 907
 908    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 909 }
 910
 911 static dst_reg
 912 with_writemask(dst_reg const & r, int mask)
 913 {
 914    dst_reg result = r;
 915    result.writemask = mask;
 916    return result;
 917 }
 918
 919 void
 920 vec4_vs_visitor::emit_prolog()
 921 {
 922    dst_reg sign_recovery_shift;
 923    dst_reg normalize_factor;
 924    dst_reg es3_normalize_factor;
 925
 926    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 927       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 928          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 929          dst_reg reg(ATTR, i);
 930          dst_reg reg_d = reg;
 931          reg_d.type = BRW_REGISTER_TYPE_D;
 932          dst_reg reg_ud = reg;
 933          reg_ud.type = BRW_REGISTER_TYPE_UD;
 934
 935          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 936           * come in as floating point conversions of the integer values.
 937           */
 938          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 939             dst_reg dst = reg;
 940             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 941             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 942             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 943          }
 944
 945          /* Do sign recovery for 2101010 formats if required. */
 946          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 947             if (sign_recovery_shift.file == BAD_FILE) {
 948                /* shift constant: <22,22,22,30> */
 949                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 950                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 951                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 952             }
 953
 954             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 955             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 956          }
 957
 958          /* Apply BGRA swizzle if required. */
 959          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 960             src_reg temp = src_reg(reg);
 961             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 962             emit(MOV(reg, temp));
 963          }
 964
 965          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 966             /* ES 3.0 has different rules for converting signed normalized
 967              * fixed-point numbers than desktop GL.
 968              */
 969             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 970                /* According to equation 2.2 of the ES 3.0 specification,
 971                 * signed normalization conversion is done by:
 972                 *
 973                 * f = c / (2^(b-1)-1)
 974                 */
 975                if (es3_normalize_factor.file == BAD_FILE) {
 976                   /* mul constant: 1 / (2^(b-1) - 1) */
 977                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 978                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 979                            src_reg(1.0f / ((1<<9) - 1))));
 980                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 981                            src_reg(1.0f / ((1<<1) - 1))));
 982                }
 983
 984                dst_reg dst = reg;
 985                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 986                emit(MOV(dst, src_reg(reg_d)));
 987                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 988                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 989             } else {
 990                /* The following equations are from the OpenGL 3.2 specification:
 991                 *
 992                 * 2.1 unsigned normalization
 993                 * f = c/(2^n-1)
 994                 *
 995                 * 2.2 signed normalization
 996                 * f = (2c+1)/(2^n-1)
 997                 *
 998                 * Both of these share a common divisor, which is represented by
 999                 * "normalize_factor" in the code below.
1000                 */
1001                if (normalize_factor.file == BAD_FILE) {
1002                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1003                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1004                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1005                            src_reg(1.0f / ((1<<10) - 1))));
1006                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1007                            src_reg(1.0f / ((1<<2) - 1))));
1008                }
1009
1010                dst_reg dst = reg;
1011                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1012                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1013
1014                /* For signed normalization, we want the numerator to be 2c+1. */
1015                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1016                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1017                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1018                }
1019
1020                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1021             }
1022          }
1023
1024          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1025             dst_reg dst = reg;
1026             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1027             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1028          }
1029       }
1030    }
1031 }
1032
1033
1034 dst_reg *
1035 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1036 {
1037    /* VertexID is stored by the VF as the last vertex element, but
1038     * we don't represent it with a flag in inputs_read, so we call
1039     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1040     */
1041    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1042    vs_prog_data->uses_vertexid = true;
1043
1044    switch (ir->location) {
1045    case SYSTEM_VALUE_VERTEX_ID:
1046       reg->writemask = WRITEMASK_X;
1047       break;
1048    case SYSTEM_VALUE_INSTANCE_ID:
1049       reg->writemask = WRITEMASK_Y;
1050       break;
1051    default:
1052       assert(!"not reached");
1053       break;
1054    }
1055
1056    return reg;
1057 }
1058
1059
1060 void
1061 vec4_visitor::visit(ir_variable *ir)
1062 {
1063    dst_reg *reg = NULL;
1064
1065    if (variable_storage(ir))
1066       return;
1067
1068    switch (ir->mode) {
1069    case ir_var_shader_in:
1070       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1071       break;
1072
1073    case ir_var_shader_out:
1074       reg = new(mem_ctx) dst_reg(this, ir->type);
1075
1076       for (int i = 0; i < type_size(ir->type); i++) {
1077          output_reg[ir->location + i] = *reg;
1078          output_reg[ir->location + i].reg_offset = i;
1079          output_reg[ir->location + i].type =
1080             brw_type_for_base_type(ir->type->get_scalar_type());
1081          output_reg_annotation[ir->location + i] = ir->name;
1082       }
1083       break;
1084
1085    case ir_var_auto:
1086    case ir_var_temporary:
1087       reg = new(mem_ctx) dst_reg(this, ir->type);
1088       break;
1089
1090    case ir_var_uniform:
1091       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1092
1093       /* Thanks to the lower_ubo_reference pass, we will see only
1094        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1095        * variables, so no need for them to be in variable_ht.
1096        */
1097       if (ir->is_in_uniform_block())
1098          return;
1099
1100       /* Track how big the whole uniform variable is, in case we need to put a
1101        * copy of its data into pull constants for array access.
1102        */
1103       this->uniform_size[this->uniforms] = type_size(ir->type);
1104
1105       if (!strncmp(ir->name, "gl_", 3)) {
1106          setup_builtin_uniform_values(ir);
1107       } else {
1108          setup_uniform_values(ir);
1109       }
1110       break;
1111
1112    case ir_var_system_value:
1113       reg = make_reg_for_system_value(ir);
1114       break;
1115
1116    default:
1117       assert(!"not reached");
1118    }
1119
1120    reg->type = brw_type_for_base_type(ir->type);
1121    hash_table_insert(this->variable_ht, reg, ir);
1122 }
1123
1124 void
1125 vec4_visitor::visit(ir_loop *ir)
1126 {
1127    dst_reg counter;
1128
1129    /* We don't want debugging output to print the whole body of the
1130     * loop as the annotation.
1131     */
1132    this->base_ir = NULL;
1133
1134    if (ir->counter != NULL) {
1135       this->base_ir = ir->counter;
1136       ir->counter->accept(this);
1137       counter = *(variable_storage(ir->counter));
1138
1139       if (ir->from != NULL) {
1140          this->base_ir = ir->from;
1141          ir->from->accept(this);
1142
1143          emit(MOV(counter, this->result));
1144       }
1145    }
1146
1147    emit(BRW_OPCODE_DO);
1148
1149    if (ir->to) {
1150       this->base_ir = ir->to;
1151       ir->to->accept(this);
1152
1153       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1154                brw_conditional_for_comparison(ir->cmp)));
1155
1156       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1157       inst->predicate = BRW_PREDICATE_NORMAL;
1158    }
1159
1160    visit_instructions(&ir->body_instructions);
1161
1162
1163    if (ir->increment) {
1164       this->base_ir = ir->increment;
1165       ir->increment->accept(this);
1166       emit(ADD(counter, src_reg(counter), this->result));
1167    }
1168
1169    emit(BRW_OPCODE_WHILE);
1170 }
1171
1172 void
1173 vec4_visitor::visit(ir_loop_jump *ir)
1174 {
1175    switch (ir->mode) {
1176    case ir_loop_jump::jump_break:
1177       emit(BRW_OPCODE_BREAK);
1178       break;
1179    case ir_loop_jump::jump_continue:
1180       emit(BRW_OPCODE_CONTINUE);
1181       break;
1182    }
1183 }
1184
1185
1186 void
1187 vec4_visitor::visit(ir_function_signature *ir)
1188 {
1189    assert(0);
1190    (void)ir;
1191 }
1192
1193 void
1194 vec4_visitor::visit(ir_function *ir)
1195 {
1196    /* Ignore function bodies other than main() -- we shouldn't see calls to
1197     * them since they should all be inlined.
1198     */
1199    if (strcmp(ir->name, "main") == 0) {
1200       const ir_function_signature *sig;
1201       exec_list empty;
1202
1203       sig = ir->matching_signature(&empty);
1204
1205       assert(sig);
1206
1207       visit_instructions(&sig->body);
1208    }
1209 }
1210
1211 bool
1212 vec4_visitor::try_emit_sat(ir_expression *ir)
1213 {
1214    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1215    if (!sat_src)
1216       return false;
1217
1218    sat_src->accept(this);
1219    src_reg src = this->result;
1220
1221    this->result = src_reg(this, ir->type);
1222    vec4_instruction *inst;
1223    inst = emit(MOV(dst_reg(this->result), src));
1224    inst->saturate = true;
1225
1226    return true;
1227 }
1228
1229 bool
1230 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1231 {
1232    /* 3-src instructions were introduced in gen6. */
1233    if (brw->gen < 6)
1234       return false;
1235
1236    /* MAD can only handle floating-point data. */
1237    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1238       return false;
1239
1240    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1241    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1242
1243    if (!mul || mul->operation != ir_binop_mul)
1244       return false;
1245
1246    nonmul->accept(this);
1247    src_reg src0 = fix_3src_operand(this->result);
1248
1249    mul->operands[0]->accept(this);
1250    src_reg src1 = fix_3src_operand(this->result);
1251
1252    mul->operands[1]->accept(this);
1253    src_reg src2 = fix_3src_operand(this->result);
1254
1255    this->result = src_reg(this, ir->type);
1256    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1257
1258    return true;
1259 }
1260
1261 void
1262 vec4_visitor::emit_bool_comparison(unsigned int op,
1263                                  dst_reg dst, src_reg src0, src_reg src1)
1264 {
1265    /* original gen4 does destination conversion before comparison. */
1266    if (brw->gen < 5)
1267       dst.type = src0.type;
1268
1269    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1270
1271    dst.type = BRW_REGISTER_TYPE_D;
1272    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1273 }
1274
1275 void
1276 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1277                           src_reg src0, src_reg src1)
1278 {
1279    vec4_instruction *inst;
1280
1281    if (brw->gen >= 6) {
1282       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1283       inst->conditional_mod = conditionalmod;
1284    } else {
1285       emit(CMP(dst, src0, src1, conditionalmod));
1286
1287       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1288       inst->predicate = BRW_PREDICATE_NORMAL;
1289    }
1290 }
1291
1292 static bool
1293 is_16bit_constant(ir_rvalue *rvalue)
1294 {
1295    ir_constant *constant = rvalue->as_constant();
1296    if (!constant)
1297       return false;
1298
1299    if (constant->type != glsl_type::int_type &&
1300        constant->type != glsl_type::uint_type)
1301       return false;
1302
1303    return constant->value.u[0] < (1 << 16);
1304 }
1305
1306 void
1307 vec4_visitor::visit(ir_expression *ir)
1308 {
1309    unsigned int operand;
1310    src_reg op[Elements(ir->operands)];
1311    src_reg result_src;
1312    dst_reg result_dst;
1313    vec4_instruction *inst;
1314
1315    if (try_emit_sat(ir))
1316       return;
1317
1318    if (ir->operation == ir_binop_add) {
1319       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1320          return;
1321    }
1322
1323    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1324       this->result.file = BAD_FILE;
1325       ir->operands[operand]->accept(this);
1326       if (this->result.file == BAD_FILE) {
1327          printf("Failed to get tree for expression operand:\n");
1328          ir->operands[operand]->print();
1329          exit(1);
1330       }
1331       op[operand] = this->result;
1332
1333       /* Matrix expression operands should have been broken down to vector
1334        * operations already.
1335        */
1336       assert(!ir->operands[operand]->type->is_matrix());
1337    }
1338
1339    int vector_elements = ir->operands[0]->type->vector_elements;
1340    if (ir->operands[1]) {
1341       vector_elements = MAX2(vector_elements,
1342                              ir->operands[1]->type->vector_elements);
1343    }
1344
1345    this->result.file = BAD_FILE;
1346
1347    /* Storage for our result.  Ideally for an assignment we'd be using
1348     * the actual storage for the result here, instead.
1349     */
1350    result_src = src_reg(this, ir->type);
1351    /* convenience for the emit functions below. */
1352    result_dst = dst_reg(result_src);
1353    /* If nothing special happens, this is the result. */
1354    this->result = result_src;
1355    /* Limit writes to the channels that will be used by result_src later.
1356     * This does limit this temp's use as a temporary for multi-instruction
1357     * sequences.
1358     */
1359    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1360
1361    switch (ir->operation) {
1362    case ir_unop_logic_not:
1363       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1364        * ones complement of the whole register, not just bit 0.
1365        */
1366       emit(XOR(result_dst, op[0], src_reg(1)));
1367       break;
1368    case ir_unop_neg:
1369       op[0].negate = !op[0].negate;
1370       emit(MOV(result_dst, op[0]));
1371       break;
1372    case ir_unop_abs:
1373       op[0].abs = true;
1374       op[0].negate = false;
1375       emit(MOV(result_dst, op[0]));
1376       break;
1377
1378    case ir_unop_sign:
1379       emit(MOV(result_dst, src_reg(0.0f)));
1380
1381       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1382       inst = emit(MOV(result_dst, src_reg(1.0f)));
1383       inst->predicate = BRW_PREDICATE_NORMAL;
1384
1385       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1386       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1387       inst->predicate = BRW_PREDICATE_NORMAL;
1388
1389       break;
1390
1391    case ir_unop_rcp:
1392       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1393       break;
1394
1395    case ir_unop_exp2:
1396       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1397       break;
1398    case ir_unop_log2:
1399       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1400       break;
1401    case ir_unop_exp:
1402    case ir_unop_log:
1403       assert(!"not reached: should be handled by ir_explog_to_explog2");
1404       break;
1405    case ir_unop_sin:
1406    case ir_unop_sin_reduced:
1407       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1408       break;
1409    case ir_unop_cos:
1410    case ir_unop_cos_reduced:
1411       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1412       break;
1413
1414    case ir_unop_dFdx:
1415    case ir_unop_dFdy:
1416       assert(!"derivatives not valid in vertex shader");
1417       break;
1418
1419    case ir_unop_bitfield_reverse:
1420       emit(BFREV(result_dst, op[0]));
1421       break;
1422    case ir_unop_bit_count:
1423       emit(CBIT(result_dst, op[0]));
1424       break;
1425    case ir_unop_find_msb: {
1426       src_reg temp = src_reg(this, glsl_type::uint_type);
1427
1428       inst = emit(FBH(dst_reg(temp), op[0]));
1429       inst->dst.writemask = WRITEMASK_XYZW;
1430
1431       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1432        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1433        * subtract the result from 31 to convert the MSB count into an LSB count.
1434        */
1435
1436       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1437       temp.swizzle = BRW_SWIZZLE_NOOP;
1438       emit(MOV(result_dst, temp));
1439
1440       src_reg src_tmp = src_reg(result_dst);
1441       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1442
1443       src_tmp.negate = true;
1444       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1445       inst->predicate = BRW_PREDICATE_NORMAL;
1446       break;
1447    }
1448    case ir_unop_find_lsb:
1449       emit(FBL(result_dst, op[0]));
1450       break;
1451
1452    case ir_unop_noise:
1453       assert(!"not reached: should be handled by lower_noise");
1454       break;
1455
1456    case ir_binop_add:
1457       emit(ADD(result_dst, op[0], op[1]));
1458       break;
1459    case ir_binop_sub:
1460       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1461       break;
1462
1463    case ir_binop_mul:
1464       if (ir->type->is_integer()) {
1465          /* For integer multiplication, the MUL uses the low 16 bits of one of
1466           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1467           * accumulates in the contribution of the upper 16 bits of that
1468           * operand.  If we can determine that one of the args is in the low
1469           * 16 bits, though, we can just emit a single MUL.
1470           */
1471          if (is_16bit_constant(ir->operands[0])) {
1472             if (brw->gen < 7)
1473                emit(MUL(result_dst, op[0], op[1]));
1474             else
1475                emit(MUL(result_dst, op[1], op[0]));
1476          } else if (is_16bit_constant(ir->operands[1])) {
1477             if (brw->gen < 7)
1478                emit(MUL(result_dst, op[1], op[0]));
1479             else
1480                emit(MUL(result_dst, op[0], op[1]));
1481          } else {
1482             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1483
1484             emit(MUL(acc, op[0], op[1]));
1485             emit(MACH(dst_null_d(), op[0], op[1]));
1486             emit(MOV(result_dst, src_reg(acc)));
1487          }
1488       } else {
1489          emit(MUL(result_dst, op[0], op[1]));
1490       }
1491       break;
1492    case ir_binop_div:
1493       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1494       assert(ir->type->is_integer());
1495       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1496       break;
1497    case ir_binop_mod:
1498       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1499       assert(ir->type->is_integer());
1500       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1501       break;
1502
1503    case ir_binop_less:
1504    case ir_binop_greater:
1505    case ir_binop_lequal:
1506    case ir_binop_gequal:
1507    case ir_binop_equal:
1508    case ir_binop_nequal: {
1509       emit(CMP(result_dst, op[0], op[1],
1510                brw_conditional_for_comparison(ir->operation)));
1511       emit(AND(result_dst, result_src, src_reg(0x1)));
1512       break;
1513    }
1514
1515    case ir_binop_all_equal:
1516       /* "==" operator producing a scalar boolean. */
1517       if (ir->operands[0]->type->is_vector() ||
1518           ir->operands[1]->type->is_vector()) {
1519          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1520          emit(MOV(result_dst, src_reg(0)));
1521          inst = emit(MOV(result_dst, src_reg(1)));
1522          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1523       } else {
1524          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1525          emit(AND(result_dst, result_src, src_reg(0x1)));
1526       }
1527       break;
1528    case ir_binop_any_nequal:
1529       /* "!=" operator producing a scalar boolean. */
1530       if (ir->operands[0]->type->is_vector() ||
1531           ir->operands[1]->type->is_vector()) {
1532          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1533
1534          emit(MOV(result_dst, src_reg(0)));
1535          inst = emit(MOV(result_dst, src_reg(1)));
1536          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1537       } else {
1538          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1539          emit(AND(result_dst, result_src, src_reg(0x1)));
1540       }
1541       break;
1542
1543    case ir_unop_any:
1544       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1545       emit(MOV(result_dst, src_reg(0)));
1546
1547       inst = emit(MOV(result_dst, src_reg(1)));
1548       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1549       break;
1550
1551    case ir_binop_logic_xor:
1552       emit(XOR(result_dst, op[0], op[1]));
1553       break;
1554
1555    case ir_binop_logic_or:
1556       emit(OR(result_dst, op[0], op[1]));
1557       break;
1558
1559    case ir_binop_logic_and:
1560       emit(AND(result_dst, op[0], op[1]));
1561       break;
1562
1563    case ir_binop_dot:
1564       assert(ir->operands[0]->type->is_vector());
1565       assert(ir->operands[0]->type == ir->operands[1]->type);
1566       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1567       break;
1568
1569    case ir_unop_sqrt:
1570       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1571       break;
1572    case ir_unop_rsq:
1573       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1574       break;
1575
1576    case ir_unop_bitcast_i2f:
1577    case ir_unop_bitcast_u2f:
1578       this->result = op[0];
1579       this->result.type = BRW_REGISTER_TYPE_F;
1580       break;
1581
1582    case ir_unop_bitcast_f2i:
1583       this->result = op[0];
1584       this->result.type = BRW_REGISTER_TYPE_D;
1585       break;
1586
1587    case ir_unop_bitcast_f2u:
1588       this->result = op[0];
1589       this->result.type = BRW_REGISTER_TYPE_UD;
1590       break;
1591
1592    case ir_unop_i2f:
1593    case ir_unop_i2u:
1594    case ir_unop_u2i:
1595    case ir_unop_u2f:
1596    case ir_unop_b2f:
1597    case ir_unop_b2i:
1598    case ir_unop_f2i:
1599    case ir_unop_f2u:
1600       emit(MOV(result_dst, op[0]));
1601       break;
1602    case ir_unop_f2b:
1603    case ir_unop_i2b: {
1604       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1605       emit(AND(result_dst, result_src, src_reg(1)));
1606       break;
1607    }
1608
1609    case ir_unop_trunc:
1610       emit(RNDZ(result_dst, op[0]));
1611       break;
1612    case ir_unop_ceil:
1613       op[0].negate = !op[0].negate;
1614       inst = emit(RNDD(result_dst, op[0]));
1615       this->result.negate = true;
1616       break;
1617    case ir_unop_floor:
1618       inst = emit(RNDD(result_dst, op[0]));
1619       break;
1620    case ir_unop_fract:
1621       inst = emit(FRC(result_dst, op[0]));
1622       break;
1623    case ir_unop_round_even:
1624       emit(RNDE(result_dst, op[0]));
1625       break;
1626
1627    case ir_binop_min:
1628       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1629       break;
1630    case ir_binop_max:
1631       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1632       break;
1633
1634    case ir_binop_pow:
1635       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1636       break;
1637
1638    case ir_unop_bit_not:
1639       inst = emit(NOT(result_dst, op[0]));
1640       break;
1641    case ir_binop_bit_and:
1642       inst = emit(AND(result_dst, op[0], op[1]));
1643       break;
1644    case ir_binop_bit_xor:
1645       inst = emit(XOR(result_dst, op[0], op[1]));
1646       break;
1647    case ir_binop_bit_or:
1648       inst = emit(OR(result_dst, op[0], op[1]));
1649       break;
1650
1651    case ir_binop_lshift:
1652       inst = emit(SHL(result_dst, op[0], op[1]));
1653       break;
1654
1655    case ir_binop_rshift:
1656       if (ir->type->base_type == GLSL_TYPE_INT)
1657          inst = emit(ASR(result_dst, op[0], op[1]));
1658       else
1659          inst = emit(SHR(result_dst, op[0], op[1]));
1660       break;
1661
1662    case ir_binop_bfm:
1663       emit(BFI1(result_dst, op[0], op[1]));
1664       break;
1665
1666    case ir_binop_ubo_load: {
1667       ir_constant *uniform_block = ir->operands[0]->as_constant();
1668       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1669       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1670       src_reg offset = op[1];
1671
1672       /* Now, load the vector from that offset. */
1673       assert(ir->type->is_vector() || ir->type->is_scalar());
1674
1675       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1676       packed_consts.type = result.type;
1677       src_reg surf_index =
1678          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1679       if (const_offset_ir) {
1680          offset = src_reg(const_offset / 16);
1681       } else {
1682          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1683       }
1684
1685       vec4_instruction *pull =
1686          emit(new(mem_ctx) vec4_instruction(this,
1687                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1688                                             dst_reg(packed_consts),
1689                                             surf_index,
1690                                             offset));
1691       pull->base_mrf = 14;
1692       pull->mlen = 1;
1693
1694       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1695       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1696                                             const_offset % 16 / 4,
1697                                             const_offset % 16 / 4,
1698                                             const_offset % 16 / 4);
1699
1700       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1701       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1702          emit(CMP(result_dst, packed_consts, src_reg(0u),
1703                   BRW_CONDITIONAL_NZ));
1704          emit(AND(result_dst, result, src_reg(0x1)));
1705       } else {
1706          emit(MOV(result_dst, packed_consts));
1707       }
1708       break;
1709    }
1710
1711    case ir_binop_vector_extract:
1712       assert(!"should have been lowered by vec_index_to_cond_assign");
1713       break;
1714
1715    case ir_triop_fma:
1716       op[0] = fix_3src_operand(op[0]);
1717       op[1] = fix_3src_operand(op[1]);
1718       op[2] = fix_3src_operand(op[2]);
1719       /* Note that the instruction's argument order is reversed from GLSL
1720        * and the IR.
1721        */
1722       emit(MAD(result_dst, op[2], op[1], op[0]));
1723       break;
1724
1725    case ir_triop_lrp:
1726       op[0] = fix_3src_operand(op[0]);
1727       op[1] = fix_3src_operand(op[1]);
1728       op[2] = fix_3src_operand(op[2]);
1729       /* Note that the instruction's argument order is reversed from GLSL
1730        * and the IR.
1731        */
1732       emit(LRP(result_dst, op[2], op[1], op[0]));
1733       break;
1734
1735    case ir_triop_bfi:
1736       op[0] = fix_3src_operand(op[0]);
1737       op[1] = fix_3src_operand(op[1]);
1738       op[2] = fix_3src_operand(op[2]);
1739       emit(BFI2(result_dst, op[0], op[1], op[2]));
1740       break;
1741
1742    case ir_triop_bitfield_extract:
1743       op[0] = fix_3src_operand(op[0]);
1744       op[1] = fix_3src_operand(op[1]);
1745       op[2] = fix_3src_operand(op[2]);
1746       /* Note that the instruction's argument order is reversed from GLSL
1747        * and the IR.
1748        */
1749       emit(BFE(result_dst, op[2], op[1], op[0]));
1750       break;
1751
1752    case ir_triop_vector_insert:
1753       assert(!"should have been lowered by lower_vector_insert");
1754       break;
1755
1756    case ir_quadop_bitfield_insert:
1757       assert(!"not reached: should be handled by "
1758               "bitfield_insert_to_bfm_bfi\n");
1759       break;
1760
1761    case ir_quadop_vector:
1762       assert(!"not reached: should be handled by lower_quadop_vector");
1763       break;
1764
1765    case ir_unop_pack_half_2x16:
1766       emit_pack_half_2x16(result_dst, op[0]);
1767       break;
1768    case ir_unop_unpack_half_2x16:
1769       emit_unpack_half_2x16(result_dst, op[0]);
1770       break;
1771    case ir_unop_pack_snorm_2x16:
1772    case ir_unop_pack_snorm_4x8:
1773    case ir_unop_pack_unorm_2x16:
1774    case ir_unop_pack_unorm_4x8:
1775    case ir_unop_unpack_snorm_2x16:
1776    case ir_unop_unpack_snorm_4x8:
1777    case ir_unop_unpack_unorm_2x16:
1778    case ir_unop_unpack_unorm_4x8:
1779       assert(!"not reached: should be handled by lower_packing_builtins");
1780       break;
1781    case ir_unop_unpack_half_2x16_split_x:
1782    case ir_unop_unpack_half_2x16_split_y:
1783    case ir_binop_pack_half_2x16_split:
1784       assert(!"not reached: should not occur in vertex shader");
1785       break;
1786    }
1787 }
1788
1789
1790 void
1791 vec4_visitor::visit(ir_swizzle *ir)
1792 {
1793    src_reg src;
1794    int i = 0;
1795    int swizzle[4];
1796
1797    /* Note that this is only swizzles in expressions, not those on the left
1798     * hand side of an assignment, which do write masking.  See ir_assignment
1799     * for that.
1800     */
1801
1802    ir->val->accept(this);
1803    src = this->result;
1804    assert(src.file != BAD_FILE);
1805
1806    for (i = 0; i < ir->type->vector_elements; i++) {
1807       switch (i) {
1808       case 0:
1809          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1810          break;
1811       case 1:
1812          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1813          break;
1814       case 2:
1815          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1816          break;
1817       case 3:
1818          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1819             break;
1820       }
1821    }
1822    for (; i < 4; i++) {
1823       /* Replicate the last channel out. */
1824       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1825    }
1826
1827    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1828
1829    this->result = src;
1830 }
1831
1832 void
1833 vec4_visitor::visit(ir_dereference_variable *ir)
1834 {
1835    const struct glsl_type *type = ir->type;
1836    dst_reg *reg = variable_storage(ir->var);
1837
1838    if (!reg) {
1839       fail("Failed to find variable storage for %s\n", ir->var->name);
1840       this->result = src_reg(brw_null_reg());
1841       return;
1842    }
1843
1844    this->result = src_reg(*reg);
1845
1846    /* System values get their swizzle from the dst_reg writemask */
1847    if (ir->var->mode == ir_var_system_value)
1848       return;
1849
1850    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1851       this->result.swizzle = swizzle_for_size(type->vector_elements);
1852 }
1853
1854
1855 int
1856 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1857 {
1858    /* Under normal circumstances array elements are stored consecutively, so
1859     * the stride is equal to the size of the array element.
1860     */
1861    return type_size(ir->type);
1862 }
1863
1864
1865 void
1866 vec4_visitor::visit(ir_dereference_array *ir)
1867 {
1868    ir_constant *constant_index;
1869    src_reg src;
1870    int array_stride = compute_array_stride(ir);
1871
1872    constant_index = ir->array_index->constant_expression_value();
1873
1874    ir->array->accept(this);
1875    src = this->result;
1876
1877    if (constant_index) {
1878       src.reg_offset += constant_index->value.i[0] * array_stride;
1879    } else {
1880       /* Variable index array dereference.  It eats the "vec4" of the
1881        * base of the array and an index that offsets the Mesa register
1882        * index.
1883        */
1884       ir->array_index->accept(this);
1885
1886       src_reg index_reg;
1887
1888       if (array_stride == 1) {
1889          index_reg = this->result;
1890       } else {
1891          index_reg = src_reg(this, glsl_type::int_type);
1892
1893          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1894       }
1895
1896       if (src.reladdr) {
1897          src_reg temp = src_reg(this, glsl_type::int_type);
1898
1899          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1900
1901          index_reg = temp;
1902       }
1903
1904       src.reladdr = ralloc(mem_ctx, src_reg);
1905       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1906    }
1907
1908    /* If the type is smaller than a vec4, replicate the last channel out. */
1909    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1910       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1911    else
1912       src.swizzle = BRW_SWIZZLE_NOOP;
1913    src.type = brw_type_for_base_type(ir->type);
1914
1915    this->result = src;
1916 }
1917
1918 void
1919 vec4_visitor::visit(ir_dereference_record *ir)
1920 {
1921    unsigned int i;
1922    const glsl_type *struct_type = ir->record->type;
1923    int offset = 0;
1924
1925    ir->record->accept(this);
1926
1927    for (i = 0; i < struct_type->length; i++) {
1928       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1929          break;
1930       offset += type_size(struct_type->fields.structure[i].type);
1931    }
1932
1933    /* If the type is smaller than a vec4, replicate the last channel out. */
1934    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1935       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1936    else
1937       this->result.swizzle = BRW_SWIZZLE_NOOP;
1938    this->result.type = brw_type_for_base_type(ir->type);
1939
1940    this->result.reg_offset += offset;
1941 }
1942
1943 /**
1944  * We want to be careful in assignment setup to hit the actual storage
1945  * instead of potentially using a temporary like we might with the
1946  * ir_dereference handler.
1947  */
1948 static dst_reg
1949 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1950 {
1951    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1952     * access of a vector, it must be separated into a series conditional moves
1953     * before reaching this point (see ir_vec_index_to_cond_assign).
1954     */
1955    assert(ir->as_dereference());
1956    ir_dereference_array *deref_array = ir->as_dereference_array();
1957    if (deref_array) {
1958       assert(!deref_array->array->type->is_vector());
1959    }
1960
1961    /* Use the rvalue deref handler for the most part.  We'll ignore
1962     * swizzles in it and write swizzles using writemask, though.
1963     */
1964    ir->accept(v);
1965    return dst_reg(v->result);
1966 }
1967
1968 void
1969 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1970                               const struct glsl_type *type, uint32_t predicate)
1971 {
1972    if (type->base_type == GLSL_TYPE_STRUCT) {
1973       for (unsigned int i = 0; i < type->length; i++) {
1974          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1975       }
1976       return;
1977    }
1978
1979    if (type->is_array()) {
1980       for (unsigned int i = 0; i < type->length; i++) {
1981          emit_block_move(dst, src, type->fields.array, predicate);
1982       }
1983       return;
1984    }
1985
1986    if (type->is_matrix()) {
1987       const struct glsl_type *vec_type;
1988
1989       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1990                                          type->vector_elements, 1);
1991
1992       for (int i = 0; i < type->matrix_columns; i++) {
1993          emit_block_move(dst, src, vec_type, predicate);
1994       }
1995       return;
1996    }
1997
1998    assert(type->is_scalar() || type->is_vector());
1999
2000    dst->type = brw_type_for_base_type(type);
2001    src->type = dst->type;
2002
2003    dst->writemask = (1 << type->vector_elements) - 1;
2004
2005    src->swizzle = swizzle_for_size(type->vector_elements);
2006
2007    vec4_instruction *inst = emit(MOV(*dst, *src));
2008    inst->predicate = predicate;
2009
2010    dst->reg_offset++;
2011    src->reg_offset++;
2012 }
2013
2014
2015 /* If the RHS processing resulted in an instruction generating a
2016  * temporary value, and it would be easy to rewrite the instruction to
2017  * generate its result right into the LHS instead, do so.  This ends
2018  * up reliably removing instructions where it can be tricky to do so
2019  * later without real UD chain information.
2020  */
2021 bool
2022 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2023                                      dst_reg dst,
2024                                      src_reg src,
2025                                      vec4_instruction *pre_rhs_inst,
2026                                      vec4_instruction *last_rhs_inst)
2027 {
2028    /* This could be supported, but it would take more smarts. */
2029    if (ir->condition)
2030       return false;
2031
2032    if (pre_rhs_inst == last_rhs_inst)
2033       return false; /* No instructions generated to work with. */
2034
2035    /* Make sure the last instruction generated our source reg. */
2036    if (src.file != GRF ||
2037        src.file != last_rhs_inst->dst.file ||
2038        src.reg != last_rhs_inst->dst.reg ||
2039        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2040        src.reladdr ||
2041        src.abs ||
2042        src.negate ||
2043        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2044       return false;
2045
2046    /* Check that that last instruction fully initialized the channels
2047     * we want to use, in the order we want to use them.  We could
2048     * potentially reswizzle the operands of many instructions so that
2049     * we could handle out of order channels, but don't yet.
2050     */
2051
2052    for (unsigned i = 0; i < 4; i++) {
2053       if (dst.writemask & (1 << i)) {
2054          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2055             return false;
2056
2057          if (BRW_GET_SWZ(src.swizzle, i) != i)
2058             return false;
2059       }
2060    }
2061
2062    /* Success!  Rewrite the instruction. */
2063    last_rhs_inst->dst.file = dst.file;
2064    last_rhs_inst->dst.reg = dst.reg;
2065    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2066    last_rhs_inst->dst.reladdr = dst.reladdr;
2067    last_rhs_inst->dst.writemask &= dst.writemask;
2068
2069    return true;
2070 }
2071
2072 void
2073 vec4_visitor::visit(ir_assignment *ir)
2074 {
2075    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2076    uint32_t predicate = BRW_PREDICATE_NONE;
2077
2078    if (!ir->lhs->type->is_scalar() &&
2079        !ir->lhs->type->is_vector()) {
2080       ir->rhs->accept(this);
2081       src_reg src = this->result;
2082
2083       if (ir->condition) {
2084          emit_bool_to_cond_code(ir->condition, &predicate);
2085       }
2086
2087       /* emit_block_move doesn't account for swizzles in the source register.
2088        * This should be ok, since the source register is a structure or an
2089        * array, and those can't be swizzled.  But double-check to be sure.
2090        */
2091       assert(src.swizzle ==
2092              (ir->rhs->type->is_matrix()
2093               ? swizzle_for_size(ir->rhs->type->vector_elements)
2094               : BRW_SWIZZLE_NOOP));
2095
2096       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2097       return;
2098    }
2099
2100    /* Now we're down to just a scalar/vector with writemasks. */
2101    int i;
2102
2103    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2104    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2105
2106    ir->rhs->accept(this);
2107
2108    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2109
2110    src_reg src = this->result;
2111
2112    int swizzles[4];
2113    int first_enabled_chan = 0;
2114    int src_chan = 0;
2115
2116    assert(ir->lhs->type->is_vector() ||
2117           ir->lhs->type->is_scalar());
2118    dst.writemask = ir->write_mask;
2119
2120    for (int i = 0; i < 4; i++) {
2121       if (dst.writemask & (1 << i)) {
2122          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2123          break;
2124       }
2125    }
2126
2127    /* Swizzle a small RHS vector into the channels being written.
2128     *
2129     * glsl ir treats write_mask as dictating how many channels are
2130     * present on the RHS while in our instructions we need to make
2131     * those channels appear in the slots of the vec4 they're written to.
2132     */
2133    for (int i = 0; i < 4; i++) {
2134       if (dst.writemask & (1 << i))
2135          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2136       else
2137          swizzles[i] = first_enabled_chan;
2138    }
2139    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2140                               swizzles[2], swizzles[3]);
2141
2142    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2143       return;
2144    }
2145
2146    if (ir->condition) {
2147       emit_bool_to_cond_code(ir->condition, &predicate);
2148    }
2149
2150    for (i = 0; i < type_size(ir->lhs->type); i++) {
2151       vec4_instruction *inst = emit(MOV(dst, src));
2152       inst->predicate = predicate;
2153
2154       dst.reg_offset++;
2155       src.reg_offset++;
2156    }
2157 }
2158
2159 void
2160 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2161 {
2162    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2163       foreach_list(node, &ir->components) {
2164          ir_constant *field_value = (ir_constant *)node;
2165
2166          emit_constant_values(dst, field_value);
2167       }
2168       return;
2169    }
2170
2171    if (ir->type->is_array()) {
2172       for (unsigned int i = 0; i < ir->type->length; i++) {
2173          emit_constant_values(dst, ir->array_elements[i]);
2174       }
2175       return;
2176    }
2177
2178    if (ir->type->is_matrix()) {
2179       for (int i = 0; i < ir->type->matrix_columns; i++) {
2180          float *vec = &ir->value.f[i * ir->type->vector_elements];
2181
2182          for (int j = 0; j < ir->type->vector_elements; j++) {
2183             dst->writemask = 1 << j;
2184             dst->type = BRW_REGISTER_TYPE_F;
2185
2186             emit(MOV(*dst, src_reg(vec[j])));
2187          }
2188          dst->reg_offset++;
2189       }
2190       return;
2191    }
2192
2193    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2194
2195    for (int i = 0; i < ir->type->vector_elements; i++) {
2196       if (!(remaining_writemask & (1 << i)))
2197          continue;
2198
2199       dst->writemask = 1 << i;
2200       dst->type = brw_type_for_base_type(ir->type);
2201
2202       /* Find other components that match the one we're about to
2203        * write.  Emits fewer instructions for things like vec4(0.5,
2204        * 1.5, 1.5, 1.5).
2205        */
2206       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2207          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2208             if (ir->value.b[i] == ir->value.b[j])
2209                dst->writemask |= (1 << j);
2210          } else {
2211             /* u, i, and f storage all line up, so no need for a
2212              * switch case for comparing each type.
2213              */
2214             if (ir->value.u[i] == ir->value.u[j])
2215                dst->writemask |= (1 << j);
2216          }
2217       }
2218
2219       switch (ir->type->base_type) {
2220       case GLSL_TYPE_FLOAT:
2221          emit(MOV(*dst, src_reg(ir->value.f[i])));
2222          break;
2223       case GLSL_TYPE_INT:
2224          emit(MOV(*dst, src_reg(ir->value.i[i])));
2225          break;
2226       case GLSL_TYPE_UINT:
2227          emit(MOV(*dst, src_reg(ir->value.u[i])));
2228          break;
2229       case GLSL_TYPE_BOOL:
2230          emit(MOV(*dst, src_reg(ir->value.b[i])));
2231          break;
2232       default:
2233          assert(!"Non-float/uint/int/bool constant");
2234          break;
2235       }
2236
2237       remaining_writemask &= ~dst->writemask;
2238    }
2239    dst->reg_offset++;
2240 }
2241
2242 void
2243 vec4_visitor::visit(ir_constant *ir)
2244 {
2245    dst_reg dst = dst_reg(this, ir->type);
2246    this->result = src_reg(dst);
2247
2248    emit_constant_values(&dst, ir);
2249 }
2250
2251 void
2252 vec4_visitor::visit(ir_call *ir)
2253 {
2254    assert(!"not reached");
2255 }
2256
2257 void
2258 vec4_visitor::visit(ir_texture *ir)
2259 {
2260    int sampler =
2261       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2262
2263    /* Should be lowered by do_lower_texture_projection */
2264    assert(!ir->projector);
2265
2266    /* Generate code to compute all the subexpression trees.  This has to be
2267     * done before loading any values into MRFs for the sampler message since
2268     * generating these values may involve SEND messages that need the MRFs.
2269     */
2270    src_reg coordinate;
2271    if (ir->coordinate) {
2272       ir->coordinate->accept(this);
2273       coordinate = this->result;
2274    }
2275
2276    src_reg shadow_comparitor;
2277    if (ir->shadow_comparitor) {
2278       ir->shadow_comparitor->accept(this);
2279       shadow_comparitor = this->result;
2280    }
2281
2282    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2283    src_reg lod, dPdx, dPdy, sample_index;
2284    switch (ir->op) {
2285    case ir_tex:
2286       lod = src_reg(0.0f);
2287       lod_type = glsl_type::float_type;
2288       break;
2289    case ir_txf:
2290    case ir_txl:
2291    case ir_txs:
2292       ir->lod_info.lod->accept(this);
2293       lod = this->result;
2294       lod_type = ir->lod_info.lod->type;
2295       break;
2296    case ir_txf_ms:
2297       ir->lod_info.sample_index->accept(this);
2298       sample_index = this->result;
2299       sample_index_type = ir->lod_info.sample_index->type;
2300       break;
2301    case ir_txd:
2302       ir->lod_info.grad.dPdx->accept(this);
2303       dPdx = this->result;
2304
2305       ir->lod_info.grad.dPdy->accept(this);
2306       dPdy = this->result;
2307
2308       lod_type = ir->lod_info.grad.dPdx->type;
2309       break;
2310    case ir_txb:
2311    case ir_lod:
2312       break;
2313    }
2314
2315    vec4_instruction *inst = NULL;
2316    switch (ir->op) {
2317    case ir_tex:
2318    case ir_txl:
2319       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2320       break;
2321    case ir_txd:
2322       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2323       break;
2324    case ir_txf:
2325       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2326       break;
2327    case ir_txf_ms:
2328       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2329       break;
2330    case ir_txs:
2331       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2332       break;
2333    case ir_txb:
2334       assert(!"TXB is not valid for vertex shaders.");
2335       break;
2336    case ir_lod:
2337       assert(!"LOD is not valid for vertex shaders.");
2338       break;
2339    }
2340
2341    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2342
2343    /* Texel offsets go in the message header; Gen4 also requires headers. */
2344    inst->header_present = use_texture_offset || brw->gen < 5;
2345    inst->base_mrf = 2;
2346    inst->mlen = inst->header_present + 1; /* always at least one */
2347    inst->sampler = sampler;
2348    inst->dst = dst_reg(this, ir->type);
2349    inst->dst.writemask = WRITEMASK_XYZW;
2350    inst->shadow_compare = ir->shadow_comparitor != NULL;
2351
2352    if (use_texture_offset)
2353       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2354
2355    /* MRF for the first parameter */
2356    int param_base = inst->base_mrf + inst->header_present;
2357
2358    if (ir->op == ir_txs) {
2359       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2360       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2361    } else {
2362       int i, coord_mask = 0, zero_mask = 0;
2363       /* Load the coordinate */
2364       /* FINISHME: gl_clamp_mask and saturate */
2365       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2366          coord_mask |= (1 << i);
2367       for (; i < 4; i++)
2368          zero_mask |= (1 << i);
2369
2370       if (ir->offset && ir->op == ir_txf) {
2371          /* It appears that the ld instruction used for txf does its
2372           * address bounds check before adding in the offset.  To work
2373           * around this, just add the integer offset to the integer
2374           * texel coordinate, and don't put the offset in the header.
2375           */
2376          ir_constant *offset = ir->offset->as_constant();
2377          assert(offset);
2378
2379          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2380             src_reg src = coordinate;
2381             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2382                                        BRW_GET_SWZ(src.swizzle, j),
2383                                        BRW_GET_SWZ(src.swizzle, j),
2384                                        BRW_GET_SWZ(src.swizzle, j));
2385             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2386                      src, offset->value.i[j]));
2387          }
2388       } else {
2389          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2390                   coordinate));
2391       }
2392       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2393                src_reg(0)));
2394       /* Load the shadow comparitor */
2395       if (ir->shadow_comparitor && ir->op != ir_txd) {
2396          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2397                           WRITEMASK_X),
2398                   shadow_comparitor));
2399          inst->mlen++;
2400       }
2401
2402       /* Load the LOD info */
2403       if (ir->op == ir_tex || ir->op == ir_txl) {
2404          int mrf, writemask;
2405          if (brw->gen >= 5) {
2406             mrf = param_base + 1;
2407             if (ir->shadow_comparitor) {
2408                writemask = WRITEMASK_Y;
2409                /* mlen already incremented */
2410             } else {
2411                writemask = WRITEMASK_X;
2412                inst->mlen++;
2413             }
2414          } else /* brw->gen == 4 */ {
2415             mrf = param_base;
2416             writemask = WRITEMASK_W;
2417          }
2418          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2419       } else if (ir->op == ir_txf) {
2420          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2421       } else if (ir->op == ir_txf_ms) {
2422          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2423                   sample_index));
2424          inst->mlen++;
2425
2426          /* on Gen7, there is an additional MCS parameter here after SI,
2427           * but we don't bother to emit it since it's always zero. If
2428           * we start supporting texturing from CMS surfaces, this will have
2429           * to change
2430           */
2431       } else if (ir->op == ir_txd) {
2432          const glsl_type *type = lod_type;
2433
2434          if (brw->gen >= 5) {
2435             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2436             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2437             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2438             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2439             inst->mlen++;
2440
2441             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2442                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2443                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2444                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2445                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2446                inst->mlen++;
2447
2448                if (ir->shadow_comparitor) {
2449                   emit(MOV(dst_reg(MRF, param_base + 2,
2450                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2451                            shadow_comparitor));
2452                }
2453             }
2454          } else /* brw->gen == 4 */ {
2455             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2456             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2457             inst->mlen += 2;
2458          }
2459       }
2460    }
2461
2462    emit(inst);
2463
2464    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2465     * spec requires layers.
2466     */
2467    if (ir->op == ir_txs) {
2468       glsl_type const *type = ir->sampler->type;
2469       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2470           type->sampler_array) {
2471          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2472                    with_writemask(inst->dst, WRITEMASK_Z),
2473                    src_reg(inst->dst), src_reg(6));
2474       }
2475    }
2476
2477    swizzle_result(ir, src_reg(inst->dst), sampler);
2478 }
2479
2480 void
2481 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2482 {
2483    int s = key->tex.swizzles[sampler];
2484
2485    this->result = src_reg(this, ir->type);
2486    dst_reg swizzled_result(this->result);
2487
2488    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2489                         || s == SWIZZLE_NOOP) {
2490       emit(MOV(swizzled_result, orig_val));
2491       return;
2492    }
2493
2494    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2495    int swizzle[4] = {0};
2496
2497    for (int i = 0; i < 4; i++) {
2498       switch (GET_SWZ(s, i)) {
2499       case SWIZZLE_ZERO:
2500          zero_mask |= (1 << i);
2501          break;
2502       case SWIZZLE_ONE:
2503          one_mask |= (1 << i);
2504          break;
2505       default:
2506          copy_mask |= (1 << i);
2507          swizzle[i] = GET_SWZ(s, i);
2508          break;
2509       }
2510    }
2511
2512    if (copy_mask) {
2513       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2514       swizzled_result.writemask = copy_mask;
2515       emit(MOV(swizzled_result, orig_val));
2516    }
2517
2518    if (zero_mask) {
2519       swizzled_result.writemask = zero_mask;
2520       emit(MOV(swizzled_result, src_reg(0.0f)));
2521    }
2522
2523    if (one_mask) {
2524       swizzled_result.writemask = one_mask;
2525       emit(MOV(swizzled_result, src_reg(1.0f)));
2526    }
2527 }
2528
2529 void
2530 vec4_visitor::visit(ir_return *ir)
2531 {
2532    assert(!"not reached");
2533 }
2534
2535 void
2536 vec4_visitor::visit(ir_discard *ir)
2537 {
2538    assert(!"not reached");
2539 }
2540
2541 void
2542 vec4_visitor::visit(ir_if *ir)
2543 {
2544    /* Don't point the annotation at the if statement, because then it plus
2545     * the then and else blocks get printed.
2546     */
2547    this->base_ir = ir->condition;
2548
2549    if (brw->gen == 6) {
2550       emit_if_gen6(ir);
2551    } else {
2552       uint32_t predicate;
2553       emit_bool_to_cond_code(ir->condition, &predicate);
2554       emit(IF(predicate));
2555    }
2556
2557    visit_instructions(&ir->then_instructions);
2558
2559    if (!ir->else_instructions.is_empty()) {
2560       this->base_ir = ir->condition;
2561       emit(BRW_OPCODE_ELSE);
2562
2563       visit_instructions(&ir->else_instructions);
2564    }
2565
2566    this->base_ir = ir->condition;
2567    emit(BRW_OPCODE_ENDIF);
2568 }
2569
2570 void
2571 vec4_visitor::visit(ir_emit_vertex *)
2572 {
2573    assert(!"not reached");
2574 }
2575
2576 void
2577 vec4_visitor::visit(ir_end_primitive *)
2578 {
2579    assert(!"not reached");
2580 }
2581
2582 void
2583 vec4_visitor::emit_ndc_computation()
2584 {
2585    /* Get the position */
2586    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2587
2588    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2589    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2590    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2591
2592    current_annotation = "NDC";
2593    dst_reg ndc_w = ndc;
2594    ndc_w.writemask = WRITEMASK_W;
2595    src_reg pos_w = pos;
2596    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2597    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2598
2599    dst_reg ndc_xyz = ndc;
2600    ndc_xyz.writemask = WRITEMASK_XYZ;
2601
2602    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2603 }
2604
2605 void
2606 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2607 {
2608    if (brw->gen < 6 &&
2609        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2610         key->userclip_active || brw->has_negative_rhw_bug)) {
2611       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2612       dst_reg header1_w = header1;
2613       header1_w.writemask = WRITEMASK_W;
2614
2615       emit(MOV(header1, 0u));
2616
2617       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2618          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2619
2620          current_annotation = "Point size";
2621          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2622          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2623       }
2624
2625       if (key->userclip_active) {
2626          current_annotation = "Clipping flags";
2627          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2628          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2629
2630          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2631          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2632          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2633
2634          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2635          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2636          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2637          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2638       }
2639
2640       /* i965 clipping workaround:
2641        * 1) Test for -ve rhw
2642        * 2) If set,
2643        *      set ndc = (0,0,0,0)
2644        *      set ucp[6] = 1
2645        *
2646        * Later, clipping will detect ucp[6] and ensure the primitive is
2647        * clipped against all fixed planes.
2648        */
2649       if (brw->has_negative_rhw_bug) {
2650          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2651          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2652          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2653          vec4_instruction *inst;
2654          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2655          inst->predicate = BRW_PREDICATE_NORMAL;
2656          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2657          inst->predicate = BRW_PREDICATE_NORMAL;
2658       }
2659
2660       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2661    } else if (brw->gen < 6) {
2662       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2663    } else {
2664       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2665       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2666          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2667                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2668       }
2669       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2670          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2671                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2672       }
2673    }
2674 }
2675
2676 void
2677 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2678 {
2679    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2680     *
2681     *     "If a linked set of shaders forming the vertex stage contains no
2682     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2683     *     application has requested clipping against user clip planes through
2684     *     the API, then the coordinate written to gl_Position is used for
2685     *     comparison against the user clip planes."
2686     *
2687     * This function is only called if the shader didn't write to
2688     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2689     * if the user wrote to it; otherwise we use gl_Position.
2690     */
2691    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2692    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2693       clip_vertex = VARYING_SLOT_POS;
2694    }
2695
2696    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2697         ++i) {
2698       reg.writemask = 1 << i;
2699       emit(DP4(reg,
2700                src_reg(output_reg[clip_vertex]),
2701                src_reg(this->userplane[i + offset])));
2702    }
2703 }
2704
2705 void
2706 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2707 {
2708    assert (varying < VARYING_SLOT_MAX);
2709    reg.type = output_reg[varying].type;
2710    current_annotation = output_reg_annotation[varying];
2711    /* Copy the register, saturating if necessary */
2712    vec4_instruction *inst = emit(MOV(reg,
2713                                      src_reg(output_reg[varying])));
2714    if ((varying == VARYING_SLOT_COL0 ||
2715         varying == VARYING_SLOT_COL1 ||
2716         varying == VARYING_SLOT_BFC0 ||
2717         varying == VARYING_SLOT_BFC1) &&
2718        key->clamp_vertex_color) {
2719       inst->saturate = true;
2720    }
2721 }
2722
2723 void
2724 vec4_visitor::emit_urb_slot(int mrf, int varying)
2725 {
2726    struct brw_reg hw_reg = brw_message_reg(mrf);
2727    dst_reg reg = dst_reg(MRF, mrf);
2728    reg.type = BRW_REGISTER_TYPE_F;
2729
2730    switch (varying) {
2731    case VARYING_SLOT_PSIZ:
2732       /* PSIZ is always in slot 0, and is coupled with other flags. */
2733       current_annotation = "indices, point width, clip flags";
2734       emit_psiz_and_flags(hw_reg);
2735       break;
2736    case BRW_VARYING_SLOT_NDC:
2737       current_annotation = "NDC";
2738       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2739       break;
2740    case VARYING_SLOT_POS:
2741       current_annotation = "gl_Position";
2742       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2743       break;
2744    case VARYING_SLOT_EDGE:
2745       /* This is present when doing unfilled polygons.  We're supposed to copy
2746        * the edge flag from the user-provided vertex array
2747        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2748        * of that attribute (starts as 1.0f).  This is then used in clipping to
2749        * determine which edges should be drawn as wireframe.
2750        */
2751       current_annotation = "edge flag";
2752       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2753                                     glsl_type::float_type, WRITEMASK_XYZW))));
2754       break;
2755    case BRW_VARYING_SLOT_PAD:
2756       /* No need to write to this slot */
2757       break;
2758    default:
2759       emit_generic_urb_slot(reg, varying);
2760       break;
2761    }
2762 }
2763
2764 static int
2765 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2766 {
2767    if (brw->gen >= 6) {
2768       /* URB data written (does not include the message header reg) must
2769        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2770        * section 5.4.3.2.2: URB_INTERLEAVED.
2771        *
2772        * URB entries are allocated on a multiple of 1024 bits, so an
2773        * extra 128 bits written here to make the end align to 256 is
2774        * no problem.
2775        */
2776       if ((mlen % 2) != 1)
2777          mlen++;
2778    }
2779
2780    return mlen;
2781 }
2782
2783 void
2784 vec4_vs_visitor::emit_urb_write_header(int mrf)
2785 {
2786    /* No need to do anything for VS; an implied write to this MRF will be
2787     * performed by VS_OPCODE_URB_WRITE.
2788     */
2789    (void) mrf;
2790 }
2791
2792 vec4_instruction *
2793 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2794 {
2795    /* For VS, the URB writes end the thread. */
2796    if (complete) {
2797       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2798          emit_shader_time_end();
2799    }
2800
2801    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2802    inst->urb_write_flags = complete ?
2803       BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
2804
2805    return inst;
2806 }
2807
2808 /**
2809  * Generates the VUE payload plus the necessary URB write instructions to
2810  * output it.
2811  *
2812  * The VUE layout is documented in Volume 2a.
2813  */
2814 void
2815 vec4_visitor::emit_vertex()
2816 {
2817    /* MRF 0 is reserved for the debugger, so start with message header
2818     * in MRF 1.
2819     */
2820    int base_mrf = 1;
2821    int mrf = base_mrf;
2822    /* In the process of generating our URB write message contents, we
2823     * may need to unspill a register or load from an array.  Those
2824     * reads would use MRFs 14-15.
2825     */
2826    int max_usable_mrf = 13;
2827
2828    /* The following assertion verifies that max_usable_mrf causes an
2829     * even-numbered amount of URB write data, which will meet gen6's
2830     * requirements for length alignment.
2831     */
2832    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2833
2834    /* First mrf is the g0-based message header containing URB handles and
2835     * such.
2836     */
2837    emit_urb_write_header(mrf++);
2838
2839    if (brw->gen < 6) {
2840       emit_ndc_computation();
2841    }
2842
2843    /* Lower legacy ff and ClipVertex clipping to clip distances */
2844    if (key->userclip_active && !key->uses_clip_distance) {
2845       current_annotation = "user clip distances";
2846
2847       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2848       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2849
2850       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2851       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2852    }
2853
2854    /* Set up the VUE data for the first URB write */
2855    int slot;
2856    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2857       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2858
2859       /* If this was max_usable_mrf, we can't fit anything more into this URB
2860        * WRITE.
2861        */
2862       if (mrf > max_usable_mrf) {
2863          slot++;
2864          break;
2865       }
2866    }
2867
2868    bool complete = slot >= prog_data->vue_map.num_slots;
2869    current_annotation = "URB write";
2870    vec4_instruction *inst = emit_urb_write_opcode(complete);
2871    inst->base_mrf = base_mrf;
2872    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2873
2874    /* Optional second URB write */
2875    if (!complete) {
2876       mrf = base_mrf + 1;
2877
2878       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2879          assert(mrf < max_usable_mrf);
2880
2881          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2882       }
2883
2884       current_annotation = "URB write";
2885       inst = emit_urb_write_opcode(true /* complete */);
2886       inst->base_mrf = base_mrf;
2887       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2888       /* URB destination offset.  In the previous write, we got MRFs
2889        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2890        * URB row increments, and each of our MRFs is half of one of
2891        * those, since we're doing interleaved writes.
2892        */
2893       inst->offset = (max_usable_mrf - base_mrf) / 2;
2894    }
2895 }
2896
2897 void
2898 vec4_vs_visitor::emit_thread_end()
2899 {
2900    /* For VS, we always end the thread by emitting a single vertex.
2901     * emit_urb_write_opcode() will take care of setting the eot flag on the
2902     * SEND instruction.
2903     */
2904    emit_vertex();
2905 }
2906
2907 src_reg
2908 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2909                                  src_reg *reladdr, int reg_offset)
2910 {
2911    /* Because we store the values to scratch interleaved like our
2912     * vertex data, we need to scale the vec4 index by 2.
2913     */
2914    int message_header_scale = 2;
2915
2916    /* Pre-gen6, the message header uses byte offsets instead of vec4
2917     * (16-byte) offset units.
2918     */
2919    if (brw->gen < 6)
2920       message_header_scale *= 16;
2921
2922    if (reladdr) {
2923       src_reg index = src_reg(this, glsl_type::int_type);
2924
2925       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2926       emit_before(inst, MUL(dst_reg(index),
2927                             index, src_reg(message_header_scale)));
2928
2929       return index;
2930    } else {
2931       return src_reg(reg_offset * message_header_scale);
2932    }
2933 }
2934
2935 src_reg
2936 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2937                                        src_reg *reladdr, int reg_offset)
2938 {
2939    if (reladdr) {
2940       src_reg index = src_reg(this, glsl_type::int_type);
2941
2942       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2943
2944       /* Pre-gen6, the message header uses byte offsets instead of vec4
2945        * (16-byte) offset units.
2946        */
2947       if (brw->gen < 6) {
2948          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2949       }
2950
2951       return index;
2952    } else {
2953       int message_header_scale = brw->gen < 6 ? 16 : 1;
2954       return src_reg(reg_offset * message_header_scale);
2955    }
2956 }
2957
2958 /**
2959  * Emits an instruction before @inst to load the value named by @orig_src
2960  * from scratch space at @base_offset to @temp.
2961  *
2962  * @base_offset is measured in 32-byte units (the size of a register).
2963  */
2964 void
2965 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2966                                 dst_reg temp, src_reg orig_src,
2967                                 int base_offset)
2968 {
2969    int reg_offset = base_offset + orig_src.reg_offset;
2970    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2971
2972    emit_before(inst, SCRATCH_READ(temp, index));
2973 }
2974
2975 /**
2976  * Emits an instruction after @inst to store the value to be written
2977  * to @orig_dst to scratch space at @base_offset, from @temp.
2978  *
2979  * @base_offset is measured in 32-byte units (the size of a register).
2980  */
2981 void
2982 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2983 {
2984    int reg_offset = base_offset + inst->dst.reg_offset;
2985    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2986
2987    /* Create a temporary register to store *inst's result in.
2988     *
2989     * We have to be careful in MOVing from our temporary result register in
2990     * the scratch write.  If we swizzle from channels of the temporary that
2991     * weren't initialized, it will confuse live interval analysis, which will
2992     * make spilling fail to make progress.
2993     */
2994    src_reg temp = src_reg(this, glsl_type::vec4_type);
2995    temp.type = inst->dst.type;
2996    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2997    int swizzles[4];
2998    for (int i = 0; i < 4; i++)
2999       if (inst->dst.writemask & (1 << i))
3000          swizzles[i] = i;
3001       else
3002          swizzles[i] = first_writemask_chan;
3003    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3004                                swizzles[2], swizzles[3]);
3005
3006    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3007                                        inst->dst.writemask));
3008    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3009    write->predicate = inst->predicate;
3010    write->ir = inst->ir;
3011    write->annotation = inst->annotation;
3012    inst->insert_after(write);
3013
3014    inst->dst.file = temp.file;
3015    inst->dst.reg = temp.reg;
3016    inst->dst.reg_offset = temp.reg_offset;
3017    inst->dst.reladdr = NULL;
3018 }
3019
3020 /**
3021  * We can't generally support array access in GRF space, because a
3022  * single instruction's destination can only span 2 contiguous
3023  * registers.  So, we send all GRF arrays that get variable index
3024  * access to scratch space.
3025  */
3026 void
3027 vec4_visitor::move_grf_array_access_to_scratch()
3028 {
3029    int scratch_loc[this->virtual_grf_count];
3030
3031    for (int i = 0; i < this->virtual_grf_count; i++) {
3032       scratch_loc[i] = -1;
3033    }
3034
3035    /* First, calculate the set of virtual GRFs that need to be punted
3036     * to scratch due to having any array access on them, and where in
3037     * scratch.
3038     */
3039    foreach_list(node, &this->instructions) {
3040       vec4_instruction *inst = (vec4_instruction *)node;
3041
3042       if (inst->dst.file == GRF && inst->dst.reladdr &&
3043           scratch_loc[inst->dst.reg] == -1) {
3044          scratch_loc[inst->dst.reg] = c->last_scratch;
3045          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3046       }
3047
3048       for (int i = 0 ; i < 3; i++) {
3049          src_reg *src = &inst->src[i];
3050
3051          if (src->file == GRF && src->reladdr &&
3052              scratch_loc[src->reg] == -1) {
3053             scratch_loc[src->reg] = c->last_scratch;
3054             c->last_scratch += this->virtual_grf_sizes[src->reg];
3055          }
3056       }
3057    }
3058
3059    /* Now, for anything that will be accessed through scratch, rewrite
3060     * it to load/store.  Note that this is a _safe list walk, because
3061     * we may generate a new scratch_write instruction after the one
3062     * we're processing.
3063     */
3064    foreach_list_safe(node, &this->instructions) {
3065       vec4_instruction *inst = (vec4_instruction *)node;
3066
3067       /* Set up the annotation tracking for new generated instructions. */
3068       base_ir = inst->ir;
3069       current_annotation = inst->annotation;
3070
3071       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3072          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3073       }
3074
3075       for (int i = 0 ; i < 3; i++) {
3076          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3077             continue;
3078
3079          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3080
3081          emit_scratch_read(inst, temp, inst->src[i],
3082                            scratch_loc[inst->src[i].reg]);
3083
3084          inst->src[i].file = temp.file;
3085          inst->src[i].reg = temp.reg;
3086          inst->src[i].reg_offset = temp.reg_offset;
3087          inst->src[i].reladdr = NULL;
3088       }
3089    }
3090 }
3091
3092 /**
3093  * Emits an instruction before @inst to load the value named by @orig_src
3094  * from the pull constant buffer (surface) at @base_offset to @temp.
3095  */
3096 void
3097 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3098                                       dst_reg temp, src_reg orig_src,
3099                                       int base_offset)
3100 {
3101    int reg_offset = base_offset + orig_src.reg_offset;
3102    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
3103    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3104    vec4_instruction *load;
3105
3106    if (brw->gen >= 7) {
3107       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3108       grf_offset.type = offset.type;
3109       emit_before(inst, MOV(grf_offset, offset));
3110
3111       load = new(mem_ctx) vec4_instruction(this,
3112                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3113                                            temp, index, src_reg(grf_offset));
3114    } else {
3115       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3116                                            temp, index, offset);
3117       load->base_mrf = 14;
3118       load->mlen = 1;
3119    }
3120    emit_before(inst, load);
3121 }
3122
3123 /**
3124  * Implements array access of uniforms by inserting a
3125  * PULL_CONSTANT_LOAD instruction.
3126  *
3127  * Unlike temporary GRF array access (where we don't support it due to
3128  * the difficulty of doing relative addressing on instruction
3129  * destinations), we could potentially do array access of uniforms
3130  * that were loaded in GRF space as push constants.  In real-world
3131  * usage we've seen, though, the arrays being used are always larger
3132  * than we could load as push constants, so just always move all
3133  * uniform array access out to a pull constant buffer.
3134  */
3135 void
3136 vec4_visitor::move_uniform_array_access_to_pull_constants()
3137 {
3138    int pull_constant_loc[this->uniforms];
3139
3140    for (int i = 0; i < this->uniforms; i++) {
3141       pull_constant_loc[i] = -1;
3142    }
3143
3144    /* Walk through and find array access of uniforms.  Put a copy of that
3145     * uniform in the pull constant buffer.
3146     *
3147     * Note that we don't move constant-indexed accesses to arrays.  No
3148     * testing has been done of the performance impact of this choice.
3149     */
3150    foreach_list_safe(node, &this->instructions) {
3151       vec4_instruction *inst = (vec4_instruction *)node;
3152
3153       for (int i = 0 ; i < 3; i++) {
3154          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3155             continue;
3156
3157          int uniform = inst->src[i].reg;
3158
3159          /* If this array isn't already present in the pull constant buffer,
3160           * add it.
3161           */
3162          if (pull_constant_loc[uniform] == -1) {
3163             const float **values = &prog_data->param[uniform * 4];
3164
3165             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3166
3167             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3168                prog_data->pull_param[prog_data->nr_pull_params++]
3169                   = values[j];
3170             }
3171          }
3172
3173          /* Set up the annotation tracking for new generated instructions. */
3174          base_ir = inst->ir;
3175          current_annotation = inst->annotation;
3176
3177          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3178
3179          emit_pull_constant_load(inst, temp, inst->src[i],
3180                                  pull_constant_loc[uniform]);
3181
3182          inst->src[i].file = temp.file;
3183          inst->src[i].reg = temp.reg;
3184          inst->src[i].reg_offset = temp.reg_offset;
3185          inst->src[i].reladdr = NULL;
3186       }
3187    }
3188
3189    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3190     * no need to track them as larger-than-vec4 objects.  This will be
3191     * relied on in cutting out unused uniform vectors from push
3192     * constants.
3193     */
3194    split_uniform_registers();
3195 }
3196
3197 void
3198 vec4_visitor::resolve_ud_negate(src_reg *reg)
3199 {
3200    if (reg->type != BRW_REGISTER_TYPE_UD ||
3201        !reg->negate)
3202       return;
3203
3204    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3205    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3206    *reg = temp;
3207 }
3208
3209 vec4_visitor::vec4_visitor(struct brw_context *brw,
3210                            struct brw_vec4_compile *c,
3211                            struct gl_program *prog,
3212                            const struct brw_vec4_prog_key *key,
3213                            struct brw_vec4_prog_data *prog_data,
3214                            struct gl_shader_program *shader_prog,
3215                            struct brw_shader *shader,
3216                            void *mem_ctx,
3217                            bool debug_flag)
3218    : debug_flag(debug_flag)
3219 {
3220    this->brw = brw;
3221    this->ctx = &brw->ctx;
3222    this->shader_prog = shader_prog;
3223    this->shader = shader;
3224
3225    this->mem_ctx = mem_ctx;
3226    this->failed = false;
3227
3228    this->base_ir = NULL;
3229    this->current_annotation = NULL;
3230    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3231
3232    this->c = c;
3233    this->prog = prog;
3234    this->key = key;
3235    this->prog_data = prog_data;
3236
3237    this->variable_ht = hash_table_ctor(0,
3238                                        hash_table_pointer_hash,
3239                                        hash_table_pointer_compare);
3240
3241    this->virtual_grf_start = NULL;
3242    this->virtual_grf_end = NULL;
3243    this->virtual_grf_sizes = NULL;
3244    this->virtual_grf_count = 0;
3245    this->virtual_grf_reg_map = NULL;
3246    this->virtual_grf_reg_count = 0;
3247    this->virtual_grf_array_size = 0;
3248    this->live_intervals_valid = false;
3249
3250    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3251
3252    this->uniforms = 0;
3253 }
3254
3255 vec4_visitor::~vec4_visitor()
3256 {
3257    hash_table_dtor(this->variable_ht);
3258 }
3259
3260
3261 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3262                                  struct brw_vs_compile *vs_compile,
3263                                  struct brw_vs_prog_data *vs_prog_data,
3264                                  struct gl_shader_program *prog,
3265                                  struct brw_shader *shader,
3266                                  void *mem_ctx)
3267    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3268                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3269                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3270      vs_compile(vs_compile),
3271      vs_prog_data(vs_prog_data)
3272 {
3273 }
3274
3275
3276 void
3277 vec4_visitor::fail(const char *format, ...)
3278 {
3279    va_list va;
3280    char *msg;
3281
3282    if (failed)
3283       return;
3284
3285    failed = true;
3286
3287    va_start(va, format);
3288    msg = ralloc_vasprintf(mem_ctx, format, va);
3289    va_end(va);
3290    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3291
3292    this->fail_msg = msg;
3293
3294    if (debug_flag) {
3295       fprintf(stderr, "%s",  msg);
3296    }
3297 }
3298
3299 } /* namespace brw */