src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_vs.h"
  26 #include "glsl/ir_uniform.h"
  27 extern "C" {
  28 #include "main/context.h"
  29 #include "main/macros.h"
  30 #include "program/prog_parameter.h"
  31 #include "program/sampler.h"
  32 }
  33
  34 namespace brw {
  35
  36 vec4_instruction::vec4_instruction(vec4_visitor *v,
  37                                    enum opcode opcode, dst_reg dst,
  38                                    src_reg src0, src_reg src1, src_reg src2)
  39 {
  40    this->opcode = opcode;
  41    this->dst = dst;
  42    this->src[0] = src0;
  43    this->src[1] = src1;
  44    this->src[2] = src2;
  45    this->ir = v->base_ir;
  46    this->annotation = v->current_annotation;
  47 }
  48
  49 vec4_instruction *
  50 vec4_visitor::emit(vec4_instruction *inst)
  51 {
  52    this->instructions.push_tail(inst);
  53
  54    return inst;
  55 }
  56
  57 vec4_instruction *
  58 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  59 {
  60    new_inst->ir = inst->ir;
  61    new_inst->annotation = inst->annotation;
  62
  63    inst->insert_before(new_inst);
  64
  65    return inst;
  66 }
  67
  68 vec4_instruction *
  69 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  70                    src_reg src0, src_reg src1, src_reg src2)
  71 {
  72    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  73                                              src0, src1, src2));
  74 }
  75
  76
  77 vec4_instruction *
  78 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  79 {
  80    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  81 }
  82
  83 vec4_instruction *
  84 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  85 {
  86    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  87 }
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(enum opcode opcode)
  91 {
  92    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  93 }
  94
  95 #define ALU1(op)                                                        \
  96    vec4_instruction *                                                   \
  97    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  98    {                                                                    \
  99       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 100                                            src0);                       \
 101    }
 102
 103 #define ALU2(op)                                                        \
 104    vec4_instruction *                                                   \
 105    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 106    {                                                                    \
 107       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 108                                            src0, src1);                 \
 109    }
 110
 111 #define ALU3(op)                                                        \
 112    vec4_instruction *                                                   \
 113    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
 114    {                                                                    \
 115       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 116                                            src0, src1, src2);           \
 117    }
 118
 119 ALU1(NOT)
 120 ALU1(MOV)
 121 ALU1(FRC)
 122 ALU1(RNDD)
 123 ALU1(RNDE)
 124 ALU1(RNDZ)
 125 ALU1(F32TO16)
 126 ALU1(F16TO32)
 127 ALU2(ADD)
 128 ALU2(MUL)
 129 ALU2(MACH)
 130 ALU2(AND)
 131 ALU2(OR)
 132 ALU2(XOR)
 133 ALU2(DP3)
 134 ALU2(DP4)
 135 ALU2(DPH)
 136 ALU2(SHL)
 137 ALU2(SHR)
 138 ALU2(ASR)
 139 ALU3(LRP)
 140 ALU1(BFREV)
 141 ALU3(BFE)
 142 ALU2(BFI1)
 143 ALU3(BFI2)
 144 ALU1(FBH)
 145 ALU1(FBL)
 146 ALU1(CBIT)
 147
 148 /** Gen4 predicated IF. */
 149 vec4_instruction *
 150 vec4_visitor::IF(uint32_t predicate)
 151 {
 152    vec4_instruction *inst;
 153
 154    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 155    inst->predicate = predicate;
 156
 157    return inst;
 158 }
 159
 160 /** Gen6+ IF with embedded comparison. */
 161 vec4_instruction *
 162 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 163 {
 164    assert(brw->gen >= 6);
 165
 166    vec4_instruction *inst;
 167
 168    resolve_ud_negate(&src0);
 169    resolve_ud_negate(&src1);
 170
 171    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 172                                         src0, src1);
 173    inst->conditional_mod = condition;
 174
 175    return inst;
 176 }
 177
 178 /**
 179  * CMP: Sets the low bit of the destination channels with the result
 180  * of the comparison, while the upper bits are undefined, and updates
 181  * the flag register with the packed 16 bits of the result.
 182  */
 183 vec4_instruction *
 184 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 185 {
 186    vec4_instruction *inst;
 187
 188    /* original gen4 does type conversion to the destination type
 189     * before before comparison, producing garbage results for floating
 190     * point comparisons.
 191     */
 192    if (brw->gen == 4) {
 193       dst.type = src0.type;
 194       if (dst.file == HW_REG)
 195          dst.fixed_hw_reg.type = dst.type;
 196    }
 197
 198    resolve_ud_negate(&src0);
 199    resolve_ud_negate(&src1);
 200
 201    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 202    inst->conditional_mod = condition;
 203
 204    return inst;
 205 }
 206
 207 vec4_instruction *
 208 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 209 {
 210    vec4_instruction *inst;
 211
 212    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 213                                         dst, index);
 214    inst->base_mrf = 14;
 215    inst->mlen = 2;
 216
 217    return inst;
 218 }
 219
 220 vec4_instruction *
 221 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 222 {
 223    vec4_instruction *inst;
 224
 225    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 226                                         dst, src, index);
 227    inst->base_mrf = 13;
 228    inst->mlen = 3;
 229
 230    return inst;
 231 }
 232
 233 void
 234 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 235 {
 236    static enum opcode dot_opcodes[] = {
 237       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 238    };
 239
 240    emit(dot_opcodes[elements - 2], dst, src0, src1);
 241 }
 242
 243 src_reg
 244 vec4_visitor::fix_3src_operand(src_reg src)
 245 {
 246    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 247     * able to use vertical stride of zero to replicate the vec4 uniform, like
 248     *
 249     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 250     *
 251     * But you can't, since vertical stride is always four in three-source
 252     * instructions. Instead, insert a MOV instruction to do the replication so
 253     * that the three-source instruction can consume it.
 254     */
 255
 256    /* The MOV is only needed if the source is a uniform or immediate. */
 257    if (src.file != UNIFORM && src.file != IMM)
 258       return src;
 259
 260    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 261    expanded.type = src.type;
 262    emit(MOV(expanded, src));
 263    return src_reg(expanded);
 264 }
 265
 266 src_reg
 267 vec4_visitor::fix_math_operand(src_reg src)
 268 {
 269    /* The gen6 math instruction ignores the source modifiers --
 270     * swizzle, abs, negate, and at least some parts of the register
 271     * region description.
 272     *
 273     * Rather than trying to enumerate all these cases, *always* expand the
 274     * operand to a temp GRF for gen6.
 275     *
 276     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 277     * can't use.
 278     */
 279
 280    if (brw->gen == 7 && src.file != IMM)
 281       return src;
 282
 283    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 284    expanded.type = src.type;
 285    emit(MOV(expanded, src));
 286    return src_reg(expanded);
 287 }
 288
 289 void
 290 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 291 {
 292    src = fix_math_operand(src);
 293
 294    if (dst.writemask != WRITEMASK_XYZW) {
 295       /* The gen6 math instruction must be align1, so we can't do
 296        * writemasks.
 297        */
 298       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 299
 300       emit(opcode, temp_dst, src);
 301
 302       emit(MOV(dst, src_reg(temp_dst)));
 303    } else {
 304       emit(opcode, dst, src);
 305    }
 306 }
 307
 308 void
 309 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 310 {
 311    vec4_instruction *inst = emit(opcode, dst, src);
 312    inst->base_mrf = 1;
 313    inst->mlen = 1;
 314 }
 315
 316 void
 317 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 318 {
 319    switch (opcode) {
 320    case SHADER_OPCODE_RCP:
 321    case SHADER_OPCODE_RSQ:
 322    case SHADER_OPCODE_SQRT:
 323    case SHADER_OPCODE_EXP2:
 324    case SHADER_OPCODE_LOG2:
 325    case SHADER_OPCODE_SIN:
 326    case SHADER_OPCODE_COS:
 327       break;
 328    default:
 329       assert(!"not reached: bad math opcode");
 330       return;
 331    }
 332
 333    if (brw->gen >= 6) {
 334       return emit_math1_gen6(opcode, dst, src);
 335    } else {
 336       return emit_math1_gen4(opcode, dst, src);
 337    }
 338 }
 339
 340 void
 341 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 342                               dst_reg dst, src_reg src0, src_reg src1)
 343 {
 344    src0 = fix_math_operand(src0);
 345    src1 = fix_math_operand(src1);
 346
 347    if (dst.writemask != WRITEMASK_XYZW) {
 348       /* The gen6 math instruction must be align1, so we can't do
 349        * writemasks.
 350        */
 351       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 352       temp_dst.type = dst.type;
 353
 354       emit(opcode, temp_dst, src0, src1);
 355
 356       emit(MOV(dst, src_reg(temp_dst)));
 357    } else {
 358       emit(opcode, dst, src0, src1);
 359    }
 360 }
 361
 362 void
 363 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 364                               dst_reg dst, src_reg src0, src_reg src1)
 365 {
 366    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 367    inst->base_mrf = 1;
 368    inst->mlen = 2;
 369 }
 370
 371 void
 372 vec4_visitor::emit_math(enum opcode opcode,
 373                         dst_reg dst, src_reg src0, src_reg src1)
 374 {
 375    switch (opcode) {
 376    case SHADER_OPCODE_POW:
 377    case SHADER_OPCODE_INT_QUOTIENT:
 378    case SHADER_OPCODE_INT_REMAINDER:
 379       break;
 380    default:
 381       assert(!"not reached: unsupported binary math opcode");
 382       return;
 383    }
 384
 385    if (brw->gen >= 6) {
 386       return emit_math2_gen6(opcode, dst, src0, src1);
 387    } else {
 388       return emit_math2_gen4(opcode, dst, src0, src1);
 389    }
 390 }
 391
 392 void
 393 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 394 {
 395    if (brw->gen < 7)
 396       assert(!"ir_unop_pack_half_2x16 should be lowered");
 397
 398    assert(dst.type == BRW_REGISTER_TYPE_UD);
 399    assert(src0.type == BRW_REGISTER_TYPE_F);
 400
 401    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 402     *
 403     *   Because this instruction does not have a 16-bit floating-point type,
 404     *   the destination data type must be Word (W).
 405     *
 406     *   The destination must be DWord-aligned and specify a horizontal stride
 407     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 408     *   each destination channel and the upper word is not modified.
 409     *
 410     * The above restriction implies that the f32to16 instruction must use
 411     * align1 mode, because only in align1 mode is it possible to specify
 412     * horizontal stride.  We choose here to defy the hardware docs and emit
 413     * align16 instructions.
 414     *
 415     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 416     * instructions. I was partially successful in that the code passed all
 417     * tests.  However, the code was dubiously correct and fragile, and the
 418     * tests were not harsh enough to probe that frailty. Not trusting the
 419     * code, I chose instead to remain in align16 mode in defiance of the hw
 420     * docs).
 421     *
 422     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 423     * simulator, emitting a f32to16 in align16 mode with UD as destination
 424     * data type is safe. The behavior differs from that specified in the PRM
 425     * in that the upper word of each destination channel is cleared to 0.
 426     */
 427
 428    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 429    src_reg tmp_src(tmp_dst);
 430
 431 #if 0
 432    /* Verify the undocumented behavior on which the following instructions
 433     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 434     * then the result of the bit-or instruction below will be incorrect.
 435     *
 436     * You should inspect the disasm output in order to verify that the MOV is
 437     * not optimized away.
 438     */
 439    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 440 #endif
 441
 442    /* Give tmp the form below, where "." means untouched.
 443     *
 444     *     w z          y          x w z          y          x
 445     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 446     *
 447     * That the upper word of each write-channel be 0 is required for the
 448     * following bit-shift and bit-or instructions to work. Note that this
 449     * relies on the undocumented hardware behavior mentioned above.
 450     */
 451    tmp_dst.writemask = WRITEMASK_XY;
 452    emit(F32TO16(tmp_dst, src0));
 453
 454    /* Give the write-channels of dst the form:
 455     *   0xhhhh0000
 456     */
 457    tmp_src.swizzle = SWIZZLE_Y;
 458    emit(SHL(dst, tmp_src, src_reg(16u)));
 459
 460    /* Finally, give the write-channels of dst the form of packHalf2x16's
 461     * output:
 462     *   0xhhhhllll
 463     */
 464    tmp_src.swizzle = SWIZZLE_X;
 465    emit(OR(dst, src_reg(dst), tmp_src));
 466 }
 467
 468 void
 469 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 470 {
 471    if (brw->gen < 7)
 472       assert(!"ir_unop_unpack_half_2x16 should be lowered");
 473
 474    assert(dst.type == BRW_REGISTER_TYPE_F);
 475    assert(src0.type == BRW_REGISTER_TYPE_UD);
 476
 477    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 478     *
 479     *   Because this instruction does not have a 16-bit floating-point type,
 480     *   the source data type must be Word (W). The destination type must be
 481     *   F (Float).
 482     *
 483     * To use W as the source data type, we must adjust horizontal strides,
 484     * which is only possible in align1 mode. All my [chadv] attempts at
 485     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 486     * Piglit tests, so I gave up.
 487     *
 488     * I've verified that, on gen7 hardware and the simulator, it is safe to
 489     * emit f16to32 in align16 mode with UD as source data type.
 490     */
 491
 492    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 493    src_reg tmp_src(tmp_dst);
 494
 495    tmp_dst.writemask = WRITEMASK_X;
 496    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 497
 498    tmp_dst.writemask = WRITEMASK_Y;
 499    emit(SHR(tmp_dst, src0, src_reg(16u)));
 500
 501    dst.writemask = WRITEMASK_XY;
 502    emit(F16TO32(dst, tmp_src));
 503 }
 504
 505 void
 506 vec4_visitor::visit_instructions(const exec_list *list)
 507 {
 508    foreach_list(node, list) {
 509       ir_instruction *ir = (ir_instruction *)node;
 510
 511       base_ir = ir;
 512       ir->accept(this);
 513    }
 514 }
 515
 516
 517 static int
 518 type_size(const struct glsl_type *type)
 519 {
 520    unsigned int i;
 521    int size;
 522
 523    switch (type->base_type) {
 524    case GLSL_TYPE_UINT:
 525    case GLSL_TYPE_INT:
 526    case GLSL_TYPE_FLOAT:
 527    case GLSL_TYPE_BOOL:
 528       if (type->is_matrix()) {
 529          return type->matrix_columns;
 530       } else {
 531          /* Regardless of size of vector, it gets a vec4. This is bad
 532           * packing for things like floats, but otherwise arrays become a
 533           * mess.  Hopefully a later pass over the code can pack scalars
 534           * down if appropriate.
 535           */
 536          return 1;
 537       }
 538    case GLSL_TYPE_ARRAY:
 539       assert(type->length > 0);
 540       return type_size(type->fields.array) * type->length;
 541    case GLSL_TYPE_STRUCT:
 542       size = 0;
 543       for (i = 0; i < type->length; i++) {
 544          size += type_size(type->fields.structure[i].type);
 545       }
 546       return size;
 547    case GLSL_TYPE_SAMPLER:
 548       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 549        * at link time.
 550        */
 551       return 1;
 552    case GLSL_TYPE_VOID:
 553    case GLSL_TYPE_ERROR:
 554    case GLSL_TYPE_INTERFACE:
 555       assert(0);
 556       break;
 557    }
 558
 559    return 0;
 560 }
 561
 562 int
 563 vec4_visitor::virtual_grf_alloc(int size)
 564 {
 565    if (virtual_grf_array_size <= virtual_grf_count) {
 566       if (virtual_grf_array_size == 0)
 567          virtual_grf_array_size = 16;
 568       else
 569          virtual_grf_array_size *= 2;
 570       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 571                                    virtual_grf_array_size);
 572       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 573                                      virtual_grf_array_size);
 574    }
 575    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 576    virtual_grf_reg_count += size;
 577    virtual_grf_sizes[virtual_grf_count] = size;
 578    return virtual_grf_count++;
 579 }
 580
 581 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 582 {
 583    init();
 584
 585    this->file = GRF;
 586    this->reg = v->virtual_grf_alloc(type_size(type));
 587
 588    if (type->is_array() || type->is_record()) {
 589       this->swizzle = BRW_SWIZZLE_NOOP;
 590    } else {
 591       this->swizzle = swizzle_for_size(type->vector_elements);
 592    }
 593
 594    this->type = brw_type_for_base_type(type);
 595 }
 596
 597 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 598 {
 599    init();
 600
 601    this->file = GRF;
 602    this->reg = v->virtual_grf_alloc(type_size(type));
 603
 604    if (type->is_array() || type->is_record()) {
 605       this->writemask = WRITEMASK_XYZW;
 606    } else {
 607       this->writemask = (1 << type->vector_elements) - 1;
 608    }
 609
 610    this->type = brw_type_for_base_type(type);
 611 }
 612
 613 /* Our support for uniforms is piggy-backed on the struct
 614  * gl_fragment_program, because that's where the values actually
 615  * get stored, rather than in some global gl_shader_program uniform
 616  * store.
 617  */
 618 void
 619 vec4_visitor::setup_uniform_values(ir_variable *ir)
 620 {
 621    int namelen = strlen(ir->name);
 622
 623    /* The data for our (non-builtin) uniforms is stored in a series of
 624     * gl_uniform_driver_storage structs for each subcomponent that
 625     * glGetUniformLocation() could name.  We know it's been set up in the same
 626     * order we'd walk the type, so walk the list of storage and find anything
 627     * with our name, or the prefix of a component that starts with our name.
 628     */
 629    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 630       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 631
 632       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 633           (storage->name[namelen] != 0 &&
 634            storage->name[namelen] != '.' &&
 635            storage->name[namelen] != '[')) {
 636          continue;
 637       }
 638
 639       gl_constant_value *components = storage->storage;
 640       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 641                                storage->type->matrix_columns);
 642
 643       for (unsigned s = 0; s < vector_count; s++) {
 644          uniform_vector_size[uniforms] = storage->type->vector_elements;
 645
 646          int i;
 647          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 648             prog_data->param[uniforms * 4 + i] = &components->f;
 649             components++;
 650          }
 651          for (; i < 4; i++) {
 652             static float zero = 0;
 653             prog_data->param[uniforms * 4 + i] = &zero;
 654          }
 655
 656          uniforms++;
 657       }
 658    }
 659 }
 660
 661 void
 662 vec4_visitor::setup_uniform_clipplane_values()
 663 {
 664    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 665
 666    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 667       this->uniform_vector_size[this->uniforms] = 4;
 668       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 669       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 670       for (int j = 0; j < 4; ++j) {
 671          prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
 672       }
 673       ++this->uniforms;
 674    }
 675 }
 676
 677 /* Our support for builtin uniforms is even scarier than non-builtin.
 678  * It sits on top of the PROG_STATE_VAR parameters that are
 679  * automatically updated from GL context state.
 680  */
 681 void
 682 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 683 {
 684    const ir_state_slot *const slots = ir->state_slots;
 685    assert(ir->state_slots != NULL);
 686
 687    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 688       /* This state reference has already been setup by ir_to_mesa,
 689        * but we'll get the same index back here.  We can reference
 690        * ParameterValues directly, since unlike brw_fs.cpp, we never
 691        * add new state references during compile.
 692        */
 693       int index = _mesa_add_state_reference(this->prog->Parameters,
 694                                             (gl_state_index *)slots[i].tokens);
 695       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
 696
 697       this->uniform_vector_size[this->uniforms] = 0;
 698       /* Add each of the unique swizzled channels of the element.
 699        * This will end up matching the size of the glsl_type of this field.
 700        */
 701       int last_swiz = -1;
 702       for (unsigned int j = 0; j < 4; j++) {
 703          int swiz = GET_SWZ(slots[i].swizzle, j);
 704          last_swiz = swiz;
 705
 706          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
 707          if (swiz <= last_swiz)
 708             this->uniform_vector_size[this->uniforms]++;
 709       }
 710       this->uniforms++;
 711    }
 712 }
 713
 714 dst_reg *
 715 vec4_visitor::variable_storage(ir_variable *var)
 716 {
 717    return (dst_reg *)hash_table_find(this->variable_ht, var);
 718 }
 719
 720 void
 721 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 722 {
 723    ir_expression *expr = ir->as_expression();
 724
 725    *predicate = BRW_PREDICATE_NORMAL;
 726
 727    if (expr) {
 728       src_reg op[2];
 729       vec4_instruction *inst;
 730
 731       assert(expr->get_num_operands() <= 2);
 732       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 733          expr->operands[i]->accept(this);
 734          op[i] = this->result;
 735
 736          resolve_ud_negate(&op[i]);
 737       }
 738
 739       switch (expr->operation) {
 740       case ir_unop_logic_not:
 741          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 742          inst->conditional_mod = BRW_CONDITIONAL_Z;
 743          break;
 744
 745       case ir_binop_logic_xor:
 746          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 747          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 748          break;
 749
 750       case ir_binop_logic_or:
 751          inst = emit(OR(dst_null_d(), op[0], op[1]));
 752          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 753          break;
 754
 755       case ir_binop_logic_and:
 756          inst = emit(AND(dst_null_d(), op[0], op[1]));
 757          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 758          break;
 759
 760       case ir_unop_f2b:
 761          if (brw->gen >= 6) {
 762             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 763          } else {
 764             inst = emit(MOV(dst_null_f(), op[0]));
 765             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 766          }
 767          break;
 768
 769       case ir_unop_i2b:
 770          if (brw->gen >= 6) {
 771             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 772          } else {
 773             inst = emit(MOV(dst_null_d(), op[0]));
 774             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 775          }
 776          break;
 777
 778       case ir_binop_all_equal:
 779          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 780          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 781          break;
 782
 783       case ir_binop_any_nequal:
 784          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 785          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 786          break;
 787
 788       case ir_unop_any:
 789          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 790          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 791          break;
 792
 793       case ir_binop_greater:
 794       case ir_binop_gequal:
 795       case ir_binop_less:
 796       case ir_binop_lequal:
 797       case ir_binop_equal:
 798       case ir_binop_nequal:
 799          emit(CMP(dst_null_d(), op[0], op[1],
 800                   brw_conditional_for_comparison(expr->operation)));
 801          break;
 802
 803       default:
 804          assert(!"not reached");
 805          break;
 806       }
 807       return;
 808    }
 809
 810    ir->accept(this);
 811
 812    resolve_ud_negate(&this->result);
 813
 814    if (brw->gen >= 6) {
 815       vec4_instruction *inst = emit(AND(dst_null_d(),
 816                                         this->result, src_reg(1)));
 817       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 818    } else {
 819       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 820       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 821    }
 822 }
 823
 824 /**
 825  * Emit a gen6 IF statement with the comparison folded into the IF
 826  * instruction.
 827  */
 828 void
 829 vec4_visitor::emit_if_gen6(ir_if *ir)
 830 {
 831    ir_expression *expr = ir->condition->as_expression();
 832
 833    if (expr) {
 834       src_reg op[2];
 835       dst_reg temp;
 836
 837       assert(expr->get_num_operands() <= 2);
 838       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 839          expr->operands[i]->accept(this);
 840          op[i] = this->result;
 841       }
 842
 843       switch (expr->operation) {
 844       case ir_unop_logic_not:
 845          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 846          return;
 847
 848       case ir_binop_logic_xor:
 849          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 850          return;
 851
 852       case ir_binop_logic_or:
 853          temp = dst_reg(this, glsl_type::bool_type);
 854          emit(OR(temp, op[0], op[1]));
 855          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 856          return;
 857
 858       case ir_binop_logic_and:
 859          temp = dst_reg(this, glsl_type::bool_type);
 860          emit(AND(temp, op[0], op[1]));
 861          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 862          return;
 863
 864       case ir_unop_f2b:
 865          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 866          return;
 867
 868       case ir_unop_i2b:
 869          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 870          return;
 871
 872       case ir_binop_greater:
 873       case ir_binop_gequal:
 874       case ir_binop_less:
 875       case ir_binop_lequal:
 876       case ir_binop_equal:
 877       case ir_binop_nequal:
 878          emit(IF(op[0], op[1],
 879                  brw_conditional_for_comparison(expr->operation)));
 880          return;
 881
 882       case ir_binop_all_equal:
 883          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 884          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 885          return;
 886
 887       case ir_binop_any_nequal:
 888          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 889          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 890          return;
 891
 892       case ir_unop_any:
 893          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 894          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 895          return;
 896
 897       default:
 898          assert(!"not reached");
 899          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 900          return;
 901       }
 902       return;
 903    }
 904
 905    ir->condition->accept(this);
 906
 907    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 908 }
 909
 910 static dst_reg
 911 with_writemask(dst_reg const & r, int mask)
 912 {
 913    dst_reg result = r;
 914    result.writemask = mask;
 915    return result;
 916 }
 917
 918 void
 919 vec4_vs_visitor::emit_prolog()
 920 {
 921    dst_reg sign_recovery_shift;
 922    dst_reg normalize_factor;
 923    dst_reg es3_normalize_factor;
 924
 925    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 926       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
 927          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
 928          dst_reg reg(ATTR, i);
 929          dst_reg reg_d = reg;
 930          reg_d.type = BRW_REGISTER_TYPE_D;
 931          dst_reg reg_ud = reg;
 932          reg_ud.type = BRW_REGISTER_TYPE_UD;
 933
 934          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 935           * come in as floating point conversions of the integer values.
 936           */
 937          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 938             dst_reg dst = reg;
 939             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 940             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 941             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 942          }
 943
 944          /* Do sign recovery for 2101010 formats if required. */
 945          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 946             if (sign_recovery_shift.file == BAD_FILE) {
 947                /* shift constant: <22,22,22,30> */
 948                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 949                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 950                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 951             }
 952
 953             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 954             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 955          }
 956
 957          /* Apply BGRA swizzle if required. */
 958          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 959             src_reg temp = src_reg(reg);
 960             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 961             emit(MOV(reg, temp));
 962          }
 963
 964          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 965             /* ES 3.0 has different rules for converting signed normalized
 966              * fixed-point numbers than desktop GL.
 967              */
 968             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 969                /* According to equation 2.2 of the ES 3.0 specification,
 970                 * signed normalization conversion is done by:
 971                 *
 972                 * f = c / (2^(b-1)-1)
 973                 */
 974                if (es3_normalize_factor.file == BAD_FILE) {
 975                   /* mul constant: 1 / (2^(b-1) - 1) */
 976                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 977                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 978                            src_reg(1.0f / ((1<<9) - 1))));
 979                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 980                            src_reg(1.0f / ((1<<1) - 1))));
 981                }
 982
 983                dst_reg dst = reg;
 984                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 985                emit(MOV(dst, src_reg(reg_d)));
 986                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 987                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 988             } else {
 989                /* The following equations are from the OpenGL 3.2 specification:
 990                 *
 991                 * 2.1 unsigned normalization
 992                 * f = c/(2^n-1)
 993                 *
 994                 * 2.2 signed normalization
 995                 * f = (2c+1)/(2^n-1)
 996                 *
 997                 * Both of these share a common divisor, which is represented by
 998                 * "normalize_factor" in the code below.
 999                 */
1000                if (normalize_factor.file == BAD_FILE) {
1001                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
1002                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
1003                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1004                            src_reg(1.0f / ((1<<10) - 1))));
1005                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1006                            src_reg(1.0f / ((1<<2) - 1))));
1007                }
1008
1009                dst_reg dst = reg;
1010                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1011                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1012
1013                /* For signed normalization, we want the numerator to be 2c+1. */
1014                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1015                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1016                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1017                }
1018
1019                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1020             }
1021          }
1022
1023          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1024             dst_reg dst = reg;
1025             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1026             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1027          }
1028       }
1029    }
1030 }
1031
1032
1033 dst_reg *
1034 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1035 {
1036    /* VertexID is stored by the VF as the last vertex element, but
1037     * we don't represent it with a flag in inputs_read, so we call
1038     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1039     */
1040    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1041    vs_prog_data->uses_vertexid = true;
1042
1043    switch (ir->location) {
1044    case SYSTEM_VALUE_VERTEX_ID:
1045       reg->writemask = WRITEMASK_X;
1046       break;
1047    case SYSTEM_VALUE_INSTANCE_ID:
1048       reg->writemask = WRITEMASK_Y;
1049       break;
1050    default:
1051       assert(!"not reached");
1052       break;
1053    }
1054
1055    return reg;
1056 }
1057
1058
1059 void
1060 vec4_visitor::visit(ir_variable *ir)
1061 {
1062    dst_reg *reg = NULL;
1063
1064    if (variable_storage(ir))
1065       return;
1066
1067    switch (ir->mode) {
1068    case ir_var_shader_in:
1069       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1070       break;
1071
1072    case ir_var_shader_out:
1073       reg = new(mem_ctx) dst_reg(this, ir->type);
1074
1075       for (int i = 0; i < type_size(ir->type); i++) {
1076          output_reg[ir->location + i] = *reg;
1077          output_reg[ir->location + i].reg_offset = i;
1078          output_reg[ir->location + i].type =
1079             brw_type_for_base_type(ir->type->get_scalar_type());
1080          output_reg_annotation[ir->location + i] = ir->name;
1081       }
1082       break;
1083
1084    case ir_var_auto:
1085    case ir_var_temporary:
1086       reg = new(mem_ctx) dst_reg(this, ir->type);
1087       break;
1088
1089    case ir_var_uniform:
1090       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1091
1092       /* Thanks to the lower_ubo_reference pass, we will see only
1093        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1094        * variables, so no need for them to be in variable_ht.
1095        */
1096       if (ir->is_in_uniform_block())
1097          return;
1098
1099       /* Track how big the whole uniform variable is, in case we need to put a
1100        * copy of its data into pull constants for array access.
1101        */
1102       this->uniform_size[this->uniforms] = type_size(ir->type);
1103
1104       if (!strncmp(ir->name, "gl_", 3)) {
1105          setup_builtin_uniform_values(ir);
1106       } else {
1107          setup_uniform_values(ir);
1108       }
1109       break;
1110
1111    case ir_var_system_value:
1112       reg = make_reg_for_system_value(ir);
1113       break;
1114
1115    default:
1116       assert(!"not reached");
1117    }
1118
1119    reg->type = brw_type_for_base_type(ir->type);
1120    hash_table_insert(this->variable_ht, reg, ir);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop *ir)
1125 {
1126    dst_reg counter;
1127
1128    /* We don't want debugging output to print the whole body of the
1129     * loop as the annotation.
1130     */
1131    this->base_ir = NULL;
1132
1133    if (ir->counter != NULL) {
1134       this->base_ir = ir->counter;
1135       ir->counter->accept(this);
1136       counter = *(variable_storage(ir->counter));
1137
1138       if (ir->from != NULL) {
1139          this->base_ir = ir->from;
1140          ir->from->accept(this);
1141
1142          emit(MOV(counter, this->result));
1143       }
1144    }
1145
1146    emit(BRW_OPCODE_DO);
1147
1148    if (ir->to) {
1149       this->base_ir = ir->to;
1150       ir->to->accept(this);
1151
1152       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1153                brw_conditional_for_comparison(ir->cmp)));
1154
1155       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1156       inst->predicate = BRW_PREDICATE_NORMAL;
1157    }
1158
1159    visit_instructions(&ir->body_instructions);
1160
1161
1162    if (ir->increment) {
1163       this->base_ir = ir->increment;
1164       ir->increment->accept(this);
1165       emit(ADD(counter, src_reg(counter), this->result));
1166    }
1167
1168    emit(BRW_OPCODE_WHILE);
1169 }
1170
1171 void
1172 vec4_visitor::visit(ir_loop_jump *ir)
1173 {
1174    switch (ir->mode) {
1175    case ir_loop_jump::jump_break:
1176       emit(BRW_OPCODE_BREAK);
1177       break;
1178    case ir_loop_jump::jump_continue:
1179       emit(BRW_OPCODE_CONTINUE);
1180       break;
1181    }
1182 }
1183
1184
1185 void
1186 vec4_visitor::visit(ir_function_signature *ir)
1187 {
1188    assert(0);
1189    (void)ir;
1190 }
1191
1192 void
1193 vec4_visitor::visit(ir_function *ir)
1194 {
1195    /* Ignore function bodies other than main() -- we shouldn't see calls to
1196     * them since they should all be inlined.
1197     */
1198    if (strcmp(ir->name, "main") == 0) {
1199       const ir_function_signature *sig;
1200       exec_list empty;
1201
1202       sig = ir->matching_signature(&empty);
1203
1204       assert(sig);
1205
1206       visit_instructions(&sig->body);
1207    }
1208 }
1209
1210 bool
1211 vec4_visitor::try_emit_sat(ir_expression *ir)
1212 {
1213    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1214    if (!sat_src)
1215       return false;
1216
1217    sat_src->accept(this);
1218    src_reg src = this->result;
1219
1220    this->result = src_reg(this, ir->type);
1221    vec4_instruction *inst;
1222    inst = emit(MOV(dst_reg(this->result), src));
1223    inst->saturate = true;
1224
1225    return true;
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1230 {
1231    /* 3-src instructions were introduced in gen6. */
1232    if (brw->gen < 6)
1233       return false;
1234
1235    /* MAD can only handle floating-point data. */
1236    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1237       return false;
1238
1239    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1240    ir_expression *mul = ir->operands[mul_arg]->as_expression();
1241
1242    if (!mul || mul->operation != ir_binop_mul)
1243       return false;
1244
1245    nonmul->accept(this);
1246    src_reg src0 = fix_3src_operand(this->result);
1247
1248    mul->operands[0]->accept(this);
1249    src_reg src1 = fix_3src_operand(this->result);
1250
1251    mul->operands[1]->accept(this);
1252    src_reg src2 = fix_3src_operand(this->result);
1253
1254    this->result = src_reg(this, ir->type);
1255    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1256
1257    return true;
1258 }
1259
1260 void
1261 vec4_visitor::emit_bool_comparison(unsigned int op,
1262                                  dst_reg dst, src_reg src0, src_reg src1)
1263 {
1264    /* original gen4 does destination conversion before comparison. */
1265    if (brw->gen < 5)
1266       dst.type = src0.type;
1267
1268    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1269
1270    dst.type = BRW_REGISTER_TYPE_D;
1271    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1272 }
1273
1274 void
1275 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1276                           src_reg src0, src_reg src1)
1277 {
1278    vec4_instruction *inst;
1279
1280    if (brw->gen >= 6) {
1281       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282       inst->conditional_mod = conditionalmod;
1283    } else {
1284       emit(CMP(dst, src0, src1, conditionalmod));
1285
1286       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1287       inst->predicate = BRW_PREDICATE_NORMAL;
1288    }
1289 }
1290
1291 static bool
1292 is_16bit_constant(ir_rvalue *rvalue)
1293 {
1294    ir_constant *constant = rvalue->as_constant();
1295    if (!constant)
1296       return false;
1297
1298    if (constant->type != glsl_type::int_type &&
1299        constant->type != glsl_type::uint_type)
1300       return false;
1301
1302    return constant->value.u[0] < (1 << 16);
1303 }
1304
1305 void
1306 vec4_visitor::visit(ir_expression *ir)
1307 {
1308    unsigned int operand;
1309    src_reg op[Elements(ir->operands)];
1310    src_reg result_src;
1311    dst_reg result_dst;
1312    vec4_instruction *inst;
1313
1314    if (try_emit_sat(ir))
1315       return;
1316
1317    if (ir->operation == ir_binop_add) {
1318       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1319          return;
1320    }
1321
1322    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1323       this->result.file = BAD_FILE;
1324       ir->operands[operand]->accept(this);
1325       if (this->result.file == BAD_FILE) {
1326          printf("Failed to get tree for expression operand:\n");
1327          ir->operands[operand]->print();
1328          exit(1);
1329       }
1330       op[operand] = this->result;
1331
1332       /* Matrix expression operands should have been broken down to vector
1333        * operations already.
1334        */
1335       assert(!ir->operands[operand]->type->is_matrix());
1336    }
1337
1338    int vector_elements = ir->operands[0]->type->vector_elements;
1339    if (ir->operands[1]) {
1340       vector_elements = MAX2(vector_elements,
1341                              ir->operands[1]->type->vector_elements);
1342    }
1343
1344    this->result.file = BAD_FILE;
1345
1346    /* Storage for our result.  Ideally for an assignment we'd be using
1347     * the actual storage for the result here, instead.
1348     */
1349    result_src = src_reg(this, ir->type);
1350    /* convenience for the emit functions below. */
1351    result_dst = dst_reg(result_src);
1352    /* If nothing special happens, this is the result. */
1353    this->result = result_src;
1354    /* Limit writes to the channels that will be used by result_src later.
1355     * This does limit this temp's use as a temporary for multi-instruction
1356     * sequences.
1357     */
1358    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1359
1360    switch (ir->operation) {
1361    case ir_unop_logic_not:
1362       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1363        * ones complement of the whole register, not just bit 0.
1364        */
1365       emit(XOR(result_dst, op[0], src_reg(1)));
1366       break;
1367    case ir_unop_neg:
1368       op[0].negate = !op[0].negate;
1369       emit(MOV(result_dst, op[0]));
1370       break;
1371    case ir_unop_abs:
1372       op[0].abs = true;
1373       op[0].negate = false;
1374       emit(MOV(result_dst, op[0]));
1375       break;
1376
1377    case ir_unop_sign:
1378       emit(MOV(result_dst, src_reg(0.0f)));
1379
1380       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1381       inst = emit(MOV(result_dst, src_reg(1.0f)));
1382       inst->predicate = BRW_PREDICATE_NORMAL;
1383
1384       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1385       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1386       inst->predicate = BRW_PREDICATE_NORMAL;
1387
1388       break;
1389
1390    case ir_unop_rcp:
1391       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1392       break;
1393
1394    case ir_unop_exp2:
1395       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1396       break;
1397    case ir_unop_log2:
1398       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1399       break;
1400    case ir_unop_exp:
1401    case ir_unop_log:
1402       assert(!"not reached: should be handled by ir_explog_to_explog2");
1403       break;
1404    case ir_unop_sin:
1405    case ir_unop_sin_reduced:
1406       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1407       break;
1408    case ir_unop_cos:
1409    case ir_unop_cos_reduced:
1410       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1411       break;
1412
1413    case ir_unop_dFdx:
1414    case ir_unop_dFdy:
1415       assert(!"derivatives not valid in vertex shader");
1416       break;
1417
1418    case ir_unop_bitfield_reverse:
1419       emit(BFREV(result_dst, op[0]));
1420       break;
1421    case ir_unop_bit_count:
1422       emit(CBIT(result_dst, op[0]));
1423       break;
1424    case ir_unop_find_msb: {
1425       src_reg temp = src_reg(this, glsl_type::uint_type);
1426
1427       inst = emit(FBH(dst_reg(temp), op[0]));
1428       inst->dst.writemask = WRITEMASK_XYZW;
1429
1430       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1431        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1432        * subtract the result from 31 to convert the MSB count into an LSB count.
1433        */
1434
1435       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1436       temp.swizzle = BRW_SWIZZLE_NOOP;
1437       emit(MOV(result_dst, temp));
1438
1439       src_reg src_tmp = src_reg(result_dst);
1440       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1441
1442       src_tmp.negate = true;
1443       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1444       inst->predicate = BRW_PREDICATE_NORMAL;
1445       break;
1446    }
1447    case ir_unop_find_lsb:
1448       emit(FBL(result_dst, op[0]));
1449       break;
1450
1451    case ir_unop_noise:
1452       assert(!"not reached: should be handled by lower_noise");
1453       break;
1454
1455    case ir_binop_add:
1456       emit(ADD(result_dst, op[0], op[1]));
1457       break;
1458    case ir_binop_sub:
1459       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1460       break;
1461
1462    case ir_binop_mul:
1463       if (ir->type->is_integer()) {
1464          /* For integer multiplication, the MUL uses the low 16 bits of one of
1465           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1466           * accumulates in the contribution of the upper 16 bits of that
1467           * operand.  If we can determine that one of the args is in the low
1468           * 16 bits, though, we can just emit a single MUL.
1469           */
1470          if (is_16bit_constant(ir->operands[0])) {
1471             if (brw->gen < 7)
1472                emit(MUL(result_dst, op[0], op[1]));
1473             else
1474                emit(MUL(result_dst, op[1], op[0]));
1475          } else if (is_16bit_constant(ir->operands[1])) {
1476             if (brw->gen < 7)
1477                emit(MUL(result_dst, op[1], op[0]));
1478             else
1479                emit(MUL(result_dst, op[0], op[1]));
1480          } else {
1481             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1482
1483             emit(MUL(acc, op[0], op[1]));
1484             emit(MACH(dst_null_d(), op[0], op[1]));
1485             emit(MOV(result_dst, src_reg(acc)));
1486          }
1487       } else {
1488          emit(MUL(result_dst, op[0], op[1]));
1489       }
1490       break;
1491    case ir_binop_div:
1492       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1493       assert(ir->type->is_integer());
1494       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1495       break;
1496    case ir_binop_mod:
1497       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1498       assert(ir->type->is_integer());
1499       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1500       break;
1501
1502    case ir_binop_less:
1503    case ir_binop_greater:
1504    case ir_binop_lequal:
1505    case ir_binop_gequal:
1506    case ir_binop_equal:
1507    case ir_binop_nequal: {
1508       emit(CMP(result_dst, op[0], op[1],
1509                brw_conditional_for_comparison(ir->operation)));
1510       emit(AND(result_dst, result_src, src_reg(0x1)));
1511       break;
1512    }
1513
1514    case ir_binop_all_equal:
1515       /* "==" operator producing a scalar boolean. */
1516       if (ir->operands[0]->type->is_vector() ||
1517           ir->operands[1]->type->is_vector()) {
1518          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1519          emit(MOV(result_dst, src_reg(0)));
1520          inst = emit(MOV(result_dst, src_reg(1)));
1521          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1522       } else {
1523          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1524          emit(AND(result_dst, result_src, src_reg(0x1)));
1525       }
1526       break;
1527    case ir_binop_any_nequal:
1528       /* "!=" operator producing a scalar boolean. */
1529       if (ir->operands[0]->type->is_vector() ||
1530           ir->operands[1]->type->is_vector()) {
1531          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1532
1533          emit(MOV(result_dst, src_reg(0)));
1534          inst = emit(MOV(result_dst, src_reg(1)));
1535          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1536       } else {
1537          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1538          emit(AND(result_dst, result_src, src_reg(0x1)));
1539       }
1540       break;
1541
1542    case ir_unop_any:
1543       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1544       emit(MOV(result_dst, src_reg(0)));
1545
1546       inst = emit(MOV(result_dst, src_reg(1)));
1547       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1548       break;
1549
1550    case ir_binop_logic_xor:
1551       emit(XOR(result_dst, op[0], op[1]));
1552       break;
1553
1554    case ir_binop_logic_or:
1555       emit(OR(result_dst, op[0], op[1]));
1556       break;
1557
1558    case ir_binop_logic_and:
1559       emit(AND(result_dst, op[0], op[1]));
1560       break;
1561
1562    case ir_binop_dot:
1563       assert(ir->operands[0]->type->is_vector());
1564       assert(ir->operands[0]->type == ir->operands[1]->type);
1565       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1566       break;
1567
1568    case ir_unop_sqrt:
1569       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1570       break;
1571    case ir_unop_rsq:
1572       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1573       break;
1574
1575    case ir_unop_bitcast_i2f:
1576    case ir_unop_bitcast_u2f:
1577       this->result = op[0];
1578       this->result.type = BRW_REGISTER_TYPE_F;
1579       break;
1580
1581    case ir_unop_bitcast_f2i:
1582       this->result = op[0];
1583       this->result.type = BRW_REGISTER_TYPE_D;
1584       break;
1585
1586    case ir_unop_bitcast_f2u:
1587       this->result = op[0];
1588       this->result.type = BRW_REGISTER_TYPE_UD;
1589       break;
1590
1591    case ir_unop_i2f:
1592    case ir_unop_i2u:
1593    case ir_unop_u2i:
1594    case ir_unop_u2f:
1595    case ir_unop_b2f:
1596    case ir_unop_b2i:
1597    case ir_unop_f2i:
1598    case ir_unop_f2u:
1599       emit(MOV(result_dst, op[0]));
1600       break;
1601    case ir_unop_f2b:
1602    case ir_unop_i2b: {
1603       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1604       emit(AND(result_dst, result_src, src_reg(1)));
1605       break;
1606    }
1607
1608    case ir_unop_trunc:
1609       emit(RNDZ(result_dst, op[0]));
1610       break;
1611    case ir_unop_ceil:
1612       op[0].negate = !op[0].negate;
1613       inst = emit(RNDD(result_dst, op[0]));
1614       this->result.negate = true;
1615       break;
1616    case ir_unop_floor:
1617       inst = emit(RNDD(result_dst, op[0]));
1618       break;
1619    case ir_unop_fract:
1620       inst = emit(FRC(result_dst, op[0]));
1621       break;
1622    case ir_unop_round_even:
1623       emit(RNDE(result_dst, op[0]));
1624       break;
1625
1626    case ir_binop_min:
1627       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1628       break;
1629    case ir_binop_max:
1630       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1631       break;
1632
1633    case ir_binop_pow:
1634       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1635       break;
1636
1637    case ir_unop_bit_not:
1638       inst = emit(NOT(result_dst, op[0]));
1639       break;
1640    case ir_binop_bit_and:
1641       inst = emit(AND(result_dst, op[0], op[1]));
1642       break;
1643    case ir_binop_bit_xor:
1644       inst = emit(XOR(result_dst, op[0], op[1]));
1645       break;
1646    case ir_binop_bit_or:
1647       inst = emit(OR(result_dst, op[0], op[1]));
1648       break;
1649
1650    case ir_binop_lshift:
1651       inst = emit(SHL(result_dst, op[0], op[1]));
1652       break;
1653
1654    case ir_binop_rshift:
1655       if (ir->type->base_type == GLSL_TYPE_INT)
1656          inst = emit(ASR(result_dst, op[0], op[1]));
1657       else
1658          inst = emit(SHR(result_dst, op[0], op[1]));
1659       break;
1660
1661    case ir_binop_bfm:
1662       emit(BFI1(result_dst, op[0], op[1]));
1663       break;
1664
1665    case ir_binop_ubo_load: {
1666       ir_constant *uniform_block = ir->operands[0]->as_constant();
1667       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1668       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1669       src_reg offset = op[1];
1670
1671       /* Now, load the vector from that offset. */
1672       assert(ir->type->is_vector() || ir->type->is_scalar());
1673
1674       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1675       packed_consts.type = result.type;
1676       src_reg surf_index =
1677          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1678       if (const_offset_ir) {
1679          offset = src_reg(const_offset / 16);
1680       } else {
1681          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1682       }
1683
1684       vec4_instruction *pull =
1685          emit(new(mem_ctx) vec4_instruction(this,
1686                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1687                                             dst_reg(packed_consts),
1688                                             surf_index,
1689                                             offset));
1690       pull->base_mrf = 14;
1691       pull->mlen = 1;
1692
1693       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1694       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1695                                             const_offset % 16 / 4,
1696                                             const_offset % 16 / 4,
1697                                             const_offset % 16 / 4);
1698
1699       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1700       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1701          emit(CMP(result_dst, packed_consts, src_reg(0u),
1702                   BRW_CONDITIONAL_NZ));
1703          emit(AND(result_dst, result, src_reg(0x1)));
1704       } else {
1705          emit(MOV(result_dst, packed_consts));
1706       }
1707       break;
1708    }
1709
1710    case ir_binop_vector_extract:
1711       assert(!"should have been lowered by vec_index_to_cond_assign");
1712       break;
1713
1714    case ir_triop_lrp:
1715       op[0] = fix_3src_operand(op[0]);
1716       op[1] = fix_3src_operand(op[1]);
1717       op[2] = fix_3src_operand(op[2]);
1718       /* Note that the instruction's argument order is reversed from GLSL
1719        * and the IR.
1720        */
1721       emit(LRP(result_dst, op[2], op[1], op[0]));
1722       break;
1723
1724    case ir_triop_bfi:
1725       op[0] = fix_3src_operand(op[0]);
1726       op[1] = fix_3src_operand(op[1]);
1727       op[2] = fix_3src_operand(op[2]);
1728       emit(BFI2(result_dst, op[0], op[1], op[2]));
1729       break;
1730
1731    case ir_triop_bitfield_extract:
1732       op[0] = fix_3src_operand(op[0]);
1733       op[1] = fix_3src_operand(op[1]);
1734       op[2] = fix_3src_operand(op[2]);
1735       /* Note that the instruction's argument order is reversed from GLSL
1736        * and the IR.
1737        */
1738       emit(BFE(result_dst, op[2], op[1], op[0]));
1739       break;
1740
1741    case ir_triop_vector_insert:
1742       assert(!"should have been lowered by lower_vector_insert");
1743       break;
1744
1745    case ir_quadop_bitfield_insert:
1746       assert(!"not reached: should be handled by "
1747               "bitfield_insert_to_bfm_bfi\n");
1748       break;
1749
1750    case ir_quadop_vector:
1751       assert(!"not reached: should be handled by lower_quadop_vector");
1752       break;
1753
1754    case ir_unop_pack_half_2x16:
1755       emit_pack_half_2x16(result_dst, op[0]);
1756       break;
1757    case ir_unop_unpack_half_2x16:
1758       emit_unpack_half_2x16(result_dst, op[0]);
1759       break;
1760    case ir_unop_pack_snorm_2x16:
1761    case ir_unop_pack_snorm_4x8:
1762    case ir_unop_pack_unorm_2x16:
1763    case ir_unop_pack_unorm_4x8:
1764    case ir_unop_unpack_snorm_2x16:
1765    case ir_unop_unpack_snorm_4x8:
1766    case ir_unop_unpack_unorm_2x16:
1767    case ir_unop_unpack_unorm_4x8:
1768       assert(!"not reached: should be handled by lower_packing_builtins");
1769       break;
1770    case ir_unop_unpack_half_2x16_split_x:
1771    case ir_unop_unpack_half_2x16_split_y:
1772    case ir_binop_pack_half_2x16_split:
1773       assert(!"not reached: should not occur in vertex shader");
1774       break;
1775    }
1776 }
1777
1778
1779 void
1780 vec4_visitor::visit(ir_swizzle *ir)
1781 {
1782    src_reg src;
1783    int i = 0;
1784    int swizzle[4];
1785
1786    /* Note that this is only swizzles in expressions, not those on the left
1787     * hand side of an assignment, which do write masking.  See ir_assignment
1788     * for that.
1789     */
1790
1791    ir->val->accept(this);
1792    src = this->result;
1793    assert(src.file != BAD_FILE);
1794
1795    for (i = 0; i < ir->type->vector_elements; i++) {
1796       switch (i) {
1797       case 0:
1798          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1799          break;
1800       case 1:
1801          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1802          break;
1803       case 2:
1804          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1805          break;
1806       case 3:
1807          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1808             break;
1809       }
1810    }
1811    for (; i < 4; i++) {
1812       /* Replicate the last channel out. */
1813       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1814    }
1815
1816    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1817
1818    this->result = src;
1819 }
1820
1821 void
1822 vec4_visitor::visit(ir_dereference_variable *ir)
1823 {
1824    const struct glsl_type *type = ir->type;
1825    dst_reg *reg = variable_storage(ir->var);
1826
1827    if (!reg) {
1828       fail("Failed to find variable storage for %s\n", ir->var->name);
1829       this->result = src_reg(brw_null_reg());
1830       return;
1831    }
1832
1833    this->result = src_reg(*reg);
1834
1835    /* System values get their swizzle from the dst_reg writemask */
1836    if (ir->var->mode == ir_var_system_value)
1837       return;
1838
1839    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1840       this->result.swizzle = swizzle_for_size(type->vector_elements);
1841 }
1842
1843
1844 int
1845 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1846 {
1847    /* Under normal circumstances array elements are stored consecutively, so
1848     * the stride is equal to the size of the array element.
1849     */
1850    return type_size(ir->type);
1851 }
1852
1853
1854 void
1855 vec4_visitor::visit(ir_dereference_array *ir)
1856 {
1857    ir_constant *constant_index;
1858    src_reg src;
1859    int array_stride = compute_array_stride(ir);
1860
1861    constant_index = ir->array_index->constant_expression_value();
1862
1863    ir->array->accept(this);
1864    src = this->result;
1865
1866    if (constant_index) {
1867       src.reg_offset += constant_index->value.i[0] * array_stride;
1868    } else {
1869       /* Variable index array dereference.  It eats the "vec4" of the
1870        * base of the array and an index that offsets the Mesa register
1871        * index.
1872        */
1873       ir->array_index->accept(this);
1874
1875       src_reg index_reg;
1876
1877       if (array_stride == 1) {
1878          index_reg = this->result;
1879       } else {
1880          index_reg = src_reg(this, glsl_type::int_type);
1881
1882          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1883       }
1884
1885       if (src.reladdr) {
1886          src_reg temp = src_reg(this, glsl_type::int_type);
1887
1888          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1889
1890          index_reg = temp;
1891       }
1892
1893       src.reladdr = ralloc(mem_ctx, src_reg);
1894       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1895    }
1896
1897    /* If the type is smaller than a vec4, replicate the last channel out. */
1898    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1899       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1900    else
1901       src.swizzle = BRW_SWIZZLE_NOOP;
1902    src.type = brw_type_for_base_type(ir->type);
1903
1904    this->result = src;
1905 }
1906
1907 void
1908 vec4_visitor::visit(ir_dereference_record *ir)
1909 {
1910    unsigned int i;
1911    const glsl_type *struct_type = ir->record->type;
1912    int offset = 0;
1913
1914    ir->record->accept(this);
1915
1916    for (i = 0; i < struct_type->length; i++) {
1917       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1918          break;
1919       offset += type_size(struct_type->fields.structure[i].type);
1920    }
1921
1922    /* If the type is smaller than a vec4, replicate the last channel out. */
1923    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1924       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1925    else
1926       this->result.swizzle = BRW_SWIZZLE_NOOP;
1927    this->result.type = brw_type_for_base_type(ir->type);
1928
1929    this->result.reg_offset += offset;
1930 }
1931
1932 /**
1933  * We want to be careful in assignment setup to hit the actual storage
1934  * instead of potentially using a temporary like we might with the
1935  * ir_dereference handler.
1936  */
1937 static dst_reg
1938 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1939 {
1940    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1941     * access of a vector, it must be separated into a series conditional moves
1942     * before reaching this point (see ir_vec_index_to_cond_assign).
1943     */
1944    assert(ir->as_dereference());
1945    ir_dereference_array *deref_array = ir->as_dereference_array();
1946    if (deref_array) {
1947       assert(!deref_array->array->type->is_vector());
1948    }
1949
1950    /* Use the rvalue deref handler for the most part.  We'll ignore
1951     * swizzles in it and write swizzles using writemask, though.
1952     */
1953    ir->accept(v);
1954    return dst_reg(v->result);
1955 }
1956
1957 void
1958 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1959                               const struct glsl_type *type, uint32_t predicate)
1960 {
1961    if (type->base_type == GLSL_TYPE_STRUCT) {
1962       for (unsigned int i = 0; i < type->length; i++) {
1963          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1964       }
1965       return;
1966    }
1967
1968    if (type->is_array()) {
1969       for (unsigned int i = 0; i < type->length; i++) {
1970          emit_block_move(dst, src, type->fields.array, predicate);
1971       }
1972       return;
1973    }
1974
1975    if (type->is_matrix()) {
1976       const struct glsl_type *vec_type;
1977
1978       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1979                                          type->vector_elements, 1);
1980
1981       for (int i = 0; i < type->matrix_columns; i++) {
1982          emit_block_move(dst, src, vec_type, predicate);
1983       }
1984       return;
1985    }
1986
1987    assert(type->is_scalar() || type->is_vector());
1988
1989    dst->type = brw_type_for_base_type(type);
1990    src->type = dst->type;
1991
1992    dst->writemask = (1 << type->vector_elements) - 1;
1993
1994    src->swizzle = swizzle_for_size(type->vector_elements);
1995
1996    vec4_instruction *inst = emit(MOV(*dst, *src));
1997    inst->predicate = predicate;
1998
1999    dst->reg_offset++;
2000    src->reg_offset++;
2001 }
2002
2003
2004 /* If the RHS processing resulted in an instruction generating a
2005  * temporary value, and it would be easy to rewrite the instruction to
2006  * generate its result right into the LHS instead, do so.  This ends
2007  * up reliably removing instructions where it can be tricky to do so
2008  * later without real UD chain information.
2009  */
2010 bool
2011 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2012                                      dst_reg dst,
2013                                      src_reg src,
2014                                      vec4_instruction *pre_rhs_inst,
2015                                      vec4_instruction *last_rhs_inst)
2016 {
2017    /* This could be supported, but it would take more smarts. */
2018    if (ir->condition)
2019       return false;
2020
2021    if (pre_rhs_inst == last_rhs_inst)
2022       return false; /* No instructions generated to work with. */
2023
2024    /* Make sure the last instruction generated our source reg. */
2025    if (src.file != GRF ||
2026        src.file != last_rhs_inst->dst.file ||
2027        src.reg != last_rhs_inst->dst.reg ||
2028        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2029        src.reladdr ||
2030        src.abs ||
2031        src.negate ||
2032        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2033       return false;
2034
2035    /* Check that that last instruction fully initialized the channels
2036     * we want to use, in the order we want to use them.  We could
2037     * potentially reswizzle the operands of many instructions so that
2038     * we could handle out of order channels, but don't yet.
2039     */
2040
2041    for (unsigned i = 0; i < 4; i++) {
2042       if (dst.writemask & (1 << i)) {
2043          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2044             return false;
2045
2046          if (BRW_GET_SWZ(src.swizzle, i) != i)
2047             return false;
2048       }
2049    }
2050
2051    /* Success!  Rewrite the instruction. */
2052    last_rhs_inst->dst.file = dst.file;
2053    last_rhs_inst->dst.reg = dst.reg;
2054    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2055    last_rhs_inst->dst.reladdr = dst.reladdr;
2056    last_rhs_inst->dst.writemask &= dst.writemask;
2057
2058    return true;
2059 }
2060
2061 void
2062 vec4_visitor::visit(ir_assignment *ir)
2063 {
2064    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2065    uint32_t predicate = BRW_PREDICATE_NONE;
2066
2067    if (!ir->lhs->type->is_scalar() &&
2068        !ir->lhs->type->is_vector()) {
2069       ir->rhs->accept(this);
2070       src_reg src = this->result;
2071
2072       if (ir->condition) {
2073          emit_bool_to_cond_code(ir->condition, &predicate);
2074       }
2075
2076       /* emit_block_move doesn't account for swizzles in the source register.
2077        * This should be ok, since the source register is a structure or an
2078        * array, and those can't be swizzled.  But double-check to be sure.
2079        */
2080       assert(src.swizzle ==
2081              (ir->rhs->type->is_matrix()
2082               ? swizzle_for_size(ir->rhs->type->vector_elements)
2083               : BRW_SWIZZLE_NOOP));
2084
2085       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2086       return;
2087    }
2088
2089    /* Now we're down to just a scalar/vector with writemasks. */
2090    int i;
2091
2092    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2093    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2094
2095    ir->rhs->accept(this);
2096
2097    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2098
2099    src_reg src = this->result;
2100
2101    int swizzles[4];
2102    int first_enabled_chan = 0;
2103    int src_chan = 0;
2104
2105    assert(ir->lhs->type->is_vector() ||
2106           ir->lhs->type->is_scalar());
2107    dst.writemask = ir->write_mask;
2108
2109    for (int i = 0; i < 4; i++) {
2110       if (dst.writemask & (1 << i)) {
2111          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2112          break;
2113       }
2114    }
2115
2116    /* Swizzle a small RHS vector into the channels being written.
2117     *
2118     * glsl ir treats write_mask as dictating how many channels are
2119     * present on the RHS while in our instructions we need to make
2120     * those channels appear in the slots of the vec4 they're written to.
2121     */
2122    for (int i = 0; i < 4; i++) {
2123       if (dst.writemask & (1 << i))
2124          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2125       else
2126          swizzles[i] = first_enabled_chan;
2127    }
2128    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2129                               swizzles[2], swizzles[3]);
2130
2131    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2132       return;
2133    }
2134
2135    if (ir->condition) {
2136       emit_bool_to_cond_code(ir->condition, &predicate);
2137    }
2138
2139    for (i = 0; i < type_size(ir->lhs->type); i++) {
2140       vec4_instruction *inst = emit(MOV(dst, src));
2141       inst->predicate = predicate;
2142
2143       dst.reg_offset++;
2144       src.reg_offset++;
2145    }
2146 }
2147
2148 void
2149 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2150 {
2151    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2152       foreach_list(node, &ir->components) {
2153          ir_constant *field_value = (ir_constant *)node;
2154
2155          emit_constant_values(dst, field_value);
2156       }
2157       return;
2158    }
2159
2160    if (ir->type->is_array()) {
2161       for (unsigned int i = 0; i < ir->type->length; i++) {
2162          emit_constant_values(dst, ir->array_elements[i]);
2163       }
2164       return;
2165    }
2166
2167    if (ir->type->is_matrix()) {
2168       for (int i = 0; i < ir->type->matrix_columns; i++) {
2169          float *vec = &ir->value.f[i * ir->type->vector_elements];
2170
2171          for (int j = 0; j < ir->type->vector_elements; j++) {
2172             dst->writemask = 1 << j;
2173             dst->type = BRW_REGISTER_TYPE_F;
2174
2175             emit(MOV(*dst, src_reg(vec[j])));
2176          }
2177          dst->reg_offset++;
2178       }
2179       return;
2180    }
2181
2182    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2183
2184    for (int i = 0; i < ir->type->vector_elements; i++) {
2185       if (!(remaining_writemask & (1 << i)))
2186          continue;
2187
2188       dst->writemask = 1 << i;
2189       dst->type = brw_type_for_base_type(ir->type);
2190
2191       /* Find other components that match the one we're about to
2192        * write.  Emits fewer instructions for things like vec4(0.5,
2193        * 1.5, 1.5, 1.5).
2194        */
2195       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2196          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2197             if (ir->value.b[i] == ir->value.b[j])
2198                dst->writemask |= (1 << j);
2199          } else {
2200             /* u, i, and f storage all line up, so no need for a
2201              * switch case for comparing each type.
2202              */
2203             if (ir->value.u[i] == ir->value.u[j])
2204                dst->writemask |= (1 << j);
2205          }
2206       }
2207
2208       switch (ir->type->base_type) {
2209       case GLSL_TYPE_FLOAT:
2210          emit(MOV(*dst, src_reg(ir->value.f[i])));
2211          break;
2212       case GLSL_TYPE_INT:
2213          emit(MOV(*dst, src_reg(ir->value.i[i])));
2214          break;
2215       case GLSL_TYPE_UINT:
2216          emit(MOV(*dst, src_reg(ir->value.u[i])));
2217          break;
2218       case GLSL_TYPE_BOOL:
2219          emit(MOV(*dst, src_reg(ir->value.b[i])));
2220          break;
2221       default:
2222          assert(!"Non-float/uint/int/bool constant");
2223          break;
2224       }
2225
2226       remaining_writemask &= ~dst->writemask;
2227    }
2228    dst->reg_offset++;
2229 }
2230
2231 void
2232 vec4_visitor::visit(ir_constant *ir)
2233 {
2234    dst_reg dst = dst_reg(this, ir->type);
2235    this->result = src_reg(dst);
2236
2237    emit_constant_values(&dst, ir);
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_call *ir)
2242 {
2243    assert(!"not reached");
2244 }
2245
2246 void
2247 vec4_visitor::visit(ir_texture *ir)
2248 {
2249    int sampler =
2250       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2251
2252    /* Should be lowered by do_lower_texture_projection */
2253    assert(!ir->projector);
2254
2255    /* Generate code to compute all the subexpression trees.  This has to be
2256     * done before loading any values into MRFs for the sampler message since
2257     * generating these values may involve SEND messages that need the MRFs.
2258     */
2259    src_reg coordinate;
2260    if (ir->coordinate) {
2261       ir->coordinate->accept(this);
2262       coordinate = this->result;
2263    }
2264
2265    src_reg shadow_comparitor;
2266    if (ir->shadow_comparitor) {
2267       ir->shadow_comparitor->accept(this);
2268       shadow_comparitor = this->result;
2269    }
2270
2271    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2272    src_reg lod, dPdx, dPdy, sample_index;
2273    switch (ir->op) {
2274    case ir_tex:
2275       lod = src_reg(0.0f);
2276       lod_type = glsl_type::float_type;
2277       break;
2278    case ir_txf:
2279    case ir_txl:
2280    case ir_txs:
2281       ir->lod_info.lod->accept(this);
2282       lod = this->result;
2283       lod_type = ir->lod_info.lod->type;
2284       break;
2285    case ir_txf_ms:
2286       ir->lod_info.sample_index->accept(this);
2287       sample_index = this->result;
2288       sample_index_type = ir->lod_info.sample_index->type;
2289       break;
2290    case ir_txd:
2291       ir->lod_info.grad.dPdx->accept(this);
2292       dPdx = this->result;
2293
2294       ir->lod_info.grad.dPdy->accept(this);
2295       dPdy = this->result;
2296
2297       lod_type = ir->lod_info.grad.dPdx->type;
2298       break;
2299    case ir_txb:
2300    case ir_lod:
2301       break;
2302    }
2303
2304    vec4_instruction *inst = NULL;
2305    switch (ir->op) {
2306    case ir_tex:
2307    case ir_txl:
2308       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2309       break;
2310    case ir_txd:
2311       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2312       break;
2313    case ir_txf:
2314       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2315       break;
2316    case ir_txf_ms:
2317       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2318       break;
2319    case ir_txs:
2320       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2321       break;
2322    case ir_txb:
2323       assert(!"TXB is not valid for vertex shaders.");
2324       break;
2325    case ir_lod:
2326       assert(!"LOD is not valid for vertex shaders.");
2327       break;
2328    }
2329
2330    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2331
2332    /* Texel offsets go in the message header; Gen4 also requires headers. */
2333    inst->header_present = use_texture_offset || brw->gen < 5;
2334    inst->base_mrf = 2;
2335    inst->mlen = inst->header_present + 1; /* always at least one */
2336    inst->sampler = sampler;
2337    inst->dst = dst_reg(this, ir->type);
2338    inst->dst.writemask = WRITEMASK_XYZW;
2339    inst->shadow_compare = ir->shadow_comparitor != NULL;
2340
2341    if (use_texture_offset)
2342       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2343
2344    /* MRF for the first parameter */
2345    int param_base = inst->base_mrf + inst->header_present;
2346
2347    if (ir->op == ir_txs) {
2348       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2349       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2350    } else {
2351       int i, coord_mask = 0, zero_mask = 0;
2352       /* Load the coordinate */
2353       /* FINISHME: gl_clamp_mask and saturate */
2354       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2355          coord_mask |= (1 << i);
2356       for (; i < 4; i++)
2357          zero_mask |= (1 << i);
2358
2359       if (ir->offset && ir->op == ir_txf) {
2360          /* It appears that the ld instruction used for txf does its
2361           * address bounds check before adding in the offset.  To work
2362           * around this, just add the integer offset to the integer
2363           * texel coordinate, and don't put the offset in the header.
2364           */
2365          ir_constant *offset = ir->offset->as_constant();
2366          assert(offset);
2367
2368          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2369             src_reg src = coordinate;
2370             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2371                                        BRW_GET_SWZ(src.swizzle, j),
2372                                        BRW_GET_SWZ(src.swizzle, j),
2373                                        BRW_GET_SWZ(src.swizzle, j));
2374             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2375                      src, offset->value.i[j]));
2376          }
2377       } else {
2378          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2379                   coordinate));
2380       }
2381       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2382                src_reg(0)));
2383       /* Load the shadow comparitor */
2384       if (ir->shadow_comparitor && ir->op != ir_txd) {
2385          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2386                           WRITEMASK_X),
2387                   shadow_comparitor));
2388          inst->mlen++;
2389       }
2390
2391       /* Load the LOD info */
2392       if (ir->op == ir_tex || ir->op == ir_txl) {
2393          int mrf, writemask;
2394          if (brw->gen >= 5) {
2395             mrf = param_base + 1;
2396             if (ir->shadow_comparitor) {
2397                writemask = WRITEMASK_Y;
2398                /* mlen already incremented */
2399             } else {
2400                writemask = WRITEMASK_X;
2401                inst->mlen++;
2402             }
2403          } else /* brw->gen == 4 */ {
2404             mrf = param_base;
2405             writemask = WRITEMASK_W;
2406          }
2407          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2408       } else if (ir->op == ir_txf) {
2409          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2410       } else if (ir->op == ir_txf_ms) {
2411          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2412                   sample_index));
2413          inst->mlen++;
2414
2415          /* on Gen7, there is an additional MCS parameter here after SI,
2416           * but we don't bother to emit it since it's always zero. If
2417           * we start supporting texturing from CMS surfaces, this will have
2418           * to change
2419           */
2420       } else if (ir->op == ir_txd) {
2421          const glsl_type *type = lod_type;
2422
2423          if (brw->gen >= 5) {
2424             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2425             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2426             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2427             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2428             inst->mlen++;
2429
2430             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2431                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2432                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2433                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2434                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2435                inst->mlen++;
2436
2437                if (ir->shadow_comparitor) {
2438                   emit(MOV(dst_reg(MRF, param_base + 2,
2439                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2440                            shadow_comparitor));
2441                }
2442             }
2443          } else /* brw->gen == 4 */ {
2444             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2445             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2446             inst->mlen += 2;
2447          }
2448       }
2449    }
2450
2451    emit(inst);
2452
2453    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2454     * spec requires layers.
2455     */
2456    if (ir->op == ir_txs) {
2457       glsl_type const *type = ir->sampler->type;
2458       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2459           type->sampler_array) {
2460          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2461                    with_writemask(inst->dst, WRITEMASK_Z),
2462                    src_reg(inst->dst), src_reg(6));
2463       }
2464    }
2465
2466    swizzle_result(ir, src_reg(inst->dst), sampler);
2467 }
2468
2469 void
2470 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2471 {
2472    int s = key->tex.swizzles[sampler];
2473
2474    this->result = src_reg(this, ir->type);
2475    dst_reg swizzled_result(this->result);
2476
2477    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2478                         || s == SWIZZLE_NOOP) {
2479       emit(MOV(swizzled_result, orig_val));
2480       return;
2481    }
2482
2483    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2484    int swizzle[4] = {0};
2485
2486    for (int i = 0; i < 4; i++) {
2487       switch (GET_SWZ(s, i)) {
2488       case SWIZZLE_ZERO:
2489          zero_mask |= (1 << i);
2490          break;
2491       case SWIZZLE_ONE:
2492          one_mask |= (1 << i);
2493          break;
2494       default:
2495          copy_mask |= (1 << i);
2496          swizzle[i] = GET_SWZ(s, i);
2497          break;
2498       }
2499    }
2500
2501    if (copy_mask) {
2502       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2503       swizzled_result.writemask = copy_mask;
2504       emit(MOV(swizzled_result, orig_val));
2505    }
2506
2507    if (zero_mask) {
2508       swizzled_result.writemask = zero_mask;
2509       emit(MOV(swizzled_result, src_reg(0.0f)));
2510    }
2511
2512    if (one_mask) {
2513       swizzled_result.writemask = one_mask;
2514       emit(MOV(swizzled_result, src_reg(1.0f)));
2515    }
2516 }
2517
2518 void
2519 vec4_visitor::visit(ir_return *ir)
2520 {
2521    assert(!"not reached");
2522 }
2523
2524 void
2525 vec4_visitor::visit(ir_discard *ir)
2526 {
2527    assert(!"not reached");
2528 }
2529
2530 void
2531 vec4_visitor::visit(ir_if *ir)
2532 {
2533    /* Don't point the annotation at the if statement, because then it plus
2534     * the then and else blocks get printed.
2535     */
2536    this->base_ir = ir->condition;
2537
2538    if (brw->gen == 6) {
2539       emit_if_gen6(ir);
2540    } else {
2541       uint32_t predicate;
2542       emit_bool_to_cond_code(ir->condition, &predicate);
2543       emit(IF(predicate));
2544    }
2545
2546    visit_instructions(&ir->then_instructions);
2547
2548    if (!ir->else_instructions.is_empty()) {
2549       this->base_ir = ir->condition;
2550       emit(BRW_OPCODE_ELSE);
2551
2552       visit_instructions(&ir->else_instructions);
2553    }
2554
2555    this->base_ir = ir->condition;
2556    emit(BRW_OPCODE_ENDIF);
2557 }
2558
2559 void
2560 vec4_visitor::visit(ir_emit_vertex *)
2561 {
2562    assert(!"not reached");
2563 }
2564
2565 void
2566 vec4_visitor::visit(ir_end_primitive *)
2567 {
2568    assert(!"not reached");
2569 }
2570
2571 void
2572 vec4_visitor::emit_ndc_computation()
2573 {
2574    /* Get the position */
2575    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2576
2577    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2578    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2579    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2580
2581    current_annotation = "NDC";
2582    dst_reg ndc_w = ndc;
2583    ndc_w.writemask = WRITEMASK_W;
2584    src_reg pos_w = pos;
2585    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2586    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2587
2588    dst_reg ndc_xyz = ndc;
2589    ndc_xyz.writemask = WRITEMASK_XYZ;
2590
2591    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2592 }
2593
2594 void
2595 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2596 {
2597    if (brw->gen < 6 &&
2598        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2599         key->userclip_active || brw->has_negative_rhw_bug)) {
2600       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2601       dst_reg header1_w = header1;
2602       header1_w.writemask = WRITEMASK_W;
2603
2604       emit(MOV(header1, 0u));
2605
2606       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2607          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2608
2609          current_annotation = "Point size";
2610          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2611          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2612       }
2613
2614       if (key->userclip_active) {
2615          current_annotation = "Clipping flags";
2616          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2617          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2618
2619          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2620          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2621          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2622
2623          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2624          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2625          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2626          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2627       }
2628
2629       /* i965 clipping workaround:
2630        * 1) Test for -ve rhw
2631        * 2) If set,
2632        *      set ndc = (0,0,0,0)
2633        *      set ucp[6] = 1
2634        *
2635        * Later, clipping will detect ucp[6] and ensure the primitive is
2636        * clipped against all fixed planes.
2637        */
2638       if (brw->has_negative_rhw_bug) {
2639          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2640          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2641          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2642          vec4_instruction *inst;
2643          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2644          inst->predicate = BRW_PREDICATE_NORMAL;
2645          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2646          inst->predicate = BRW_PREDICATE_NORMAL;
2647       }
2648
2649       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2650    } else if (brw->gen < 6) {
2651       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2652    } else {
2653       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2654       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2655          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2656                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
2657       }
2658       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2659          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2660                   src_reg(output_reg[VARYING_SLOT_LAYER])));
2661       }
2662    }
2663 }
2664
2665 void
2666 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2667 {
2668    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2669     *
2670     *     "If a linked set of shaders forming the vertex stage contains no
2671     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2672     *     application has requested clipping against user clip planes through
2673     *     the API, then the coordinate written to gl_Position is used for
2674     *     comparison against the user clip planes."
2675     *
2676     * This function is only called if the shader didn't write to
2677     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2678     * if the user wrote to it; otherwise we use gl_Position.
2679     */
2680    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2681    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2682       clip_vertex = VARYING_SLOT_POS;
2683    }
2684
2685    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2686         ++i) {
2687       reg.writemask = 1 << i;
2688       emit(DP4(reg,
2689                src_reg(output_reg[clip_vertex]),
2690                src_reg(this->userplane[i + offset])));
2691    }
2692 }
2693
2694 void
2695 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2696 {
2697    assert (varying < VARYING_SLOT_MAX);
2698    reg.type = output_reg[varying].type;
2699    current_annotation = output_reg_annotation[varying];
2700    /* Copy the register, saturating if necessary */
2701    vec4_instruction *inst = emit(MOV(reg,
2702                                      src_reg(output_reg[varying])));
2703    if ((varying == VARYING_SLOT_COL0 ||
2704         varying == VARYING_SLOT_COL1 ||
2705         varying == VARYING_SLOT_BFC0 ||
2706         varying == VARYING_SLOT_BFC1) &&
2707        key->clamp_vertex_color) {
2708       inst->saturate = true;
2709    }
2710 }
2711
2712 void
2713 vec4_visitor::emit_urb_slot(int mrf, int varying)
2714 {
2715    struct brw_reg hw_reg = brw_message_reg(mrf);
2716    dst_reg reg = dst_reg(MRF, mrf);
2717    reg.type = BRW_REGISTER_TYPE_F;
2718
2719    switch (varying) {
2720    case VARYING_SLOT_PSIZ:
2721       /* PSIZ is always in slot 0, and is coupled with other flags. */
2722       current_annotation = "indices, point width, clip flags";
2723       emit_psiz_and_flags(hw_reg);
2724       break;
2725    case BRW_VARYING_SLOT_NDC:
2726       current_annotation = "NDC";
2727       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2728       break;
2729    case VARYING_SLOT_POS:
2730       current_annotation = "gl_Position";
2731       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2732       break;
2733    case VARYING_SLOT_EDGE:
2734       /* This is present when doing unfilled polygons.  We're supposed to copy
2735        * the edge flag from the user-provided vertex array
2736        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2737        * of that attribute (starts as 1.0f).  This is then used in clipping to
2738        * determine which edges should be drawn as wireframe.
2739        */
2740       current_annotation = "edge flag";
2741       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2742                                     glsl_type::float_type, WRITEMASK_XYZW))));
2743       break;
2744    case BRW_VARYING_SLOT_PAD:
2745       /* No need to write to this slot */
2746       break;
2747    default:
2748       emit_generic_urb_slot(reg, varying);
2749       break;
2750    }
2751 }
2752
2753 static int
2754 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2755 {
2756    if (brw->gen >= 6) {
2757       /* URB data written (does not include the message header reg) must
2758        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2759        * section 5.4.3.2.2: URB_INTERLEAVED.
2760        *
2761        * URB entries are allocated on a multiple of 1024 bits, so an
2762        * extra 128 bits written here to make the end align to 256 is
2763        * no problem.
2764        */
2765       if ((mlen % 2) != 1)
2766          mlen++;
2767    }
2768
2769    return mlen;
2770 }
2771
2772 void
2773 vec4_vs_visitor::emit_urb_write_header(int mrf)
2774 {
2775    /* No need to do anything for VS; an implied write to this MRF will be
2776     * performed by VS_OPCODE_URB_WRITE.
2777     */
2778    (void) mrf;
2779 }
2780
2781 vec4_instruction *
2782 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2783 {
2784    /* For VS, the URB writes end the thread. */
2785    if (complete) {
2786       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2787          emit_shader_time_end();
2788    }
2789
2790    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2791    inst->urb_write_flags = complete ?
2792       BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
2793
2794    return inst;
2795 }
2796
2797 /**
2798  * Generates the VUE payload plus the necessary URB write instructions to
2799  * output it.
2800  *
2801  * The VUE layout is documented in Volume 2a.
2802  */
2803 void
2804 vec4_visitor::emit_vertex()
2805 {
2806    /* MRF 0 is reserved for the debugger, so start with message header
2807     * in MRF 1.
2808     */
2809    int base_mrf = 1;
2810    int mrf = base_mrf;
2811    /* In the process of generating our URB write message contents, we
2812     * may need to unspill a register or load from an array.  Those
2813     * reads would use MRFs 14-15.
2814     */
2815    int max_usable_mrf = 13;
2816
2817    /* The following assertion verifies that max_usable_mrf causes an
2818     * even-numbered amount of URB write data, which will meet gen6's
2819     * requirements for length alignment.
2820     */
2821    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2822
2823    /* First mrf is the g0-based message header containing URB handles and
2824     * such.
2825     */
2826    emit_urb_write_header(mrf++);
2827
2828    if (brw->gen < 6) {
2829       emit_ndc_computation();
2830    }
2831
2832    /* Lower legacy ff and ClipVertex clipping to clip distances */
2833    if (key->userclip_active && !key->uses_clip_distance) {
2834       current_annotation = "user clip distances";
2835
2836       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2837       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2838
2839       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2840       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2841    }
2842
2843    /* Set up the VUE data for the first URB write */
2844    int slot;
2845    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2846       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2847
2848       /* If this was max_usable_mrf, we can't fit anything more into this URB
2849        * WRITE.
2850        */
2851       if (mrf > max_usable_mrf) {
2852          slot++;
2853          break;
2854       }
2855    }
2856
2857    bool complete = slot >= prog_data->vue_map.num_slots;
2858    current_annotation = "URB write";
2859    vec4_instruction *inst = emit_urb_write_opcode(complete);
2860    inst->base_mrf = base_mrf;
2861    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2862
2863    /* Optional second URB write */
2864    if (!complete) {
2865       mrf = base_mrf + 1;
2866
2867       for (; slot < prog_data->vue_map.num_slots; ++slot) {
2868          assert(mrf < max_usable_mrf);
2869
2870          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2871       }
2872
2873       current_annotation = "URB write";
2874       inst = emit_urb_write_opcode(true /* complete */);
2875       inst->base_mrf = base_mrf;
2876       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2877       /* URB destination offset.  In the previous write, we got MRFs
2878        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2879        * URB row increments, and each of our MRFs is half of one of
2880        * those, since we're doing interleaved writes.
2881        */
2882       inst->offset = (max_usable_mrf - base_mrf) / 2;
2883    }
2884 }
2885
2886 void
2887 vec4_vs_visitor::emit_thread_end()
2888 {
2889    /* For VS, we always end the thread by emitting a single vertex.
2890     * emit_urb_write_opcode() will take care of setting the eot flag on the
2891     * SEND instruction.
2892     */
2893    emit_vertex();
2894 }
2895
2896 src_reg
2897 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2898                                  src_reg *reladdr, int reg_offset)
2899 {
2900    /* Because we store the values to scratch interleaved like our
2901     * vertex data, we need to scale the vec4 index by 2.
2902     */
2903    int message_header_scale = 2;
2904
2905    /* Pre-gen6, the message header uses byte offsets instead of vec4
2906     * (16-byte) offset units.
2907     */
2908    if (brw->gen < 6)
2909       message_header_scale *= 16;
2910
2911    if (reladdr) {
2912       src_reg index = src_reg(this, glsl_type::int_type);
2913
2914       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2915       emit_before(inst, MUL(dst_reg(index),
2916                             index, src_reg(message_header_scale)));
2917
2918       return index;
2919    } else {
2920       return src_reg(reg_offset * message_header_scale);
2921    }
2922 }
2923
2924 src_reg
2925 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2926                                        src_reg *reladdr, int reg_offset)
2927 {
2928    if (reladdr) {
2929       src_reg index = src_reg(this, glsl_type::int_type);
2930
2931       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2932
2933       /* Pre-gen6, the message header uses byte offsets instead of vec4
2934        * (16-byte) offset units.
2935        */
2936       if (brw->gen < 6) {
2937          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2938       }
2939
2940       return index;
2941    } else {
2942       int message_header_scale = brw->gen < 6 ? 16 : 1;
2943       return src_reg(reg_offset * message_header_scale);
2944    }
2945 }
2946
2947 /**
2948  * Emits an instruction before @inst to load the value named by @orig_src
2949  * from scratch space at @base_offset to @temp.
2950  *
2951  * @base_offset is measured in 32-byte units (the size of a register).
2952  */
2953 void
2954 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2955                                 dst_reg temp, src_reg orig_src,
2956                                 int base_offset)
2957 {
2958    int reg_offset = base_offset + orig_src.reg_offset;
2959    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2960
2961    emit_before(inst, SCRATCH_READ(temp, index));
2962 }
2963
2964 /**
2965  * Emits an instruction after @inst to store the value to be written
2966  * to @orig_dst to scratch space at @base_offset, from @temp.
2967  *
2968  * @base_offset is measured in 32-byte units (the size of a register).
2969  */
2970 void
2971 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2972 {
2973    int reg_offset = base_offset + inst->dst.reg_offset;
2974    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2975
2976    /* Create a temporary register to store *inst's result in.
2977     *
2978     * We have to be careful in MOVing from our temporary result register in
2979     * the scratch write.  If we swizzle from channels of the temporary that
2980     * weren't initialized, it will confuse live interval analysis, which will
2981     * make spilling fail to make progress.
2982     */
2983    src_reg temp = src_reg(this, glsl_type::vec4_type);
2984    temp.type = inst->dst.type;
2985    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2986    int swizzles[4];
2987    for (int i = 0; i < 4; i++)
2988       if (inst->dst.writemask & (1 << i))
2989          swizzles[i] = i;
2990       else
2991          swizzles[i] = first_writemask_chan;
2992    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2993                                swizzles[2], swizzles[3]);
2994
2995    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2996                                        inst->dst.writemask));
2997    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2998    write->predicate = inst->predicate;
2999    write->ir = inst->ir;
3000    write->annotation = inst->annotation;
3001    inst->insert_after(write);
3002
3003    inst->dst.file = temp.file;
3004    inst->dst.reg = temp.reg;
3005    inst->dst.reg_offset = temp.reg_offset;
3006    inst->dst.reladdr = NULL;
3007 }
3008
3009 /**
3010  * We can't generally support array access in GRF space, because a
3011  * single instruction's destination can only span 2 contiguous
3012  * registers.  So, we send all GRF arrays that get variable index
3013  * access to scratch space.
3014  */
3015 void
3016 vec4_visitor::move_grf_array_access_to_scratch()
3017 {
3018    int scratch_loc[this->virtual_grf_count];
3019
3020    for (int i = 0; i < this->virtual_grf_count; i++) {
3021       scratch_loc[i] = -1;
3022    }
3023
3024    /* First, calculate the set of virtual GRFs that need to be punted
3025     * to scratch due to having any array access on them, and where in
3026     * scratch.
3027     */
3028    foreach_list(node, &this->instructions) {
3029       vec4_instruction *inst = (vec4_instruction *)node;
3030
3031       if (inst->dst.file == GRF && inst->dst.reladdr &&
3032           scratch_loc[inst->dst.reg] == -1) {
3033          scratch_loc[inst->dst.reg] = c->last_scratch;
3034          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3035       }
3036
3037       for (int i = 0 ; i < 3; i++) {
3038          src_reg *src = &inst->src[i];
3039
3040          if (src->file == GRF && src->reladdr &&
3041              scratch_loc[src->reg] == -1) {
3042             scratch_loc[src->reg] = c->last_scratch;
3043             c->last_scratch += this->virtual_grf_sizes[src->reg];
3044          }
3045       }
3046    }
3047
3048    /* Now, for anything that will be accessed through scratch, rewrite
3049     * it to load/store.  Note that this is a _safe list walk, because
3050     * we may generate a new scratch_write instruction after the one
3051     * we're processing.
3052     */
3053    foreach_list_safe(node, &this->instructions) {
3054       vec4_instruction *inst = (vec4_instruction *)node;
3055
3056       /* Set up the annotation tracking for new generated instructions. */
3057       base_ir = inst->ir;
3058       current_annotation = inst->annotation;
3059
3060       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3061          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3062       }
3063
3064       for (int i = 0 ; i < 3; i++) {
3065          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3066             continue;
3067
3068          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3069
3070          emit_scratch_read(inst, temp, inst->src[i],
3071                            scratch_loc[inst->src[i].reg]);
3072
3073          inst->src[i].file = temp.file;
3074          inst->src[i].reg = temp.reg;
3075          inst->src[i].reg_offset = temp.reg_offset;
3076          inst->src[i].reladdr = NULL;
3077       }
3078    }
3079 }
3080
3081 /**
3082  * Emits an instruction before @inst to load the value named by @orig_src
3083  * from the pull constant buffer (surface) at @base_offset to @temp.
3084  */
3085 void
3086 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3087                                       dst_reg temp, src_reg orig_src,
3088                                       int base_offset)
3089 {
3090    int reg_offset = base_offset + orig_src.reg_offset;
3091    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
3092    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3093    vec4_instruction *load;
3094
3095    if (brw->gen >= 7) {
3096       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3097       grf_offset.type = offset.type;
3098       emit_before(inst, MOV(grf_offset, offset));
3099
3100       load = new(mem_ctx) vec4_instruction(this,
3101                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3102                                            temp, index, src_reg(grf_offset));
3103    } else {
3104       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3105                                            temp, index, offset);
3106       load->base_mrf = 14;
3107       load->mlen = 1;
3108    }
3109    emit_before(inst, load);
3110 }
3111
3112 /**
3113  * Implements array access of uniforms by inserting a
3114  * PULL_CONSTANT_LOAD instruction.
3115  *
3116  * Unlike temporary GRF array access (where we don't support it due to
3117  * the difficulty of doing relative addressing on instruction
3118  * destinations), we could potentially do array access of uniforms
3119  * that were loaded in GRF space as push constants.  In real-world
3120  * usage we've seen, though, the arrays being used are always larger
3121  * than we could load as push constants, so just always move all
3122  * uniform array access out to a pull constant buffer.
3123  */
3124 void
3125 vec4_visitor::move_uniform_array_access_to_pull_constants()
3126 {
3127    int pull_constant_loc[this->uniforms];
3128
3129    for (int i = 0; i < this->uniforms; i++) {
3130       pull_constant_loc[i] = -1;
3131    }
3132
3133    /* Walk through and find array access of uniforms.  Put a copy of that
3134     * uniform in the pull constant buffer.
3135     *
3136     * Note that we don't move constant-indexed accesses to arrays.  No
3137     * testing has been done of the performance impact of this choice.
3138     */
3139    foreach_list_safe(node, &this->instructions) {
3140       vec4_instruction *inst = (vec4_instruction *)node;
3141
3142       for (int i = 0 ; i < 3; i++) {
3143          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3144             continue;
3145
3146          int uniform = inst->src[i].reg;
3147
3148          /* If this array isn't already present in the pull constant buffer,
3149           * add it.
3150           */
3151          if (pull_constant_loc[uniform] == -1) {
3152             const float **values = &prog_data->param[uniform * 4];
3153
3154             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3155
3156             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3157                prog_data->pull_param[prog_data->nr_pull_params++]
3158                   = values[j];
3159             }
3160          }
3161
3162          /* Set up the annotation tracking for new generated instructions. */
3163          base_ir = inst->ir;
3164          current_annotation = inst->annotation;
3165
3166          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3167
3168          emit_pull_constant_load(inst, temp, inst->src[i],
3169                                  pull_constant_loc[uniform]);
3170
3171          inst->src[i].file = temp.file;
3172          inst->src[i].reg = temp.reg;
3173          inst->src[i].reg_offset = temp.reg_offset;
3174          inst->src[i].reladdr = NULL;
3175       }
3176    }
3177
3178    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3179     * no need to track them as larger-than-vec4 objects.  This will be
3180     * relied on in cutting out unused uniform vectors from push
3181     * constants.
3182     */
3183    split_uniform_registers();
3184 }
3185
3186 void
3187 vec4_visitor::resolve_ud_negate(src_reg *reg)
3188 {
3189    if (reg->type != BRW_REGISTER_TYPE_UD ||
3190        !reg->negate)
3191       return;
3192
3193    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3194    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3195    *reg = temp;
3196 }
3197
3198 vec4_visitor::vec4_visitor(struct brw_context *brw,
3199                            struct brw_vec4_compile *c,
3200                            struct gl_program *prog,
3201                            const struct brw_vec4_prog_key *key,
3202                            struct brw_vec4_prog_data *prog_data,
3203                            struct gl_shader_program *shader_prog,
3204                            struct brw_shader *shader,
3205                            void *mem_ctx,
3206                            bool debug_flag)
3207    : debug_flag(debug_flag)
3208 {
3209    this->brw = brw;
3210    this->ctx = &brw->ctx;
3211    this->shader_prog = shader_prog;
3212    this->shader = shader;
3213
3214    this->mem_ctx = mem_ctx;
3215    this->failed = false;
3216
3217    this->base_ir = NULL;
3218    this->current_annotation = NULL;
3219    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3220
3221    this->c = c;
3222    this->prog = prog;
3223    this->key = key;
3224    this->prog_data = prog_data;
3225
3226    this->variable_ht = hash_table_ctor(0,
3227                                        hash_table_pointer_hash,
3228                                        hash_table_pointer_compare);
3229
3230    this->virtual_grf_start = NULL;
3231    this->virtual_grf_end = NULL;
3232    this->virtual_grf_sizes = NULL;
3233    this->virtual_grf_count = 0;
3234    this->virtual_grf_reg_map = NULL;
3235    this->virtual_grf_reg_count = 0;
3236    this->virtual_grf_array_size = 0;
3237    this->live_intervals_valid = false;
3238
3239    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3240
3241    this->uniforms = 0;
3242 }
3243
3244 vec4_visitor::~vec4_visitor()
3245 {
3246    hash_table_dtor(this->variable_ht);
3247 }
3248
3249
3250 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3251                                  struct brw_vs_compile *vs_compile,
3252                                  struct brw_vs_prog_data *vs_prog_data,
3253                                  struct gl_shader_program *prog,
3254                                  struct brw_shader *shader,
3255                                  void *mem_ctx)
3256    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3257                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3258                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
3259      vs_compile(vs_compile),
3260      vs_prog_data(vs_prog_data)
3261 {
3262 }
3263
3264
3265 void
3266 vec4_visitor::fail(const char *format, ...)
3267 {
3268    va_list va;
3269    char *msg;
3270
3271    if (failed)
3272       return;
3273
3274    failed = true;
3275
3276    va_start(va, format);
3277    msg = ralloc_vasprintf(mem_ctx, format, va);
3278    va_end(va);
3279    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3280
3281    this->fail_msg = msg;
3282
3283    if (debug_flag) {
3284       fprintf(stderr, "%s",  msg);
3285    }
3286 }
3287
3288 } /* namespace brw */